8 files changed, 37450 insertions, 29 deletions
diff --git a/config-generic b/config-generic
index 1a949eb..87a4b60 100644
--- a/config-generic
+++ b/config-generic
@@ -4059,3 +4059,19 @@ CONFIG_DETECT_HUNG_TASK=y
 
 CONFIG_MEMORY_FAILURE=y
 CONFIG_HWPOISON_INJECT=m
+
+# added for xen pvops patch
+#
+CONFIG_XEN_DOM0=y
+CONFIG_XEN_BACKEND=y
+CONFIG_XEN_BLKDEV_BACKEND=m
+CONFIG_XEN_NETDEV_BACKEND=m
+CONFIG_XEN_GNTDEV=m
+CONFIG_XEN_PCI_PASSTHROUGH=y
+CONFIG_XEN_PCIDEV_BACKEND=m
+# CONFIG_XEN_PCIDEV_BE_DEBUG is not set
+CONFIG_XEN_PCIDEV_FRONTEND=m
+CONFIG_XEN_BLKDEV_TAP=m
+CONFIG_XEN_PLATFORM_PCI=m
+CONFIG_NET_SCH_PLUG=m
+CONFIG_XEN_WDT=m
diff --git a/config-x86-generic b/config-x86-generic
index e0f1d1e..24c48d1 100644
--- a/config-x86-generic
+++ b/config-x86-generic
@@ -72,7 +72,7 @@ CONFIG_EDD=m
 CONFIG_HIGHMEM4G=y
 # CONFIG_HIGHMEM64G is not set
 CONFIG_HIGHMEM=y
-CONFIG_HIGHPTE=y
+CONFIG_HIGHPTE=n
 
 # CONFIG_MATH_EMULATION is not set
 CONFIG_MTRR=y
@@ -364,8 +364,8 @@ CONFIG_XEN_BALLOON=y
 CONFIG_XEN_SCRUB_PAGES=y
 CONFIG_XEN_SAVE_RESTORE=y
 CONFIG_HVC_XEN=y
-CONFIG_XEN_FBDEV_FRONTEND=y
-CONFIG_XEN_KBDDEV_FRONTEND=y
+CONFIG_XEN_FBDEV_FRONTEND=m
+CONFIG_XEN_KBDDEV_FRONTEND=m
 CONFIG_XEN_BLKDEV_FRONTEND=m
 CONFIG_XEN_NETDEV_FRONTEND=m
 CONFIG_XENFS=m
diff --git a/config-x86_64-generic b/config-x86_64-generic
index 175f57b..1f7b440 100644
--- a/config-x86_64-generic
+++ b/config-x86_64-generic
@@ -292,8 +292,8 @@ CONFIG_XEN_BALLOON=y
 CONFIG_XEN_SCRUB_PAGES=y
 CONFIG_XEN_SAVE_RESTORE=y
 CONFIG_HVC_XEN=y
-CONFIG_XEN_FBDEV_FRONTEND=y
-CONFIG_XEN_KBDDEV_FRONTEND=y
+CONFIG_XEN_FBDEV_FRONTEND=m
+CONFIG_XEN_KBDDEV_FRONTEND=m
 CONFIG_XEN_BLKDEV_FRONTEND=m
 CONFIG_XEN_NETDEV_FRONTEND=m
 CONFIG_XENFS=m
diff --git a/kernel.spec b/kernel.spec
index 94f99a1..4025f32 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -23,6 +23,7 @@ Summary: The Linux kernel
 #
 # (Uncomment the '#' and the first two spaces below to set buildid.)
 # % define buildid .local
+%define buildid .xendom0
 ###################################################################
 
 # The buildid can also be specified on the rpmbuild command line
@@ -130,7 +131,7 @@ Summary: The Linux kernel
 %define doc_build_fail true
 %endif
 
-%define rawhide_skip_docs 0
+%define rawhide_skip_docs 1
 %if 0%{?rawhide_skip_docs}
 %define with_doc 0
 %endif
@@ -150,7 +151,7 @@ Summary: The Linux kernel
 # Set debugbuildsenabled to 1 for production (build separate debug kernels)
 #  and 0 for rawhide (all kernels are debug kernels).
 # See also 'make debug' and 'make release'.
-%define debugbuildsenabled 1
+%define debugbuildsenabled 0
 
 # Want to build a vanilla kernel build without any non-upstream patches?
 # (well, almost none, we need nonintconfig for build purposes). Default to 0 (off).
@@ -225,6 +226,7 @@ Summary: The Linux kernel
 # kernel-PAE is only built on i686.
 %ifarch i686
 %define with_pae 1
+%define with_up 0
 %else
 %define with_pae 0
 %endif
@@ -407,7 +409,7 @@ Summary: The Linux kernel
 
 # We don't build a kernel on i386; we only do kernel-headers there,
 # and we no longer build for 31bit S390. Same for 32bit sparc and arm.
-%define nobuildarches i386 s390 sparc %{arm}
+%define nobuildarches i386 ppc ppc64 ia64 sparc sparc64 390 s390x alpha alphaev56 %{arm}
 
 %ifarch %nobuildarches
 %define with_up 0
@@ -441,7 +443,7 @@ Summary: The Linux kernel
 # problems with the newer kernel or lack certain things that make
 # integration in the distro harder than needed.
 #
-%define package_conflicts initscripts < 7.23, udev < 063-6, iptables < 1.3.2-1, ipw2200-firmware < 2.4, iwl4965-firmware < 228.57.2, selinux-policy-targeted < 1.25.3-14, squashfs-tools < 4.0, wireless-tools < 29-3
+%define package_conflicts initscripts < 7.23, udev < 063-6, iptables < 1.3.2-1, ipw2200-firmware < 2.4, iwl4965-firmware < 228.57.2, selinux-policy-targeted < 1.25.3-14, squashfs-tools < 4.0, wireless-tools < 29-3, xen < 3.4.3
 
 #
 # The ld.so.conf.d file we install uses syntax older ldconfig's don't grok.
@@ -508,7 +510,7 @@ Version: %{rpmversion}
 Release: %{pkg_release}
 # DO NOT CHANGE THE 'ExclusiveArch' LINE TO TEMPORARILY EXCLUDE AN ARCHITECTURE BUILD.
 # SET %%nobuildarches (ABOVE) INSTEAD
-ExclusiveArch: noarch %{all_x86} x86_64 ppc ppc64 ia64 sparc sparc64 s390x alpha alphaev56 %{arm}
+ExclusiveArch: noarch %{all_x86} x86_64 ia64 sparc sparc64 s390x alpha alphaev56 %{arm}
 ExclusiveOS: Linux
 
 %kernel_reqprovconf
@@ -838,8 +840,6 @@ Patch14210: execve-improve-interactivity-with-large-arguments.patch
 Patch14211: execve-make-responsive-to-sigkill-with-large-arguments.patch
 Patch14212: setup_arg_pages-diagnose-excessive-argument-size.patch
 
-Patch14220: xen-fix-typo-in-xen-irq-fix.patch
-
 # rhbz#447489
 Patch14224: skge-quirk-to-4gb-dma.patch
 
@@ -849,6 +849,10 @@ Patch14225: r8169-fix-dma-allocations.patch
 # rhbz#596475
 Patch14226: add-support-for-ricoh-e822-sdhci.patch
 
+Patch19997: xen.pvops.pre.patch
+Patch19998: xen.pvops.patch
+Patch19999: xen.pvops.post.patch
+
 # ==============================================================================
 %endif
 
@@ -1553,9 +1557,6 @@ ApplyPatch execve-improve-interactivity-with-large-arguments.patch
 ApplyPatch execve-make-responsive-to-sigkill-with-large-arguments.patch
 ApplyPatch setup_arg_pages-diagnose-excessive-argument-size.patch
 
-# Fix typo in Xen patch from 2.6.22 that causes hang on boot.
-ApplyPatch xen-fix-typo-in-xen-irq-fix.patch
-
 # rhbz#629158
 ApplyPatch r8169-fix-dma-allocations.patch
 
@@ -1565,6 +1566,10 @@ ApplyPatch skge-quirk-to-4gb-dma.patch
 # rhbz#596475
 ApplyPatch add-support-for-ricoh-e822-sdhci.patch
 
+ApplyPatch xen.pvops.pre.patch
+ApplyPatch xen.pvops.patch
+ApplyPatch xen.pvops.post.patch
+
 # END OF PATCH APPLICATIONS ====================================================
 %endif
 
@@ -1775,7 +1780,7 @@ hwcap 0 nosegneg"
     fi
     mkdir -p $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
     cd include
-    cp -a acpi config crypto keys linux math-emu media mtd net pcmcia rdma rxrpc scsi sound trace video drm asm-generic $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
+    cp -a acpi config crypto keys linux math-emu media mtd net pcmcia rdma rxrpc scsi sound trace video drm asm-generic xen $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
     asmdir=$(readlink asm)
     cp -a $asmdir $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include/
     pushd $RPM_BUILD_ROOT/lib/modules/$KernelVer/build/include
@@ -2222,6 +2227,11 @@ fi
 - rhbz629158: r8169-fix-dma-allocations.patch
 - rhbz596475: add-support-for-ricoh-e822-sdhci.patch
 
+* Wed Oct 06 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops including memory and irq fixes
+- Drop xen-fix-typo-in-xen-irq-fix.patch as it is in the pvops patch
+- Build new xen watchdog driver CONFIG_XEN_WDT=m
+
 * Mon Sep 27 2010 Chuck Ebbert <cebbert@redhat.com> 2.6.32.23-170
 - Linux 2.6.32.23
 - Drop merged patches:
@@ -2244,6 +2254,9 @@ fi
   alsa-seq-oss-fix-double-free-at-error-path-of-snd_seq_oss_open.patch
   tracing-do-not-allow-llseek-to-set_ftrace_filter.patch
 
+* Mon Sep 20 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops
+
 * Tue Sep 14 2010 Chuck Ebbert <cebbert@redhat.com> 2.6.32.21-168
 - Fix three CVEs:
   CVE-2010-3080: /dev/sequencer open failure is not handled correctly
@@ -2262,6 +2275,10 @@ fi
 - Backport two fixes from Eric Paris to resolve #598796 which avoids a
   capability check if the request comes from the kernel.
 
+* Fri Sep 03 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops to 2.6.32.21
+- Set new dom0 related option CONFIG_NET_SCH_PLUG=m
+
 * Thu Sep 02 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.21-167
 - irda-correctly-clean-up-self-ias_obj-on-irda_bind-failure.patch (CVE-2010-2954)
 
@@ -2329,6 +2346,10 @@ fi
     xfs-prevent-swapext-from-operating-on-write-only-files.patch
     cifs-fix-dns-resolver.patch
 
+* Fri Aug 06 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops to 2.6.32.17
+- try removing patch that set CONFIG_XEN_XENBUS_FRONTEND=y
+
 * Fri Aug 06 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.17-157
 - Fix USB HID initialization (#592785)
 
@@ -2362,6 +2383,11 @@ fi
 - kvm-mmu-fix-conflict-access-permissions-in-direct-sp.patch:
   Fix crash in guest Python programs (#610911)
 
+* Tue Jul 27 2010 Michael Young <m.a.young@durham.ac.uk>
+- remove some obsolete or unnecessary additions to config-generic
+- try building XEN_BLKDEV_BACKEND XEN_NETDEV_BACKEND XEN_GNTDEV
+  as modules and adding XEN_PCIDEV_FRONTEND XEN_PLATFORM_PCI as modules
+
 * Mon Jul 26 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.16-152
 - usb-obey-the-sysfs-power-wakeup-setting.patch:
   Restore ability of USB devices to wake the machine (F13#617559)
@@ -2407,6 +2433,9 @@ fi
 * Wed Jul 14 2010 Chuck Ebbert <cebbert@redhat.com> 2.6.32.16-142
 - Drop Intel Moorestown support.
 
+* Wed Jul 12 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops
+
 * Wed Jul 07 2010 Jarod Wilson <jarod@redhat.com> 2.6.32.16-141
 - Really make hdpvr i2c IR part register this time, so something can
   actually be bound to it (like, say, lirc_zilog)
@@ -2440,6 +2469,10 @@ fi
 * Wed Jun 23 2010 Kyle McMartin <kyle@redhat.com>  2.6.32.14-135
 - l2tp: fix oops in pppol2tp_xmit (rhbz#607054)
 
+* Wed Jun 16 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops
+- undo balloning patch as it is upstream in xen
+
 * Tue Jun 15 2010 Kyle McMartin <kyle@redhat.com>  2.6.32.14-134
 - Fix btrfs ACL fixes... commit 431547b3c4533b8c7fd150ab36980b9a3147797b
   changed them to take a struct dentry instead of struct inode after
@@ -2455,9 +2488,16 @@ fi
 - mac80211/iwlwifi fix connections to some APs (rhbz#558002)
   patches from sgruszka@.
 
+* Sat Jun  5 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops (only change is 2.6.32.14 -> 2.6.32.15)
+- try a ballooning patch
+
 * Wed Jun  2 2010 John W. Linville <linville@redhat.com>
 - iwlwifi: update supported PCI_ID list for 5xx0 series (#599153)
 
+* Sat May 29 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops
+
 * Thu May 27 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.14-127
 - CVE-2010-1437: keyrings: find_keyring_by_name() can gain the freed keyring
 
@@ -2476,6 +2516,9 @@ fi
 - iwlwifi: fix scan races (#592011)
 - iwlwifi: fix internal scan race (#592011)
 
+* Tue May 18 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops
+
 * Tue May 18 2010 Kyle McMartin <kyle@redhat.com>
 - btrfs: check for read permission on src file in the clone ioctl
   (rhbz#593226)
@@ -2509,9 +2552,20 @@ fi
 * Wed May 12 2010 Roland McGrath <roland@redhat.com> 2.6.32.12-116
 - utrace update (#590954)
 
+* Thu May 6 2010 Michael Young <m.a.young@durham.ac.uk>
+- pvops update which adds XEN_PLATFORM_PCI support
+- current pvops kernel won't build unless CONFIG_XEN_XENBUS_FRONTEND=y
+  if XEN and PM_SLEEP are selected
+- add a patch to ensure this as we can't set CONFIG_XEN_XENBUS_FRONTEND
+  directly
+- build problems with CONFIG_XEN_PLATFORM_PCI=m so unset it again
+
 * Fri Apr 30 2010 John W. Linville <linville@redhat.com> 2.6.32.12-115
 - Revert "ath9k: fix lockdep warning when unloading module"
 
+* Wed Apr 28 2010 Michael Young <m.a.young@durham.ac.uk>
+- another pvops update
+
 * Tue Apr 27 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.12-114
 - libiscsi-regression-fix-header-digest-errors.patch:
   fix iscsi header authentication broken in .32 (#583581)
@@ -2576,6 +2630,9 @@ fi
 * Mon Apr 12 2010 John W. Linville <linville@redhat.com> 2.6.32.11-102
 - patches from Intel to address intermittent firmware failures with iwlagn
 
+* Sat Apr 10 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops
+
 * Tue Apr 07 2010 Chuck Ebbert <cebbert@redhat.com>
 - Disable async RAID4/5/6 processing (#575402)
 
@@ -2615,6 +2672,9 @@ fi
 - drm-intel-make-lvds-work.patch: Fix screen not turning back on on lid open
 - linux-2.6-usb-wwan-update.patch: Update wwan code and fix qcserial
 
+* Wed Mar 31 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops (xen/stable-2.6.32.x branch) which introduces PAT support
+
 * Tue Mar 30 2010 John W. Linville <linville@redhat.com> 2.6.32.10-94
 - Avoid null pointer dereference introduced by 'ssb: check for sprom' (#577463)
 
@@ -2626,6 +2686,14 @@ fi
 * Mon Mar 29 2010 Ben Skeggs <bskeggs@redhat.com> 2.6.32.10-91
 - nouveau: fix display issues on Dell D620 laptops
 
+* Sun Mar 28 2010 Michael Young <m.a.young@durham.ac.uk>
+- try turning STACKPROTECTOR back on
+- add xen includes to -devel package
+
+* Tue Mar 23 2010 Michael Young <m.a.young@durham.ac.uk>
+- update pvops which should hopefully fix ix86
+- Add in a conflicts xen < 3.4.3 rule
+
 * Mon Mar 22 2010 Jarod Wilson <jarod@redhat.com> 2.6.32.10-90
 - A few more imon driver button additions
 - Fix minor init issue w/topseed 0x0008 mceusb transceivers
@@ -2668,6 +2736,10 @@ fi
 - Rebase lirc drivers to latest git tree
 - Copious amounts of imon driver update
 
+* Mon Mar 15 2010 Michael Young <m.a.young@durham.ac.uk>
+- another pvops update
+- try pvops with dracut
+
 * Mon Mar 15 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.10-75.rc1
 - Make the perf package require libdwarf; fix up description (#568309)
 
@@ -2710,6 +2782,9 @@ fi
 * Tue Mar 02 2010 Chuck Ebbert <cebbert@redhat.com>
 - Include examples.txt in the perf package (#569506)
 
+* Mon Mar 01 2010 Michael Young <m.a.young@durham.ac.uk>
+- another pvops update
+
 * Mon Mar 01 2010 Dave Jones <davej@redhat.com>
 - Don't own /usr/src/kernels any more, it's now owned by filesystem. (#569438)
 
@@ -2723,6 +2798,9 @@ fi
 * Thu Feb 25 2010 Ben Skeggs <bskeggs@redhat.com> 2.6.32.9-65
 - nouveau: DP fix for cards with version 0x20 DP tables
 
+* Wed Feb 24 2010 Michael Young <m.a.young@durham.ac.uk>
+- switch to xen/stable pvops branch
+
 * Tue Feb 23 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.9-64
 - Linux 2.6.32.9
 
@@ -2754,6 +2832,10 @@ fi
 - ice1712-fix-revo71-mixer-names.patch: fix mixer names for
   monty. (rhbz#566565)
 
+* Wed Feb 17 2010 Michael Young <m.a.young@durham.ac.uk>
+- try another pvops update
+- fix an incorrect git conflict merge in generating pvops patch
+
 * Wed Feb 17 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.8-58
 - fix-race-in-tty_fasync-properly.patch: fix for deadlock caused
   by original patch in 2.6.32.6
@@ -2785,6 +2867,12 @@ fi
 - fix-abrtd.patch: backport of nhorman's call_usermode_helper changes
   from devel/ & linux-next.
 
+* Mon Feb 15 2010 Michael Young <m.a.young@durham.ac.uk>
+- more pvops updates and try with STACKPROTECTOR off
+
+* Tue Feb 09 2010 Michael Young <m.a.young@durham.ac.uk>
+- try pvops again with xen/next after some more updates
+
 * Tue Feb 09 2010 Kyle McMartin <kyle@redhat.com> 2.6.32.8-49
 - Linux 2.6.32.8
 - futex-handle-user-space-corruption-gracefully.patch: Fix oops in
@@ -2833,6 +2921,39 @@ fi
 * Mon Feb 01 2010 Dave Airlie <airlied@redhat.com> 2.6.32.7-39
 - Add two input quirks for HP and eGalax touchscreens.
 
+* Sat Jan 31 2010 Michael Young <m.a.young@durham.ac.uk>
+- switch pvops to xen/next branch for the moment
+- update to 2.6.32
+- orphaned comments
+-* Wed Dec 23 2009 Michael Young <m.a.young@durham.ac.uk>
+-- update to latest pvops patch
+-* Wed Dec 09 2009 Michael Young <m.a.young@durham.ac.uk>
+-- update to latest pvops patch
+-* Fri Nov 13 2009 Michael Young <m.a.young@durham.ac.uk>
+-- fix typo in drm-edid-retry.patch
+-* Thu Nov 12 2009 Michael Young <m.a.young@durham.ac.uk>
+-- XEN_NETCHANNEL2 depends on XEN_XENBUS_FRONTEND
+-- Disable XEN_PCIDEV_FRONTEND for the moment (compile issues)
+-* Mon Nov 09 2009 Michael Young <m.a.young@durham.ac.uk>
+-- update pvops which adds XEN_NETCHANNEL2 and XEN_PCIDEV_FRONTEND
+-* Fri Oct 16 2009 Michael Young <m.a.young@durham.ac.uk>
+-- update pvops patch to 2.6.31.4
+-- add configuration options for XEN_PCIDEV_BACKEND and XEN_PCIDEV_BE_DEBUG
+-* Sat Oct 10 2009 Michael Young <m.a.young@durham.ac.uk>
+-- update pvops patch
+-- try putting DRM_RADEON and DRM_NOUVEAU back in
+-* Sat Oct  3 2009 Michael Young <m.a.young@durham.ac.uk>
+-- update pvops patch
+-* Sat Sep 26 2009 Michael Young <m.a.young@durham.ac.uk>
+-- disable DRM_RADEON  and DRM_NOUVEAU due to build problems
+-* Thu Sep 24 2009 Michael Young <m.a.young@durham.ac.uk>
+-- Try a dri fix in the latest xen/master
+-* Sat Sep 19 2009 Michael Young <m.a.young@durham.ac.uk>
+-- Switch pvops from rebase/master to xen/master branch
+-* Tue Sep 15 2009 Michael Young <m.a.young@durham.ac.uk>
+-- switch to the F-12 branch for the moment
+-- try an NX related fix
+
 * Sat Jan 30 2010 Chuck Ebbert <cebbert@redhat.com>  2.6.32.7-38
 - Fix possible oops in bio-integrity code.
 
@@ -3343,6 +3464,9 @@ fi
 - linux-2.6-rtc-show-hctosys.patch: Export the hctosys state of an rtc
 - linux-2.6-rfkill-all.patch: Support for keys that toggle all rfkill state
 
+* Thu Sep 10 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops and get to 2.6.31
+
 * Thu Sep 10 2009 Ben Skeggs <bskeggs@redhat.com>
 - drm-nouveau.patch: add some scaler-only modes for LVDS, GEM/TTM fixes
 
@@ -3416,6 +3540,11 @@ fi
 * Sat Sep 05 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.204.rc9
 - 2.6.31-rc9
 
+* Sat Sep 04 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops which includes swiotlb updates and a network fix
+- try a drm build fix
+- therefore re-enable CONFIG_DRM_NOUVEAU and CONFIG_DRM_RADEON_KMS options
+
 * Fri Sep 04 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.31-0.203.rc8.git2
 - Fix kernel build errors when building firmware by removing the
   .config file before that step and restoring it afterward.
@@ -3427,6 +3556,10 @@ fi
 * Thu Sep 03 2009 Jarod Wilson <jarod@redhat.com>
 - Update hdpvr and lirc_zilog drivers for 2.6.31 i2c
 
+* Thu Sep 03 2009 Michael Young <m.a.young@durham.ac.uk>
+- Update pvops patch to try stack protector on i686 again
+- disable linux-2.6-xen-stack-protector-fix.patch as we already have it
+
 * Thu Sep 03 2009 Justin M.Forbes <jforbes@redhat.com>
 - Fix xen guest with stack protector. (#508120)
 - Small kvm fixes.
@@ -3493,6 +3626,9 @@ fi
 - Fix munlock with KSM (#516909)
 - Re-enable KSM
 
+* Wed Aug 26 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops again.
+
 * Wed Aug 26 2009 Chuck Ebbert <cebbert@redhat.com>
 - 2.6.31-rc7-git4
 - Drop patches merged upstream:
@@ -3521,12 +3657,18 @@ fi
 * Mon Aug 24 2009 Chuck Ebbert <cebbert@redhat.com>
 - 2.6.31-rc7-git2
 
+* Mon Aug 24 2009 Michael Young <m.a.young@durham.ac.uk>
+- turn off stackprotector on i686 for a working build
+
 * Mon Aug 24 2009 Chuck Ebbert <cebbert@redhat.com>
 - 2.6.31-rc7-git1
 
 * Sat Aug 22 2009 Chuck Ebbert <cebbert@redhat.com>
 - 2.6.31-rc7
 
+* Thu Aug 20 2009 Michael Young <m.a.young@durham.ac.uk>
+- test a i686 stackprotector patch
+
 * Thu Aug 20 2009 Mark McLoughlin <markmc@redhat.com>
 - Disable LZMA for xen (#515831)
 
@@ -3535,6 +3677,9 @@ fi
 - Fix up drm-r600-kms.patch
 - Drop fix-perf-make-man-failure.patch
 
+* Wed Aug 19 2009 Michael Young <m.a.young@durham.ac.uk>
+- update rebase/master to test i686 stackprotector issue
+
 * Wed Aug 19 2009 Chuck Ebbert <cebbert@redhat.com>
 - 2.6.31-rc6-git5
 - Revert linux-2.6-debug-vm-would-have-oomkilled.patch to v1.2
@@ -3545,6 +3690,13 @@ fi
 - with_docs disables perf docs too. be warned. (logic is that the
   build deps are (mostly) the same, so if you don't want one, odds are...)
 
+* Tue Aug 18 2009 Michael Young <m.a.young@durham.ac.uk>
+- another rebase/master update
+  - try upstream STACKPROTECTOR fixes
+  - MCE/MCA support
+- remove grubby dependency to make it more F11 friendly, it isn't needed
+  until dracut is re-enabled
+
 * Tue Aug 18 2009 Dave Jones <davej@redhat.com>
 - 2.6.31-rc6-git3
 
@@ -3564,9 +3716,16 @@ fi
 * Sat Aug 15 2009 Dave Jones <davej@redhat.com> 2.6.31-0.157.rc6
 - Disable KSM patches on a hunch.  Chasing the "encrypted VGs don't work" bug.
 
+* Fri Aug 14 2009 Michael Young <m.a.young@durham.ac.uk>
+- another rebase/master update
+- make perf a Source file so the kernel builds
+
 * Fri Aug 14 2009 Dave Jones <davej@redhat.com> 2.6.31-0.155.rc6
 - 2.6.31-rc6
 
+* Thu Aug 13 2009 Michael Young <m.a.young@durham.ac.uk>
+- add another rebase/master update
+
 * Wed Aug 12 2009 Kyle McMartin <kyle@redhat.com>
 - fix perf.
 - move perf to perf.$ver instead of perf-$ver...
@@ -3589,9 +3748,19 @@ fi
 * Tue Aug 11 2009 Eric Paris <eparis@redhat.com>
 - Enable config IMA
 
+* Tue Aug 11 2009 Michael Young <m.a.young@durham.ac.uk>
+- CONFIG_CC_STACKPROTECTOR was probably innocent but leave it off
+  a bit longer just in case
+- add rebase/master update
+
 * Tue Aug 11 2009 Ben Skeggs <bskeggs@redhat.com>
 - nouveau: various cleanups and fixes + more sanity checking in dma paths
 
+* Mon Aug 10 2009 Michael Young <m.a.young@durham.ac.uk>
+- disable CONFIG_CC_STACKPROTECTOR for x86_64 again (the workaround
+  needs revising)
+- disable dracut until the issue with livecd is fixed
+
 * Mon Aug 10 2009 Jarod Wilson <jarod@redhat.com>
 - Add new device ID to lirc_mceusb (#512483)
 - Fix some lockdep false positives
@@ -3612,6 +3781,10 @@ fi
 - set max cpus to 256 on sparc64
 - enable AT keyboard on sparc64
 
+* Sat Aug 08 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops patch to latest rebase/master and current rawhide
+- tell kernel.spec not to build non-PAE kernel for i686
+
 * Fri Aug 07 2009 Justin M. Forbes <jforbes@redhat.com>
 - Apply KSM updates from upstream
 
@@ -3827,6 +4000,9 @@ fi
 - linux-2.6-vga-arb.patch - add VGA arbiter.
 - drm-vga-arb.patch - add VGA arbiter support to drm
 
+* Wed Jul 15 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops patch x2
+
 * Tue Jul 14 2009 Kyle McMartin <kyle@redhat.com> 2.6.31-0.68-rc3
 - 2.6.31-rc3
 - config changes:
@@ -3851,6 +4027,9 @@ fi
 * Fri Jul 10 2009 Dave Jones <davej@redhat.com>
 - 2.6.31-rc2-git5
 
+* Thu Jul 09 2009 Michael Young <m.a.young@durham.ac.uk>
+- disable CONFIG_KERNEL_LZMA as xen doesn't like it
+
 * Thu Jul 09 2009 Dave Jones <davej@redhat.com> 2.6.31-0.62.rc2.git4
 - Use correct spinlock initialization in dma-debug
 
@@ -3862,6 +4041,9 @@ fi
 - Trim the changelog, axing everything before 2.6.29 (see cvs
   if you still really want to see that far back)
 
+* Thu Jul 09 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops and see if CONFIG_KERNEL_LZMA=y is compatible with xen
+
 * Wed Jul 08 2009 Dave Jones <davej@redhat.com>
 - Enable a bunch of debugging options that were missed somehow.
 
@@ -3922,6 +4104,10 @@ fi
 * Tue Jun 30 2009 Dave Jones <davej@redhat.com> 2.6.31-0.37.rc1.git5
 - Disable kmemleak. Way too noisy, and not finding any real bugs.
 
+* Tue Jun 30 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops from xen/rebase/master branch which should return disk
+  and network support
+
 * Tue Jun 30 2009 Ben Skeggs <bskeggs@redhat.com>
 - drm-nouveau.patch: match upstream
 
@@ -3932,6 +4118,13 @@ fi
 * Mon Jun 29 2009 Chuck Ebbert <cebbert@redhat.com>
 - Try to fix the dm overlay bug for real (#505121)
 
+* Sat Jun 27 2009 Michael Young <m.a.young@durham.ac.uk>
+- switch pvops to xen/rebase/master branch
+- rebase pvops on 2.6.31-rc1-git2
+- drivers/gpu/drm/ttm/ttm_agp_backend.c doesn't like 
+  include/linux/swiotlb.h so disable the options CONFIG_DRM_RADEON_KMS
+  and CONFIG_DRM_NOUVEAU that use it.
+
 * Sat Jun 27 2009 Ben Skeggs <bskeggs@redhat.com> 2.6.31-0.33.rc1.git2
 - drm-nouveau.patch: fix conflicts from 2.6.31-rc1-git2
 
@@ -4164,6 +4357,9 @@ fi
 * Mon Jun 15 2009 Jarod Wilson <jarod@redhat.com>
 - Update lirc patches w/new imon hotness
 
+* Sat Jun 13 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops to 2.6.30
+
 * Fri Jun 12 2009 Chuck Ebbert <cebbert@redhat.com>
 - Update VIA temp sensor and mmc drivers.
 
@@ -4192,6 +4388,9 @@ fi
 * Fri Jun 05 2009 Chuck Ebbert <cebbert@redhat.com>
 - Linux 2.6.30-rc8-git1
 
+* Thu Jun 04 2009 Michael Young <m.a.young@durham.ac.uk>
+- pvops update to 2.6.30-rc8
+
 * Wed Jun 03 2009 Kyle McMartin <kyle@redhat.com>
 - Linux 2.6.30-rc8
 
@@ -4229,6 +4428,14 @@ fi
 * Sat May 23 2009 Dave Jones <davej@redhat.com>
 - 2.6.30-rc7
 
+* Fri May 22 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops patch to latest xen-tip/next version
+- pull in patch for !PERF_COUNTERS build failure
+- That still doesn't work so enable PERF_COUNTERS for the moment
+
+* Thu May 21 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops patch to latest xen-tip/next version
+
 * Thu May 21 2009 Dave Jones <davej@redhat.com>
 - 2.6.30-rc6-git6
 
@@ -4256,6 +4463,12 @@ fi
 * Fri May 08 2009 Kyle McMartin <kyle@redhat.com>
 - Linux 2.6.30-rc4-git4
 
+* Thu May 07 2009 Michael Young <m.a.young@durham.ac.uk>
+- i686 CONFIG_CC_STACKPROTECTOR is still broken so disable it
+
+* Wed May 06 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops patch from xen-tip/master - hopefully i686 will work again
+
 * Wed May 06 2009 Kyle McMartin <kyle@redhat.com>
 - Linux 2.6.30-rc4-git3
 - linux-2.6-cdrom-door-status.patch: merged upstream.
@@ -4263,6 +4476,12 @@ fi
 - linux-2.6-utrace.patch: rebase against changes to fs/proc/array.c
 - USB_NET_CDC_EEM=m
 
+* Sat May 02 2009 Michael Young <m.a.young@durham.ac.uk>
+- update pvops patch from xen-tip/master
+- Try enabling CONFIG_XEN_GNTDEV and CONFIG_XEN_PCI_PASSTHROUGH
+- test a patch to allow CONFIG_CC_STACKPROTECTOR to be enabled
+- patch to allow kernel to build with XEN_PCI_PASSTHROUGH and current config
+
 * Fri May 01 2009 Eric Sandeen <sandeen@redhat.com>
 - Fix ext4 corruption on partial write into prealloc block
 
@@ -4294,6 +4513,18 @@ fi
 * Sun Apr 26 2009 Chuck Ebbert <cebbert@redhat.com> 2.6.30-0.68.rc3.git1
 - Linux 2.6.30-rc3-git1
 
+* Fri Apr 24 2009 Michael Young <m.a.young@durham.ac.uk>
+- switch back to devel kernel branch
+- switch pvops to xen-tip/next branch
+- remove added config options now in main fedora configuration
+- add in new config options required by new pvops patch
+- relocate the below comment stranded by the devel switch
+-* Tue Apr 14 2009 Michael Young <m.a.young@durham.ac.uk>
+-- follow the 2.6.29.1 stable branch for the moment
+-- reset a few config settings to Fedora defaults
+-- drop the squashfs 3 patches as we can't build both
+-- docs won't build, so don't build them
+
 * Wed Apr 22 2009 Dave Jones <davej@redhat.com> 2.6.30-0.67.rc3
 - Disable SYSFS_DEPRECATED on ia64
 
@@ -4453,6 +4684,10 @@ fi
 * Mon Mar 30 2009 Dave Jones <davej@redhat.com>
 - Add a strict-devmem=0 boot argument (#492803)
 
+* Mon Mar 30 2009 Michael Young <m.a.young@durham.ac.uk>
+- pvops update and merging of patches
+- disable squashfs 3 again since we can't build both versions at the same time
+
 * Mon Mar 30 2009 Adam Jackson <ajax@redhat.com>
 - linux-2.6.29-pat-fixes.patch: Fix PAT/GTT interaction
 
@@ -4474,6 +4709,18 @@ fi
 * Sun Mar 29 2009 Chuck Ebbert <cebbert@redhat.com>
 - More fixes for ALSA hardware pointer updating.
 
+* Sun Mar 29 2009 Michael Young <m.a.young@durham.ac.uk>
+- try push2/xen/dom0/master branch rather than xen/dom0/hackery as it should
+    be closer to the proposed 2.6.30 merge
+- add squashfs 3 support from F-10 to make kernel more friendly to fc10
+- Set CONFIG_HIGHPTE=n in config-x86-generic to avoid eventual crash problem
+- comment out linux-2.6-net-fix-gro-bug.patch which is in push2/xen/dom0/master
+
+* Sun Mar 29 2009 Michael Young <m.a.young@durham.ac.uk>
+- drop dropwatch patch due to compile problems
+- revert pvops patches bd4a7874716d1b1f69cacfef4adf9f94050ecd82 and
+    cfb667260eb7f6dd26ceb6d49da818978396757d to get the kernel to boot
+
 * Sat Mar 28 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
 - linux-2.6-revert-dvb-net-kabi-change.patch: attempt to fix dvb net breakage
 - update v4l fixes patch to reflect what's ready for 2.6.30
@@ -4516,6 +4763,10 @@ fi
 * Wed Mar 25 2009 Mauro Carvalho Chehab <mchehab@redhat.com>
 - remove duplicated Cinergy T2 entry at config-generic
 
+* Wed Mar 25 2009 Michael Young <m.a.young@durham.ac.uk>
+- disable linux-2.6-utrace-ftrace.patch due to merge problems
+- minor pvops update
+
 * Wed Mar 25 2009 Neil Horman <nhorman@redhat.com>
 - Add dropmonitor/dropwatch protocol from 2.6.30
 
@@ -4530,6 +4781,9 @@ fi
 * Tue Mar 24 2009 Kyle McMartin <kyle@redhat.com>
 - Disable DMAR by default until suspend & resume is fixed.
 
+* Tue Mar 24 2009 Michael Young <m.a.young@durham.ac.uk>
+- Update pvops patch and fix package numbering for 2.6.29
+
 * Tue Mar 24 2009 Josef Bacik <josef@toxicpanda.com>
 - fsync replay fixes for btrfs
 
diff --git a/xen-fix-typo-in-xen-irq-fix.patch b/xen-fix-typo-in-xen-irq-fix.patch
deleted file mode 100644
index 3a9fb62..0000000
--- a/xen-fix-typo-in-xen-irq-fix.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-Fix typo in Xen patch from 2.6.35.5
-
---- linux-2.6.35.noarch.orig/drivers/xen/events.c
-+++ linux-2.6.35.noarch/drivers/xen/events.c
-@@ -935,7 +935,7 @@ static struct irq_chip xen_dynamic_chip 
- 	.retrigger	= retrigger_dynirq,
- };
- 
--static struct irq_chip en_percpu_chip __read_mostly = {
-+static struct irq_chip xen_percpu_chip __read_mostly = {
- 	.name		= "xen-percpu",
- 
- 	.disable	= disable_dynirq,
diff --git a/xen.pvops.patch b/xen.pvops.patch
new file mode 100644
index 0000000..62e411a
--- /dev/null
+++ b/xen.pvops.patch
@@ -0,0 +1,37027 @@
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
+index 5f6aa11..9ec8558 100644
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -113,6 +113,7 @@ parameter is applicable:
+ 			More X86-64 boot options can be found in
+ 			Documentation/x86/x86_64/boot-options.txt .
+ 	X86	Either 32bit or 64bit x86 (same as X86-32+X86-64)
++	XEN	Xen support is enabled
+ 
+ In addition, the following text indicates that the option:
+ 
+@@ -2760,6 +2761,18 @@ and is between 256 and 4096 characters. It is defined in the file
+ 	xd=		[HW,XT] Original XT pre-IDE (RLL encoded) disks.
+ 	xd_geo=		See header of drivers/block/xd.c.
+ 
++	xen_emul_unplug=		[HW,X86,XEN]
++			Unplug Xen emulated devices
++			Format: [unplug0,][unplug1]
++			ide-disks -- unplug primary master IDE devices
++			aux-ide-disks -- unplug non-primary-master IDE devices
++			nics -- unplug network devices
++			all -- unplug all emulated devices (NICs and IDE disks)
++			unnecessary -- unplugging emulated devices is
++				unnecessary even if the host did not respond to
++				the unplug protocol
++			never -- do not unplug even if version check succeeds
++
+ 	xirc2ps_cs=	[NET,PCMCIA]
+ 			Format:
+ 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
+diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
+index 29a6ff8..81f9b94 100644
+--- a/Documentation/x86/x86_64/boot-options.txt
++++ b/Documentation/x86/x86_64/boot-options.txt
+@@ -267,10 +267,14 @@ IOMMU (input/output memory management unit)
+ 
+   iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
+   implementation:
+-    swiotlb=<pages>[,force]
++    swiotlb=[npages=<pages>]
++    swiotlb=[force]
++    swiotlb=[overflow=<size>]
++
+     <pages>            Prereserve that many 128K pages for the software IO
+                        bounce buffering.
+     force              Force all IO through the software TLB.
++    <size>             Size in bytes of the overflow buffer.
+ 
+   Settings for the IBM Calgary hardware IOMMU currently found in IBM
+   pSeries and xSeries machines:
+diff --git a/Makefile b/Makefile
+index 6a29b82..83813cc 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 32
+-EXTRAVERSION = .23
++EXTRAVERSION = .24
+ NAME = Man-Eating Seals of Antiquity
+ 
+ # *DOCUMENTATION*
+diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h
+index 8d3c79c..7d09a09 100644
+--- a/arch/ia64/include/asm/dma-mapping.h
++++ b/arch/ia64/include/asm/dma-mapping.h
+@@ -73,7 +73,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+ 	if (!dev->dma_mask)
+ 		return 0;
+ 
+-	return addr + size <= *dev->dma_mask;
++	return addr + size - 1 <= *dev->dma_mask;
+ }
+ 
+ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+diff --git a/arch/ia64/include/asm/swiotlb.h b/arch/ia64/include/asm/swiotlb.h
+index dcbaea7..f0acde6 100644
+--- a/arch/ia64/include/asm/swiotlb.h
++++ b/arch/ia64/include/asm/swiotlb.h
+@@ -4,8 +4,6 @@
+ #include <linux/dma-mapping.h>
+ #include <linux/swiotlb.h>
+ 
+-extern int swiotlb_force;
+-
+ #ifdef CONFIG_SWIOTLB
+ extern int swiotlb;
+ extern void pci_swiotlb_init(void);
+diff --git a/arch/ia64/include/asm/xen/events.h b/arch/ia64/include/asm/xen/events.h
+index b8370c8..baa74c8 100644
+--- a/arch/ia64/include/asm/xen/events.h
++++ b/arch/ia64/include/asm/xen/events.h
+@@ -36,10 +36,6 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
+ 	return !(ia64_psr(regs)->i);
+ }
+ 
+-static inline void handle_irq(int irq, struct pt_regs *regs)
+-{
+-	__do_IRQ(irq);
+-}
+ #define irq_ctx_init(cpu)	do { } while (0)
+ 
+ #endif /* _ASM_IA64_XEN_EVENTS_H */
+diff --git a/arch/ia64/kernel/pci-swiotlb.c b/arch/ia64/kernel/pci-swiotlb.c
+index 285aae8..53292ab 100644
+--- a/arch/ia64/kernel/pci-swiotlb.c
++++ b/arch/ia64/kernel/pci-swiotlb.c
+@@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = {
+ void __init swiotlb_dma_init(void)
+ {
+ 	dma_ops = &swiotlb_dma_ops;
+-	swiotlb_init();
++	swiotlb_init(1);
+ }
+ 
+ void __init pci_swiotlb_init(void)
+@@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void)
+ 		swiotlb = 1;
+ 		printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
+ 		machvec_init("dig");
+-		swiotlb_init();
++		swiotlb_init(1);
+ 		dma_ops = &swiotlb_dma_ops;
+ #else
+ 		panic("Unable to find Intel IOMMU");
+diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
+index e281dae..80a973b 100644
+--- a/arch/powerpc/include/asm/dma-mapping.h
++++ b/arch/powerpc/include/asm/dma-mapping.h
+@@ -197,7 +197,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+ 	if (!dev->dma_mask)
+ 		return 0;
+ 
+-	return addr + size <= *dev->dma_mask;
++	return addr + size - 1 <= *dev->dma_mask;
+ }
+ 
+ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
+index 53bcf3d..b152de3 100644
+--- a/arch/powerpc/kernel/setup_32.c
++++ b/arch/powerpc/kernel/setup_32.c
+@@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p)
+ 
+ #ifdef CONFIG_SWIOTLB
+ 	if (ppc_swiotlb_enable)
+-		swiotlb_init();
++		swiotlb_init(1);
+ #endif
+ 
+ 	paging_init();
+diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
+index 04f638d..df2c9e9 100644
+--- a/arch/powerpc/kernel/setup_64.c
++++ b/arch/powerpc/kernel/setup_64.c
+@@ -550,7 +550,7 @@ void __init setup_arch(char **cmdline_p)
+ 
+ #ifdef CONFIG_SWIOTLB
+ 	if (ppc_swiotlb_enable)
+-		swiotlb_init();
++		swiotlb_init(1);
+ #endif
+ 
+ 	paging_init();
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index cb5a57c..a3b7475 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -1885,6 +1885,10 @@ config PCI_OLPC
+ 	def_bool y
+ 	depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
+ 
++config PCI_XEN
++	bool
++	select SWIOTLB
++
+ config PCI_DOMAINS
+ 	def_bool y
+ 	depends on PCI
+diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
+index 18aa3f8..4413ba4 100644
+--- a/arch/x86/include/asm/amd_iommu.h
++++ b/arch/x86/include/asm/amd_iommu.h
+@@ -23,20 +23,16 @@
+ #include <linux/irqreturn.h>
+ 
+ #ifdef CONFIG_AMD_IOMMU
+-extern int amd_iommu_init(void);
+ extern int amd_iommu_init_dma_ops(void);
+ extern int amd_iommu_init_passthrough(void);
+ extern void amd_iommu_detect(void);
+ extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+ extern void amd_iommu_flush_all_domains(void);
+ extern void amd_iommu_flush_all_devices(void);
+-extern void amd_iommu_shutdown(void);
+ extern void amd_iommu_apply_erratum_63(u16 devid);
+ extern void amd_iommu_init_api(void);
+ #else
+-static inline int amd_iommu_init(void) { return -ENODEV; }
+ static inline void amd_iommu_detect(void) { }
+-static inline void amd_iommu_shutdown(void) { }
+ #endif
+ 
+ #endif /* _ASM_X86_AMD_IOMMU_H */
+diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
+index b03bedb..0918654 100644
+--- a/arch/x86/include/asm/calgary.h
++++ b/arch/x86/include/asm/calgary.h
+@@ -62,10 +62,8 @@ struct cal_chipset_ops {
+ extern int use_calgary;
+ 
+ #ifdef CONFIG_CALGARY_IOMMU
+-extern int calgary_iommu_init(void);
+ extern void detect_calgary(void);
+ #else
+-static inline int calgary_iommu_init(void) { return 1; }
+ static inline void detect_calgary(void) { return; }
+ #endif
+ 
+diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
+index 6a25d5d..ac91eed 100644
+--- a/arch/x86/include/asm/dma-mapping.h
++++ b/arch/x86/include/asm/dma-mapping.h
+@@ -20,7 +20,8 @@
+ # define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
+ #endif
+ 
+-extern dma_addr_t bad_dma_address;
++#define DMA_ERROR_CODE	0
++
+ extern int iommu_merge;
+ extern struct device x86_dma_fallback_dev;
+ extern int panic_on_overflow;
+@@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+ 	if (ops->mapping_error)
+ 		return ops->mapping_error(dev, dma_addr);
+ 
+-	return (dma_addr == bad_dma_address);
++	return (dma_addr == DMA_ERROR_CODE);
+ }
+ 
+ #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
+@@ -66,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
+ 	if (!dev->dma_mask)
+ 		return 0;
+ 
+-	return addr + size <= *dev->dma_mask;
++	return addr + size - 1 <= *dev->dma_mask;
+ }
+ 
+ static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
+index 40b4e61..fa3fd43 100644
+--- a/arch/x86/include/asm/e820.h
++++ b/arch/x86/include/asm/e820.h
+@@ -109,6 +109,8 @@ extern void reserve_early(u64 start, u64 end, char *name);
+ extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
+ extern void free_early(u64 start, u64 end);
+ extern void early_res_to_bootmem(u64 start, u64 end);
++extern u64 early_res_next_free(u64 start);
++extern u64 early_res_next_reserved(u64 addr, u64 max);
+ extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+ 
+ extern unsigned long e820_end_of_ram_pfn(void);
+diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
+index 6cfdafa..4ac5b0f 100644
+--- a/arch/x86/include/asm/gart.h
++++ b/arch/x86/include/asm/gart.h
+@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
+ extern int gart_iommu_aperture_disabled;
+ 
+ extern void early_gart_iommu_check(void);
+-extern void gart_iommu_init(void);
+-extern void gart_iommu_shutdown(void);
++extern int gart_iommu_init(void);
+ extern void __init gart_parse_options(char *);
+ extern void gart_iommu_hole_init(void);
+ 
+@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
+ static inline void early_gart_iommu_check(void)
+ {
+ }
+-static inline void gart_iommu_init(void)
+-{
+-}
+-static inline void gart_iommu_shutdown(void)
+-{
+-}
+ static inline void gart_parse_options(char *options)
+ {
+ }
+diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
+index 3251e23..fa152cb 100644
+--- a/arch/x86/include/asm/hpet.h
++++ b/arch/x86/include/asm/hpet.h
+@@ -68,6 +68,7 @@ extern unsigned long force_hpet_address;
+ extern int hpet_force_user;
+ extern u8 hpet_msi_disable;
+ extern int is_hpet_enabled(void);
++extern int disable_hpet(char *);
+ extern int hpet_enable(void);
+ extern void hpet_disable(void);
+ extern unsigned long hpet_readl(unsigned long a);
+@@ -108,6 +109,7 @@ extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
+ #else /* CONFIG_HPET_TIMER */
+ 
+ static inline int hpet_enable(void) { return 0; }
++static inline int disable_hpet(char *s) { return 0; }
+ static inline int is_hpet_enabled(void) { return 0; }
+ #define hpet_readl(a) 0
+ 
+diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
+index 439a9ac..bf88684 100644
+--- a/arch/x86/include/asm/hugetlb.h
++++ b/arch/x86/include/asm/hugetlb.h
+@@ -36,16 +36,28 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+ 	free_pgd_range(tlb, addr, end, floor, ceiling);
+ }
+ 
++static inline pte_t huge_ptep_get(pte_t *ptep)
++{
++	return *ptep;
++}
++
+ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ 				   pte_t *ptep, pte_t pte)
+ {
+-	set_pte_at(mm, addr, ptep, pte);
++#if PAGETABLE_LEVELS >= 3
++	set_pmd((pmd_t *)ptep, native_make_pmd(native_pte_val(pte)));
++#else
++	set_pgd((pgd_t *)ptep, native_make_pgd(native_pte_val(pte)));
++#endif
+ }
+ 
+ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ 					    unsigned long addr, pte_t *ptep)
+ {
+-	return ptep_get_and_clear(mm, addr, ptep);
++	pte_t pte = huge_ptep_get(ptep);
++
++	set_huge_pte_at(mm, addr, ptep, __pte(0));
++	return pte;
+ }
+ 
+ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
+@@ -66,19 +78,25 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
+ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ 					   unsigned long addr, pte_t *ptep)
+ {
+-	ptep_set_wrprotect(mm, addr, ptep);
++	pte_t pte = huge_ptep_get(ptep);
++
++	pte = pte_wrprotect(pte);
++	set_huge_pte_at(mm, addr, ptep, pte);
+ }
+ 
+ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ 					     unsigned long addr, pte_t *ptep,
+ 					     pte_t pte, int dirty)
+ {
+-	return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+-}
++	pte_t oldpte = huge_ptep_get(ptep);
++	int changed = !pte_same(oldpte, pte);
+ 
+-static inline pte_t huge_ptep_get(pte_t *ptep)
+-{
+-	return *ptep;
++	if (changed && dirty) {
++		set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
++		flush_tlb_page(vma, addr);
++	}
++
++	return changed;
+ }
+ 
+ static inline int arch_prepare_hugepage(struct page *page)
+diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
+index 7373932..49ee1a9 100644
+--- a/arch/x86/include/asm/io.h
++++ b/arch/x86/include/asm/io.h
+@@ -7,6 +7,10 @@
+ #include <asm-generic/int-ll64.h>
+ #include <asm/page.h>
+ 
++#include <xen/xen.h>
++
++extern int isapnp_disable;
++
+ #define build_mmio_read(name, size, type, reg, barrier) \
+ static inline type name(const volatile void __iomem *addr) \
+ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+@@ -199,6 +203,17 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
+ 				    unsigned long size);
+ extern void early_iounmap(void __iomem *addr, unsigned long size);
+ 
++#ifdef CONFIG_XEN
++struct bio_vec;
++
++extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
++				      const struct bio_vec *vec2);
++
++#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)				\
++	(__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&				\
++	 (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
++#endif	/* CONFIG_XEN */
++
+ #define IO_SPACE_LIMIT 0xffff
+ 
+ #endif /* _ASM_X86_IO_H */
+diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
+index 5f61f6e..b852da9 100644
+--- a/arch/x86/include/asm/io_apic.h
++++ b/arch/x86/include/asm/io_apic.h
+@@ -172,6 +172,7 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+ extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
+ 
+ extern void probe_nr_irqs_gsi(void);
++extern int get_nr_irqs_gsi(void);
+ 
+ extern int setup_ioapic_entry(int apic, int irq,
+ 			      struct IO_APIC_route_entry *entry,
+@@ -201,4 +202,6 @@ static inline void probe_nr_irqs_gsi(void)	{ }
+ 
+ #endif
+ 
++void xen_io_apic_init(void);
++
+ #endif /* _ASM_X86_IO_APIC_H */
+diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
+index fd6d21b..345c99c 100644
+--- a/arch/x86/include/asm/iommu.h
++++ b/arch/x86/include/asm/iommu.h
+@@ -1,8 +1,6 @@
+ #ifndef _ASM_X86_IOMMU_H
+ #define _ASM_X86_IOMMU_H
+ 
+-extern void pci_iommu_shutdown(void);
+-extern void no_iommu_init(void);
+ extern struct dma_map_ops nommu_dma_ops;
+ extern int force_iommu, no_iommu;
+ extern int iommu_detected;
+diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
+index 6e90a04..ba4dc7b 100644
+--- a/arch/x86/include/asm/irq_vectors.h
++++ b/arch/x86/include/asm/irq_vectors.h
+@@ -120,6 +120,12 @@
+  */
+ #define MCE_SELF_VECTOR			0xeb
+ 
++#ifdef CONFIG_XEN
++/* Xen vector callback to receive events in a HVM domain */
++#define XEN_HVM_EVTCHN_CALLBACK		0xe9
++#endif
++
++
+ /*
+  * First APIC vector available to drivers: (vectors 0x30-0xee) we
+  * start at 0x31(0x41) to spread out vectors evenly between priority
+@@ -157,6 +163,14 @@ static inline int invalid_vm86_irq(int irq)
+ #define CPU_VECTOR_LIMIT		(  8 * NR_CPUS      )
+ #define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
+ 
++#ifndef __ASSEMBLY__
++# if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SPARSE_IRQ)
++extern int nr_dynamic_irqs;
++# else
++#  define NR_DYNAMIC_IRQS			 256
++# endif
++#endif
++
+ #ifdef CONFIG_X86_IO_APIC
+ # ifdef CONFIG_SPARSE_IRQ
+ #  define NR_IRQS					\
+@@ -165,13 +179,13 @@ static inline int invalid_vm86_irq(int irq)
+ 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
+ # else
+ #  if NR_CPUS < MAX_IO_APICS
+-#   define NR_IRQS 			(NR_VECTORS + 4*CPU_VECTOR_LIMIT)
++#   define NR_IRQS 			(NR_VECTORS + 4*CPU_VECTOR_LIMIT) + NR_DYNAMIC_IRQS
+ #  else
+-#   define NR_IRQS			(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
++#   define NR_IRQS			(NR_VECTORS + IO_APIC_VECTOR_LIMIT) + NR_DYNAMIC_IRQS
+ #  endif
+ # endif
+ #else /* !CONFIG_X86_IO_APIC: */
+-# define NR_IRQS			NR_IRQS_LEGACY
++# define NR_IRQS			NR_IRQS_LEGACY + NR_DYNAMIC_IRQS
+ #endif
+ 
+ #endif /* _ASM_X86_IRQ_VECTORS_H */
+diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
+index ef51b50..e15fca1 100644
+--- a/arch/x86/include/asm/microcode.h
++++ b/arch/x86/include/asm/microcode.h
+@@ -55,4 +55,13 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
+ }
+ #endif
+ 
++#ifdef CONFIG_MICROCODE_XEN
++extern struct microcode_ops * __init init_xen_microcode(void);
++#else
++static inline struct microcode_ops * __init init_xen_microcode(void)
++{
++	return NULL;
++}
++#endif
++
+ #endif /* _ASM_X86_MICROCODE_H */
+diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
+index 80a1dee..67eaa91 100644
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -13,6 +13,9 @@ typedef struct {
+ 	int size;
+ 	struct mutex lock;
+ 	void *vdso;
++#ifdef CONFIG_XEN
++	int has_foreign_mappings;
++#endif
+ } mm_context_t;
+ 
+ #ifdef CONFIG_SMP
+diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
+index efb3899..e571db4 100644
+--- a/arch/x86/include/asm/paravirt.h
++++ b/arch/x86/include/asm/paravirt.h
+@@ -330,11 +330,18 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
+ {
+ 	PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
+ }
++
+ static inline void set_iopl_mask(unsigned mask)
+ {
+ 	PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
+ }
+ 
++static inline void set_io_bitmap(struct thread_struct *thread,
++				 unsigned long bytes_updated)
++{
++	PVOP_VCALL2(pv_cpu_ops.set_io_bitmap, thread, bytes_updated);
++}
++
+ /* The paravirtualized I/O functions */
+ static inline void slow_down_io(void)
+ {
+@@ -770,15 +777,28 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+ #define PV_RESTORE_REGS "popl %edx; popl %ecx;"
+ 
+ /* save and restore all caller-save registers, except return value */
+-#define PV_SAVE_ALL_CALLER_REGS		"pushl %ecx;"
+-#define PV_RESTORE_ALL_CALLER_REGS	"popl  %ecx;"
++#define __PV_SAVE_ALL_CALLER_REGS	"pushl %ecx;"
++#define __PV_RESTORE_ALL_CALLER_REGS	"popl  %ecx;"
++
++#ifdef CONFIG_FRAME_POINTER
++#define PV_SAVE_ALL_CALLER_REGS			\
++	"push %ebp;"				\
++	"mov %esp, %ebp;"			\
++	__PV_SAVE_ALL_CALLER_REGS
++#define PV_RESTORE_ALL_CALLER_REGS		\
++	__PV_RESTORE_ALL_CALLER_REGS		\
++	"leave;"
++#else
++#define PV_SAVE_ALL_CALLER_REGS		__PV_SAVE_ALL_CALLER_REGS
++#define PV_RESTORE_ALL_CALLER_REGS	__PV_RESTORE_ALL_CALLER_REGS
++#endif
+ 
+ #define PV_FLAGS_ARG "0"
+ #define PV_EXTRA_CLOBBERS
+ #define PV_VEXTRA_CLOBBERS
+ #else
+ /* save and restore all caller-save registers, except return value */
+-#define PV_SAVE_ALL_CALLER_REGS						\
++#define __PV_SAVE_ALL_CALLER_REGS					\
+ 	"push %rcx;"							\
+ 	"push %rdx;"							\
+ 	"push %rsi;"							\
+@@ -787,7 +807,7 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+ 	"push %r9;"							\
+ 	"push %r10;"							\
+ 	"push %r11;"
+-#define PV_RESTORE_ALL_CALLER_REGS					\
++#define __PV_RESTORE_ALL_CALLER_REGS					\
+ 	"pop %r11;"							\
+ 	"pop %r10;"							\
+ 	"pop %r9;"							\
+@@ -797,6 +817,19 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
+ 	"pop %rdx;"							\
+ 	"pop %rcx;"
+ 
++#ifdef CONFIG_FRAME_POINTER
++#define PV_SAVE_ALL_CALLER_REGS			\
++	"push %rbp;"				\
++	"mov %rsp, %rbp;"			\
++	__PV_SAVE_ALL_CALLER_REGS
++#define PV_RESTORE_ALL_CALLER_REGS		\
++	__PV_RESTORE_ALL_CALLER_REGS		\
++	"leaveq;"
++#else
++#define PV_SAVE_ALL_CALLER_REGS		__PV_SAVE_ALL_CALLER_REGS
++#define PV_RESTORE_ALL_CALLER_REGS	__PV_RESTORE_ALL_CALLER_REGS
++#endif
++
+ /* We save some registers, but all of them, that's too much. We clobber all
+  * caller saved registers but the argument parameter */
+ #define PV_SAVE_REGS "pushq %%rdi;"
+diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
+index 9357473..3202dcc 100644
+--- a/arch/x86/include/asm/paravirt_types.h
++++ b/arch/x86/include/asm/paravirt_types.h
+@@ -135,6 +135,8 @@ struct pv_cpu_ops {
+ 	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+ 
+ 	void (*set_iopl_mask)(unsigned mask);
++	void (*set_io_bitmap)(struct thread_struct *thread,
++			      unsigned long bytes_updated);
+ 
+ 	void (*wbinvd)(void);
+ 	void (*io_delay)(void);
+diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
+index ada8c20..faa0af1 100644
+--- a/arch/x86/include/asm/pci.h
++++ b/arch/x86/include/asm/pci.h
+@@ -21,6 +21,7 @@ struct pci_sysdata {
+ extern int pci_routeirq;
+ extern int noioapicquirk;
+ extern int noioapicreroute;
++extern int pci_scan_all_fns;
+ 
+ /* scan a bus after allocating a pci_sysdata for it */
+ extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+@@ -49,6 +50,11 @@ extern unsigned int pcibios_assign_all_busses(void);
+ #define pcibios_assign_all_busses()	0
+ #endif
+ 
++static inline int pcibios_scan_all_fns(struct pci_bus *bus, int devfn)
++{
++	return pci_scan_all_fns;
++}
++
+ extern unsigned long pci_mem_start;
+ #define PCIBIOS_MIN_IO		0x1000
+ #define PCIBIOS_MIN_MEM		(pci_mem_start)
+@@ -87,6 +93,7 @@ extern void pci_iommu_alloc(void);
+ 
+ /* MSI arch hook */
+ #define arch_setup_msi_irqs arch_setup_msi_irqs
++#define arch_teardown_msi_irqs arch_teardown_msi_irqs
+ 
+ #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
+ 
+@@ -128,6 +135,7 @@ extern void pci_iommu_alloc(void);
+ #include <asm-generic/pci-dma-compat.h>
+ 
+ /* generic pci stuff */
++#define HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS
+ #include <asm-generic/pci.h>
+ #define PCIBIOS_MAX_MEM_32 0xffffffff
+ 
+diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
+index b399988..30cbf49 100644
+--- a/arch/x86/include/asm/pci_x86.h
++++ b/arch/x86/include/asm/pci_x86.h
+@@ -45,6 +45,7 @@ enum pci_bf_sort_state {
+ extern unsigned int pcibios_max_latency;
+ 
+ void pcibios_resource_survey(void);
++void pcibios_set_cache_line_size(void);
+ 
+ /* pci-pc.c */
+ 
+@@ -106,6 +107,7 @@ extern int pci_direct_probe(void);
+ extern void pci_direct_init(int type);
+ extern void pci_pcbios_init(void);
+ extern int pci_olpc_init(void);
++extern int pci_xen_init(void);
+ extern void __init dmi_check_pciprobe(void);
+ extern void __init dmi_check_skip_isa_align(void);
+ 
+diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
+index af6fd36..430e3cc 100644
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -15,7 +15,6 @@
+ 	 : (prot))
+ 
+ #ifndef __ASSEMBLY__
+-
+ /*
+  * ZERO_PAGE is a global shared page that is always zero: used
+  * for zero-mapped memory areas etc..
+@@ -26,6 +25,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+ extern spinlock_t pgd_lock;
+ extern struct list_head pgd_list;
+ 
++extern struct mm_struct *pgd_page_get_mm(struct page *page);
++
+ #ifdef CONFIG_PARAVIRT
+ #include <asm/paravirt.h>
+ #else  /* !CONFIG_PARAVIRT */
+@@ -76,6 +77,11 @@ extern struct list_head pgd_list;
+ 
+ #endif	/* CONFIG_PARAVIRT */
+ 
++static inline pteval_t pte_flags(pte_t pte)
++{
++	return pte_val(pte) & PTE_FLAGS_MASK;
++}
++
+ /*
+  * The following only work if pte_present() is true.
+  * Undefined behaviour if not..
+@@ -397,6 +403,9 @@ static inline unsigned long pages_to_mb(unsigned long npg)
+ #define io_remap_pfn_range(vma, vaddr, pfn, size, prot)	\
+ 	remap_pfn_range(vma, vaddr, pfn, size, prot)
+ 
++#define arch_vm_get_page_prot arch_vm_get_page_prot
++extern pgprot_t arch_vm_get_page_prot(unsigned vm_flags);
++
+ #if PAGETABLE_LEVELS > 2
+ static inline int pud_none(pud_t pud)
+ {
+@@ -616,6 +625,9 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+        memcpy(dst, src, count * sizeof(pgd_t));
+ }
+ 
++int create_lookup_pte_addr(struct mm_struct *mm,
++                           unsigned long address,
++                           uint64_t *ptep);
+ 
+ #include <asm-generic/pgtable.h>
+ #endif	/* __ASSEMBLY__ */
+diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
+index c57a301..4e46931 100644
+--- a/arch/x86/include/asm/pgtable_64.h
++++ b/arch/x86/include/asm/pgtable_64.h
+@@ -160,7 +160,7 @@ extern void cleanup_highmap(void);
+ #define pgtable_cache_init()   do { } while (0)
+ #define check_pgt_cache()      do { } while (0)
+ 
+-#define PAGE_AGP    PAGE_KERNEL_NOCACHE
++#define PAGE_AGP    PAGE_KERNEL_IO_NOCACHE
+ #define HAVE_PAGE_AGP 1
+ 
+ /* fs/proc/kcore.c */
+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
+index d1f4a76..a81b0ed 100644
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -265,11 +265,6 @@ static inline pteval_t native_pte_val(pte_t pte)
+ 	return pte.pte;
+ }
+ 
+-static inline pteval_t pte_flags(pte_t pte)
+-{
+-	return native_pte_val(pte) & PTE_FLAGS_MASK;
+-}
+-
+ #define pgprot_val(x)	((x).pgprot)
+ #define __pgprot(x)	((pgprot_t) { (x) } )
+ 
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index 13b1885..0aac25a 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -551,6 +551,9 @@ static inline void native_set_iopl_mask(unsigned mask)
+ #endif
+ }
+ 
++extern void native_set_io_bitmap(struct thread_struct *thread,
++				 unsigned long updated_bytes);
++
+ static inline void
+ native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+ {
+@@ -592,6 +595,7 @@ static inline void load_sp0(struct tss_struct *tss,
+ }
+ 
+ #define set_iopl_mask native_set_iopl_mask
++#define set_io_bitmap native_set_io_bitmap
+ #endif /* CONFIG_PARAVIRT */
+ 
+ /*
+diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
+index 18e496c..154a5f1 100644
+--- a/arch/x86/include/asm/setup.h
++++ b/arch/x86/include/asm/setup.h
+@@ -95,6 +95,11 @@ void *extend_brk(size_t size, size_t align);
+ 			: : "i" (sz));					\
+ 	}
+ 
++/* Helper for reserving space for arrays of things */
++#define RESERVE_BRK_ARRAY(type, name, entries)		\
++	type *name;					\
++	RESERVE_BRK(name, sizeof(type) * entries)
++
+ #ifdef __i386__
+ 
+ void __init i386_start_kernel(void);
+diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
+index b9e4e20..8085277 100644
+--- a/arch/x86/include/asm/swiotlb.h
++++ b/arch/x86/include/asm/swiotlb.h
+@@ -3,15 +3,16 @@
+ 
+ #include <linux/swiotlb.h>
+ 
+-/* SWIOTLB interface */
+-
+-extern int swiotlb_force;
+-
+ #ifdef CONFIG_SWIOTLB
+ extern int swiotlb;
+-extern void pci_swiotlb_init(void);
++extern int __init pci_swiotlb_detect(void);
++extern void __init pci_swiotlb_init(void);
+ #else
+ #define swiotlb 0
++static inline int pci_swiotlb_detect(void)
++{
++	return 0;
++}
+ static inline void pci_swiotlb_init(void)
+ {
+ }
+diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
+index 1bb6e39..ef0fa4d 100644
+--- a/arch/x86/include/asm/syscalls.h
++++ b/arch/x86/include/asm/syscalls.h
+@@ -33,11 +33,11 @@ long sys_rt_sigreturn(struct pt_regs *);
+ asmlinkage int sys_set_thread_area(struct user_desc __user *);
+ asmlinkage int sys_get_thread_area(struct user_desc __user *);
+ 
+-/* X86_32 only */
+-#ifdef CONFIG_X86_32
+ /* kernel/ioport.c */
+-long sys_iopl(struct pt_regs *);
++asmlinkage long sys_iopl(unsigned int);
+ 
++/* X86_32 only */
++#ifdef CONFIG_X86_32
+ /* kernel/process_32.c */
+ int sys_clone(struct pt_regs *);
+ int sys_execve(struct pt_regs *);
+@@ -68,8 +68,6 @@ int sys_vm86(struct pt_regs *);
+ #else /* CONFIG_X86_32 */
+ 
+ /* X86_64 only */
+-/* kernel/ioport.c */
+-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+ 
+ /* kernel/process_64.c */
+ asmlinkage long sys_clone(unsigned long, unsigned long,
+diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
+index 7f3eba0..e4fc8ea 100644
+--- a/arch/x86/include/asm/tlbflush.h
++++ b/arch/x86/include/asm/tlbflush.h
+@@ -89,6 +89,10 @@ static inline void __flush_tlb_one(unsigned long addr)
+ 
+ #ifndef CONFIG_SMP
+ 
++static inline void __init init_smp_flush(void)
++{
++}
++
+ #define flush_tlb() __flush_tlb()
+ #define flush_tlb_all() __flush_tlb_all()
+ #define local_flush_tlb() __flush_tlb()
+@@ -129,6 +133,8 @@ static inline void reset_lazy_tlbstate(void)
+ 
+ #define local_flush_tlb() __flush_tlb()
+ 
++extern void init_smp_flush(void);
++
+ extern void flush_tlb_all(void);
+ extern void flush_tlb_current_task(void);
+ extern void flush_tlb_mm(struct mm_struct *);
+diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
+index 2c756fd..d8e7145 100644
+--- a/arch/x86/include/asm/x86_init.h
++++ b/arch/x86/include/asm/x86_init.h
+@@ -91,6 +91,14 @@ struct x86_init_timers {
+ };
+ 
+ /**
++ * struct x86_init_iommu - platform specific iommu setup
++ * @iommu_init:			platform specific iommu setup
++ */
++struct x86_init_iommu {
++	int (*iommu_init)(void);
++};
++
++/**
+  * struct x86_init_ops - functions for platform specific setup
+  *
+  */
+@@ -101,6 +109,7 @@ struct x86_init_ops {
+ 	struct x86_init_oem		oem;
+ 	struct x86_init_paging		paging;
+ 	struct x86_init_timers		timers;
++	struct x86_init_iommu		iommu;
+ };
+ 
+ /**
+@@ -121,6 +130,7 @@ struct x86_platform_ops {
+ 	unsigned long (*calibrate_tsc)(void);
+ 	unsigned long (*get_wallclock)(void);
+ 	int (*set_wallclock)(unsigned long nowtime);
++	void (*iommu_shutdown)(void);
+ };
+ 
+ extern struct x86_init_ops x86_init;
+diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
+index 9c371e4..41c4be0 100644
+--- a/arch/x86/include/asm/xen/hypercall.h
++++ b/arch/x86/include/asm/xen/hypercall.h
+@@ -45,6 +45,8 @@
+ #include <xen/interface/xen.h>
+ #include <xen/interface/sched.h>
+ #include <xen/interface/physdev.h>
++#include <xen/interface/platform.h>
++#include <xen/interface/xen-mca.h>
+ 
+ /*
+  * The hypercall asms have to meet several constraints:
+@@ -200,6 +202,23 @@ extern struct { char _entry[32]; } hypercall_page[];
+ 	(type)__res;							\
+ })
+ 
++static inline long
++privcmd_call(unsigned call,
++	     unsigned long a1, unsigned long a2,
++	     unsigned long a3, unsigned long a4,
++	     unsigned long a5)
++{
++	__HYPERCALL_DECLS;
++	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
++
++	asm volatile("call *%[call]"
++		     : __HYPERCALL_5PARAM
++		     : [call] "a" (&hypercall_page[call])
++		     : __HYPERCALL_CLOBBER5);
++
++	return (long)__res;
++}
++
+ static inline int
+ HYPERVISOR_set_trap_table(struct trap_info *table)
+ {
+@@ -282,6 +301,20 @@ HYPERVISOR_set_timer_op(u64 timeout)
+ }
+ 
+ static inline int
++HYPERVISOR_mca(struct xen_mc *mc_op)
++{
++	mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
++	return _hypercall1(int, mca, mc_op);
++}
++
++static inline int
++HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
++{
++	platform_op->interface_version = XENPF_INTERFACE_VERSION;
++	return _hypercall1(int, dom0_op, platform_op);
++}
++
++static inline int
+ HYPERVISOR_set_debugreg(int reg, unsigned long value)
+ {
+ 	return _hypercall2(int, set_debugreg, reg, value);
+@@ -417,6 +450,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
+ 	return _hypercall2(int, nmi_op, op, arg);
+ }
+ 
++static inline unsigned long __must_check
++HYPERVISOR_hvm_op(int op, void *arg)
++{
++       return _hypercall2(unsigned long, hvm_op, op, arg);
++}
++
+ static inline void
+ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+ {
+@@ -424,6 +463,14 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
+ 	mcl->args[0] = set;
+ }
+ 
++#if defined(CONFIG_X86_64)
++#define MULTI_UVMFLAGS_INDEX 2
++#define MULTI_UVMDOMID_INDEX 3
++#else
++#define MULTI_UVMFLAGS_INDEX 3
++#define MULTI_UVMDOMID_INDEX 4
++#endif
++
+ static inline void
+ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ 			pte_t new_val, unsigned long flags)
+@@ -432,12 +479,11 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
+ 	mcl->args[0] = va;
+ 	if (sizeof(new_val) == sizeof(long)) {
+ 		mcl->args[1] = new_val.pte;
+-		mcl->args[2] = flags;
+ 	} else {
+ 		mcl->args[1] = new_val.pte;
+ 		mcl->args[2] = new_val.pte >> 32;
+-		mcl->args[3] = flags;
+ 	}
++	mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
+ }
+ 
+ static inline void
+diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
+index d5b7e90..396ff4c 100644
+--- a/arch/x86/include/asm/xen/hypervisor.h
++++ b/arch/x86/include/asm/xen/hypervisor.h
+@@ -37,31 +37,4 @@
+ extern struct shared_info *HYPERVISOR_shared_info;
+ extern struct start_info *xen_start_info;
+ 
+-enum xen_domain_type {
+-	XEN_NATIVE,		/* running on bare hardware    */
+-	XEN_PV_DOMAIN,		/* running in a PV domain      */
+-	XEN_HVM_DOMAIN,		/* running in a Xen hvm domain */
+-};
+-
+-#ifdef CONFIG_XEN
+-extern enum xen_domain_type xen_domain_type;
+-#else
+-#define xen_domain_type		XEN_NATIVE
+-#endif
+-
+-#define xen_domain()		(xen_domain_type != XEN_NATIVE)
+-#define xen_pv_domain()		(xen_domain() &&			\
+-				 xen_domain_type == XEN_PV_DOMAIN)
+-#define xen_hvm_domain()	(xen_domain() &&			\
+-				 xen_domain_type == XEN_HVM_DOMAIN)
+-
+-#ifdef CONFIG_XEN_DOM0
+-#include <xen/interface/xen.h>
+-
+-#define xen_initial_domain()	(xen_pv_domain() && \
+-				 xen_start_info->flags & SIF_INITDOMAIN)
+-#else  /* !CONFIG_XEN_DOM0 */
+-#define xen_initial_domain()	(0)
+-#endif	/* CONFIG_XEN_DOM0 */
+-
+ #endif /* _ASM_X86_XEN_HYPERVISOR_H */
+diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
+index e8506c1..9539998 100644
+--- a/arch/x86/include/asm/xen/interface.h
++++ b/arch/x86/include/asm/xen/interface.h
+@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void);
+ #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+ #endif
+ 
+-#ifndef machine_to_phys_mapping
+-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+-#endif
++#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
++#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
++#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
+ 
+ /* Maximum number of virtual CPUs in multi-processor guests. */
+ #define MAX_VIRT_CPUS 32
+@@ -97,6 +97,8 @@ DEFINE_GUEST_HANDLE(void);
+ #define TI_SET_IF(_ti, _if)	((_ti)->flags |= ((!!(_if))<<2))
+ 
+ #ifndef __ASSEMBLY__
++#include <linux/types.h>
++
+ struct trap_info {
+     uint8_t       vector;  /* exception vector                              */
+     uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
+diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
+index 42a7e00..8413688 100644
+--- a/arch/x86/include/asm/xen/interface_32.h
++++ b/arch/x86/include/asm/xen/interface_32.h
+@@ -32,6 +32,11 @@
+ /* And the trap vector is... */
+ #define TRAP_INSTR "int $0x82"
+ 
++#define __MACH2PHYS_VIRT_START 0xF5800000
++#define __MACH2PHYS_VIRT_END   0xF6800000
++
++#define __MACH2PHYS_SHIFT      2
++
+ /*
+  * Virtual addresses beyond this are not modifiable by guest OSes. The
+  * machine->physical mapping table starts at this address, read-only.
+diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
+index 100d266..839a481 100644
+--- a/arch/x86/include/asm/xen/interface_64.h
++++ b/arch/x86/include/asm/xen/interface_64.h
+@@ -39,18 +39,7 @@
+ #define __HYPERVISOR_VIRT_END   0xFFFF880000000000
+ #define __MACH2PHYS_VIRT_START  0xFFFF800000000000
+ #define __MACH2PHYS_VIRT_END    0xFFFF804000000000
+-
+-#ifndef HYPERVISOR_VIRT_START
+-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+-#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
+-#endif
+-
+-#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+-#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+-#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+-#ifndef machine_to_phys_mapping
+-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+-#endif
++#define __MACH2PHYS_SHIFT       3
+ 
+ /*
+  * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+diff --git a/arch/x86/include/asm/xen/iommu.h b/arch/x86/include/asm/xen/iommu.h
+new file mode 100644
+index 0000000..75df312
+--- /dev/null
++++ b/arch/x86/include/asm/xen/iommu.h
+@@ -0,0 +1,12 @@
++#ifndef ASM_X86__XEN_IOMMU_H
++
++#ifdef CONFIG_PCI_XEN
++extern void xen_iommu_init(void);
++#else
++static inline void xen_iommu_init(void)
++{
++}
++#endif
++
++#endif
++
+diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
+index 018a0a4..8760cc6 100644
+--- a/arch/x86/include/asm/xen/page.h
++++ b/arch/x86/include/asm/xen/page.h
+@@ -5,6 +5,7 @@
+ #include <linux/types.h>
+ #include <linux/spinlock.h>
+ #include <linux/pfn.h>
++#include <linux/mm.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/page.h>
+@@ -35,16 +36,25 @@ typedef struct xpaddr {
+ #define MAX_DOMAIN_PAGES						\
+     ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+ 
++extern unsigned long *machine_to_phys_mapping;
++extern unsigned int   machine_to_phys_order;
+ 
+ extern unsigned long get_phys_to_machine(unsigned long pfn);
+-extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
++extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+ 
+ static inline unsigned long pfn_to_mfn(unsigned long pfn)
+ {
++	unsigned long mfn;
++
+ 	if (xen_feature(XENFEAT_auto_translated_physmap))
+ 		return pfn;
+ 
+-	return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
++	mfn = get_phys_to_machine(pfn);
++
++	if (mfn != INVALID_P2M_ENTRY)
++		mfn &= ~FOREIGN_FRAME_BIT;
++
++	return mfn;
+ }
+ 
+ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+@@ -62,10 +72,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
+ 	if (xen_feature(XENFEAT_auto_translated_physmap))
+ 		return mfn;
+ 
+-#if 0
+ 	if (unlikely((mfn >> machine_to_phys_order) != 0))
+-		return max_mapnr;
+-#endif
++		return ~0;
+ 
+ 	pfn = 0;
+ 	/*
+@@ -112,13 +120,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+  */
+ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+ {
+-	extern unsigned long max_mapnr;
+ 	unsigned long pfn = mfn_to_pfn(mfn);
+-	if ((pfn < max_mapnr)
+-	    && !xen_feature(XENFEAT_auto_translated_physmap)
+-	    && (get_phys_to_machine(pfn) != mfn))
+-		return max_mapnr; /* force !pfn_valid() */
+-	/* XXX fixme; not true with sparsemem */
++	if (get_phys_to_machine(pfn) != mfn)
++		return -1; /* force !pfn_valid() */
+ 	return pfn;
+ }
+ 
+@@ -163,6 +167,7 @@ static inline pte_t __pte_ma(pteval_t x)
+ 
+ #define pgd_val_ma(x)	((x).pgd)
+ 
++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
+ 
+ xmaddr_t arbitrary_virt_to_machine(void *address);
+ unsigned long arbitrary_virt_to_mfn(void *vaddr);
+diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
+new file mode 100644
+index 0000000..6683196
+--- /dev/null
++++ b/arch/x86/include/asm/xen/pci.h
+@@ -0,0 +1,104 @@
++#ifndef _ASM_X86_XEN_PCI_H
++#define _ASM_X86_XEN_PCI_H
++
++#if defined(CONFIG_PCI_MSI)
++#if defined(CONFIG_PCI_XEN)
++int xen_register_pirq(u32 gsi, int triggering);
++int xen_register_gsi(u32 gsi, int triggering, int polarity);
++int xen_create_msi_irq(struct pci_dev *dev,
++			struct msi_desc *msidesc,
++			int type);
++void xen_pci_teardown_msi_dev(struct pci_dev *dev);
++void xen_pci_teardown_msi_irq(int irq);
++int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
++
++/* The drivers/pci/xen-pcifront.c sets this structure to
++ * its own functions.
++ */
++struct xen_pci_frontend_ops {
++	int (*enable_msi)(struct pci_dev *dev, int **vectors);
++	void (*disable_msi)(struct pci_dev *dev);
++	int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec);
++	void (*disable_msix)(struct pci_dev *dev);
++};
++
++extern struct xen_pci_frontend_ops *xen_pci_frontend;
++	
++static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
++					      int **vectors)
++{
++	if (xen_pci_frontend && xen_pci_frontend->enable_msi)
++		return xen_pci_frontend->enable_msi(dev, vectors);
++	return -ENODEV;
++}
++static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
++{
++	if (xen_pci_frontend && xen_pci_frontend->disable_msi)
++			xen_pci_frontend->disable_msi(dev);
++}
++static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
++					       int **vectors, int nvec)
++{
++	if (xen_pci_frontend && xen_pci_frontend->enable_msix)
++		return xen_pci_frontend->enable_msix(dev, vectors, nvec);
++	return -ENODEV;
++}
++static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
++{
++	if (xen_pci_frontend && xen_pci_frontend->disable_msix)
++			xen_pci_frontend->disable_msix(dev);
++}
++#else
++static inline int xen_create_msi_irq(struct pci_dev *dev,
++				struct msi_desc *msidesc,
++				int type)
++{
++	return -1;
++}
++static inline void xen_pci_teardown_msi_dev(struct pci_dev *dev) { }
++static inline void xen_pci_teardown_msi_irq(int irq) { }
++static inline int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++	return -ENODEV;
++}
++#endif /* CONFIG_PCI_XEN */
++
++#endif /* CONFIG_PCI_MSI */
++
++#ifdef CONFIG_XEN_DOM0_PCI
++int xen_register_gsi(u32 gsi, int triggering, int polarity);
++int xen_find_device_domain_owner(struct pci_dev *dev);
++int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
++int xen_unregister_device_domain_owner(struct pci_dev *dev);
++
++#else
++static inline int xen_register_gsi(u32 gsi, int triggering, int polarity)
++{
++	return -1;
++}
++
++static inline int xen_find_device_domain_owner(struct pci_dev *dev)
++{
++	return -1;
++}
++static inline int xen_register_device_domain_owner(struct pci_dev *dev,
++						   uint16_t domain)
++{
++ 	return -1;
++}
++static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
++{
++ 	return -1;
++}
++#endif
++
++#if defined(CONFIG_PCI_MSI) && defined(CONFIG_XEN_DOM0_PCI)
++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
++#else
++static inline int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++	return -1;
++}
++#endif
++
++#endif	/* _ASM_X86_XEN_PCI_H */
+diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h
+new file mode 100644
+index 0000000..e4fe299
+--- /dev/null
++++ b/arch/x86/include/asm/xen/swiotlb-xen.h
+@@ -0,0 +1,14 @@
++#ifndef _ASM_X86_SWIOTLB_XEN_H 
++#define _ASM_X86_SWIOTLB_XEN_H
++ 
++#ifdef CONFIG_PCI_XEN
++extern int xen_swiotlb;
++extern int __init pci_xen_swiotlb_detect(void);
++extern void __init pci_xen_swiotlb_init(void);
++#else
++#define xen_swiotlb 0
++static inline int __init pci_xen_swiotlb_detect(void) { return 0; }
++static inline void __init pci_xen_swiotlb_init(void) { }
++#endif
++ 
++#endif /* _ASM_X86_SWIOTLB_XEN_H */
+diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
+index d8e5d0c..d4f3b05 100644
+--- a/arch/x86/kernel/Makefile
++++ b/arch/x86/kernel/Makefile
+@@ -11,6 +11,7 @@ ifdef CONFIG_FUNCTION_TRACER
+ CFLAGS_REMOVE_tsc.o = -pg
+ CFLAGS_REMOVE_rtc.o = -pg
+ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
++CFLAGS_REMOVE_pvclock.o = -pg
+ CFLAGS_REMOVE_ftrace.o = -pg
+ CFLAGS_REMOVE_early_printk.o = -pg
+ endif
+@@ -111,6 +112,7 @@ obj-$(CONFIG_X86_MRST)		+= mrst.o
+ microcode-y				:= microcode_core.o
+ microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o
+ microcode-$(CONFIG_MICROCODE_AMD)	+= microcode_amd.o
++microcode-$(CONFIG_MICROCODE_XEN)	+= microcode_xen.o
+ obj-$(CONFIG_MICROCODE)			+= microcode.o
+ 
+ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
+index 23c2da8..a2a5125 100644
+--- a/arch/x86/kernel/acpi/boot.c
++++ b/arch/x86/kernel/acpi/boot.c
+@@ -42,6 +42,10 @@
+ #include <asm/mpspec.h>
+ #include <asm/smp.h>
+ 
++#include <asm/xen/pci.h>
++
++#include <asm/xen/hypervisor.h>
++
+ static int __initdata acpi_force = 0;
+ u32 acpi_rsdt_forced;
+ int acpi_disabled;
+@@ -149,6 +153,10 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
+ {
+ 	unsigned int ver = 0;
+ 
++	/* We don't want to register lapics when in Xen dom0 */
++	if (xen_initial_domain())
++		return;
++
+ 	if (!enabled) {
+ 		++disabled_cpus;
+ 		return;
+@@ -461,9 +469,13 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+  */
+ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+ {
+-	unsigned int irq;
++	int irq;
+ 	unsigned int plat_gsi = gsi;
+ 
++	irq = xen_register_gsi(gsi, trigger, polarity);
++	if (irq >= 0)
++		return irq;
++
+ #ifdef CONFIG_PCI
+ 	/*
+ 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
+@@ -740,6 +752,10 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
+ 
+ static void __init acpi_register_lapic_address(unsigned long address)
+ {
++	/* Xen dom0 doesn't have usable lapics */
++	if (xen_initial_domain())
++		return;
++
+ 	mp_lapic_addr = address;
+ 
+ 	set_fixmap_nocache(FIX_APIC_BASE, address);
+@@ -860,6 +876,9 @@ int __init acpi_probe_gsi(void)
+ 			max_gsi = gsi;
+ 	}
+ 
++	if (xen_initial_domain())
++		max_gsi += 255; /* Plus maximum entries of an ioapic. */
++
+ 	return max_gsi + 1;
+ }
+ 
+diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
+index d85d1b2..8aabedd 100644
+--- a/arch/x86/kernel/acpi/processor.c
++++ b/arch/x86/kernel/acpi/processor.c
+@@ -12,6 +12,8 @@
+ #include <acpi/processor.h>
+ #include <asm/acpi.h>
+ 
++#include <xen/xen.h>
++
+ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
+ {
+ 	struct acpi_object_list *obj_list;
+@@ -59,7 +61,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
+ 	/*
+ 	 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
+ 	 */
+-	if (!cpu_has(c, X86_FEATURE_MWAIT))
++	if (!cpu_has(c, X86_FEATURE_MWAIT) && !xen_initial_domain())
+ 		buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
+ 
+ 	obj->type = ACPI_TYPE_BUFFER;
+@@ -88,6 +90,19 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
+ 
+ EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
+ 
++/* Initialize _PDC data based on the CPU vendor */
++void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr)
++{
++	struct cpuinfo_x86 *c = &cpu_data(0);
++
++	pr->pdc = NULL;
++	if (c->x86_vendor == X86_VENDOR_INTEL)
++		init_intel_pdc(pr, c);
++
++	return;
++}
++EXPORT_SYMBOL(xen_arch_acpi_processor_init_pdc);
++
+ void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
+ {
+ 	if (pr->pdc) {
+diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
+index ca93638..9eff23c 100644
+--- a/arch/x86/kernel/acpi/sleep.c
++++ b/arch/x86/kernel/acpi/sleep.c
+@@ -12,6 +12,8 @@
+ #include <asm/segment.h>
+ #include <asm/desc.h>
+ 
++#include <xen/acpi.h>
++
+ #include "realmode/wakeup.h"
+ #include "sleep.h"
+ 
+diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
+index f0fa7a1..0c1876b 100644
+--- a/arch/x86/kernel/amd_iommu.c
++++ b/arch/x86/kernel/amd_iommu.c
+@@ -928,7 +928,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
+ 	}
+ 
+ 	if (unlikely(address == -1))
+-		address = bad_dma_address;
++		address = DMA_ERROR_CODE;
+ 
+ 	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
+ 
+@@ -1545,7 +1545,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
+ 
+ 	pte  = dma_ops_get_pte(dom, address);
+ 	if (!pte)
+-		return bad_dma_address;
++		return DMA_ERROR_CODE;
+ 
+ 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
+ 
+@@ -1626,7 +1626,7 @@ static dma_addr_t __map_single(struct device *dev,
+ retry:
+ 	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
+ 					  dma_mask);
+-	if (unlikely(address == bad_dma_address)) {
++	if (unlikely(address == DMA_ERROR_CODE)) {
+ 		/*
+ 		 * setting next_address here will let the address
+ 		 * allocator only scan the new allocated range in the
+@@ -1647,7 +1647,7 @@ retry:
+ 	start = address;
+ 	for (i = 0; i < pages; ++i) {
+ 		ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+-		if (ret == bad_dma_address)
++		if (ret == DMA_ERROR_CODE)
+ 			goto out_unmap;
+ 
+ 		paddr += PAGE_SIZE;
+@@ -1675,7 +1675,7 @@ out_unmap:
+ 
+ 	dma_ops_free_addresses(dma_dom, address, pages);
+ 
+-	return bad_dma_address;
++	return DMA_ERROR_CODE;
+ }
+ 
+ /*
+@@ -1691,7 +1691,7 @@ static void __unmap_single(struct amd_iommu *iommu,
+ 	dma_addr_t i, start;
+ 	unsigned int pages;
+ 
+-	if ((dma_addr == bad_dma_address) ||
++	if ((dma_addr == DMA_ERROR_CODE) ||
+ 	    (dma_addr + size > dma_dom->aperture_size))
+ 		return;
+ 
+@@ -1733,7 +1733,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
+ 	INC_STATS_COUNTER(cnt_map_single);
+ 
+ 	if (!check_device(dev))
+-		return bad_dma_address;
++		return DMA_ERROR_CODE;
+ 
+ 	dma_mask = *dev->dma_mask;
+ 
+@@ -1744,12 +1744,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
+ 		return (dma_addr_t)paddr;
+ 
+ 	if (!dma_ops_domain(domain))
+-		return bad_dma_address;
++		return DMA_ERROR_CODE;
+ 
+ 	spin_lock_irqsave(&domain->lock, flags);
+ 	addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
+ 			    dma_mask);
+-	if (addr == bad_dma_address)
++	if (addr == DMA_ERROR_CODE)
+ 		goto out;
+ 
+ 	iommu_completion_wait(iommu);
+@@ -1958,7 +1958,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
+ 	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+ 				 size, DMA_BIDIRECTIONAL, true, dma_mask);
+ 
+-	if (*dma_addr == bad_dma_address) {
++	if (*dma_addr == DMA_ERROR_CODE) {
+ 		spin_unlock_irqrestore(&domain->lock, flags);
+ 		goto out_free;
+ 	}
+@@ -2120,8 +2120,7 @@ int __init amd_iommu_init_dma_ops(void)
+ 		prealloc_protection_domains();
+ 
+ 	iommu_detected = 1;
+-	force_iommu = 1;
+-	bad_dma_address = 0;
++	swiotlb = 0;
+ #ifdef CONFIG_GART_IOMMU
+ 	gart_iommu_aperture_disabled = 1;
+ 	gart_iommu_aperture = 0;
+diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
+index 3925adf..642793e 100644
+--- a/arch/x86/kernel/amd_iommu_init.c
++++ b/arch/x86/kernel/amd_iommu_init.c
+@@ -29,6 +29,7 @@
+ #include <asm/amd_iommu.h>
+ #include <asm/iommu.h>
+ #include <asm/gart.h>
++#include <asm/x86_init.h>
+ 
+ /*
+  * definitions for the ACPI scanning code
+@@ -1183,19 +1184,10 @@ static struct sys_device device_amd_iommu = {
+  * functions. Finally it prints some information about AMD IOMMUs and
+  * the driver state and enables the hardware.
+  */
+-int __init amd_iommu_init(void)
++static int __init amd_iommu_init(void)
+ {
+ 	int i, ret = 0;
+ 
+-
+-	if (no_iommu) {
+-		printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
+-		return 0;
+-	}
+-
+-	if (!amd_iommu_detected)
+-		return -ENODEV;
+-
+ 	/*
+ 	 * First parse ACPI tables to find the largest Bus/Dev/Func
+ 	 * we need to handle. Upon this information the shared data
+@@ -1310,6 +1302,7 @@ int __init amd_iommu_init(void)
+ 	else
+ 		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
+ 
++	x86_platform.iommu_shutdown = disable_iommus;
+ out:
+ 	return ret;
+ 
+@@ -1338,11 +1331,6 @@ free:
+ 	goto out;
+ }
+ 
+-void amd_iommu_shutdown(void)
+-{
+-	disable_iommus();
+-}
+-
+ /****************************************************************************
+  *
+  * Early detect code. This code runs at IOMMU detection time in the DMA
+@@ -1357,16 +1345,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+ 
+ void __init amd_iommu_detect(void)
+ {
+-	if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
++	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
+ 		return;
+ 
+ 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+ 		iommu_detected = 1;
+ 		amd_iommu_detected = 1;
+-#ifdef CONFIG_GART_IOMMU
+-		gart_iommu_aperture_disabled = 1;
+-		gart_iommu_aperture = 0;
+-#endif
++		x86_init.iommu.iommu_init = amd_iommu_init;
+ 	}
+ }
+ 
+diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
+index 082089e..8d34362 100644
+--- a/arch/x86/kernel/aperture_64.c
++++ b/arch/x86/kernel/aperture_64.c
+@@ -28,6 +28,7 @@
+ #include <asm/pci-direct.h>
+ #include <asm/dma.h>
+ #include <asm/k8.h>
++#include <asm/x86_init.h>
+ 
+ int gart_iommu_aperture;
+ int gart_iommu_aperture_disabled __initdata;
+@@ -401,6 +402,7 @@ void __init gart_iommu_hole_init(void)
+ 
+ 			iommu_detected = 1;
+ 			gart_iommu_aperture = 1;
++			x86_init.iommu.iommu_init = gart_iommu_init;
+ 
+ 			ctl = read_pci_config(bus, slot, 3,
+ 					      AMD64_GARTAPERTURECTL);
+@@ -469,7 +471,7 @@ out:
+ 
+ 	if (aper_alloc) {
+ 		/* Got the aperture from the AGP bridge */
+-	} else if (swiotlb && !valid_agp) {
++	} else if (!valid_agp) {
+ 		/* Do nothing */
+ 	} else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ 		   force_iommu ||
+diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
+index 0da6495..42d1fe2 100644
+--- a/arch/x86/kernel/apic/io_apic.c
++++ b/arch/x86/kernel/apic/io_apic.c
+@@ -63,7 +63,12 @@
+ #include <asm/uv/uv_hub.h>
+ #include <asm/uv/uv_irq.h>
+ 
++#include <asm/xen/hypervisor.h>
+ #include <asm/apic.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/pci.h>
++
++#include <asm/xen/pci.h>
+ 
+ #define __apicdebuginit(type) static type __init
+ #define for_each_irq_pin(entry, head) \
+@@ -390,14 +395,18 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+ 
+ static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+ {
+-	struct io_apic __iomem *io_apic = io_apic_base(apic);
++	struct io_apic __iomem *io_apic;
++
++	io_apic = io_apic_base(apic);
+ 	writel(reg, &io_apic->index);
+ 	return readl(&io_apic->data);
+ }
+ 
+ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+ {
+-	struct io_apic __iomem *io_apic = io_apic_base(apic);
++	struct io_apic __iomem *io_apic;
++
++	io_apic = io_apic_base(apic);
+ 	writel(reg, &io_apic->index);
+ 	writel(value, &io_apic->data);
+ }
+@@ -410,7 +419,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
+  */
+ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+ {
+-	struct io_apic __iomem *io_apic = io_apic_base(apic);
++	struct io_apic __iomem *io_apic;
++
++	io_apic = io_apic_base(apic);
+ 
+ 	if (sis_apic_bug)
+ 		writel(reg, &io_apic->index);
+@@ -3487,6 +3498,9 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+ 	if (type == PCI_CAP_ID_MSI && nvec > 1)
+ 		return 1;
+ 
++	if (xen_pv_domain())
++		return xen_pci_setup_msi_irqs(dev, nvec, type);
++
+ 	node = dev_to_node(&dev->dev);
+ 	irq_want = nr_irqs_gsi;
+ 	sub_handle = 0;
+@@ -3536,7 +3550,29 @@ error:
+ 
+ void arch_teardown_msi_irq(unsigned int irq)
+ {
+-	destroy_irq(irq);
++	if (xen_domain())
++		xen_pci_teardown_msi_irq(irq);
++	else
++		destroy_irq(irq);
++}
++
++void arch_teardown_msi_irqs(struct pci_dev *dev)
++{
++	struct msi_desc *entry;
++
++	/* If we are non-privileged PV domain, we have to
++	* to call xen_teardown_msi_dev first. */
++	if (xen_domain())
++		xen_pci_teardown_msi_dev(dev);
++
++	list_for_each_entry(entry, &dev->msi_list, list) {
++		int i, nvec;
++		if (entry->irq == 0)
++			continue;
++		nvec = 1 << entry->msi_attrib.multiple;
++		for (i = 0; i < nvec; i++)
++			arch_teardown_msi_irq(entry->irq + i);
++	}
+ }
+ 
+ #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
+@@ -3852,7 +3888,14 @@ void __init probe_nr_irqs_gsi(void)
+ 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
+ }
+ 
++int get_nr_irqs_gsi(void)
++{
++	return nr_irqs_gsi;
++}
++
+ #ifdef CONFIG_SPARSE_IRQ
++int nr_dynamic_irqs;
++
+ int __init arch_probe_nr_irqs(void)
+ {
+ 	int nr;
+@@ -3870,6 +3913,8 @@ int __init arch_probe_nr_irqs(void)
+ 	if (nr < nr_irqs)
+ 		nr_irqs = nr;
+ 
++	nr_irqs += nr_dynamic_irqs;
++
+ 	return 0;
+ }
+ #endif
+diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
+index 7ff61d6..d1e6e60 100644
+--- a/arch/x86/kernel/apic/nmi.c
++++ b/arch/x86/kernel/apic/nmi.c
+@@ -558,6 +558,9 @@ void arch_trigger_all_cpu_backtrace(void)
+ {
+ 	int i;
+ 
++	if (!cpu_has_apic)
++		return;
++
+ 	cpumask_copy(&backtrace_mask, cpu_online_mask);
+ 
+ 	printk(KERN_INFO "sending NMI to all CPUs:\n");
+diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
+index f4361b5..404e458 100644
+--- a/arch/x86/kernel/cpu/mtrr/Makefile
++++ b/arch/x86/kernel/cpu/mtrr/Makefile
+@@ -1,3 +1,4 @@
+ obj-y		:= main.o if.o generic.o state.o cleanup.o
+ obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
++obj-$(CONFIG_XEN_DOM0) += xen.o
+ 
+diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
+index 33af141..378f8dc 100644
+--- a/arch/x86/kernel/cpu/mtrr/amd.c
++++ b/arch/x86/kernel/cpu/mtrr/amd.c
+@@ -108,6 +108,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
+ 	return 0;
+ }
+ 
++static int amd_num_var_ranges(void)
++{
++	return 2;
++}
++
+ static struct mtrr_ops amd_mtrr_ops = {
+ 	.vendor            = X86_VENDOR_AMD,
+ 	.set               = amd_set_mtrr,
+@@ -115,6 +120,7 @@ static struct mtrr_ops amd_mtrr_ops = {
+ 	.get_free_region   = generic_get_free_region,
+ 	.validate_add_page = amd_validate_add_page,
+ 	.have_wrcomb       = positive_have_wrcomb,
++	.num_var_ranges	   = amd_num_var_ranges,
+ };
+ 
+ int __init amd_init_mtrr(void)
+diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
+index de89f14..7c686a0 100644
+--- a/arch/x86/kernel/cpu/mtrr/centaur.c
++++ b/arch/x86/kernel/cpu/mtrr/centaur.c
+@@ -110,6 +110,11 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
+ 	return 0;
+ }
+ 
++static int centaur_num_var_ranges(void)
++{
++	return 8;
++}
++
+ static struct mtrr_ops centaur_mtrr_ops = {
+ 	.vendor            = X86_VENDOR_CENTAUR,
+ 	.set               = centaur_set_mcr,
+@@ -117,6 +122,7 @@ static struct mtrr_ops centaur_mtrr_ops = {
+ 	.get_free_region   = centaur_get_free_region,
+ 	.validate_add_page = centaur_validate_add_page,
+ 	.have_wrcomb       = positive_have_wrcomb,
++	.num_var_ranges	   = centaur_num_var_ranges,
+ };
+ 
+ int __init centaur_init_mtrr(void)
+diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
+index 228d982..fd6edcc 100644
+--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
++++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
+@@ -265,6 +265,11 @@ static void cyrix_set_all(void)
+ 	post_set();
+ }
+ 
++static int cyrix_num_var_ranges(void)
++{
++	return 8;
++}
++
+ static struct mtrr_ops cyrix_mtrr_ops = {
+ 	.vendor            = X86_VENDOR_CYRIX,
+ 	.set_all	   = cyrix_set_all,
+@@ -273,6 +278,7 @@ static struct mtrr_ops cyrix_mtrr_ops = {
+ 	.get_free_region   = cyrix_get_free_region,
+ 	.validate_add_page = generic_validate_add_page,
+ 	.have_wrcomb       = positive_have_wrcomb,
++	.num_var_ranges	   = cyrix_num_var_ranges,
+ };
+ 
+ int __init cyrix_init_mtrr(void)
+diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
+index 55da0c5..42f30cd 100644
+--- a/arch/x86/kernel/cpu/mtrr/generic.c
++++ b/arch/x86/kernel/cpu/mtrr/generic.c
+@@ -749,8 +749,16 @@ int positive_have_wrcomb(void)
+ 	return 1;
+ }
+ 
+-/*
+- * Generic structure...
++static int generic_num_var_ranges(void)
++{
++	unsigned long config = 0, dummy;
++
++	rdmsr(MSR_MTRRcap, config, dummy);
++
++	return config & 0xff;
++}
++
++/* generic structure...
+  */
+ struct mtrr_ops generic_mtrr_ops = {
+ 	.use_intel_if		= 1,
+@@ -760,4 +768,5 @@ struct mtrr_ops generic_mtrr_ops = {
+ 	.set			= generic_set_mtrr,
+ 	.validate_add_page	= generic_validate_add_page,
+ 	.have_wrcomb		= generic_have_wrcomb,
++	.num_var_ranges	   	= generic_num_var_ranges,
+ };
+diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
+index 84e83de..c8cb9ed 100644
+--- a/arch/x86/kernel/cpu/mtrr/main.c
++++ b/arch/x86/kernel/cpu/mtrr/main.c
+@@ -110,21 +110,6 @@ static int have_wrcomb(void)
+ 	return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
+ }
+ 
+-/*  This function returns the number of variable MTRRs  */
+-static void __init set_num_var_ranges(void)
+-{
+-	unsigned long config = 0, dummy;
+-
+-	if (use_intel())
+-		rdmsr(MSR_MTRRcap, config, dummy);
+-	else if (is_cpu(AMD))
+-		config = 2;
+-	else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
+-		config = 8;
+-
+-	num_var_ranges = config & 0xff;
+-}
+-
+ static void __init init_table(void)
+ {
+ 	int i, max;
+@@ -711,8 +696,11 @@ void __init mtrr_bp_init(void)
+ 		}
+ 	}
+ 
++	/* Let Xen code override the above if it wants */
++	xen_init_mtrr();
++
+ 	if (mtrr_if) {
+-		set_num_var_ranges();
++		num_var_ranges = mtrr_if->num_var_ranges();
+ 		init_table();
+ 		if (use_intel()) {
+ 			get_mtrr_state();
+diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
+index a501dee..98569c3 100644
+--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
++++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
+@@ -5,6 +5,8 @@
+ #include <linux/types.h>
+ #include <linux/stddef.h>
+ 
++#include <asm/mtrr.h>
++
+ #define MTRR_CHANGE_MASK_FIXED     0x01
+ #define MTRR_CHANGE_MASK_VARIABLE  0x02
+ #define MTRR_CHANGE_MASK_DEFTYPE   0x04
+@@ -25,6 +27,8 @@ struct mtrr_ops {
+ 	int	(*validate_add_page)(unsigned long base, unsigned long size,
+ 				     unsigned int type);
+ 	int	(*have_wrcomb)(void);
++
++	int	(*num_var_ranges)(void);
+ };
+ 
+ extern int generic_get_free_region(unsigned long base, unsigned long size,
+@@ -73,6 +77,13 @@ void mtrr_wrmsr(unsigned, unsigned, unsigned);
+ int amd_init_mtrr(void);
+ int cyrix_init_mtrr(void);
+ int centaur_init_mtrr(void);
++#ifdef CONFIG_XEN_DOM0
++void xen_init_mtrr(void);
++#else
++static inline void xen_init_mtrr(void)
++{
++}
++#endif
+ 
+ extern int changed_by_mtrr_cleanup;
+ extern int mtrr_cleanup(unsigned address_bits);
+diff --git a/arch/x86/kernel/cpu/mtrr/xen.c b/arch/x86/kernel/cpu/mtrr/xen.c
+new file mode 100644
+index 0000000..852018b
+--- /dev/null
++++ b/arch/x86/kernel/cpu/mtrr/xen.c
+@@ -0,0 +1,109 @@
++#include <linux/init.h>
++#include <linux/mm.h>
++
++#include <asm/pat.h>
++
++#include "mtrr.h"
++
++#include <xen/xen.h>
++#include <xen/interface/platform.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++static void xen_set_mtrr(unsigned int reg, unsigned long base,
++			 unsigned long size, mtrr_type type)
++{
++	struct xen_platform_op op;
++	int error;
++
++	/* mtrr_ops->set() is called once per CPU,
++	 * but Xen's ops apply to all CPUs.
++	 */
++	if (smp_processor_id())
++		return;
++
++	if (size == 0) {
++		op.cmd = XENPF_del_memtype;
++		op.u.del_memtype.handle = 0;
++		op.u.del_memtype.reg    = reg;
++	} else {
++		op.cmd = XENPF_add_memtype;
++		op.u.add_memtype.mfn     = base;
++		op.u.add_memtype.nr_mfns = size;
++		op.u.add_memtype.type    = type;
++	}
++
++	error = HYPERVISOR_dom0_op(&op);
++	BUG_ON(error != 0);
++}
++
++static void xen_get_mtrr(unsigned int reg, unsigned long *base,
++			 unsigned long *size, mtrr_type *type)
++{
++	struct xen_platform_op op;
++
++	op.cmd = XENPF_read_memtype;
++	op.u.read_memtype.reg = reg;
++	if (HYPERVISOR_dom0_op(&op) != 0) {
++		*base = 0;
++		*size = 0;
++		*type = 0;
++		return;
++	}
++
++	*size = op.u.read_memtype.nr_mfns;
++	*base = op.u.read_memtype.mfn;
++	*type = op.u.read_memtype.type;
++}
++
++static int __init xen_num_var_ranges(void)
++{
++	int ranges;
++	struct xen_platform_op op;
++
++	op.cmd = XENPF_read_memtype;
++
++	for (ranges = 0; ; ranges++) {
++		op.u.read_memtype.reg = ranges;
++		if (HYPERVISOR_dom0_op(&op) != 0)
++			break;
++	}
++	return ranges;
++}
++
++/*
++ * DOM0 TODO: Need to fill in the remaining mtrr methods to have full
++ * working userland mtrr support.
++ */
++static struct mtrr_ops xen_mtrr_ops = {
++	.vendor            = X86_VENDOR_UNKNOWN,
++	.get_free_region   = generic_get_free_region,
++	.set               = xen_set_mtrr,
++	.get               = xen_get_mtrr,
++	.have_wrcomb       = positive_have_wrcomb,
++	.validate_add_page = generic_validate_add_page,
++	.use_intel_if	   = 0,
++	.num_var_ranges	   = xen_num_var_ranges,
++};
++
++void __init xen_init_mtrr(void)
++{
++	/* 
++	 * Check that we're running under Xen, and privileged enough
++	 * to play with MTRRs.
++	 */
++	if (!xen_initial_domain())
++		return;
++
++	/* 
++	 * Check that the CPU has an MTRR implementation we can
++	 * support.
++	 */
++	if (cpu_has_mtrr ||
++	    cpu_has_k6_mtrr ||
++	    cpu_has_cyrix_arr ||
++	    cpu_has_centaur_mcr) {
++		mtrr_if = &xen_mtrr_ops;
++		pat_init();
++	}
++}
+diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
+index ff95824..ebd4c51 100644
+--- a/arch/x86/kernel/crash.c
++++ b/arch/x86/kernel/crash.c
+@@ -28,7 +28,6 @@
+ #include <asm/reboot.h>
+ #include <asm/virtext.h>
+ 
+-
+ #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
+ 
+ static void kdump_nmi_callback(int cpu, struct die_args *args)
+diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
+index d17d482..4d0aded 100644
+--- a/arch/x86/kernel/e820.c
++++ b/arch/x86/kernel/e820.c
+@@ -750,6 +750,36 @@ static int __init find_overlapped_early(u64 start, u64 end)
+ 	return i;
+ }
+ 
++u64 __init early_res_next_free(u64 addr)
++{
++	int i;
++	u64 end = addr;
++	struct early_res *r;
++
++	for (i = 0; i < MAX_EARLY_RES; i++) {
++		r = &early_res[i];
++		if (addr >= r->start && addr < r->end) {
++			end = r->end;
++			break;
++		}
++	}
++	return end;
++}
++
++u64 __init early_res_next_reserved(u64 addr, u64 max)
++{
++	int i;
++	struct early_res *r;
++	u64 next_res = max;
++
++	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
++		r = &early_res[i];
++		if ((r->start >= addr) && (r->start < next_res))
++			next_res = r->start;
++	}
++	return next_res;
++}
++
+ /*
+  * Drop the i-th range from the early reservation map,
+  * by copying any higher ranges down one over it, and
+diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
+index c097e7d..7764118 100644
+--- a/arch/x86/kernel/entry_32.S
++++ b/arch/x86/kernel/entry_32.S
+@@ -1088,6 +1088,9 @@ ENTRY(xen_failsafe_callback)
+ .previous
+ ENDPROC(xen_failsafe_callback)
+ 
++BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
++		xen_evtchn_do_upcall)
++
+ #endif	/* CONFIG_XEN */
+ 
+ #ifdef CONFIG_FUNCTION_TRACER
+diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
+index b5c061f..a626344 100644
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1364,6 +1364,9 @@ ENTRY(xen_failsafe_callback)
+ 	CFI_ENDPROC
+ END(xen_failsafe_callback)
+ 
++apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
++	xen_hvm_callback_vector xen_evtchn_do_upcall
++
+ #endif /* CONFIG_XEN */
+ 
+ /*
+diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
+index 19528ef..40e47cd 100644
+--- a/arch/x86/kernel/hpet.c
++++ b/arch/x86/kernel/hpet.c
+@@ -98,7 +98,7 @@ static int __init hpet_setup(char *str)
+ }
+ __setup("hpet=", hpet_setup);
+ 
+-static int __init disable_hpet(char *str)
++int __init disable_hpet(char *str)
+ {
+ 	boot_hpet_disable = 1;
+ 	return 1;
+diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
+index 99c4d30..919c1a8 100644
+--- a/arch/x86/kernel/ioport.c
++++ b/arch/x86/kernel/ioport.c
+@@ -30,13 +30,29 @@ static void set_bitmap(unsigned long *bitmap, unsigned int base,
+ 	}
+ }
+ 
++void native_set_io_bitmap(struct thread_struct *t,
++			  unsigned long bytes_updated)
++{
++	struct tss_struct *tss;
++
++	if (!bytes_updated)
++		return;
++
++	tss = &__get_cpu_var(init_tss);
++
++	/* Update the TSS: */
++	if (t->io_bitmap_ptr)
++		memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
++	else
++		memset(tss->io_bitmap, 0xff, bytes_updated);
++}
++
+ /*
+  * this changes the io permissions bitmap in the current task.
+  */
+ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+ {
+ 	struct thread_struct *t = &current->thread;
+-	struct tss_struct *tss;
+ 	unsigned int i, max_long, bytes, bytes_updated;
+ 
+ 	if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+@@ -61,13 +77,13 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+ 	}
+ 
+ 	/*
+-	 * do it in the per-thread copy and in the TSS ...
++	 * do it in the per-thread copy
+ 	 *
+-	 * Disable preemption via get_cpu() - we must not switch away
++	 * Disable preemption - we must not switch away
+ 	 * because the ->io_bitmap_max value must match the bitmap
+ 	 * contents:
+ 	 */
+-	tss = &per_cpu(init_tss, get_cpu());
++	preempt_disable();
+ 
+ 	set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+ 
+@@ -85,10 +101,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+ 
+ 	t->io_bitmap_max = bytes;
+ 
+-	/* Update the TSS: */
+-	memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
++	set_io_bitmap(t, bytes_updated);
+ 
+-	put_cpu();
++	preempt_enable();
+ 
+ 	return 0;
+ }
+@@ -119,11 +134,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
+ 	return 0;
+ }
+ 
+-#ifdef CONFIG_X86_32
+-long sys_iopl(struct pt_regs *regs)
++asmlinkage long sys_iopl(unsigned int level)
+ {
+-	unsigned int level = regs->bx;
+ 	struct thread_struct *t = &current->thread;
++	struct pt_regs *regs = task_pt_regs(current);
+ 	int rc;
+ 
+ 	rc = do_iopl(level, regs);
+@@ -135,9 +149,3 @@ long sys_iopl(struct pt_regs *regs)
+ out:
+ 	return rc;
+ }
+-#else
+-asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+-{
+-	return do_iopl(level, regs);
+-}
+-#endif
+diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
+index ec6ef60..fa5b061 100644
+--- a/arch/x86/kernel/ldt.c
++++ b/arch/x86/kernel/ldt.c
+@@ -109,6 +109,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+ 
+ 	mutex_init(&mm->context.lock);
+ 	mm->context.size = 0;
++#ifdef CONFIG_XEN
++	mm->context.has_foreign_mappings = 0;
++#endif
+ 	old_mm = current->mm;
+ 	if (old_mm && old_mm->context.size > 0) {
+ 		mutex_lock(&old_mm->context.lock);
+diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
+index 378e9a8..86ca771 100644
+--- a/arch/x86/kernel/microcode_core.c
++++ b/arch/x86/kernel/microcode_core.c
+@@ -81,6 +81,8 @@
+ #include <linux/fs.h>
+ #include <linux/mm.h>
+ 
++#include <xen/xen.h>
++#include <asm/xen/hypervisor.h>
+ #include <asm/microcode.h>
+ #include <asm/processor.h>
+ 
+@@ -503,7 +505,9 @@ static int __init microcode_init(void)
+ 	struct cpuinfo_x86 *c = &cpu_data(0);
+ 	int error;
+ 
+-	if (c->x86_vendor == X86_VENDOR_INTEL)
++	if (xen_pv_domain())
++		microcode_ops = init_xen_microcode();
++	else if (c->x86_vendor == X86_VENDOR_INTEL)
+ 		microcode_ops = init_intel_microcode();
+ 	else if (c->x86_vendor == X86_VENDOR_AMD)
+ 		microcode_ops = init_amd_microcode();
+diff --git a/arch/x86/kernel/microcode_xen.c b/arch/x86/kernel/microcode_xen.c
+new file mode 100644
+index 0000000..16c742e
+--- /dev/null
++++ b/arch/x86/kernel/microcode_xen.c
+@@ -0,0 +1,201 @@
++/*
++ * Xen microcode update driver
++ *
++ * Xen does most of the work here.  We just pass the whole blob into
++ * Xen, and it will apply it to all CPUs as appropriate.  Xen will
++ * worry about how different CPU models are actually updated.
++ */
++#include <linux/sched.h>
++#include <linux/module.h>
++#include <linux/firmware.h>
++#include <linux/vmalloc.h>
++#include <linux/uaccess.h>
++
++#include <asm/microcode.h>
++
++#include <xen/xen.h>
++#include <xen/interface/platform.h>
++#include <xen/interface/xen.h>
++
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
++MODULE_DESCRIPTION("Xen microcode update driver");
++MODULE_LICENSE("GPL");
++
++struct xen_microcode {
++	size_t len;
++	char data[0];
++};
++
++static int xen_microcode_update(int cpu)
++{
++	int err;
++	struct xen_platform_op op;
++	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++	struct xen_microcode *uc = uci->mc;
++
++	if (uc == NULL || uc->len == 0) {
++		/*
++		 * We do all cpus at once, so we don't need to do
++		 * other cpus explicitly (besides, these vcpu numbers
++		 * have no relationship to underlying physical cpus).
++		 */
++		return 0;
++	}
++
++	op.cmd = XENPF_microcode_update;
++	set_xen_guest_handle(op.u.microcode.data, uc->data);
++	op.u.microcode.length = uc->len;
++
++	err = HYPERVISOR_dom0_op(&op);
++
++	if (err != 0)
++		printk(KERN_WARNING "microcode_xen: microcode update failed: %d\n", err);
++
++	return err;
++}
++
++static enum ucode_state xen_request_microcode_fw(int cpu, struct device *device)
++{
++	char name[30];
++	struct cpuinfo_x86 *c = &cpu_data(cpu);
++	const struct firmware *firmware;
++	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++	enum ucode_state ret;
++	struct xen_microcode *uc;
++	size_t size;
++	int err;
++
++	switch (c->x86_vendor) {
++	case X86_VENDOR_INTEL:
++		snprintf(name, sizeof(name), "intel-ucode/%02x-%02x-%02x",
++			 c->x86, c->x86_model, c->x86_mask);
++		break;
++
++	case X86_VENDOR_AMD:
++		snprintf(name, sizeof(name), "amd-ucode/microcode_amd.bin");
++		break;
++
++	default:
++		return UCODE_NFOUND;
++	}
++
++	err = request_firmware(&firmware, name, device);
++	if (err) {
++		pr_debug("microcode: data file %s load failed\n", name);
++		return UCODE_NFOUND;
++	}
++
++	/*
++	 * Only bother getting real firmware for cpu 0; the others get
++	 * dummy placeholders.
++	 */
++	if (cpu == 0)
++		size = firmware->size;
++	else
++		size = 0;
++
++	if (uci->mc != NULL) {
++		vfree(uci->mc);
++		uci->mc = NULL;
++	}
++
++	ret = UCODE_ERROR;
++	uc = vmalloc(sizeof(*uc) + size);
++	if (uc == NULL)
++		goto out;
++
++	ret = UCODE_OK;
++	uc->len = size;
++	memcpy(uc->data, firmware->data, uc->len);
++
++	uci->mc = uc;
++
++out:
++	release_firmware(firmware);
++
++	return ret;
++}
++
++static enum ucode_state xen_request_microcode_user(int cpu,
++						   const void __user *buf, size_t size)
++{
++	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++	struct xen_microcode *uc;
++	enum ucode_state ret;
++	size_t unread;
++
++	if (cpu != 0) {
++		/* No real firmware for non-zero cpus; just store a
++		   placeholder */
++		size = 0;
++	}
++
++	if (uci->mc != NULL) {
++		vfree(uci->mc);
++		uci->mc = NULL;
++	}
++
++	ret = UCODE_ERROR;
++	uc = vmalloc(sizeof(*uc) + size);
++	if (uc == NULL)
++		goto out;
++
++	uc->len = size;
++
++	ret = UCODE_NFOUND;
++
++	/* XXX This sporadically returns uncopied bytes, so we return
++	   EFAULT.  As far as I can see, the usermode code
++	   (microcode_ctl) isn't doing anything wrong... */
++	unread = copy_from_user(uc->data, buf, size);
++
++	if (unread != 0) {
++		printk(KERN_WARNING "failed to read %zd of %zd bytes at %p -> %p\n",
++		       unread, size, buf, uc->data);
++		goto out;
++	}
++
++	ret = UCODE_OK;
++
++out:
++	if (ret == 0)
++		uci->mc = uc;
++	else
++		vfree(uc);
++
++	return ret;
++}
++
++static void xen_microcode_fini_cpu(int cpu)
++{
++	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
++
++	vfree(uci->mc);
++	uci->mc = NULL;
++}
++
++static int xen_collect_cpu_info(int cpu, struct cpu_signature *sig)
++{
++	sig->sig = 0;
++	sig->pf = 0;
++	sig->rev = 0;
++
++	return 0;
++}
++
++static struct microcode_ops microcode_xen_ops = {
++	.request_microcode_user		  = xen_request_microcode_user,
++	.request_microcode_fw             = xen_request_microcode_fw,
++	.collect_cpu_info                 = xen_collect_cpu_info,
++	.apply_microcode                  = xen_microcode_update,
++	.microcode_fini_cpu               = xen_microcode_fini_cpu,
++};
++
++struct microcode_ops * __init init_xen_microcode(void)
++{
++	if (!xen_initial_domain())
++		return NULL;
++	return &microcode_xen_ops;
++}
+diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
+index 1b1739d..f7e115c 100644
+--- a/arch/x86/kernel/paravirt.c
++++ b/arch/x86/kernel/paravirt.c
+@@ -376,6 +376,7 @@ struct pv_cpu_ops pv_cpu_ops = {
+ 	.swapgs = native_swapgs,
+ 
+ 	.set_iopl_mask = native_set_iopl_mask,
++	.set_io_bitmap = native_set_io_bitmap,
+ 	.io_delay = native_io_delay,
+ 
+ 	.start_context_switch = paravirt_nop,
+diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
+index 1a2d4b1..2f158a5 100644
+--- a/arch/x86/kernel/pci-calgary_64.c
++++ b/arch/x86/kernel/pci-calgary_64.c
+@@ -46,6 +46,7 @@
+ #include <asm/dma.h>
+ #include <asm/rio.h>
+ #include <asm/bios_ebda.h>
++#include <asm/x86_init.h>
+ 
+ #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
+ int use_calgary __read_mostly = 1;
+@@ -249,7 +250,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
+ 			if (panic_on_overflow)
+ 				panic("Calgary: fix the allocator.\n");
+ 			else
+-				return bad_dma_address;
++				return DMA_ERROR_CODE;
+ 		}
+ 	}
+ 
+@@ -265,11 +266,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+ 			      void *vaddr, unsigned int npages, int direction)
+ {
+ 	unsigned long entry;
+-	dma_addr_t ret = bad_dma_address;
++	dma_addr_t ret = DMA_ERROR_CODE;
+ 
+ 	entry = iommu_range_alloc(dev, tbl, npages);
+ 
+-	if (unlikely(entry == bad_dma_address))
++	if (unlikely(entry == DMA_ERROR_CODE))
+ 		goto error;
+ 
+ 	/* set the return dma address */
+@@ -284,7 +285,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
+ error:
+ 	printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+ 	       "iommu %p\n", npages, tbl);
+-	return bad_dma_address;
++	return DMA_ERROR_CODE;
+ }
+ 
+ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+@@ -295,8 +296,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ 	unsigned long flags;
+ 
+ 	/* were we called with bad_dma_address? */
+-	badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
+-	if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
++	badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
++	if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
+ 		WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
+ 		       "address 0x%Lx\n", dma_addr);
+ 		return;
+@@ -380,7 +381,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+ 		npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
+ 
+ 		entry = iommu_range_alloc(dev, tbl, npages);
+-		if (entry == bad_dma_address) {
++		if (entry == DMA_ERROR_CODE) {
+ 			/* makes sure unmap knows to stop */
+ 			s->dma_length = 0;
+ 			goto error;
+@@ -398,7 +399,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+ error:
+ 	calgary_unmap_sg(dev, sg, nelems, dir, NULL);
+ 	for_each_sg(sg, s, nelems, i) {
+-		sg->dma_address = bad_dma_address;
++		sg->dma_address = DMA_ERROR_CODE;
+ 		sg->dma_length = 0;
+ 	}
+ 	return 0;
+@@ -453,7 +454,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
+ 
+ 	/* set up tces to cover the allocated range */
+ 	mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+-	if (mapping == bad_dma_address)
++	if (mapping == DMA_ERROR_CODE)
+ 		goto free;
+ 	*dma_handle = mapping;
+ 	return ret;
+@@ -734,7 +735,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
+ 	struct iommu_table *tbl = pci_iommu(dev->bus);
+ 
+ 	/* reserve EMERGENCY_PAGES from bad_dma_address and up */
+-	iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES);
++	iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
+ 
+ 	/* avoid the BIOS/VGA first 640KB-1MB region */
+ 	/* for CalIOC2 - avoid the entire first MB */
+@@ -1349,6 +1350,23 @@ static void __init get_tce_space_from_tar(void)
+ 	return;
+ }
+ 
++static int __init calgary_iommu_init(void)
++{
++	int ret;
++
++	/* ok, we're trying to use Calgary - let's roll */
++	printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
++
++	ret = calgary_init();
++	if (ret) {
++		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
++		       "falling back to no_iommu\n", ret);
++		return ret;
++	}
++
++	return 0;
++}
++
+ void __init detect_calgary(void)
+ {
+ 	int bus;
+@@ -1362,7 +1380,7 @@ void __init detect_calgary(void)
+ 	 * if the user specified iommu=off or iommu=soft or we found
+ 	 * another HW IOMMU already, bail out.
+ 	 */
+-	if (swiotlb || no_iommu || iommu_detected)
++	if (no_iommu || iommu_detected)
+ 		return;
+ 
+ 	if (!use_calgary)
+@@ -1447,9 +1465,7 @@ void __init detect_calgary(void)
+ 		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+ 		       specified_table_size);
+ 
+-		/* swiotlb for devices that aren't behind the Calgary. */
+-		if (max_pfn > MAX_DMA32_PFN)
+-			swiotlb = 1;
++		x86_init.iommu.iommu_init = calgary_iommu_init;
+ 	}
+ 	return;
+ 
+@@ -1462,35 +1478,6 @@ cleanup:
+ 	}
+ }
+ 
+-int __init calgary_iommu_init(void)
+-{
+-	int ret;
+-
+-	if (no_iommu || (swiotlb && !calgary_detected))
+-		return -ENODEV;
+-
+-	if (!calgary_detected)
+-		return -ENODEV;
+-
+-	/* ok, we're trying to use Calgary - let's roll */
+-	printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+-
+-	ret = calgary_init();
+-	if (ret) {
+-		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+-		       "falling back to no_iommu\n", ret);
+-		return ret;
+-	}
+-
+-	force_iommu = 1;
+-	bad_dma_address = 0x0;
+-	/* dma_ops is set to swiotlb or nommu */
+-	if (!dma_ops)
+-		dma_ops = &nommu_dma_ops;
+-
+-	return 0;
+-}
+-
+ static int __init calgary_parse_options(char *p)
+ {
+ 	unsigned int bridge;
+diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
+index 6ac3931..3e57c58 100644
+--- a/arch/x86/kernel/pci-dma.c
++++ b/arch/x86/kernel/pci-dma.c
+@@ -11,10 +11,12 @@
+ #include <asm/gart.h>
+ #include <asm/calgary.h>
+ #include <asm/amd_iommu.h>
++#include <asm/x86_init.h>
++#include <asm/xen/swiotlb-xen.h>
+ 
+ static int forbid_dac __read_mostly;
+ 
+-struct dma_map_ops *dma_ops;
++struct dma_map_ops *dma_ops = &nommu_dma_ops;
+ EXPORT_SYMBOL(dma_ops);
+ 
+ static int iommu_sac_force __read_mostly;
+@@ -42,9 +44,6 @@ int iommu_detected __read_mostly = 0;
+  */
+ int iommu_pass_through __read_mostly;
+ 
+-dma_addr_t bad_dma_address __read_mostly = 0;
+-EXPORT_SYMBOL(bad_dma_address);
+-
+ /* Dummy device used for NULL arguments (normally ISA). */
+ struct device x86_dma_fallback_dev = {
+ 	.init_name = "fallback device",
+@@ -126,18 +125,19 @@ void __init pci_iommu_alloc(void)
+ 	/* free the range so iommu could get some range less than 4G */
+ 	dma32_free_bootmem();
+ #endif
++	if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
++		goto out;
+ 
+-	/*
+-	 * The order of these functions is important for
+-	 * fall-back/fail-over reasons
+-	 */
+ 	gart_iommu_hole_init();
+ 
+ 	detect_calgary();
+ 
+ 	detect_intel_iommu();
+ 
++	/* needs to be called after gart_iommu_hole_init */
+ 	amd_iommu_detect();
++out:
++	pci_xen_swiotlb_init();
+ 
+ 	pci_swiotlb_init();
+ }
+@@ -289,25 +289,17 @@ static int __init pci_iommu_init(void)
+ #ifdef CONFIG_PCI
+ 	dma_debug_add_bus(&pci_bus_type);
+ #endif
++	x86_init.iommu.iommu_init();
+ 
+-	calgary_iommu_init();
+-
+-	intel_iommu_init();
+-
+-	amd_iommu_init();
++	if (swiotlb || xen_swiotlb) {
++		printk(KERN_INFO "PCI-DMA: "
++		       "Using software bounce buffering for IO (SWIOTLB)\n");
++		swiotlb_print_info();
++	} else
++		swiotlb_free();
+ 
+-	gart_iommu_init();
+-
+-	no_iommu_init();
+ 	return 0;
+ }
+-
+-void pci_iommu_shutdown(void)
+-{
+-	gart_iommu_shutdown();
+-
+-	amd_iommu_shutdown();
+-}
+ /* Must execute after PCI subsystem */
+ rootfs_initcall(pci_iommu_init);
+ 
+diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
+index 1c76691..8c9dd05 100644
+--- a/arch/x86/kernel/pci-gart_64.c
++++ b/arch/x86/kernel/pci-gart_64.c
+@@ -39,6 +39,7 @@
+ #include <asm/swiotlb.h>
+ #include <asm/dma.h>
+ #include <asm/k8.h>
++#include <asm/x86_init.h>
+ 
+ static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
+ static unsigned long iommu_size;	/* size of remapping area bytes */
+@@ -46,6 +47,8 @@ static unsigned long iommu_pages;	/* .. and in pages */
+ 
+ static u32 *iommu_gatt_base;		/* Remapping table */
+ 
++static dma_addr_t bad_dma_addr;
++
+ /*
+  * If this is disabled the IOMMU will use an optimized flushing strategy
+  * of only flushing when an mapping is reused. With it true the GART is
+@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+ 		if (panic_on_overflow)
+ 			panic("dma_map_area overflow %lu bytes\n", size);
+ 		iommu_full(dev, size, dir);
+-		return bad_dma_address;
++		return bad_dma_addr;
+ 	}
+ 
+ 	for (i = 0; i < npages; i++) {
+@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+ 
+ 		if (nonforced_iommu(dev, addr, s->length)) {
+ 			addr = dma_map_area(dev, addr, s->length, dir, 0);
+-			if (addr == bad_dma_address) {
++			if (addr == bad_dma_addr) {
+ 				if (i > 0)
+ 					gart_unmap_sg(dev, sg, i, dir, NULL);
+ 				nents = 0;
+@@ -455,7 +458,7 @@ error:
+ 
+ 	iommu_full(dev, pages << PAGE_SHIFT, dir);
+ 	for_each_sg(sg, s, nents, i)
+-		s->dma_address = bad_dma_address;
++		s->dma_address = bad_dma_addr;
+ 	return 0;
+ }
+ 
+@@ -479,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+ 				     DMA_BIDIRECTIONAL, align_mask);
+ 
+ 		flush_gart();
+-		if (paddr != bad_dma_address) {
++		if (paddr != bad_dma_addr) {
+ 			*dma_addr = paddr;
+ 			return page_address(page);
+ 		}
+@@ -499,6 +502,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+ 	free_pages((unsigned long)vaddr, get_order(size));
+ }
+ 
++static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
++{
++	return (dma_addr == bad_dma_addr);
++}
++
+ static int no_agp;
+ 
+ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+@@ -689,14 +697,15 @@ static struct dma_map_ops gart_dma_ops = {
+ 	.unmap_page			= gart_unmap_page,
+ 	.alloc_coherent			= gart_alloc_coherent,
+ 	.free_coherent			= gart_free_coherent,
++	.mapping_error			= gart_mapping_error,
+ };
+ 
+-void gart_iommu_shutdown(void)
++static void gart_iommu_shutdown(void)
+ {
+ 	struct pci_dev *dev;
+ 	int i;
+ 
+-	if (no_agp && (dma_ops != &gart_dma_ops))
++	if (no_agp)
+ 		return;
+ 
+ 	for (i = 0; i < num_k8_northbridges; i++) {
+@@ -711,7 +720,7 @@ void gart_iommu_shutdown(void)
+ 	}
+ }
+ 
+-void __init gart_iommu_init(void)
++int __init gart_iommu_init(void)
+ {
+ 	struct agp_kern_info info;
+ 	unsigned long iommu_start;
+@@ -721,7 +730,7 @@ void __init gart_iommu_init(void)
+ 	long i;
+ 
+ 	if (num_k8_northbridges == 0)
+-		return;
++		return 0;
+ 
+ #ifndef CONFIG_AGP_AMD64
+ 	no_agp = 1;
+@@ -733,13 +742,6 @@ void __init gart_iommu_init(void)
+ 		(agp_copy_info(agp_bridge, &info) < 0);
+ #endif
+ 
+-	if (swiotlb)
+-		return;
+-
+-	/* Did we detect a different HW IOMMU? */
+-	if (iommu_detected && !gart_iommu_aperture)
+-		return;
+-
+ 	if (no_iommu ||
+ 	    (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
+ 	    !gart_iommu_aperture ||
+@@ -749,7 +751,7 @@ void __init gart_iommu_init(void)
+ 			       "but GART IOMMU not available.\n");
+ 			printk(KERN_WARNING "falling back to iommu=soft.\n");
+ 		}
+-		return;
++		return 0;
+ 	}
+ 
+ 	/* need to map that range */
+@@ -794,7 +796,7 @@ void __init gart_iommu_init(void)
+ 
+ 	iommu_start = aper_size - iommu_size;
+ 	iommu_bus_base = info.aper_base + iommu_start;
+-	bad_dma_address = iommu_bus_base;
++	bad_dma_addr = iommu_bus_base;
+ 	iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+ 
+ 	/*
+@@ -841,6 +843,10 @@ void __init gart_iommu_init(void)
+ 
+ 	flush_gart();
+ 	dma_ops = &gart_dma_ops;
++	x86_platform.iommu_shutdown = gart_iommu_shutdown;
++	swiotlb = 0;
++
++	return 0;
+ }
+ 
+ void __init gart_parse_options(char *p)
+diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
+index a3933d4..22be12b 100644
+--- a/arch/x86/kernel/pci-nommu.c
++++ b/arch/x86/kernel/pci-nommu.c
+@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
+ 	dma_addr_t bus = page_to_phys(page) + offset;
+ 	WARN_ON(size == 0);
+ 	if (!check_addr("map_single", dev, bus, size))
+-		return bad_dma_address;
++		return DMA_ERROR_CODE;
+ 	flush_write_buffers();
+ 	return bus;
+ }
+@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
+ 	.sync_sg_for_device	= nommu_sync_sg_for_device,
+ 	.is_phys		= 1,
+ };
+-
+-void __init no_iommu_init(void)
+-{
+-	if (dma_ops)
+-		return;
+-
+-	force_iommu = 0; /* no HW IOMMU */
+-	dma_ops = &nommu_dma_ops;
+-}
+diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
+index aaa6b78..7d2829d 100644
+--- a/arch/x86/kernel/pci-swiotlb.c
++++ b/arch/x86/kernel/pci-swiotlb.c
+@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
+ 	.dma_supported = NULL,
+ };
+ 
+-void __init pci_swiotlb_init(void)
++/*
++ * pci_swiotlb_detect - set swiotlb to 1 if necessary
++ *
++ * This returns non-zero if we are forced to use swiotlb (by the boot
++ * option).
++ */
++int __init pci_swiotlb_detect(void)
+ {
++	int use_swiotlb = swiotlb | swiotlb_force;
++
+ 	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
+ #ifdef CONFIG_X86_64
+-	if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
++	if (!no_iommu && max_pfn > MAX_DMA32_PFN)
+ 		swiotlb = 1;
+ #endif
+ 	if (swiotlb_force)
+ 		swiotlb = 1;
++
++	return use_swiotlb;
++}
++
++void __init pci_swiotlb_init(void)
++{
+ 	if (swiotlb) {
+-		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+-		swiotlb_init();
++		swiotlb_init(0);
+ 		dma_ops = &swiotlb_dma_ops;
+ 	}
+ }
+diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
+index 5fd5b07..11d8667 100644
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -73,16 +73,12 @@ void exit_thread(void)
+ 	unsigned long *bp = t->io_bitmap_ptr;
+ 
+ 	if (bp) {
+-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+-
++		preempt_disable();
+ 		t->io_bitmap_ptr = NULL;
+ 		clear_thread_flag(TIF_IO_BITMAP);
+-		/*
+-		 * Careful, clear this in the TSS too:
+-		 */
+-		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
++		set_io_bitmap(t, t->io_bitmap_max);
+ 		t->io_bitmap_max = 0;
+-		put_cpu();
++		preempt_enable();
+ 		kfree(bp);
+ 	}
+ }
+@@ -199,19 +195,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+ 			hard_enable_TSC();
+ 	}
+ 
+-	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+-		/*
+-		 * Copy the relevant range of the IO bitmap.
+-		 * Normally this is 128 bytes or less:
+-		 */
+-		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+-		       max(prev->io_bitmap_max, next->io_bitmap_max));
+-	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+-		/*
+-		 * Clear any possible leftover bits:
+-		 */
+-		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+-	}
++	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP) ||
++	    test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
++		set_io_bitmap(next,
++			      max(prev->io_bitmap_max, next->io_bitmap_max));
+ }
+ 
+ int sys_fork(struct pt_regs *regs)
+diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
+index 269c2a3..8e1aac8 100644
+--- a/arch/x86/kernel/reboot.c
++++ b/arch/x86/kernel/reboot.c
+@@ -23,7 +23,7 @@
+ # include <linux/ctype.h>
+ # include <linux/mc146818rtc.h>
+ #else
+-# include <asm/iommu.h>
++# include <asm/x86_init.h>
+ #endif
+ 
+ /*
+@@ -647,7 +647,7 @@ void native_machine_shutdown(void)
+ #endif
+ 
+ #ifdef CONFIG_X86_64
+-	pci_iommu_shutdown();
++	x86_platform.iommu_shutdown();
+ #endif
+ }
+ 
+diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
+index d7a0888..594e324 100644
+--- a/arch/x86/kernel/setup.c
++++ b/arch/x86/kernel/setup.c
+@@ -70,6 +70,7 @@
+ #include <linux/tboot.h>
+ 
+ #include <video/edid.h>
++#include <xen/xen.h>
+ 
+ #include <asm/mtrr.h>
+ #include <asm/apic.h>
+@@ -89,6 +90,7 @@
+ #include <asm/cacheflush.h>
+ #include <asm/processor.h>
+ #include <asm/bugs.h>
++#include <asm/tlbflush.h>
+ 
+ #include <asm/system.h>
+ #include <asm/vsyscall.h>
+@@ -966,6 +968,9 @@ void __init setup_arch(char **cmdline_p)
+ 
+ 	initmem_init(0, max_pfn);
+ 
++	/* Initialize cross-cpu tlb flushes */
++	init_smp_flush();
++
+ #ifdef CONFIG_ACPI_SLEEP
+ 	/*
+ 	 * Reserve low memory region for sleep support.
+@@ -1034,6 +1039,7 @@ void __init setup_arch(char **cmdline_p)
+ 	probe_nr_irqs_gsi();
+ 
+ 	kvm_guest_init();
++	xen_hvm_guest_init();
+ 
+ 	e820_reserve_resources();
+ 	e820_mark_nosave_regions(max_low_pfn);
+diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
+index 4449a4a..d11c5ff 100644
+--- a/arch/x86/kernel/x86_init.c
++++ b/arch/x86/kernel/x86_init.c
+@@ -14,10 +14,13 @@
+ #include <asm/time.h>
+ #include <asm/irq.h>
+ #include <asm/tsc.h>
++#include <asm/iommu.h>
+ 
+ void __cpuinit x86_init_noop(void) { }
+ void __init x86_init_uint_noop(unsigned int unused) { }
+ void __init x86_init_pgd_noop(pgd_t *unused) { }
++int __init iommu_init_noop(void) { return 0; }
++void iommu_shutdown_noop(void) { }
+ 
+ /*
+  * The platform setup functions are preset with the default functions
+@@ -62,6 +65,10 @@ struct x86_init_ops x86_init __initdata = {
+ 		.tsc_pre_init		= x86_init_noop,
+ 		.timer_init		= hpet_time_init,
+ 	},
++
++	.iommu = {
++		.iommu_init		= iommu_init_noop,
++	},
+ };
+ 
+ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
+@@ -72,4 +79,5 @@ struct x86_platform_ops x86_platform = {
+ 	.calibrate_tsc			= native_calibrate_tsc,
+ 	.get_wallclock			= mach_get_cmos_time,
+ 	.set_wallclock			= mach_set_rtc_mmss,
++	.iommu_shutdown			= iommu_shutdown_noop,
+ };
+diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
+index 06630d2..ad895ae 100644
+--- a/arch/x86/mm/Makefile
++++ b/arch/x86/mm/Makefile
+@@ -6,6 +6,11 @@ nostackp := $(call cc-option, -fno-stack-protector)
+ CFLAGS_physaddr.o		:= $(nostackp)
+ CFLAGS_setup_nx.o		:= $(nostackp)
+ 
++# Make sure __phys_addr has no stackprotector
++nostackp := $(call cc-option, -fno-stack-protector)
++CFLAGS_ioremap.o		:= $(nostackp)
++CFLAGS_init.o			:= $(nostackp)
++
+ obj-$(CONFIG_SMP)		+= tlb.o
+ 
+ obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index 1739358..e003b83 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -228,7 +228,16 @@ void vmalloc_sync_all(void)
+ 
+ 		spin_lock_irqsave(&pgd_lock, flags);
+ 		list_for_each_entry(page, &pgd_list, lru) {
+-			if (!vmalloc_sync_one(page_address(page), address))
++			spinlock_t *pgt_lock;
++			int ret;
++
++			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
++
++			spin_lock(pgt_lock);
++			ret = vmalloc_sync_one(page_address(page), address);
++			spin_unlock(pgt_lock);
++
++			if (!ret)
+ 				break;
+ 		}
+ 		spin_unlock_irqrestore(&pgd_lock, flags);
+@@ -340,11 +349,19 @@ void vmalloc_sync_all(void)
+ 		spin_lock_irqsave(&pgd_lock, flags);
+ 		list_for_each_entry(page, &pgd_list, lru) {
+ 			pgd_t *pgd;
++			spinlock_t *pgt_lock;
++
+ 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
++
++			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
++			spin_lock(pgt_lock);
++
+ 			if (pgd_none(*pgd))
+ 				set_pgd(pgd, *pgd_ref);
+ 			else
+ 				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
++
++			spin_unlock(pgt_lock);
+ 		}
+ 		spin_unlock_irqrestore(&pgd_lock, flags);
+ 	}
+diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
+index 71da1bc..892b8eb 100644
+--- a/arch/x86/mm/gup.c
++++ b/arch/x86/mm/gup.c
+@@ -313,6 +313,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ 		goto slow_irqon;
+ #endif
+ 
++#ifdef CONFIG_XEN
++	if (unlikely(mm->context.has_foreign_mappings))
++		goto slow_irqon;
++#endif
++
+ 	/*
+ 	 * XXX: batch / limit 'nr', to avoid large irq off latency
+ 	 * needs some instrumenting to determine the common sizes used by
+diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
+index 30938c1..10c3719 100644
+--- a/arch/x86/mm/init_32.c
++++ b/arch/x86/mm/init_32.c
+@@ -430,22 +430,45 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
+ {
+ 	int node_pfn;
+ 	struct page *page;
++	phys_addr_t chunk_end, chunk_max;
+ 	unsigned long final_start_pfn, final_end_pfn;
+-	struct add_highpages_data *data;
+-
+-	data = (struct add_highpages_data *)datax;
++	struct add_highpages_data *data = (struct add_highpages_data *)datax;
+ 
+ 	final_start_pfn = max(start_pfn, data->start_pfn);
+ 	final_end_pfn = min(end_pfn, data->end_pfn);
+ 	if (final_start_pfn >= final_end_pfn)
+ 		return 0;
+ 
+-	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+-	     node_pfn++) {
+-		if (!pfn_valid(node_pfn))
+-			continue;
+-		page = pfn_to_page(node_pfn);
+-		add_one_highpage_init(page, node_pfn);
++	chunk_end = PFN_PHYS(final_start_pfn);
++	chunk_max = PFN_PHYS(final_end_pfn);
++
++	/*
++	 * Check for reserved areas.
++	 */
++	for (;;) {
++		phys_addr_t chunk_start;
++		chunk_start = early_res_next_free(chunk_end);
++		
++		/*
++		 * Reserved area. Just count high mem pages.
++		 */
++		for (node_pfn = PFN_DOWN(chunk_end);
++		     node_pfn < PFN_DOWN(chunk_start); node_pfn++) {
++			if (pfn_valid(node_pfn))
++				totalhigh_pages++;
++		}
++
++		if (chunk_start >= chunk_max)
++			break;
++
++		chunk_end = early_res_next_reserved(chunk_start, chunk_max);
++		for (node_pfn = PFN_DOWN(chunk_start);
++		     node_pfn < PFN_DOWN(chunk_end); node_pfn++) {
++			if (!pfn_valid(node_pfn))
++				continue;
++			page = pfn_to_page(node_pfn);
++			add_one_highpage_init(page, node_pfn);
++		}
+ 	}
+ 
+ 	return 0;
+@@ -459,7 +482,6 @@ void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+ 
+ 	data.start_pfn = start_pfn;
+ 	data.end_pfn = end_pfn;
+-
+ 	work_with_active_regions(nid, add_highpages_work_fn, &data);
+ }
+ 
+diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
+index e78cd0e..fb91994 100644
+--- a/arch/x86/mm/pat.c
++++ b/arch/x86/mm/pat.c
+@@ -666,7 +666,7 @@ void io_free_memtype(resource_size_t start, resource_size_t end)
+ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ 				unsigned long size, pgprot_t vma_prot)
+ {
+-	return vma_prot;
++	return __pgprot(pgprot_val(vma_prot) | _PAGE_IOMAP);
+ }
+ 
+ #ifdef CONFIG_STRICT_DEVMEM
+diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
+index c9ba9de..1fcc191 100644
+--- a/arch/x86/mm/pgtable.c
++++ b/arch/x86/mm/pgtable.c
+@@ -4,6 +4,9 @@
+ #include <asm/tlb.h>
+ #include <asm/fixmap.h>
+ 
++#include <xen/xen.h>
++#include <asm/xen/hypervisor.h>
++
+ #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+ 
+ #ifdef CONFIG_HIGHPTE
+@@ -14,6 +17,16 @@
+ 
+ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+ 
++pgprot_t arch_vm_get_page_prot(unsigned vm_flags)
++{
++	pgprot_t ret = __pgprot(0);
++
++	if (vm_flags & VM_IO)
++		ret = __pgprot(_PAGE_IOMAP);
++
++	return ret;
++}
++
+ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+ {
+ 	return (pte_t *)__get_free_page(PGALLOC_GFP);
+@@ -86,7 +99,19 @@ static inline void pgd_list_del(pgd_t *pgd)
+ #define UNSHARED_PTRS_PER_PGD				\
+ 	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+ 
+-static void pgd_ctor(pgd_t *pgd)
++
++static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
++{
++	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
++	virt_to_page(pgd)->index = (pgoff_t)mm;
++}
++
++struct mm_struct *pgd_page_get_mm(struct page *page)
++{
++	return (struct mm_struct *)page->index;
++}
++
++static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+ {
+ 	/* If the pgd points to a shared pagetable level (either the
+ 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
+@@ -104,8 +129,10 @@ static void pgd_ctor(pgd_t *pgd)
+ 	}
+ 
+ 	/* list required to sync kernel mapping updates */
+-	if (!SHARED_KERNEL_PMD)
++	if (!SHARED_KERNEL_PMD) {
++		pgd_set_mm(pgd, mm);
+ 		pgd_list_add(pgd);
++	}
+ }
+ 
+ static void pgd_dtor(pgd_t *pgd)
+@@ -271,7 +298,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+ 	 */
+ 	spin_lock_irqsave(&pgd_lock, flags);
+ 
+-	pgd_ctor(pgd);
++	pgd_ctor(mm, pgd);
+ 	pgd_prepopulate_pmd(mm, pgd, pmds);
+ 
+ 	spin_unlock_irqrestore(&pgd_lock, flags);
+@@ -288,6 +315,12 @@ out:
+ 
+ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ {
++#ifdef CONFIG_XEN
++	/* EEW */
++	extern void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd);
++	if (xen_pv_domain())
++		xen_late_unpin_pgd(mm, pgd);
++#endif
+ 	pgd_mop_up_pmds(mm, pgd);
+ 	pgd_dtor(pgd);
+ 	paravirt_pgd_free(mm, pgd);
+diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
+index 36fe08e..7317947 100644
+--- a/arch/x86/mm/tlb.c
++++ b/arch/x86/mm/tlb.c
+@@ -148,13 +148,25 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
+ 		 * BUG();
+ 		 */
+ 
+-	if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+-		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
++	if (f->flush_mm == NULL ||
++	    f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
++		int tlbstate = percpu_read(cpu_tlbstate.state);
++
++		/*
++		 * flush_mm == NULL means flush everything, including
++		 * global tlbs, which will only happen when flushing
++		 * kernel mappings.
++		 */
++		if (f->flush_mm == NULL)
++			__flush_tlb_all();
++		else if (tlbstate == TLBSTATE_OK) {
+ 			if (f->flush_va == TLB_FLUSH_ALL)
+ 				local_flush_tlb();
+ 			else
+ 				__flush_tlb_one(f->flush_va);
+-		} else
++		}
++
++		if (tlbstate == TLBSTATE_LAZY)
+ 			leave_mm(cpu);
+ 	}
+ out:
+@@ -217,16 +229,13 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
+ 	flush_tlb_others_ipi(cpumask, mm, va);
+ }
+ 
+-static int __cpuinit init_smp_flush(void)
++void __init init_smp_flush(void)
+ {
+ 	int i;
+ 
+ 	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
+ 		spin_lock_init(&flush_state[i].tlbstate_lock);
+-
+-	return 0;
+ }
+-core_initcall(init_smp_flush);
+ 
+ void flush_tlb_current_task(void)
+ {
+@@ -274,17 +283,19 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+ 
+ 	preempt_enable();
+ }
++EXPORT_SYMBOL_GPL(flush_tlb_page);
+ 
+-static void do_flush_tlb_all(void *info)
++void flush_tlb_all(void)
+ {
+-	unsigned long cpu = smp_processor_id();
++	/* flush_tlb_others expects preempt to be disabled */
++	int cpu = get_cpu();
++
++	flush_tlb_others(cpu_online_mask, NULL, TLB_FLUSH_ALL);
+ 
+ 	__flush_tlb_all();
+ 	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+ 		leave_mm(cpu);
+-}
+ 
+-void flush_tlb_all(void)
+-{
+-	on_each_cpu(do_flush_tlb_all, NULL, 1);
++	put_cpu();
+ }
++EXPORT_SYMBOL_GPL(flush_tlb_all);
+diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
+index d49202e..64182c5 100644
+--- a/arch/x86/pci/Makefile
++++ b/arch/x86/pci/Makefile
+@@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS)		+= pcbios.o
+ obj-$(CONFIG_PCI_MMCONFIG)	+= mmconfig_$(BITS).o direct.o mmconfig-shared.o
+ obj-$(CONFIG_PCI_DIRECT)	+= direct.o
+ obj-$(CONFIG_PCI_OLPC)		+= olpc.o
++obj-$(CONFIG_PCI_XEN)		+= xen.o
+ 
+ obj-y				+= fixup.o
+ obj-$(CONFIG_ACPI)		+= acpi.o
+diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
+index 1331fcf..30a9808 100644
+--- a/arch/x86/pci/common.c
++++ b/arch/x86/pci/common.c
+@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
+ unsigned int pci_early_dump_regs;
+ static int pci_bf_sort;
+ int pci_routeirq;
++int pci_scan_all_fns;
+ int noioapicquirk;
+ #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ int noioapicreroute = 0;
+@@ -412,26 +413,31 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
+ 
+ extern u8 pci_cache_line_size;
+ 
+-int __init pcibios_init(void)
++void __init pcibios_set_cache_line_size(void)
+ {
+ 	struct cpuinfo_x86 *c = &boot_cpu_data;
+ 
+-	if (!raw_pci_ops) {
+-		printk(KERN_WARNING "PCI: System does not support PCI\n");
+-		return 0;
+-	}
+-
+ 	/*
+ 	 * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8
+ 	 * and P4. It's also good for 386/486s (which actually have 16)
+ 	 * as quite a few PCI devices do not support smaller values.
+ 	 */
++
+ 	pci_cache_line_size = 32 >> 2;
+ 	if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
+ 		pci_cache_line_size = 64 >> 2;	/* K7 & K8 */
+ 	else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+ 		pci_cache_line_size = 128 >> 2;	/* P4 */
++}
++
++int __init pcibios_init(void)
++{
++	if (!raw_pci_ops) {
++		printk(KERN_WARNING "PCI: System does not support PCI\n");
++		return 0;
++	}
+ 
++	pcibios_set_cache_line_size();
+ 	pcibios_resource_survey();
+ 
+ 	if (pci_bf_sort >= pci_force_bf)
+diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
+index a672f12..91d040e 100644
+--- a/arch/x86/pci/i386.c
++++ b/arch/x86/pci/i386.c
+@@ -283,6 +283,8 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+ 
+ 	prot = pgprot_val(vma->vm_page_prot);
+ 
++	prot |= _PAGE_IOMAP;	/* creating a mapping for IO */
++
+ 	/*
+  	 * Return error if pat is not enabled and write_combine is requested.
+  	 * Caller can followup with UC MINUS request and add a WC mtrr if there
+diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
+index 25a1f8e..4e2f90a 100644
+--- a/arch/x86/pci/init.c
++++ b/arch/x86/pci/init.c
+@@ -15,10 +15,16 @@ static __init int pci_arch_init(void)
+ 	if (!(pci_probe & PCI_PROBE_NOEARLY))
+ 		pci_mmcfg_early_init();
+ 
++#ifdef CONFIG_PCI_XEN
++	if (!pci_xen_init())
++		return 0;
++#endif
++
+ #ifdef CONFIG_PCI_OLPC
+ 	if (!pci_olpc_init())
+ 		return 0;	/* skip additional checks if it's an XO */
+ #endif
++
+ #ifdef CONFIG_PCI_BIOS
+ 	pci_pcbios_init();
+ #endif
+diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
+new file mode 100644
+index 0000000..67fa926
+--- /dev/null
++++ b/arch/x86/pci/xen.c
+@@ -0,0 +1,154 @@
++/*
++ * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
++ * 			   x86 PCI core to support the Xen PCI Frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/pci.h>
++#include <linux/acpi.h>
++
++#include <asm/io.h>
++#include <asm/pci_x86.h>
++
++#include <asm/xen/hypervisor.h>
++
++#include <xen/events.h>
++#include <asm/xen/pci.h>
++
++#if defined(CONFIG_PCI_MSI)
++#include <linux/msi.h>
++
++struct xen_pci_frontend_ops *xen_pci_frontend;
++EXPORT_SYMBOL_GPL(xen_pci_frontend);
++
++/*
++ * For MSI interrupts we have to use drivers/xen/event.s functions to
++ * allocate an irq_desc and setup the right */
++
++
++int xen_pci_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++	int irq, ret, i;
++	struct msi_desc *msidesc;
++	int *v;
++
++	/* Dom0 has another mechanism for this. The exit path
++	 * (xen_pci_teardown_msi_irq) is shared with Dom0.
++	 */
++	if (xen_initial_domain())
++		return xen_setup_msi_irqs(dev, nvec, type);
++
++	v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
++	if (!v)
++		return -ENOMEM;
++
++	if (!xen_initial_domain()) {
++		if (type == PCI_CAP_ID_MSIX)
++			ret = xen_pci_frontend_enable_msix(dev, &v, nvec);
++		else
++			ret = xen_pci_frontend_enable_msi(dev, &v);
++		if (ret)
++			goto error;
++	}
++	i = 0;
++	list_for_each_entry(msidesc, &dev->msi_list, list) {
++		irq = xen_allocate_pirq(v[i], 0, /* not sharable */
++			(type == PCI_CAP_ID_MSIX) ?
++			"pcifront-msi-x":"pcifront-msi");
++		if (irq < 0)
++			return -1;
++
++		ret = set_irq_msi(irq, msidesc);
++		if (ret)
++			goto error_while;
++		i++;
++	}
++	kfree(v);
++	return 0;
++
++error_while:
++	unbind_from_irqhandler(irq, NULL);
++error:
++	if (ret == -ENODEV)
++		dev_err(&dev->dev,"Xen PCI frontend has not registered" \
++			" MSI/MSI-X support!\n");
++
++	kfree(v);
++	return ret;
++}
++
++void xen_pci_teardown_msi_dev(struct pci_dev *dev)
++{
++	/* Only do this when were are in non-privileged mode.*/
++	if (!xen_initial_domain()) {
++		struct msi_desc *msidesc;
++
++		msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
++		if (msidesc->msi_attrib.is_msix)
++			xen_pci_frontend_disable_msix(dev);
++		else
++			xen_pci_frontend_disable_msi(dev);
++	}
++
++}
++
++void xen_pci_teardown_msi_irq(int irq)
++{
++	xen_destroy_irq(irq);
++}
++#endif
++
++static int xen_pcifront_enable_irq(struct pci_dev *dev)
++{
++	int rc;
++	int share = 1;
++
++	dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq);
++
++	if (dev->irq < 0)
++		return -EINVAL;
++
++	if (dev->irq < NR_IRQS_LEGACY)
++		share = 0;
++
++	rc = xen_allocate_pirq(dev->irq, share, "pcifront");
++	if (rc < 0) {
++		dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n",
++			 dev->irq, rc);
++		return rc;
++	}
++	return 0;
++}
++
++int __init pci_xen_init(void)
++{
++	if (!xen_pv_domain() || xen_initial_domain())
++		return -ENODEV;
++
++	printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
++
++	pcibios_set_cache_line_size();
++
++	pcibios_enable_irq = xen_pcifront_enable_irq;
++	pcibios_disable_irq = NULL;
++
++#ifdef CONFIG_ACPI
++	/* Keep ACPI out of the picture */
++	acpi_noirq = 1;
++#endif
++
++#ifdef CONFIG_ISAPNP
++	/* Stop isapnp from probing */
++	isapnp_disable = 1;
++#endif
++
++	/* Ensure a device still gets scanned even if it's fn number
++	 * is non-zero.
++	 */
++	pci_scan_all_fns = 1;
++
++	return 0;
++}
++
+diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
+index b83e119..3f9f4a0 100644
+--- a/arch/x86/xen/Kconfig
++++ b/arch/x86/xen/Kconfig
+@@ -13,16 +13,18 @@ config XEN
+ 	  kernel to boot in a paravirtualized environment under the
+ 	  Xen hypervisor.
+ 
++config XEN_PVHVM
++	def_bool y
++	depends on XEN
++	depends on X86_LOCAL_APIC
++
+ config XEN_MAX_DOMAIN_MEMORY
+-       int "Maximum allowed size of a domain in gigabytes"
+-       default 8 if X86_32
+-       default 32 if X86_64
++       int
++       default 128
+        depends on XEN
+        help
+-         The pseudo-physical to machine address array is sized
+-         according to the maximum possible memory size of a Xen
+-         domain.  This array uses 1 page per gigabyte, so there's no
+-         need to be too stingy here.
++         This only affects the sizing of some bss arrays, the unused
++         portions of which are freed.
+ 
+ config XEN_SAVE_RESTORE
+        bool
+@@ -36,3 +38,40 @@ config XEN_DEBUG_FS
+ 	help
+ 	  Enable statistics output and various tuning options in debugfs.
+ 	  Enabling this option may incur a significant performance overhead.
++
++config SWIOTLB_XEN
++       def_bool y
++       depends on XEN && SWIOTLB
++
++config MICROCODE_XEN
++       def_bool y
++       depends on XEN_DOM0 && MICROCODE
++
++config XEN_DOM0
++	bool "Enable Xen privileged domain support"
++	depends on XEN && X86_IO_APIC && ACPI
++	help
++	  The Xen hypervisor requires a privileged domain ("dom0") to
++	  actually manage the machine, provide devices drivers, etc.
++	  This option enables dom0 support.  A dom0 kernel can also
++	  run as an unprivileged domU kernel, or a kernel running
++	  native on bare hardware.
++
++# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
++# name in tools.
++config XEN_PRIVILEGED_GUEST
++	def_bool XEN_DOM0
++
++config XEN_DOM0_PCI
++       def_bool y
++       depends on XEN_DOM0 && PCI
++       select PCI_XEN
++
++config XEN_PCI_PASSTHROUGH
++       bool "Enable support for Xen PCI passthrough devices"
++       depends on XEN && PCI
++       select PCI_XEN
++       select SWIOTLB_XEN
++       help
++         Enable support for passing PCI devices through to
++	 unprivileged domains. (COMPLETELY UNTESTED)
+diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
+index 3bb4fc2..13ca65c 100644
+--- a/arch/x86/xen/Makefile
++++ b/arch/x86/xen/Makefile
+@@ -12,9 +12,12 @@ CFLAGS_mmu.o			:= $(nostackp)
+ 
+ obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
+ 			time.o xen-asm.o xen-asm_$(BITS).o \
+-			grant-table.o suspend.o
++			grant-table.o suspend.o platform-pci-unplug.o
+ 
+ obj-$(CONFIG_SMP)		+= smp.o
+ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
+ obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
+-
++obj-$(CONFIG_XEN_DOM0)		+= vga.o
++obj-$(CONFIG_XEN_DOM0)		+= apic.o
++obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb-xen.o
++obj-$(CONFIG_XEN_DOM0_PCI)	+= pci.o
+\ No newline at end of file
+diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
+new file mode 100644
+index 0000000..21a3089
+--- /dev/null
++++ b/arch/x86/xen/apic.c
+@@ -0,0 +1,33 @@
++#include <linux/kernel.h>
++#include <linux/threads.h>
++#include <linux/bitmap.h>
++
++#include <asm/io_apic.h>
++#include <asm/acpi.h>
++#include <asm/hw_irq.h>
++
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++#include <xen/xen.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
++
++void __init xen_io_apic_init(void)
++{
++	enable_IO_APIC();
++}
++
++void xen_init_apic(void)
++{
++	if (!xen_initial_domain())
++		return;
++
++#ifdef CONFIG_ACPI
++	/*
++	 * Pretend ACPI found our lapic even though we've disabled it,
++	 * to prevent MP tables from setting up lapics.
++	 */
++	acpi_lapic = 1;
++#endif
++}
+diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
+index 942ccf1..fd3803e 100644
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -11,6 +11,7 @@
+  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+  */
+ 
++#include <linux/cpu.h>
+ #include <linux/kernel.h>
+ #include <linux/init.h>
+ #include <linux/smp.h>
+@@ -28,12 +29,15 @@
+ #include <linux/highmem.h>
+ #include <linux/console.h>
+ 
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/version.h>
+ #include <xen/interface/physdev.h>
+ #include <xen/interface/vcpu.h>
++#include <xen/interface/memory.h>
+ #include <xen/features.h>
+ #include <xen/page.h>
++#include <xen/hvm.h>
+ #include <xen/hvc-console.h>
+ 
+ #include <asm/paravirt.h>
+@@ -53,6 +57,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/reboot.h>
+ #include <asm/stackprotector.h>
++#include <asm/hypervisor.h>
+ 
+ #include "xen-ops.h"
+ #include "mmu.h"
+@@ -66,6 +71,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+ enum xen_domain_type xen_domain_type = XEN_NATIVE;
+ EXPORT_SYMBOL_GPL(xen_domain_type);
+ 
++unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
++EXPORT_SYMBOL(machine_to_phys_mapping);
++unsigned int   machine_to_phys_order;
++EXPORT_SYMBOL(machine_to_phys_order);
++
+ struct start_info *xen_start_info;
+ EXPORT_SYMBOL_GPL(xen_start_info);
+ 
+@@ -73,6 +83,9 @@ struct shared_info xen_dummy_shared_info;
+ 
+ void *xen_initial_gdt;
+ 
++__read_mostly int xen_have_vector_callback;
++EXPORT_SYMBOL_GPL(xen_have_vector_callback);
++
+ /*
+  * Point at some empty memory to start with. We map the real shared_info
+  * page as soon as fixmap is up and running.
+@@ -94,6 +107,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+  */
+ static int have_vcpu_info_placement = 1;
+ 
++static void clamp_max_cpus(void)
++{
++#ifdef CONFIG_SMP
++	if (setup_max_cpus > MAX_VIRT_CPUS)
++		setup_max_cpus = MAX_VIRT_CPUS;
++#endif
++}
++
+ static void xen_vcpu_setup(int cpu)
+ {
+ 	struct vcpu_register_vcpu_info info;
+@@ -101,19 +122,20 @@ static void xen_vcpu_setup(int cpu)
+ 	struct vcpu_info *vcpup;
+ 
+ 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+-	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ 
+-	if (!have_vcpu_info_placement)
+-		return;		/* already tested, not available */
++	if (cpu < MAX_VIRT_CPUS)
++		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ 
+-	vcpup = &per_cpu(xen_vcpu_info, cpu);
++	if (!have_vcpu_info_placement) {
++		if (cpu >= MAX_VIRT_CPUS)
++			clamp_max_cpus();
++		return;
++	}
+ 
++	vcpup = &per_cpu(xen_vcpu_info, cpu);
+ 	info.mfn = arbitrary_virt_to_mfn(vcpup);
+ 	info.offset = offset_in_page(vcpup);
+ 
+-	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
+-	       cpu, vcpup, info.mfn, info.offset);
+-
+ 	/* Check to see if the hypervisor will put the vcpu_info
+ 	   structure where we want it, which allows direct access via
+ 	   a percpu-variable. */
+@@ -122,13 +144,11 @@ static void xen_vcpu_setup(int cpu)
+ 	if (err) {
+ 		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
+ 		have_vcpu_info_placement = 0;
++		clamp_max_cpus();
+ 	} else {
+ 		/* This cpu is using the registered vcpu info, even if
+ 		   later ones fail to. */
+ 		per_cpu(xen_vcpu, cpu) = vcpup;
+-
+-		printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
+-		       cpu, vcpup);
+ 	}
+ }
+ 
+@@ -167,13 +187,16 @@ static void __init xen_banner(void)
+ 
+ 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ 	       pv_info.name);
+-	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
++	printk(KERN_INFO "Xen version: %d.%d%s%s%s\n",
+ 	       version >> 16, version & 0xffff, extra.extraversion,
+-	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
++	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ?
++			" (preserve-AD)" : "",
++	       xen_initial_domain() ? " (dom0)" : "");
+ }
+ 
+ static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+ static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
++static __read_mostly unsigned int cpuid_leaf81_edx_mask = ~0;
+ 
+ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+ 		      unsigned int *cx, unsigned int *dx)
+@@ -187,7 +210,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+ 	 * unsupported kernel subsystems as possible.
+ 	 */
+ 	switch (*ax) {
+-	case 1:
++	case 0x1:
+ 		maskecx = cpuid_leaf1_ecx_mask;
+ 		maskedx = cpuid_leaf1_edx_mask;
+ 		break;
+@@ -196,6 +219,10 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
+ 		/* Suppress extended topology stuff */
+ 		maskebx = 0;
+ 		break;
++
++	case 0x80000001:
++		maskedx = cpuid_leaf81_edx_mask;
++		break;
+ 	}
+ 
+ 	asm(XEN_EMULATE_PREFIX "cpuid"
+@@ -215,32 +242,18 @@ static __init void xen_init_cpuid_mask(void)
+ 	unsigned int ax, bx, cx, dx;
+ 
+ 	cpuid_leaf1_edx_mask =
+-		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+-		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
+-		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
++		~(1 << X86_FEATURE_ACC);   /* thermal monitoring */
++
++	cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32));
+ 
+ 	if (!xen_initial_domain())
+ 		cpuid_leaf1_edx_mask &=
+-			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
++			~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
++			  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
++			  (1 << X86_FEATURE_APIC) |  /* disable local APIC */
+ 			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
+ 
+-	ax = 1;
+-	cx = 0;
+-	xen_cpuid(&ax, &bx, &cx, &dx);
+-
+-	/* cpuid claims we support xsave; try enabling it to see what happens */
+-	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
+-		unsigned long cr4;
+-
+-		set_in_cr4(X86_CR4_OSXSAVE);
+-		
+-		cr4 = read_cr4();
+-
+-		if ((cr4 & X86_CR4_OSXSAVE) == 0)
+-			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
+-
+-		clear_in_cr4(X86_CR4_OSXSAVE);
+-	}
++	cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); /* disable XSAVE */
+ }
+ 
+ static void xen_set_debugreg(int reg, unsigned long val)
+@@ -406,7 +419,7 @@ static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+ 
+ 		pte = pfn_pte(pfn, PAGE_KERNEL_RO);
+ 
+-		if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
++		if (HYPERVISOR_update_va_mapping(va, pte, 0))
+ 			BUG();
+ 
+ 		frames[f] = mfn;
+@@ -517,13 +530,13 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
+ 		return 0;
+ #ifdef CONFIG_X86_MCE
+ 	} else if (addr == (unsigned long)machine_check) {
+-		return 0;
++		/* We can use the original machine_check handler,
++		   despite IST. */
+ #endif
+-	} else {
+-		/* Some other trap using IST? */
+-		if (WARN_ON(val->ist != 0))
+-			return 0;
+-	}
++	} else if (WARN(val->ist != 0,
++			"Unknown IST-using trap: vector %d, %pF, val->ist=%d\n",
++			vector, (void *)addr, val->ist))
++		return 0;
+ #endif	/* CONFIG_X86_64 */
+ 	info->address = addr;
+ 
+@@ -679,6 +692,18 @@ static void xen_set_iopl_mask(unsigned mask)
+ 	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ }
+ 
++static void xen_set_io_bitmap(struct thread_struct *thread,
++			      unsigned long bytes_updated)
++{
++	struct physdev_set_iobitmap set_iobitmap;
++
++	set_xen_guest_handle(set_iobitmap.bitmap,
++			     (char *)thread->io_bitmap_ptr);
++	set_iobitmap.nr_ports = thread->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
++	WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
++				      &set_iobitmap));
++}
++
+ static void xen_io_delay(void)
+ {
+ }
+@@ -716,7 +741,7 @@ static u32 xen_safe_apic_wait_icr_idle(void)
+         return 0;
+ }
+ 
+-static void set_xen_basic_apic_ops(void)
++static __init void set_xen_basic_apic_ops(void)
+ {
+ 	apic->read = xen_apic_read;
+ 	apic->write = xen_apic_write;
+@@ -728,7 +753,6 @@ static void set_xen_basic_apic_ops(void)
+ 
+ #endif
+ 
+-
+ static void xen_clts(void)
+ {
+ 	struct multicall_space mcs;
+@@ -811,6 +835,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+ 		   Xen console noise. */
+ 		break;
+ 
++	case MSR_IA32_CR_PAT:
++		if (smp_processor_id() == 0)
++			xen_set_pat(((u64)high << 32) | low);
++		break;
++
+ 	default:
+ 		ret = native_write_msr_safe(msr, low, high);
+ 	}
+@@ -849,8 +878,6 @@ void xen_setup_vcpu_info_placement(void)
+ 	/* xen_vcpu_setup managed to place the vcpu_info within the
+ 	   percpu area for all cpus, so make use of it */
+ 	if (have_vcpu_info_placement) {
+-		printk(KERN_INFO "Xen: using vcpu_info placement\n");
+-
+ 		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+ 		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+ 		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+@@ -923,10 +950,6 @@ static const struct pv_init_ops xen_init_ops __initdata = {
+ 	.patch = xen_patch,
+ };
+ 
+-static const struct pv_time_ops xen_time_ops __initdata = {
+-	.sched_clock = xen_clocksource_read,
+-};
+-
+ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+ 	.cpuid = xen_cpuid,
+ 
+@@ -978,6 +1001,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+ 	.load_sp0 = xen_load_sp0,
+ 
+ 	.set_iopl_mask = xen_set_iopl_mask,
++	.set_io_bitmap = xen_set_io_bitmap,
+ 	.io_delay = xen_io_delay,
+ 
+ 	/* Xen takes care of %gs when switching to usermode for us */
+@@ -1020,15 +1044,40 @@ static void xen_machine_halt(void)
+ 	xen_reboot(SHUTDOWN_poweroff);
+ }
+ 
++static void xen_machine_power_off(void)
++{
++	if (pm_power_off)
++		pm_power_off();
++	else
++		xen_reboot(SHUTDOWN_poweroff);
++}
++
+ static void xen_crash_shutdown(struct pt_regs *regs)
+ {
+ 	xen_reboot(SHUTDOWN_crash);
+ }
+ 
++static int
++xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
++{
++	xen_reboot(SHUTDOWN_crash);
++	return NOTIFY_DONE;
++}
++
++static struct notifier_block xen_panic_block = {
++	.notifier_call= xen_panic_event,
++};
++
++int xen_panic_handler_init(void)
++{
++	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
++	return 0;
++}
++
+ static const struct machine_ops __initdata xen_machine_ops = {
+ 	.restart = xen_restart,
+ 	.halt = xen_machine_halt,
+-	.power_off = xen_machine_halt,
++	.power_off = xen_machine_power_off,
+ 	.shutdown = xen_machine_halt,
+ 	.crash_shutdown = xen_crash_shutdown,
+ 	.emergency_restart = xen_emergency_restart,
+@@ -1061,10 +1110,11 @@ asmlinkage void __init xen_start_kernel(void)
+ 
+ 	xen_domain_type = XEN_PV_DOMAIN;
+ 
++	xen_setup_machphys_mapping();
++
+ 	/* Install Xen paravirt ops */
+ 	pv_info = xen_info;
+ 	pv_init_ops = xen_init_ops;
+-	pv_time_ops = xen_time_ops;
+ 	pv_cpu_ops = xen_cpu_ops;
+ 	pv_apic_ops = xen_apic_ops;
+ 
+@@ -1072,13 +1122,7 @@ asmlinkage void __init xen_start_kernel(void)
+ 	x86_init.oem.arch_setup = xen_arch_setup;
+ 	x86_init.oem.banner = xen_banner;
+ 
+-	x86_init.timers.timer_init = xen_time_init;
+-	x86_init.timers.setup_percpu_clockev = x86_init_noop;
+-	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
+-
+-	x86_platform.calibrate_tsc = xen_tsc_khz;
+-	x86_platform.get_wallclock = xen_get_wallclock;
+-	x86_platform.set_wallclock = xen_set_wallclock;
++	xen_init_time_ops();
+ 
+ 	/*
+ 	 * Set up some pagetable state before starting to set any ptes.
+@@ -1116,6 +1160,10 @@ asmlinkage void __init xen_start_kernel(void)
+ 	 */
+ 	xen_setup_stackprotector();
+ 
++#ifdef CONFIG_SPARSE_IRQ
++	nr_dynamic_irqs += 256;
++#endif
++
+ 	xen_init_irq_ops();
+ 	xen_init_cpuid_mask();
+ 
+@@ -1144,6 +1192,8 @@ asmlinkage void __init xen_start_kernel(void)
+ 
+ 	pgd = (pgd_t *)xen_start_info->pt_base;
+ 
++	__supported_pte_mask |= _PAGE_IOMAP;
++
+ 	/* Don't do the full vcpu_info placement stuff until we have a
+ 	   possible map and a non-dummy shared_info. */
+ 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+@@ -1153,6 +1203,10 @@ asmlinkage void __init xen_start_kernel(void)
+ 
+ 	xen_raw_console_write("mapping kernel into physical memory\n");
+ 	pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
++	xen_ident_map_ISA();
++
++	/* Allocate and initialize top and mid mfn levels for p2m structure */
++	xen_build_mfn_list_list();
+ 
+ 	init_mm.pgd = pgd;
+ 
+@@ -1162,6 +1216,14 @@ asmlinkage void __init xen_start_kernel(void)
+ 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
+ 		pv_info.kernel_rpl = 0;
+ 
++	if (xen_initial_domain()) {
++		struct physdev_set_iopl set_iopl;
++		set_iopl.iopl = 1;
++		if (HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl) == -1)
++			BUG();
++		xen_init_apic();
++	}
++
+ 	/* set the limit of our address space */
+ 	xen_reserve_top();
+ 
+@@ -1184,6 +1246,16 @@ asmlinkage void __init xen_start_kernel(void)
+ 		add_preferred_console("xenboot", 0, NULL);
+ 		add_preferred_console("tty", 0, NULL);
+ 		add_preferred_console("hvc", 0, NULL);
++
++		boot_params.screen_info.orig_video_isVGA = 0;
++	} else {
++		const struct dom0_vga_console_info *info =
++			(void *)((char *)xen_start_info +
++				 xen_start_info->console.dom0.info_off);
++
++		xen_init_vga(info, xen_start_info->console.dom0.info_size);
++		xen_start_info->console.domU.mfn = 0;
++		xen_start_info->console.domU.evtchn = 0;
+ 	}
+ 
+ 	xen_raw_console_write("about to get started...\n");
+@@ -1197,3 +1269,126 @@ asmlinkage void __init xen_start_kernel(void)
+ 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
+ #endif
+ }
++
++static uint32_t xen_cpuid_base(void)
++{
++	uint32_t base, eax, ebx, ecx, edx;
++	char signature[13];
++
++	for (base = 0x40000000; base < 0x40010000; base += 0x100) {
++		cpuid(base, &eax, &ebx, &ecx, &edx);
++		*(uint32_t *)(signature + 0) = ebx;
++		*(uint32_t *)(signature + 4) = ecx;
++		*(uint32_t *)(signature + 8) = edx;
++		signature[12] = 0;
++
++		if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
++			return base;
++	}
++
++	return 0;
++}
++
++static int init_hvm_pv_info(int *major, int *minor)
++{
++	uint32_t eax, ebx, ecx, edx, pages, msr, base;
++	u64 pfn;
++
++	base = xen_cpuid_base();
++	if (!base)
++		return -EINVAL;
++
++	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
++
++	*major = eax >> 16;
++	*minor = eax & 0xffff;
++	printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
++
++	cpuid(base + 2, &pages, &msr, &ecx, &edx);
++
++	pfn = __pa(hypercall_page);
++	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
++
++	xen_setup_features();
++
++	pv_info = xen_info;
++	pv_info.kernel_rpl = 0;
++
++	xen_domain_type = XEN_HVM_DOMAIN;
++
++	return 0;
++}
++
++void xen_hvm_init_shared_info(void)
++{
++	int cpu;
++	struct xen_add_to_physmap xatp;
++	static struct shared_info *shared_info_page = 0;
++
++	if (!shared_info_page)
++		shared_info_page = (struct shared_info *) alloc_bootmem_pages(PAGE_SIZE);
++	xatp.domid = DOMID_SELF;
++	xatp.idx = 0;
++	xatp.space = XENMAPSPACE_shared_info;
++	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
++	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
++		BUG();
++
++	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
++
++	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
++	 * page, we use it in the event channel upcall and in some pvclock
++	 * related functions. We don't need the vcpu_info placement
++	 * optimizations because we don't use any pv_mmu or pv_irq op on
++	 * HVM.
++	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
++	 * online but xen_hvm_init_shared_info is run at resume time too and
++	 * in that case multiple vcpus might be online. */
++	for_each_online_cpu(cpu) {
++		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
++	}
++}
++
++#ifdef CONFIG_XEN_PVHVM
++static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
++				    unsigned long action, void *hcpu)
++{
++	int cpu = (long)hcpu;
++	switch (action) {
++	case CPU_UP_PREPARE:
++		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
++		break;
++	default:
++		break;
++	}
++	return NOTIFY_OK;
++}
++
++static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
++	.notifier_call	= xen_hvm_cpu_notify,
++};
++
++void __init xen_hvm_guest_init(void)
++{
++	int r;
++	int major, minor;
++
++	if (xen_pv_domain())
++		return;
++
++	r = init_hvm_pv_info(&major, &minor);
++	if (r < 0)
++		return;
++
++	xen_hvm_init_shared_info();
++
++	if (xen_feature(XENFEAT_hvm_callback_vector))
++		xen_have_vector_callback = 1;
++	register_cpu_notifier(&xen_hvm_cpu_notifier);
++	xen_unplug_emulated_devices();
++	have_vcpu_info_placement = 0;
++	x86_init.irqs.intr_init = xen_init_IRQ;
++	xen_hvm_init_time_ops();
++	xen_hvm_init_mmu_ops();
++}
++#endif
+diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
+index 350a3de..16a8e25 100644
+--- a/arch/x86/xen/mmu.c
++++ b/arch/x86/xen/mmu.c
+@@ -42,6 +42,7 @@
+ #include <linux/highmem.h>
+ #include <linux/debugfs.h>
+ #include <linux/bug.h>
++#include <linux/vmalloc.h>
+ #include <linux/module.h>
+ 
+ #include <asm/pgtable.h>
+@@ -50,14 +51,19 @@
+ #include <asm/mmu_context.h>
+ #include <asm/setup.h>
+ #include <asm/paravirt.h>
++#include <asm/e820.h>
+ #include <asm/linkage.h>
++#include <asm/pat.h>
++#include <asm/page.h>
+ 
+ #include <asm/xen/hypercall.h>
+ #include <asm/xen/hypervisor.h>
+ 
+ #include <xen/page.h>
+ #include <xen/interface/xen.h>
++#include <xen/interface/hvm/hvm_op.h>
+ #include <xen/interface/version.h>
++#include <xen/interface/memory.h>
+ #include <xen/hvc-console.h>
+ 
+ #include "multicalls.h"
+@@ -66,6 +72,13 @@
+ 
+ #define MMU_UPDATE_HISTO	30
+ 
++/*
++ * Protects atomic reservation decrease/increase against concurrent increases.
++ * Also protects non-atomic updates of current_pages and driver_pages, and
++ * balloon lists.
++ */
++DEFINE_SPINLOCK(xen_reservation_lock);
++
+ #ifdef CONFIG_XEN_DEBUG_FS
+ 
+ static struct {
+@@ -124,7 +137,8 @@ static inline void check_zero(void)
+  * large enough to allocate page table pages to allocate the rest.
+  * Each page can map 2MB.
+  */
+-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
++#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
++static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
+ 
+ #ifdef CONFIG_X86_64
+ /* l3 pud for userspace vsyscall mapping */
+@@ -155,49 +169,182 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
+  */
+ #define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+ 
++/*
++ * Xen leaves the responsibility for maintaining p2m mappings to the
++ * guests themselves, but it must also access and update the p2m array
++ * during suspend/resume when all the pages are reallocated.
++ *
++ * The p2m table is logically a flat array, but we implement it as a
++ * three-level tree to allow the address space to be sparse.
++ *
++ *                               Xen
++ *                                |
++ *     p2m_top              p2m_top_mfn
++ *       /  \                   /   \
++ * p2m_mid p2m_mid	p2m_mid_mfn p2m_mid_mfn
++ *    / \      / \         /           /
++ *  p2m p2m p2m p2m p2m p2m p2m ...
++ *
++ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
++ * maximum representable pseudo-physical address space is:
++ *  P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
++ *
++ * P2M_PER_PAGE depends on the architecture, as a mfn is always
++ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
++ * 512 and 1024 entries respectively. 
++ */
+ 
+-#define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
+-#define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
++unsigned long xen_max_p2m_pfn __read_mostly;
+ 
+-/* Placeholder for holes in the address space */
+-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
+-		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
++#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
++#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
++#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
+ 
+- /* Array of pointers to pages containing p2m entries */
+-static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
+-		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
++#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+ 
+-/* Arrays of p2m arrays expressed in mfns used for save/restore */
+-static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
++/* Placeholders for holes in the address space */
++static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
++static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
++static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+ 
+-static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
+-	__page_aligned_bss;
++static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
++static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
++
++RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
++RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+ 
+ static inline unsigned p2m_top_index(unsigned long pfn)
+ {
+-	BUG_ON(pfn >= MAX_DOMAIN_PAGES);
+-	return pfn / P2M_ENTRIES_PER_PAGE;
++	BUG_ON(pfn >= MAX_P2M_PFN);
++	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
++}
++
++static inline unsigned p2m_mid_index(unsigned long pfn)
++{
++	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+ }
+ 
+ static inline unsigned p2m_index(unsigned long pfn)
+ {
+-	return pfn % P2M_ENTRIES_PER_PAGE;
++	return pfn % P2M_PER_PAGE;
++}
++
++static void p2m_top_init(unsigned long ***top)
++{
++	unsigned i;
++
++	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++		top[i] = p2m_mid_missing;
++}
++
++static void p2m_top_mfn_init(unsigned long *top)
++{
++	unsigned i;
++
++	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
++		top[i] = virt_to_mfn(p2m_mid_missing_mfn);
++}
++
++static void p2m_mid_init(unsigned long **mid)
++{
++	unsigned i;
++
++	for (i = 0; i < P2M_MID_PER_PAGE; i++)
++		mid[i] = p2m_missing;
++}
++
++static void p2m_mid_mfn_init(unsigned long *mid)
++{
++	unsigned i;
++
++	for (i = 0; i < P2M_MID_PER_PAGE; i++)
++		mid[i] = virt_to_mfn(p2m_missing);
++}
++
++static void p2m_init(unsigned long *p2m)
++{
++	unsigned i;
++
++	for (i = 0; i < P2M_MID_PER_PAGE; i++)
++		p2m[i] = INVALID_P2M_ENTRY;
++}
++
++static int lookup_pte_fn(
++	pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
++{
++	uint64_t *ptep = (uint64_t *)data;
++	if (ptep)
++		*ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
++			 PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
++	return 0;
+ }
+ 
+-/* Build the parallel p2m_top_mfn structures */
++int create_lookup_pte_addr(struct mm_struct *mm,
++			   unsigned long address,
++			   uint64_t *ptep)
++{
++	return apply_to_page_range(mm, address, PAGE_SIZE,
++				   lookup_pte_fn, ptep);
++}
++
++EXPORT_SYMBOL(create_lookup_pte_addr);
++
++/*
++ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
++ *
++ * This is called both at boot time, and after resuming from suspend:
++ * - At boot time we're called very early, and must use extend_brk()
++ *   to allocate memory.
++ *
++ * - After resume we're called from within stop_machine, but the mfn
++ *   tree should alreay be completely allocated.
++ */
+ void xen_build_mfn_list_list(void)
+ {
+-	unsigned pfn, idx;
++	unsigned pfn;
+ 
+-	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+-		unsigned topidx = p2m_top_index(pfn);
++	/* Pre-initialize p2m_top_mfn to be completely missing */
++	if (p2m_top_mfn == NULL) {
++		p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++		p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ 
+-		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
++		p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
++		p2m_top_mfn_init(p2m_top_mfn);
+ 	}
+ 
+-	for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+-		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
+-		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
++	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
++		unsigned topidx = p2m_top_index(pfn);
++		unsigned mididx = p2m_mid_index(pfn);
++		unsigned long **mid;
++		unsigned long mid_mfn;
++		unsigned long *mid_mfn_p;
++
++		mid = p2m_top[topidx];
++
++		/* Don't bother allocating any mfn mid levels if
++		   they're just missing */
++		if (mid[mididx] == p2m_missing)
++			continue;
++
++		mid_mfn = p2m_top_mfn[topidx];
++		mid_mfn_p = mfn_to_virt(mid_mfn);
++
++		if (mid_mfn_p == p2m_mid_missing_mfn) {
++			/*
++			 * XXX boot-time only!  We should never find
++			 * missing parts of the mfn tree after
++			 * runtime.  extend_brk() will BUG if we call
++			 * it too late.
++			 */
++			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
++			p2m_mid_mfn_init(mid_mfn_p);
++
++			mid_mfn = virt_to_mfn(mid_mfn_p);
++			
++			p2m_top_mfn[topidx] = mid_mfn;
++		}
++
++		mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ 	}
+ }
+ 
+@@ -206,8 +353,8 @@ void xen_setup_mfn_list_list(void)
+ 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+ 
+ 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+-		virt_to_mfn(p2m_top_mfn_list);
+-	HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
++		virt_to_mfn(p2m_top_mfn);
++	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+ }
+ 
+ /* Set up p2m_top to point to the domain-builder provided p2m pages */
+@@ -217,96 +364,170 @@ void __init xen_build_dynamic_phys_to_machine(void)
+ 	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ 	unsigned pfn;
+ 
+-	for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
++	xen_max_p2m_pfn = max_pfn;
++
++	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++	p2m_init(p2m_missing);
++
++	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
++	p2m_mid_init(p2m_mid_missing);
++
++	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
++	p2m_top_init(p2m_top);
++
++	/*
++	 * The domain builder gives us a pre-constructed p2m array in
++	 * mfn_list for all the pages initially given to us, so we just
++	 * need to graft that into our tree structure.
++	 */
++	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+ 		unsigned topidx = p2m_top_index(pfn);
++		unsigned mididx = p2m_mid_index(pfn);
+ 
+-		p2m_top[topidx] = &mfn_list[pfn];
+-	}
++		if (p2m_top[topidx] == p2m_mid_missing) {
++			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
++			p2m_mid_init(mid);
++
++			p2m_top[topidx] = mid;
++		}
+ 
+-	xen_build_mfn_list_list();
++		p2m_top[topidx][mididx] = &mfn_list[pfn];
++	}
+ }
+ 
+ unsigned long get_phys_to_machine(unsigned long pfn)
+ {
+-	unsigned topidx, idx;
++	unsigned topidx, mididx, idx;
+ 
+-	if (unlikely(pfn >= MAX_DOMAIN_PAGES))
++	if (unlikely(pfn >= MAX_P2M_PFN))
+ 		return INVALID_P2M_ENTRY;
+ 
+ 	topidx = p2m_top_index(pfn);
++	mididx = p2m_mid_index(pfn);
+ 	idx = p2m_index(pfn);
+-	return p2m_top[topidx][idx];
++
++	return p2m_top[topidx][mididx][idx];
+ }
+ EXPORT_SYMBOL_GPL(get_phys_to_machine);
+ 
+-/* install a  new p2m_top page */
+-bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
++static void *alloc_p2m_page(void)
+ {
+-	unsigned topidx = p2m_top_index(pfn);
+-	unsigned long **pfnp, *mfnp;
+-	unsigned i;
++	return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
++}
++
++static void free_p2m_page(void *p)
++{
++	free_page((unsigned long)p);
++}
++
++/* 
++ * Fully allocate the p2m structure for a given pfn.  We need to check
++ * that both the top and mid levels are allocated, and make sure the
++ * parallel mfn tree is kept in sync.  We may race with other cpus, so
++ * the new pages are installed with cmpxchg; if we lose the race then
++ * simply free the page we allocated and use the one that's there.
++ */
++static bool alloc_p2m(unsigned long pfn)
++{
++	unsigned topidx, mididx;
++	unsigned long ***top_p, **mid;
++	unsigned long *top_mfn_p, *mid_mfn;
+ 
+-	pfnp = &p2m_top[topidx];
+-	mfnp = &p2m_top_mfn[topidx];
++	topidx = p2m_top_index(pfn);
++	mididx = p2m_mid_index(pfn);
+ 
+-	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+-		p[i] = INVALID_P2M_ENTRY;
++	top_p = &p2m_top[topidx];
++	mid = *top_p;
+ 
+-	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
+-		*mfnp = virt_to_mfn(p);
+-		return true;
++	if (mid == p2m_mid_missing) {
++		/* Mid level is missing, allocate a new one */
++		mid = alloc_p2m_page();
++		if (!mid)
++			return false;
++
++		p2m_mid_init(mid);
++
++		if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
++			free_p2m_page(mid);
+ 	}
+ 
+-	return false;
+-}
++	top_mfn_p = &p2m_top_mfn[topidx];
++	mid_mfn = mfn_to_virt(*top_mfn_p);
+ 
+-static void alloc_p2m(unsigned long pfn)
+-{
+-	unsigned long *p;
++	if (mid_mfn == p2m_mid_missing_mfn) {
++		/* Separately check the mid mfn level */
++		unsigned long missing_mfn;
++		unsigned long mid_mfn_mfn;
++
++		mid_mfn = alloc_p2m_page();
++		if (!mid_mfn)
++			return false;
++
++		p2m_mid_mfn_init(mid_mfn);
++		
++		missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
++		mid_mfn_mfn = virt_to_mfn(mid_mfn);
++		if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
++			free_p2m_page(mid_mfn);
++	}
++
++	if (p2m_top[topidx][mididx] == p2m_missing) {
++		/* p2m leaf page is missing */
++		unsigned long *p2m;
++
++		p2m = alloc_p2m_page();
++		if (!p2m)
++			return false;
+ 
+-	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+-	BUG_ON(p == NULL);
++		p2m_init(p2m);
+ 
+-	if (!install_p2mtop_page(pfn, p))
+-		free_page((unsigned long)p);
++		if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
++			free_p2m_page(p2m);
++		else
++			mid_mfn[mididx] = virt_to_mfn(p2m);
++	}
++
++	return true;
+ }
+ 
+ /* Try to install p2m mapping; fail if intermediate bits missing */
+ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+ {
+-	unsigned topidx, idx;
++	unsigned topidx, mididx, idx;
+ 
+-	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
++	if (unlikely(pfn >= MAX_P2M_PFN)) {
+ 		BUG_ON(mfn != INVALID_P2M_ENTRY);
+ 		return true;
+ 	}
+ 
+ 	topidx = p2m_top_index(pfn);
+-	if (p2m_top[topidx] == p2m_missing) {
+-		if (mfn == INVALID_P2M_ENTRY)
+-			return true;
+-		return false;
+-	}
+-
++	mididx = p2m_mid_index(pfn);
+ 	idx = p2m_index(pfn);
+-	p2m_top[topidx][idx] = mfn;
++
++	if (p2m_top[topidx][mididx] == p2m_missing)
++		return mfn == INVALID_P2M_ENTRY;
++
++	p2m_top[topidx][mididx][idx] = mfn;
+ 
+ 	return true;
+ }
+ 
+-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
++bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+ {
+ 	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ 		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+-		return;
++		return true;
+ 	}
+ 
+ 	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
+-		alloc_p2m(pfn);
++		if (!alloc_p2m(pfn))
++			return false;
+ 
+ 		if (!__set_phys_to_machine(pfn, mfn))
+-			BUG();
++			return false;
+ 	}
++
++	return true;
+ }
+ 
+ unsigned long arbitrary_virt_to_mfn(void *vaddr)
+@@ -315,6 +536,7 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr)
+ 
+ 	return PFN_DOWN(maddr.maddr);
+ }
++EXPORT_SYMBOL_GPL(set_phys_to_machine);
+ 
+ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
+ {
+@@ -376,6 +598,34 @@ static bool xen_page_pinned(void *ptr)
+ 	return PagePinned(page);
+ }
+ 
++static bool xen_iomap_pte(pte_t pte)
++{
++	return pte_flags(pte) & _PAGE_IOMAP;
++}
++
++void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
++{
++	struct multicall_space mcs;
++	struct mmu_update *u;
++
++	mcs = xen_mc_entry(sizeof(*u));
++	u = mcs.args;
++
++	/* ptep might be kmapped when using 32-bit HIGHPTE */
++	u->ptr = arbitrary_virt_to_machine(ptep).maddr;
++	u->val = pte_val_ma(pteval);
++
++	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
++
++	xen_mc_issue(PARAVIRT_LAZY_MMU);
++}
++EXPORT_SYMBOL_GPL(xen_set_domain_pte);
++
++static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
++{
++	xen_set_domain_pte(ptep, pteval, DOMID_IO);
++}
++
+ static void xen_extend_mmu_update(const struct mmu_update *update)
+ {
+ 	struct multicall_space mcs;
+@@ -452,6 +702,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ 		    pte_t *ptep, pte_t pteval)
+ {
++	if (xen_iomap_pte(pteval)) {
++		xen_set_iomap_pte(ptep, pteval);
++		goto out;
++	}
++
+ 	ADD_STATS(set_pte_at, 1);
+ //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
+ 	ADD_STATS(set_pte_at_current, mm == current->mm);
+@@ -516,7 +771,34 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
+ 	if (val & _PAGE_PRESENT) {
+ 		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ 		pteval_t flags = val & PTE_FLAGS_MASK;
+-		val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
++		unsigned long mfn = pfn_to_mfn(pfn);
++
++		/*
++		 * If there's no mfn for the pfn, then just create an
++		 * empty non-present pte.  Unfortunately this loses
++		 * information about the original pfn, so
++		 * pte_mfn_to_pfn is asymmetric.
++		 */
++		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
++			mfn = 0;
++			flags = 0;
++		}
++
++		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
++	}
++
++	return val;
++}
++
++static pteval_t iomap_pte(pteval_t val)
++{
++	if (val & _PAGE_PRESENT) {
++		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
++		pteval_t flags = val & PTE_FLAGS_MASK;
++
++		/* We assume the pte frame number is a MFN, so
++		   just use it as-is. */
++		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
+ 	}
+ 
+ 	return val;
+@@ -524,7 +806,18 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
+ 
+ pteval_t xen_pte_val(pte_t pte)
+ {
+-	return pte_mfn_to_pfn(pte.pte);
++	pteval_t pteval = pte.pte;
++
++	/* If this is a WC pte, convert back from Xen WC to Linux WC */
++	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
++		WARN_ON(!pat_enabled);
++		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
++	}
++
++	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
++		return pteval;
++
++	return pte_mfn_to_pfn(pteval);
+ }
+ PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
+ 
+@@ -534,9 +827,62 @@ pgdval_t xen_pgd_val(pgd_t pgd)
+ }
+ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
+ 
++/*
++ * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
++ * are reserved for now, to correspond to the Intel-reserved PAT
++ * types.
++ *
++ * We expect Linux's PAT set as follows:
++ *
++ * Idx  PTE flags        Linux    Xen    Default
++ * 0                     WB       WB     WB
++ * 1            PWT      WC       WT     WT
++ * 2        PCD          UC-      UC-    UC-
++ * 3        PCD PWT      UC       UC     UC
++ * 4    PAT              WB       WC     WB
++ * 5    PAT     PWT      WC       WP     WT
++ * 6    PAT PCD          UC-      UC     UC-
++ * 7    PAT PCD PWT      UC       UC     UC
++ */
++
++void xen_set_pat(u64 pat)
++{
++	/* We expect Linux to use a PAT setting of
++	 * UC UC- WC WB (ignoring the PAT flag) */
++	WARN_ON(pat != 0x0007010600070106ull);
++}
++
+ pte_t xen_make_pte(pteval_t pte)
+ {
+-	pte = pte_pfn_to_mfn(pte);
++	phys_addr_t addr = (pte & PTE_PFN_MASK);
++
++	/* If Linux is trying to set a WC pte, then map to the Xen WC.
++	 * If _PAGE_PAT is set, then it probably means it is really
++	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
++	 * things work out OK...
++	 *
++	 * (We should never see kernel mappings with _PAGE_PSE set,
++	 * but we could see hugetlbfs mappings, I think.).
++	 */
++	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
++		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
++			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
++	}
++
++	/*
++	 * Unprivileged domains are allowed to do IOMAPpings for
++	 * PCI passthrough, but not map ISA space.  The ISA
++	 * mappings are just dummy local mappings to keep other
++	 * parts of the kernel happy.
++	 */
++	if (unlikely(pte & _PAGE_IOMAP) &&
++	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
++		pte = iomap_pte(pte);
++	} else {
++		pte &= ~_PAGE_IOMAP;
++		pte = pte_pfn_to_mfn(pte);
++	}
++
+ 	return native_make_pte(pte);
+ }
+ PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
+@@ -592,6 +938,11 @@ void xen_set_pud(pud_t *ptr, pud_t val)
+ 
+ void xen_set_pte(pte_t *ptep, pte_t pte)
+ {
++	if (xen_iomap_pte(pte)) {
++		xen_set_iomap_pte(ptep, pte);
++		return;
++	}
++
+ 	ADD_STATS(pte_update, 1);
+ //	ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
+ 	ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
+@@ -608,6 +959,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte)
+ #ifdef CONFIG_X86_PAE
+ void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+ {
++	if (xen_iomap_pte(pte)) {
++		xen_set_iomap_pte(ptep, pte);
++		return;
++	}
++
+ 	set_64bit((u64 *)ptep, native_pte_val(pte));
+ }
+ 
+@@ -934,8 +1290,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
+    read-only, and can be pinned. */
+ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
+ {
+-	vm_unmap_aliases();
+-
+ 	xen_mc_batch();
+ 
+ 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
+@@ -1219,7 +1573,7 @@ void xen_exit_mmap(struct mm_struct *mm)
+ 	spin_lock(&mm->page_table_lock);
+ 
+ 	/* pgd may not be pinned in the error exit path of execve */
+-	if (xen_page_pinned(mm->pgd))
++	if (xen_page_pinned(mm->pgd) && !mm->context.has_foreign_mappings)
+ 		xen_pgd_unpin(mm);
+ 
+ 	spin_unlock(&mm->page_table_lock);
+@@ -1288,12 +1642,19 @@ static void xen_flush_tlb_single(unsigned long addr)
+ 	preempt_enable();
+ }
+ 
++/*
++ * Flush tlb on other cpus.  Xen can do this via a single hypercall
++ * rather than explicit IPIs, which has the nice property of avoiding
++ * any cpus which don't actually have dirty tlbs.  Unfortunately it
++ * doesn't give us an opportunity to kick out cpus which are in lazy
++ * tlb state, so we may end up reflushing some cpus unnecessarily.
++ */
+ static void xen_flush_tlb_others(const struct cpumask *cpus,
+ 				 struct mm_struct *mm, unsigned long va)
+ {
+ 	struct {
+ 		struct mmuext_op op;
+-		DECLARE_BITMAP(mask, NR_CPUS);
++		DECLARE_BITMAP(mask, num_processors);
+ 	} *args;
+ 	struct multicall_space mcs;
+ 
+@@ -1417,6 +1778,13 @@ static int xen_pgd_alloc(struct mm_struct *mm)
+ 	return ret;
+ }
+ 
++void xen_late_unpin_pgd(struct mm_struct *mm, pgd_t *pgd)
++{
++	if (xen_page_pinned(pgd))
++		__xen_pgd_unpin(mm, pgd);
++
++}
++
+ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+ {
+ #ifdef CONFIG_X86_64
+@@ -1448,10 +1816,17 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
+ #ifdef CONFIG_X86_32
+ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+ {
+-	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
+-	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
++	pte_t oldpte = *ptep;
++
++	if (pte_flags(oldpte) & _PAGE_PRESENT) {
++		/* Don't allow existing IO mappings to be overridden */
++		if (pte_flags(oldpte) & _PAGE_IOMAP)
++			pte = oldpte;
++
++		/* Don't allow _PAGE_RW to be set on existing pte */
+ 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
+ 			       pte_val_ma(pte));
++	}
+ 
+ 	return pte;
+ }
+@@ -1517,7 +1892,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
+ 	if (PagePinned(virt_to_page(mm->pgd))) {
+ 		SetPagePinned(page);
+ 
+-		vm_unmap_aliases();
+ 		if (!PageHighMem(page)) {
+ 			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+ 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+@@ -1620,6 +1994,7 @@ static void *m2v(phys_addr_t maddr)
+ 	return __ka(m2p(maddr));
+ }
+ 
++/* Set the page permissions on an identity-mapped pages */
+ static void set_page_prot(void *addr, pgprot_t prot)
+ {
+ 	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+@@ -1635,6 +2010,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+ 	unsigned ident_pte;
+ 	unsigned long pfn;
+ 
++	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
++				      PAGE_SIZE);
++
+ 	ident_pte = 0;
+ 	pfn = 0;
+ 	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+@@ -1645,7 +2023,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+ 			pte_page = m2v(pmd[pmdidx].pmd);
+ 		else {
+ 			/* Check for free pte pages */
+-			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
++			if (ident_pte == LEVEL1_IDENT_ENTRIES)
+ 				break;
+ 
+ 			pte_page = &level1_ident_pgt[ident_pte];
+@@ -1675,6 +2053,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+ 	set_page_prot(pmd, PAGE_KERNEL_RO);
+ }
+ 
++void __init xen_setup_machphys_mapping(void)
++{
++	struct xen_machphys_mapping mapping;
++	unsigned long machine_to_phys_nr_ents;
++
++	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
++		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
++		machine_to_phys_nr_ents = mapping.max_mfn + 1;
++	} else {
++		machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
++	}
++	machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
++}
++
+ #ifdef CONFIG_X86_64
+ static void convert_pfn_mfn(void *v)
+ {
+@@ -1760,12 +2152,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ 	return pgd;
+ }
+ #else	/* !CONFIG_X86_64 */
+-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
++static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
+ 
+ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ 					 unsigned long max_pfn)
+ {
+ 	pmd_t *kernel_pmd;
++	int i;
++
++	level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
+ 
+ 	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+ 				  xen_start_info->nr_pt_frames * PAGE_SIZE +
+@@ -1777,6 +2172,20 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ 	xen_map_identity_early(level2_kernel_pgt, max_pfn);
+ 
+ 	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
++
++	/*
++	 * When running a 32 bit domain 0 on a 64 bit hypervisor a
++	 * pinned L3 (such as the initial pgd here) contains bits
++	 * which are reserved in the PAE layout but not in the 64 bit
++	 * layout. Unfortunately some versions of the hypervisor
++	 * (incorrectly) validate compat mode guests against the PAE
++	 * layout and hence will not allow such a pagetable to be
++	 * pinned by the guest. Therefore we mask off only the PFN and
++	 * Present bits of the supplied L3.
++	 */
++	for (i = 0; i < PTRS_PER_PGD; i++)
++		swapper_pg_dir[i].pgd &= (PTE_PFN_MASK | _PAGE_PRESENT);
++
+ 	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+ 			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+ 
+@@ -1799,6 +2208,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+ }
+ #endif	/* CONFIG_X86_64 */
+ 
++static unsigned char dummy_ioapic_mapping[PAGE_SIZE] __page_aligned_bss;
++
+ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+ {
+ 	pte_t pte;
+@@ -1828,9 +2239,26 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+ 		pte = pfn_pte(phys, prot);
+ 		break;
+ 
+-	default:
++#ifdef CONFIG_X86_IO_APIC
++	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
++		/*
++		 * We just don't map the IO APIC - all access is via
++		 * hypercalls.  Keep the address in the pte for reference.
++		 */
++		pte = pfn_pte(PFN_DOWN(__pa(dummy_ioapic_mapping)), PAGE_KERNEL);
++		break;
++#endif
++
++	case FIX_PARAVIRT_BOOTMAP:
++		/* This is an MFN, but it isn't an IO mapping from the
++		   IO domain */
+ 		pte = mfn_pte(phys, prot);
+ 		break;
++
++	default:
++		/* By default, set_fixmap is used for hardware mappings */
++		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
++		break;
+ 	}
+ 
+ 	__native_set_fixmap(idx, pte);
+@@ -1845,6 +2273,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
+ #endif
+ }
+ 
++__init void xen_ident_map_ISA(void)
++{
++	unsigned long pa;
++
++	/*
++	 * If we're dom0, then linear map the ISA machine addresses into
++	 * the kernel's address space.
++	 */
++	if (!xen_initial_domain())
++		return;
++
++	xen_raw_printk("Xen: setup ISA identity maps\n");
++
++	for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
++		pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
++
++		if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
++			BUG();
++	}
++
++	xen_flush_tlb();
++}
++
+ static __init void xen_post_allocator_init(void)
+ {
+ 	pv_mmu_ops.set_pte = xen_set_pte;
+@@ -1960,8 +2411,305 @@ void __init xen_init_mmu_ops(void)
+ 	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
+ 	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
+ 	pv_mmu_ops = xen_mmu_ops;
++
++	vmap_lazy_unmap = false;
+ }
+ 
++/* Protected by xen_reservation_lock. */
++#define MAX_CONTIG_ORDER 9 /* 2MB */
++static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
++
++#define VOID_PTE (mfn_pte(0, __pgprot(0)))
++static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
++				unsigned long *in_frames,
++				unsigned long *out_frames)
++{
++	int i;
++	struct multicall_space mcs;
++
++	xen_mc_batch();
++	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
++		mcs = __xen_mc_entry(0);
++
++		if (in_frames)
++			in_frames[i] = virt_to_mfn(vaddr);
++
++		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
++		set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
++
++		if (out_frames)
++			out_frames[i] = virt_to_pfn(vaddr);
++	}
++	xen_mc_issue(0);
++}
++
++/*
++ * Update the pfn-to-mfn mappings for a virtual address range, either to
++ * point to an array of mfns, or contiguously from a single starting
++ * mfn.
++ */
++static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
++				     unsigned long *mfns,
++				     unsigned long first_mfn)
++{
++	unsigned i, limit;
++	unsigned long mfn;
++
++	xen_mc_batch();
++
++	limit = 1u << order;
++	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
++		struct multicall_space mcs;
++		unsigned flags;
++
++		mcs = __xen_mc_entry(0);
++		if (mfns)
++			mfn = mfns[i];
++		else
++			mfn = first_mfn + i;
++
++		if (i < (limit - 1))
++			flags = 0;
++		else {
++			if (order == 0)
++				flags = UVMF_INVLPG | UVMF_ALL;
++			else
++				flags = UVMF_TLB_FLUSH | UVMF_ALL;
++		}
++
++		MULTI_update_va_mapping(mcs.mc, vaddr,
++				mfn_pte(mfn, PAGE_KERNEL), flags);
++
++		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
++	}
++
++	xen_mc_issue(0);
++}
++
++/*
++ * Perform the hypercall to exchange a region of our pfns to point to
++ * memory with the required contiguous alignment.  Takes the pfns as
++ * input, and populates mfns as output.
++ *
++ * Returns a success code indicating whether the hypervisor was able to
++ * satisfy the request or not.
++ */
++static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
++			       unsigned long *pfns_in,
++			       unsigned long extents_out, unsigned int order_out,
++			       unsigned long *mfns_out,
++			       unsigned int address_bits)
++{
++	long rc;
++	int success;
++
++	struct xen_memory_exchange exchange = {
++		.in = {
++			.nr_extents   = extents_in,
++			.extent_order = order_in,
++			.extent_start = pfns_in,
++			.domid        = DOMID_SELF
++		},
++		.out = {
++			.nr_extents   = extents_out,
++			.extent_order = order_out,
++			.extent_start = mfns_out,
++			.address_bits = address_bits,
++			.domid        = DOMID_SELF
++		}
++	};
++
++	BUG_ON(extents_in << order_in != extents_out << order_out);
++
++	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
++	success = (exchange.nr_exchanged == extents_in);
++
++	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
++	BUG_ON(success && (rc != 0));
++
++	return success;
++}
++
++int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
++				 unsigned int address_bits)
++{
++	unsigned long *in_frames = discontig_frames, out_frame;
++	unsigned long  flags;
++	int            success;
++
++	/*
++	 * Currently an auto-translated guest will not perform I/O, nor will
++	 * it require PAE page directories below 4GB. Therefore any calls to
++	 * this function are redundant and can be ignored.
++	 */
++
++	if (xen_feature(XENFEAT_auto_translated_physmap))
++		return 0;
++
++	if (unlikely(order > MAX_CONTIG_ORDER))
++		return -ENOMEM;
++
++	memset((void *) vstart, 0, PAGE_SIZE << order);
++
++	spin_lock_irqsave(&xen_reservation_lock, flags);
++
++	/* 1. Zap current PTEs, remembering MFNs. */
++	xen_zap_pfn_range(vstart, order, in_frames, NULL);
++
++	/* 2. Get a new contiguous memory extent. */
++	out_frame = virt_to_pfn(vstart);
++	success = xen_exchange_memory(1UL << order, 0, in_frames,
++				      1, order, &out_frame,
++				      address_bits);
++
++	/* 3. Map the new extent in place of old pages. */
++	if (success)
++		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
++	else
++		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
++
++	spin_unlock_irqrestore(&xen_reservation_lock, flags);
++
++	return success ? 0 : -ENOMEM;
++}
++EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
++
++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
++{
++	unsigned long *out_frames = discontig_frames, in_frame;
++	unsigned long  flags;
++	int success;
++
++	if (xen_feature(XENFEAT_auto_translated_physmap))
++		return;
++
++	if (unlikely(order > MAX_CONTIG_ORDER))
++		return;
++
++	memset((void *) vstart, 0, PAGE_SIZE << order);
++
++	spin_lock_irqsave(&xen_reservation_lock, flags);
++
++	/* 1. Find start MFN of contiguous extent. */
++	in_frame = virt_to_mfn(vstart);
++
++	/* 2. Zap current PTEs. */
++	xen_zap_pfn_range(vstart, order, NULL, out_frames);
++
++	/* 3. Do the exchange for non-contiguous MFNs. */
++	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
++					0, out_frames, 0);
++
++	/* 4. Map new pages in place of old pages. */
++	if (success)
++		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
++	else
++		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
++
++	spin_unlock_irqrestore(&xen_reservation_lock, flags);
++}
++EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
++
++#define REMAP_BATCH_SIZE 16
++
++struct remap_data {
++	unsigned long mfn;
++	pgprot_t prot;
++	struct mmu_update *mmu_update;
++};
++
++static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
++				 unsigned long addr, void *data)
++{
++	struct remap_data *rmd = data;
++	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
++
++	rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
++	rmd->mmu_update->val = pte_val_ma(pte);
++	rmd->mmu_update++;
++
++	return 0;
++}
++
++int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
++			       unsigned long addr,
++			       unsigned long mfn, int nr,
++			       pgprot_t prot, unsigned domid)
++{
++	struct remap_data rmd;
++	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
++	int batch;
++	unsigned long range;
++	int err = 0;
++
++	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
++
++	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
++
++	rmd.mfn = mfn;
++	rmd.prot = prot;
++
++	while (nr) {
++		batch = min(REMAP_BATCH_SIZE, nr);
++		range = (unsigned long)batch << PAGE_SHIFT;
++
++		rmd.mmu_update = mmu_update;
++		err = apply_to_page_range(vma->vm_mm, addr, range,
++					  remap_area_mfn_pte_fn, &rmd);
++		if (err)
++			goto out;
++
++		err = -EFAULT;
++		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
++			goto out;
++
++		nr -= batch;
++		addr += range;
++	}
++
++	err = 0;
++out:
++
++	flush_tlb_all();
++
++	return err;
++}
++EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
++
++#ifdef CONFIG_XEN_PVHVM
++static void xen_hvm_exit_mmap(struct mm_struct *mm)
++{
++	struct xen_hvm_pagetable_dying a;
++	int rc;
++
++	a.domid = DOMID_SELF;
++	a.gpa = __pa(mm->pgd);
++	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
++	WARN_ON_ONCE(rc < 0);
++}
++
++static int is_pagetable_dying_supported(void)
++{
++	struct xen_hvm_pagetable_dying a;
++	int rc = 0;
++
++	a.domid = DOMID_SELF;
++	a.gpa = 0x00;
++	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
++	if (rc < 0) {
++		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
++		return 0;
++	}
++	return 1;
++}
++
++void __init xen_hvm_init_mmu_ops(void)
++{
++	if (is_pagetable_dying_supported())
++		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
++}
++#endif
++
+ #ifdef CONFIG_XEN_DEBUG_FS
+ 
+ static struct dentry *d_mmu_debug;
+diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
+index 5fe6bc7..537bb9a 100644
+--- a/arch/x86/xen/mmu.h
++++ b/arch/x86/xen/mmu.h
+@@ -12,7 +12,6 @@ enum pt_level {
+ 
+ 
+ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+-bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
+ 
+ void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+ 
+@@ -60,4 +59,5 @@ void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+ unsigned long xen_read_cr2_direct(void);
+ 
+ extern void xen_init_mmu_ops(void);
++extern void xen_hvm_init_mmu_ops(void);
+ #endif	/* _XEN_MMU_H */
+diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
+new file mode 100644
+index 0000000..4d55524
+--- /dev/null
++++ b/arch/x86/xen/pci-swiotlb-xen.c
+@@ -0,0 +1,52 @@
++/* Glue code to lib/swiotlb-xen.c */
++
++#include <linux/dma-mapping.h>
++#include <linux/swiotlb.h>
++
++#include <asm/xen/hypervisor.h>
++
++int xen_swiotlb __read_mostly;
++
++static struct dma_map_ops xen_swiotlb_dma_ops = {
++	.mapping_error = xen_swiotlb_dma_mapping_error,
++	.alloc_coherent = xen_swiotlb_alloc_coherent,
++	.free_coherent = xen_swiotlb_free_coherent,
++	.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
++	.sync_single_for_device = xen_swiotlb_sync_single_for_device,
++	.sync_single_range_for_cpu = xen_swiotlb_sync_single_range_for_cpu,
++	.sync_single_range_for_device = xen_swiotlb_sync_single_range_for_device,
++	.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
++	.sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
++	.map_sg = xen_swiotlb_map_sg_attrs,
++	.unmap_sg = xen_swiotlb_unmap_sg_attrs,
++	.map_page = xen_swiotlb_map_page,
++	.unmap_page = xen_swiotlb_unmap_page,
++	.dma_supported = xen_swiotlb_dma_supported,
++};
++
++/*
++ * pci_swiotlb_detect - set swiotlb to 1 if necessary
++ *
++ * This returns non-zero if we are forced to use swiotlb (by the boot
++ * option).
++ */
++int __init pci_xen_swiotlb_detect(void)
++{
++
++	if (xen_pv_domain() && (xen_initial_domain() || swiotlb))
++		xen_swiotlb = 1;
++
++	/* If we are running under Xen, we MUST disable the native SWIOTLB */
++	if (xen_pv_domain())
++		swiotlb = 0;
++
++	return xen_swiotlb;
++}
++
++void __init pci_xen_swiotlb_init(void)
++{
++	if (xen_swiotlb) {
++		xen_swiotlb_init(1);
++		dma_ops = &xen_swiotlb_dma_ops;
++	}
++}
+diff --git a/arch/x86/xen/pci.c b/arch/x86/xen/pci.c
+new file mode 100644
+index 0000000..8ca31f1
+--- /dev/null
++++ b/arch/x86/xen/pci.c
+@@ -0,0 +1,296 @@
++#include <linux/kernel.h>
++#include <linux/acpi.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
++#include <linux/slab.h>
++
++#include <asm/mpspec.h>
++#include <asm/io_apic.h>
++#include <asm/pci_x86.h>
++
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/pci.h>
++
++#include <xen/interface/xen.h>
++#include <xen/events.h>
++
++#include "xen-ops.h"
++
++int xen_register_pirq(u32 gsi, int triggering)
++{
++	int rc, irq;
++	struct physdev_map_pirq map_irq;
++	int shareable = 0;
++	char *name;
++
++	if (!xen_pv_domain())
++		return -1;
++
++	if (triggering == ACPI_EDGE_SENSITIVE) {
++		shareable = 0;
++		name = "ioapic-edge";
++	} else {
++		shareable = 1;
++		name = "ioapic-level";
++	}
++
++	irq = xen_allocate_pirq(gsi, shareable, name);
++
++	printk(KERN_DEBUG "xen: --> irq=%d\n", irq);
++
++	if (irq < 0)
++		goto out;
++
++	map_irq.domid = DOMID_SELF;
++	map_irq.type = MAP_PIRQ_TYPE_GSI;
++	map_irq.index = gsi;
++	map_irq.pirq = irq;
++
++	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
++	if (rc) {
++		printk(KERN_WARNING "xen map irq failed %d\n", rc);
++		return -1;
++	}
++
++out:
++	return irq;
++}
++
++int xen_register_gsi(u32 gsi, int triggering, int polarity)
++{
++	int rc, irq;
++	struct physdev_setup_gsi setup_gsi;
++
++	if (!xen_pv_domain())
++		return -1;
++
++	printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
++			gsi, triggering, polarity);
++
++	irq = xen_register_pirq(gsi, triggering);
++
++	setup_gsi.gsi = gsi;
++	setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
++	setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
++
++	rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
++	if (rc == -EEXIST)
++		printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
++	else if (rc) {
++		printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
++				gsi, rc);
++	}
++
++	return irq;
++}
++
++#ifdef CONFIG_ACPI
++#define BAD_MADT_ENTRY(entry, end) (					    \
++		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
++		((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
++
++
++static int __init
++xen_acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
++			   const unsigned long end)
++{
++	struct acpi_madt_interrupt_override *intsrc = NULL;
++
++	intsrc = (struct acpi_madt_interrupt_override *)header;
++
++	if (BAD_MADT_ENTRY(intsrc, end))
++		return -EINVAL;
++
++	acpi_table_print_madt_entry(header);
++
++	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
++		int gsi;
++		int trigger, polarity;
++
++		trigger = intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK;
++		polarity = intsrc->inti_flags & ACPI_MADT_POLARITY_MASK;
++
++		/* Command-line over-ride via acpi_sci= */
++		if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
++			trigger = acpi_sci_flags & ACPI_MADT_TRIGGER_MASK;
++
++		if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
++			polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
++
++		printk("xen: sci override: source_irq=%d global_irq=%d trigger=%x polarity=%x\n",
++			intsrc->source_irq, intsrc->global_irq,
++			trigger, polarity);
++
++		switch (polarity) {
++		case ACPI_MADT_POLARITY_CONFORMS:
++		case ACPI_MADT_POLARITY_ACTIVE_LOW:
++			polarity = ACPI_ACTIVE_LOW;
++			break;
++
++		case ACPI_MADT_POLARITY_ACTIVE_HIGH:
++			polarity = ACPI_ACTIVE_HIGH;
++			break;
++
++		default:
++			return 0;
++		}
++
++		switch (trigger) {
++		case ACPI_MADT_TRIGGER_CONFORMS:
++		case ACPI_MADT_TRIGGER_LEVEL:
++			trigger = ACPI_LEVEL_SENSITIVE;
++			break;
++
++		case ACPI_MADT_TRIGGER_EDGE:
++			trigger = ACPI_EDGE_SENSITIVE;
++			break;
++
++		default:
++			return 0;
++		}
++
++		gsi = xen_register_gsi(intsrc->global_irq,
++				       trigger, polarity);
++		/*
++		 * stash over-ride to indicate we've been here
++		 * and for later update of acpi_gbl_FADT
++		 */
++		acpi_sci_override_gsi = gsi;
++
++		printk("xen: acpi sci %d\n", gsi);
++	}
++
++	return 0;
++}
++
++static __init void xen_setup_acpi_sci(void)
++{
++  acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
++			xen_acpi_parse_int_src_ovr,
++			nr_irqs);
++}
++#else
++static __init void xen_setup_acpi_sci(void)
++{
++}
++#endif
++
++void __init xen_setup_pirqs(void)
++{
++	int irq;
++
++	if (0 == nr_ioapics) {
++		for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
++			xen_allocate_pirq(irq, 0, "xt-pic");
++		return;
++	}
++
++	/* Pre-allocate legacy irqs */
++	for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
++		int trigger, polarity;
++
++		if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
++			continue;
++
++		xen_register_pirq(irq,
++			trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
++	}
++
++	xen_setup_acpi_sci();
++}
++
++#ifdef CONFIG_PCI_MSI
++int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
++{
++	int irq, ret;
++	struct msi_desc *msidesc;
++
++	list_for_each_entry(msidesc, &dev->msi_list, list) {
++		irq = xen_create_msi_irq(dev, msidesc, type);
++		if (irq < 0)
++			return -1;
++
++		ret = set_irq_msi(irq, msidesc);
++		if (ret)
++			goto error;
++	}
++	return 0;
++
++error:
++	xen_destroy_irq(irq);
++	return ret;
++}
++#endif
++
++struct xen_device_domain_owner {
++	domid_t domain;
++	struct pci_dev *dev;
++	struct list_head list;
++};
++
++static DEFINE_SPINLOCK(dev_domain_list_spinlock);
++static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
++
++static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
++{
++	struct xen_device_domain_owner *owner;
++
++	list_for_each_entry(owner, &dev_domain_list, list) {
++		if (owner->dev == dev)
++			return owner;
++	}
++	return NULL;
++}
++
++int xen_find_device_domain_owner(struct pci_dev *dev)
++{
++	struct xen_device_domain_owner *owner;
++	int domain = -ENODEV;
++
++	spin_lock(&dev_domain_list_spinlock);
++	owner = find_device(dev);
++	if (owner)
++		domain = owner->domain;
++	spin_unlock(&dev_domain_list_spinlock);
++	return domain;
++}
++EXPORT_SYMBOL(xen_find_device_domain_owner);
++
++int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
++{
++	struct xen_device_domain_owner *owner;
++
++	owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
++	if (!owner)
++		return -ENODEV;
++
++	spin_lock(&dev_domain_list_spinlock);
++	if (find_device(dev)) {
++		spin_unlock(&dev_domain_list_spinlock);
++		kfree(owner);
++		return -EEXIST;
++	}
++	owner->domain = domain;
++	owner->dev = dev;
++	list_add_tail(&owner->list, &dev_domain_list);
++	spin_unlock(&dev_domain_list_spinlock);
++	return 0;
++}
++EXPORT_SYMBOL(xen_register_device_domain_owner);
++
++int xen_unregister_device_domain_owner(struct pci_dev *dev)
++{
++	struct xen_device_domain_owner *owner;
++
++	spin_lock(&dev_domain_list_spinlock);
++	owner = find_device(dev);
++	if (!owner) {
++		spin_unlock(&dev_domain_list_spinlock);
++		return -ENODEV;
++	}
++	list_del(&owner->list);
++	spin_unlock(&dev_domain_list_spinlock);
++	kfree(owner);
++	return 0;
++}
++EXPORT_SYMBOL(xen_unregister_device_domain_owner);
+diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
+new file mode 100644
+index 0000000..0f45638
+--- /dev/null
++++ b/arch/x86/xen/platform-pci-unplug.c
+@@ -0,0 +1,143 @@
++/******************************************************************************
++ * platform-pci-unplug.c
++ *
++ * Xen platform PCI device driver
++ * Copyright (c) 2010, Citrix
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++#include <linux/init.h>
++#include <linux/io.h>
++#include <linux/module.h>
++
++#include <xen/platform_pci.h>
++
++#define XEN_PLATFORM_ERR_MAGIC -1
++#define XEN_PLATFORM_ERR_PROTOCOL -2
++#define XEN_PLATFORM_ERR_BLACKLIST -3
++
++/* store the value of xen_emul_unplug after the unplug is done */
++int xen_platform_pci_unplug;
++EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
++#ifdef CONFIG_XEN_PVHVM
++static int xen_emul_unplug;
++
++static int __init check_platform_magic(void)
++{
++	short magic;
++	char protocol;
++
++	magic = inw(XEN_IOPORT_MAGIC);
++	if (magic != XEN_IOPORT_MAGIC_VAL) {
++		printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
++		return XEN_PLATFORM_ERR_MAGIC;
++	}
++
++	protocol = inb(XEN_IOPORT_PROTOVER);
++
++	printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
++			protocol);
++
++	switch (protocol) {
++	case 1:
++		outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
++		outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
++		if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
++			printk(KERN_ERR "Xen Platform: blacklisted by host\n");
++			return XEN_PLATFORM_ERR_BLACKLIST;
++		}
++		break;
++	default:
++		printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
++		return XEN_PLATFORM_ERR_PROTOCOL;
++	}
++
++	return 0;
++}
++
++void __init xen_unplug_emulated_devices(void)
++{
++	int r;
++
++	/* user explicitly requested no unplug */
++	if (xen_emul_unplug & XEN_UNPLUG_NEVER)
++		return;
++	/* check the version of the xen platform PCI device */
++	r = check_platform_magic();
++	/* If the version matches enable the Xen platform PCI driver.
++	 * Also enable the Xen platform PCI driver if the host does
++	 * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
++	 * but the user told us that unplugging is unnecessary. */
++	if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
++			(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
++		return;
++	/* Set the default value of xen_emul_unplug depending on whether or
++	 * not the Xen PV frontends and the Xen platform PCI driver have
++	 * been compiled for this kernel (modules or built-in are both OK). */
++	if (!xen_emul_unplug) {
++		if (xen_must_unplug_nics()) {
++			printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
++					"been compiled for this kernel: unplug emulated NICs.\n");
++			xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
++		}
++		if (xen_must_unplug_disks()) {
++			printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
++					"been compiled for this kernel: unplug emulated disks.\n"
++					"You might have to change the root device\n"
++					"from /dev/hd[a-d] to /dev/xvd[a-d]\n"
++					"in your root= kernel command line option\n");
++			xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
++		}
++	}
++	/* Now unplug the emulated devices */
++	if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
++		outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
++	xen_platform_pci_unplug = xen_emul_unplug;
++}
++
++static int __init parse_xen_emul_unplug(char *arg)
++{
++	char *p, *q;
++	int l;
++
++	for (p = arg; p; p = q) {
++		q = strchr(p, ',');
++		if (q) {
++			l = q - p;
++			q++;
++		} else {
++			l = strlen(p);
++		}
++		if (!strncmp(p, "all", l))
++			xen_emul_unplug |= XEN_UNPLUG_ALL;
++		else if (!strncmp(p, "ide-disks", l))
++			xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
++		else if (!strncmp(p, "aux-ide-disks", l))
++			xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
++		else if (!strncmp(p, "nics", l))
++			xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
++		else if (!strncmp(p, "unnecessary", l))
++			xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
++		else if (!strncmp(p, "never", l))
++			xen_emul_unplug |= XEN_UNPLUG_NEVER;
++		else
++			printk(KERN_WARNING "unrecognised option '%s' "
++				 "in parameter 'xen_emul_unplug'\n", p);
++	}
++	return 0;
++}
++early_param("xen_emul_unplug", parse_xen_emul_unplug);
++#endif
+diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
+index ad0047f..915b0c3 100644
+--- a/arch/x86/xen/setup.c
++++ b/arch/x86/xen/setup.c
+@@ -10,6 +10,7 @@
+ #include <linux/pm.h>
+ 
+ #include <asm/elf.h>
++#include <asm/hpet.h>
+ #include <asm/vdso.h>
+ #include <asm/e820.h>
+ #include <asm/setup.h>
+@@ -19,7 +20,9 @@
+ 
+ #include <xen/page.h>
+ #include <xen/interface/callback.h>
++#include <xen/interface/memory.h>
+ #include <xen/interface/physdev.h>
++#include <xen/interface/memory.h>
+ #include <xen/features.h>
+ 
+ #include "xen-ops.h"
+@@ -32,25 +35,177 @@ extern void xen_sysenter_target(void);
+ extern void xen_syscall_target(void);
+ extern void xen_syscall32_target(void);
+ 
++/* Amount of extra memory space we add to the e820 ranges */
++phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
++
++/* 
++ * The maximum amount of extra memory compared to the base size.  The
++ * main scaling factor is the size of struct page.  At extreme ratios
++ * of base:extra, all the base memory can be filled with page
++ * structures for the extra memory, leaving no space for anything
++ * else.
++ * 
++ * 10x seems like a reasonable balance between scaling flexibility and
++ * leaving a practically usable system.
++ */
++#define EXTRA_MEM_RATIO		(10)
++
++static __init void xen_add_extra_mem(unsigned long pages)
++{
++	u64 size = (u64)pages * PAGE_SIZE;
++	u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
++
++	if (!pages)
++		return;
++
++	e820_add_region(extra_start, size, E820_RAM);
++	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
++
++	reserve_early(extra_start, extra_start + size, "XEN EXTRA");
++
++	xen_extra_mem_size += size;
++
++	xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
++}
++
++static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
++					      phys_addr_t end_addr)
++{
++	struct xen_memory_reservation reservation = {
++		.address_bits = 0,
++		.extent_order = 0,
++		.domid        = DOMID_SELF
++	};
++	unsigned long start, end;
++	unsigned long len = 0;
++	unsigned long pfn;
++	int ret;
++
++	start = PFN_UP(start_addr);
++	end = PFN_DOWN(end_addr);
++
++	if (end <= start)
++		return 0;
++
++	printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ",
++	       start, end);
++	for(pfn = start; pfn < end; pfn++) {
++		unsigned long mfn = pfn_to_mfn(pfn);
++
++		/* Make sure pfn exists to start with */
++		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
++			continue;
++
++		set_xen_guest_handle(reservation.extent_start, &mfn);
++		reservation.nr_extents = 1;
++
++		ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
++					   &reservation);
++		WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
++		     start, end, ret);
++		if (ret == 1) {
++			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++			len++;
++		}
++	}
++	printk(KERN_CONT "%ld pages freed\n", len);
++
++	return len;
++}
++
++static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
++						     const struct e820map *e820)
++{
++	phys_addr_t max_addr = PFN_PHYS(max_pfn);
++	phys_addr_t last_end = 0;
++	unsigned long released = 0;
++	int i;
++
++	for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
++		phys_addr_t end = e820->map[i].addr;
++		end = min(max_addr, end);
++
++		released += xen_release_chunk(last_end, end);
++		last_end = e820->map[i].addr + e820->map[i].size;
++	}
++
++	if (last_end < max_addr)
++		released += xen_release_chunk(last_end, max_addr);
++
++	printk(KERN_INFO "released %ld pages of unused memory\n", released);
++	return released;
++}
+ 
+ /**
+  * machine_specific_memory_setup - Hook for machine specific memory setup.
+  **/
+-
+ char * __init xen_memory_setup(void)
+ {
++	static struct e820entry map[E820MAX] __initdata;
++
+ 	unsigned long max_pfn = xen_start_info->nr_pages;
++	unsigned long long mem_end;
++	int rc;
++	struct xen_memory_map memmap;
++	unsigned long extra_pages = 0;
++	unsigned long extra_limit;
++	int op;
++	int i;
+ 
+ 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
++	mem_end = PFN_PHYS(max_pfn);
++
++	memmap.nr_entries = E820MAX;
++	set_xen_guest_handle(memmap.buffer, map);
++
++	op = xen_initial_domain() ?
++		XENMEM_machine_memory_map :
++		XENMEM_memory_map;
++	rc = HYPERVISOR_memory_op(op, &memmap);
++	if (rc == -ENOSYS) {
++		memmap.nr_entries = 1;
++		map[0].addr = 0ULL;
++		map[0].size = mem_end;
++		/* 8MB slack (to balance backend allocations). */
++		map[0].size += 8ULL << 20;
++		map[0].type = E820_RAM;
++		rc = 0;
++	}
++	BUG_ON(rc);
+ 
+ 	e820.nr_map = 0;
++	xen_extra_mem_start = mem_end;
++	for (i = 0; i < memmap.nr_entries; i++) {
++		unsigned long long end = map[i].addr + map[i].size;
++
++		if (map[i].type == E820_RAM) {
++			if (map[i].addr < mem_end && end > mem_end) {
++				/* Truncate region to max_mem. */
++				u64 delta = end - mem_end;
++
++				map[i].size -= delta;
++				extra_pages += PFN_DOWN(delta);
++
++				end = mem_end;
++			}
++		}
+ 
+-	e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
++		if (end > xen_extra_mem_start)
++			xen_extra_mem_start = end;
++
++		/* If region is non-RAM or below mem_end, add what remains */
++		if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
++		    map[i].size > 0)
++			e820_add_region(map[i].addr, map[i].size, map[i].type);
++	}
+ 
+ 	/*
+ 	 * Even though this is normal, usable memory under Xen, reserve
+ 	 * ISA memory anyway because too many things think they can poke
+ 	 * about in there.
++	 *
++	 * In a dom0 kernel, this region is identity mapped with the
++	 * hardware ISA area, so it really is out of bounds.
+ 	 */
+ 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
+ 			E820_RESERVED);
+@@ -67,6 +222,29 @@ char * __init xen_memory_setup(void)
+ 
+ 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ 
++	extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
++
++	/*
++	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
++	 * factor the base size.  On non-highmem systems, the base
++	 * size is the full initial memory allocation; on highmem it
++	 * is limited to the max size of lowmem, so that it doesn't
++	 * get completely filled.
++	 *
++	 * In principle there could be a problem in lowmem systems if
++	 * the initial memory is also very large with respect to
++	 * lowmem, but we won't try to deal with that here.
++	 */
++	extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
++			  max_pfn + extra_pages);
++
++	if (extra_limit >= max_pfn)
++		extra_pages = extra_limit - max_pfn;
++	else
++		extra_pages = 0;
++
++	xen_add_extra_mem(extra_pages);
++
+ 	return "Xen";
+ }
+ 
+@@ -156,6 +334,8 @@ void __init xen_arch_setup(void)
+ 	struct physdev_set_iopl set_iopl;
+ 	int rc;
+ 
++	xen_panic_handler_init();
++
+ 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+ 	HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
+ 
+@@ -182,13 +362,17 @@ void __init xen_arch_setup(void)
+ 	}
+ #endif
+ 
++	/* 
++	 * Xen hypervisor uses HPET to wakeup cpu from deep c-states,
++	 * so the HPET usage in dom0 must be forbidden.
++	 */
++	disable_hpet(NULL);
++
+ 	memcpy(boot_command_line, xen_start_info->cmd_line,
+ 	       MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+ 	       COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+ 
+ 	pm_idle = xen_idle;
+ 
+-	paravirt_disable_iospace();
+-
+ 	fiddle_vdso();
+ }
+diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
+index 360f8d8..8a390dc 100644
+--- a/arch/x86/xen/smp.c
++++ b/arch/x86/xen/smp.c
+@@ -178,11 +178,18 @@ static void __init xen_smp_prepare_boot_cpu(void)
+ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+ {
+ 	unsigned cpu;
++	unsigned int i;
+ 
+ 	xen_init_lock_cpu(0);
+ 
+ 	smp_store_cpu_info(0);
+ 	cpu_data(0).x86_max_cores = 1;
++
++	for_each_possible_cpu(i) {
++		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
++		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
++		zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
++	}
+ 	set_cpu_sibling_map(0);
+ 
+ 	if (xen_smp_intr_init(0))
+@@ -299,6 +306,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
+ 	xen_setup_timer(cpu);
+ 	xen_init_lock_cpu(cpu);
+ 
++	cpumask_set_cpu(cpu, cpu_callout_mask);
++
+ 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ 
+ 	/* make sure interrupts start blocked */
+@@ -392,6 +401,8 @@ static void stop_self(void *v)
+ 	load_cr3(swapper_pg_dir);
+ 	/* should set up a minimal gdt */
+ 
++	set_cpu_online(cpu, false);
++
+ 	HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
+ 	BUG();
+ }
+diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
+index a9c6611..1d789d5 100644
+--- a/arch/x86/xen/suspend.c
++++ b/arch/x86/xen/suspend.c
+@@ -26,6 +26,18 @@ void xen_pre_suspend(void)
+ 		BUG();
+ }
+ 
++void xen_hvm_post_suspend(int suspend_cancelled)
++{
++	int cpu;
++	xen_hvm_init_shared_info();
++	xen_callback_vector();
++	if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
++		for_each_online_cpu(cpu) {
++			xen_setup_runstate_info(cpu);
++		}
++	}
++}
++
+ void xen_post_suspend(int suspend_cancelled)
+ {
+ 	xen_build_mfn_list_list();
+diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
+index 8e04980..30b7b44 100644
+--- a/arch/x86/xen/time.c
++++ b/arch/x86/xen/time.c
+@@ -19,6 +19,7 @@
+ #include <asm/xen/hypercall.h>
+ 
+ #include <xen/events.h>
++#include <xen/features.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/vcpu.h>
+ 
+@@ -155,7 +156,7 @@ static void do_stolen_accounting(void)
+ }
+ 
+ /* Get the TSC speed from Xen */
+-unsigned long xen_tsc_khz(void)
++static unsigned long xen_tsc_khz(void)
+ {
+ 	struct pvclock_vcpu_time_info *info =
+ 		&HYPERVISOR_shared_info->vcpu_info[0].time;
+@@ -190,7 +191,7 @@ static void xen_read_wallclock(struct timespec *ts)
+ 	put_cpu_var(xen_vcpu);
+ }
+ 
+-unsigned long xen_get_wallclock(void)
++static unsigned long xen_get_wallclock(void)
+ {
+ 	struct timespec ts;
+ 
+@@ -198,10 +199,24 @@ unsigned long xen_get_wallclock(void)
+ 	return ts.tv_sec;
+ }
+ 
+-int xen_set_wallclock(unsigned long now)
++static int xen_set_wallclock(unsigned long now)
+ {
++	struct xen_platform_op op;
++	int rc;
++
+ 	/* do nothing for domU */
+-	return -1;
++	if (!xen_initial_domain())
++		return -1;
++
++	op.cmd = XENPF_settime;
++	op.u.settime.secs = now;
++	op.u.settime.nsecs = 0;
++	op.u.settime.system_time = xen_clocksource_read();
++
++	rc = HYPERVISOR_dom0_op(&op);
++	WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
++
++	return rc;
+ }
+ 
+ static struct clocksource xen_clocksource __read_mostly = {
+@@ -403,6 +418,8 @@ void xen_setup_timer(int cpu)
+ 
+ 	evt->cpumask = cpumask_of(cpu);
+ 	evt->irq = irq;
++
++	xen_setup_runstate_info(cpu);
+ }
+ 
+ void xen_teardown_timer(int cpu)
+@@ -433,7 +450,7 @@ void xen_timer_resume(void)
+ 	}
+ }
+ 
+-__init void xen_time_init(void)
++static __init void xen_time_init(void)
+ {
+ 	int cpu = smp_processor_id();
+ 
+@@ -457,3 +474,51 @@ __init void xen_time_init(void)
+ 	xen_setup_timer(cpu);
+ 	xen_setup_cpu_clockevents();
+ }
++
++static const struct pv_time_ops xen_time_ops __initdata = {
++       .sched_clock = xen_clocksource_read,
++};
++
++__init void xen_init_time_ops(void)
++{
++	pv_time_ops = xen_time_ops;
++
++	x86_init.timers.timer_init = xen_time_init;
++	x86_init.timers.setup_percpu_clockev = x86_init_noop;
++	x86_cpuinit.setup_percpu_clockev = x86_init_noop;
++
++	x86_platform.calibrate_tsc = xen_tsc_khz;
++	x86_platform.get_wallclock = xen_get_wallclock;
++	x86_platform.set_wallclock = xen_set_wallclock;
++}
++
++#ifdef CONFIG_XEN_PVHVM
++static void xen_hvm_setup_cpu_clockevents(void)
++{
++	int cpu = smp_processor_id();
++	xen_setup_runstate_info(cpu);
++	xen_setup_timer(cpu);
++	xen_setup_cpu_clockevents();
++}
++
++__init void xen_hvm_init_time_ops(void)
++{
++	/* vector callback is needed otherwise we cannot receive interrupts
++	 * on cpu > 0 */
++	if (!xen_have_vector_callback && num_present_cpus() > 1)
++		return;
++	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
++		printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
++				"disable pv timer\n");
++		return;
++	}
++
++	pv_time_ops = xen_time_ops;
++	x86_init.timers.setup_percpu_clockev = xen_time_init;
++	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
++
++	x86_platform.calibrate_tsc = xen_tsc_khz;
++	x86_platform.get_wallclock = xen_get_wallclock;
++	x86_platform.set_wallclock = xen_set_wallclock;
++}
++#endif
+diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
+new file mode 100644
+index 0000000..1cd7f4d
+--- /dev/null
++++ b/arch/x86/xen/vga.c
+@@ -0,0 +1,67 @@
++#include <linux/screen_info.h>
++#include <linux/init.h>
++
++#include <asm/bootparam.h>
++#include <asm/setup.h>
++
++#include <xen/interface/xen.h>
++
++#include "xen-ops.h"
++
++void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
++{
++	struct screen_info *screen_info = &boot_params.screen_info;
++
++	/* This is drawn from a dump from vgacon:startup in
++	 * standard Linux. */
++	screen_info->orig_video_mode = 3;
++	screen_info->orig_video_isVGA = 1;
++	screen_info->orig_video_lines = 25;
++	screen_info->orig_video_cols = 80;
++	screen_info->orig_video_ega_bx = 3;
++	screen_info->orig_video_points = 16;
++	screen_info->orig_y = screen_info->orig_video_lines - 1;
++
++	switch (info->video_type) {
++	case XEN_VGATYPE_TEXT_MODE_3:
++		if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
++		    + sizeof(info->u.text_mode_3))
++			break;
++		screen_info->orig_video_lines = info->u.text_mode_3.rows;
++		screen_info->orig_video_cols = info->u.text_mode_3.columns;
++		screen_info->orig_x = info->u.text_mode_3.cursor_x;
++		screen_info->orig_y = info->u.text_mode_3.cursor_y;
++		screen_info->orig_video_points =
++			info->u.text_mode_3.font_height;
++		break;
++
++	case XEN_VGATYPE_VESA_LFB:
++		if (size < offsetof(struct dom0_vga_console_info,
++				    u.vesa_lfb.gbl_caps))
++			break;
++		screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
++		screen_info->lfb_width = info->u.vesa_lfb.width;
++		screen_info->lfb_height = info->u.vesa_lfb.height;
++		screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
++		screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
++		screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
++		screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
++		screen_info->red_size = info->u.vesa_lfb.red_size;
++		screen_info->red_pos = info->u.vesa_lfb.red_pos;
++		screen_info->green_size = info->u.vesa_lfb.green_size;
++		screen_info->green_pos = info->u.vesa_lfb.green_pos;
++		screen_info->blue_size = info->u.vesa_lfb.blue_size;
++		screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
++		screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
++		screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
++		if (size >= offsetof(struct dom0_vga_console_info,
++				     u.vesa_lfb.gbl_caps)
++		    + sizeof(info->u.vesa_lfb.gbl_caps))
++			screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
++		if (size >= offsetof(struct dom0_vga_console_info,
++				     u.vesa_lfb.mode_attrs)
++		    + sizeof(info->u.vesa_lfb.mode_attrs))
++			screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
++		break;
++	}
++}
+diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
+index f9153a3..ebbee21 100644
+--- a/arch/x86/xen/xen-ops.h
++++ b/arch/x86/xen/xen-ops.h
+@@ -30,6 +30,10 @@ void xen_setup_machphys_mapping(void);
+ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+ void xen_ident_map_ISA(void);
+ void xen_reserve_top(void);
++void xen_ident_map_ISA(void);
++extern unsigned long xen_max_p2m_pfn;
++
++void xen_set_pat(u64);
+ 
+ char * __init xen_memory_setup(void);
+ void __init xen_arch_setup(void);
+@@ -38,6 +42,10 @@ void xen_enable_sysenter(void);
+ void xen_enable_syscall(void);
+ void xen_vcpu_restore(void);
+ 
++void xen_callback_vector(void);
++void xen_hvm_init_shared_info(void);
++void __init xen_unplug_emulated_devices(void);
++
+ void __init xen_build_dynamic_phys_to_machine(void);
+ 
+ void xen_init_irq_ops(void);
+@@ -46,11 +54,8 @@ void xen_setup_runstate_info(int cpu);
+ void xen_teardown_timer(int cpu);
+ cycle_t xen_clocksource_read(void);
+ void xen_setup_cpu_clockevents(void);
+-unsigned long xen_tsc_khz(void);
+-void __init xen_time_init(void);
+-unsigned long xen_get_wallclock(void);
+-int xen_set_wallclock(unsigned long time);
+-unsigned long long xen_sched_clock(void);
++void __init xen_init_time_ops(void);
++void __init xen_hvm_init_time_ops(void);
+ 
+ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
+ 
+@@ -82,6 +87,23 @@ static inline void xen_uninit_lock_cpu(int cpu)
+ }
+ #endif
+ 
++struct dom0_vga_console_info;
++
++#ifdef CONFIG_XEN_DOM0
++void xen_init_vga(const struct dom0_vga_console_info *, size_t size);
++#else
++static inline void xen_init_vga(const struct dom0_vga_console_info *info,
++				size_t size)
++{
++}
++#endif
++
++#ifdef CONFIG_XEN_DOM0
++void xen_init_apic(void);
++#else
++static inline void xen_init_apic(void) {}
++#endif
++
+ /* Declare an asm function, along with symbols needed to make it
+    inlineable */
+ #define DECL_ASM(ret, name, ...)		\
+@@ -101,4 +123,6 @@ void xen_sysret32(void);
+ void xen_sysret64(void);
+ void xen_adjust_exception_frame(void);
+ 
++extern int xen_panic_handler_init(void);
++
+ #endif /* XEN_OPS_H */
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 71da511..32d305c 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -439,6 +439,7 @@ void blk_put_queue(struct request_queue *q)
+ {
+ 	kobject_put(&q->kobj);
+ }
++EXPORT_SYMBOL_GPL(blk_put_queue);
+ 
+ void blk_cleanup_queue(struct request_queue *q)
+ {
+@@ -612,6 +613,7 @@ int blk_get_queue(struct request_queue *q)
+ 
+ 	return 1;
+ }
++EXPORT_SYMBOL_GPL(blk_get_queue);
+ 
+ static inline void blk_free_request(struct request_queue *q, struct request *rq)
+ {
+diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
+index 7702118..1be123c 100644
+--- a/drivers/acpi/Makefile
++++ b/drivers/acpi/Makefile
+@@ -61,6 +61,7 @@ obj-$(CONFIG_ACPI_POWER_METER)	+= power_meter.o
+ # processor has its own "processor." module_param namespace
+ processor-y			:= processor_core.o processor_throttling.o
+ processor-y			+= processor_idle.o processor_thermal.o
++processor-y			+= processor_xen.o
+ processor-$(CONFIG_CPU_FREQ)	+= processor_perflib.o
+ 
+ obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
+diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
+index 28ccdbc..b0f9ed6 100644
+--- a/drivers/acpi/acpi_memhotplug.c
++++ b/drivers/acpi/acpi_memhotplug.c
+@@ -31,6 +31,7 @@
+ #include <linux/types.h>
+ #include <linux/memory_hotplug.h>
+ #include <acpi/acpi_drivers.h>
++#include <xen/acpi.h>
+ 
+ #define ACPI_MEMORY_DEVICE_CLASS		"memory"
+ #define ACPI_MEMORY_DEVICE_HID			"PNP0C80"
+@@ -70,21 +71,6 @@ static struct acpi_driver acpi_memory_device_driver = {
+ 		},
+ };
+ 
+-struct acpi_memory_info {
+-	struct list_head list;
+-	u64 start_addr;		/* Memory Range start physical addr */
+-	u64 length;		/* Memory Range length */
+-	unsigned short caching;	/* memory cache attribute */
+-	unsigned short write_protect;	/* memory read/write attribute */
+-	unsigned int enabled:1;
+-};
+-
+-struct acpi_memory_device {
+-	struct acpi_device * device;
+-	unsigned int state;	/* State of the memory device */
+-	struct list_head res_list;
+-};
+-
+ static int acpi_hotmem_initialized;
+ 
+ static acpi_status
+@@ -228,6 +214,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
+ 		return result;
+ 	}
+ 
++	if (xen_initial_domain())
++		return xen_hotadd_memory(mem_device);
++
+ 	node = acpi_get_node(mem_device->device->handle);
+ 	/*
+ 	 * Tell the VM there is more memory here...
+diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c
+index cc22f9a..747d96f 100644
+--- a/drivers/acpi/acpica/hwsleep.c
++++ b/drivers/acpi/acpica/hwsleep.c
+@@ -47,6 +47,9 @@
+ #include "actables.h"
+ #include <linux/tboot.h>
+ 
++#include <xen/acpi.h>
++#include <asm/xen/hypervisor.h>
++
+ #define _COMPONENT          ACPI_HARDWARE
+ ACPI_MODULE_NAME("hwsleep")
+ 
+@@ -346,6 +349,19 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state)
+ 	tboot_sleep(sleep_state, pm1a_control, pm1b_control);
+ 
+ 	/* Write #2: Write both SLP_TYP + SLP_EN */
++	if (xen_pv_acpi()) {
++		int err;
++
++		err = acpi_notify_hypervisor_state(sleep_state,
++						   pm1a_control, pm1b_control);
++		if (err) {
++			ACPI_DEBUG_PRINT((ACPI_DB_INIT,
++					  "Hypervisor failure [%d]\n", err));
++			return_ACPI_STATUS(AE_ERROR);
++		}
++
++		return_ACPI_STATUS(AE_OK);
++	}
+ 
+ 	status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control);
+ 	if (ACPI_FAILURE(status)) {
+diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
+index ec742a4..492a899 100644
+--- a/drivers/acpi/processor_core.c
++++ b/drivers/acpi/processor_core.c
+@@ -58,6 +58,7 @@
+ #include <acpi/acpi_bus.h>
+ #include <acpi/acpi_drivers.h>
+ #include <acpi/processor.h>
++#include <xen/acpi.h>
+ 
+ #define PREFIX "ACPI: "
+ 
+@@ -81,11 +82,9 @@ MODULE_DESCRIPTION("ACPI Processor Driver");
+ MODULE_LICENSE("GPL");
+ 
+ static int acpi_processor_add(struct acpi_device *device);
+-static int acpi_processor_remove(struct acpi_device *device, int type);
+ #ifdef CONFIG_ACPI_PROCFS
+ static int acpi_processor_info_open_fs(struct inode *inode, struct file *file);
+ #endif
+-static void acpi_processor_notify(struct acpi_device *device, u32 event);
+ static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu);
+ static int acpi_processor_handle_eject(struct acpi_processor *pr);
+ 
+@@ -253,7 +252,7 @@ static int acpi_processor_errata_piix4(struct pci_dev *dev)
+ 	return 0;
+ }
+ 
+-static int acpi_processor_errata(struct acpi_processor *pr)
++int acpi_processor_errata(struct acpi_processor *pr)
+ {
+ 	int result = 0;
+ 	struct pci_dev *dev = NULL;
+@@ -284,7 +283,7 @@ static int acpi_processor_errata(struct acpi_processor *pr)
+  * _PDC is required for a BIOS-OS handshake for most of the newer
+  * ACPI processor features.
+  */
+-static int acpi_processor_set_pdc(struct acpi_processor *pr)
++int acpi_processor_set_pdc(struct acpi_processor *pr)
+ {
+ 	struct acpi_object_list *pdc_in = pr->pdc;
+ 	acpi_status status = AE_OK;
+@@ -353,7 +352,7 @@ static int acpi_processor_info_open_fs(struct inode *inode, struct file *file)
+ 			   PDE(inode)->data);
+ }
+ 
+-static int acpi_processor_add_fs(struct acpi_device *device)
++int acpi_processor_add_fs(struct acpi_device *device)
+ {
+ 	struct proc_dir_entry *entry = NULL;
+ 
+@@ -392,7 +391,7 @@ static int acpi_processor_add_fs(struct acpi_device *device)
+ 		return -EIO;
+ 	return 0;
+ }
+-static int acpi_processor_remove_fs(struct acpi_device *device)
++int acpi_processor_remove_fs(struct acpi_device *device)
+ {
+ 
+ 	if (acpi_device_dir(device)) {
+@@ -408,15 +407,6 @@ static int acpi_processor_remove_fs(struct acpi_device *device)
+ 
+ 	return 0;
+ }
+-#else
+-static inline int acpi_processor_add_fs(struct acpi_device *device)
+-{
+-	return 0;
+-}
+-static inline int acpi_processor_remove_fs(struct acpi_device *device)
+-{
+-	return 0;
+-}
+ #endif
+ 
+ /* Use the acpiid in MADT to map cpus in case of SMP */
+@@ -711,7 +701,7 @@ static int acpi_processor_get_info(struct acpi_device *device)
+ 
+ static DEFINE_PER_CPU(void *, processor_device_array);
+ 
+-static void acpi_processor_notify(struct acpi_device *device, u32 event)
++void acpi_processor_notify(struct acpi_device *device, u32 event)
+ {
+ 	struct acpi_processor *pr = acpi_driver_data(device);
+ 	int saved;
+@@ -879,7 +869,7 @@ err_free_cpumask:
+ 	return result;
+ }
+ 
+-static int acpi_processor_remove(struct acpi_device *device, int type)
++int acpi_processor_remove(struct acpi_device *device, int type)
+ {
+ 	struct acpi_processor *pr = NULL;
+ 
+@@ -1154,7 +1144,11 @@ static int __init acpi_processor_init(void)
+ 	if (result < 0)
+ 		goto out_proc;
+ 
+-	result = acpi_bus_register_driver(&acpi_processor_driver);
++	if (xen_initial_domain())
++		result = xen_acpi_processor_init();
++	else
++		result = acpi_bus_register_driver(&acpi_processor_driver);
++
+ 	if (result < 0)
+ 		goto out_cpuidle;
+ 
+@@ -1190,7 +1184,10 @@ static void __exit acpi_processor_exit(void)
+ 
+ 	acpi_processor_uninstall_hotplug_notify();
+ 
+-	acpi_bus_unregister_driver(&acpi_processor_driver);
++	if (xen_initial_domain())
++		xen_acpi_processor_exit();
++	else
++		acpi_bus_unregister_driver(&acpi_processor_driver);
+ 
+ 	cpuidle_unregister_driver(&acpi_idle_driver);
+ 
+diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
+index a6ad608..3c32e87 100644
+--- a/drivers/acpi/processor_idle.c
++++ b/drivers/acpi/processor_idle.c
+@@ -58,6 +58,7 @@
+ 
+ #include <acpi/acpi_bus.h>
+ #include <acpi/processor.h>
++#include <xen/acpi.h>
+ #include <asm/processor.h>
+ 
+ #define PREFIX "ACPI: "
+@@ -439,7 +440,8 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
+ 				cx.entry_method = ACPI_CSTATE_HALT;
+ 				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+ 			} else {
+-				continue;
++				if (!xen_initial_domain())
++					continue;
+ 			}
+ 			if (cx.type == ACPI_STATE_C1 &&
+ 					(idle_halt || idle_nomwait)) {
+@@ -477,6 +479,9 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
+ 
+ 		cx.power = obj->integer.value;
+ 
++		/* cache control methods to notify xen*/
++		processor_cntl_xen_power_cache(pr->acpi_id, i, reg);
++
+ 		current_count++;
+ 		memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
+ 
+@@ -653,7 +658,7 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
+ 	return (working);
+ }
+ 
+-static int acpi_processor_get_power_info(struct acpi_processor *pr)
++int acpi_processor_get_power_info(struct acpi_processor *pr)
+ {
+ 	unsigned int i;
+ 	int result;
+@@ -1223,9 +1228,14 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
+ 	 * platforms that only support C1.
+ 	 */
+ 	if (pr->flags.power) {
+-		acpi_processor_setup_cpuidle(pr);
+-		if (cpuidle_register_device(&pr->power.dev))
+-			return -EIO;
++		if (xen_initial_domain()) {
++			processor_cntl_xen_notify(pr,
++					PROCESSOR_PM_INIT, PM_TYPE_IDLE);
++		} else {
++			acpi_processor_setup_cpuidle(pr);
++			if (cpuidle_register_device(&pr->power.dev))
++				return -EIO;
++		}
+ 	}
+ #ifdef CONFIG_ACPI_PROCFS
+ 	/* 'power' [R] */
+diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
+index 40d395e..7ba143d 100644
+--- a/drivers/acpi/processor_perflib.c
++++ b/drivers/acpi/processor_perflib.c
+@@ -332,7 +332,7 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr)
+ 	return result;
+ }
+ 
+-static int acpi_processor_get_performance_info(struct acpi_processor *pr)
++int acpi_processor_get_performance_info(struct acpi_processor *pr)
+ {
+ 	int result = 0;
+ 	acpi_status status = AE_OK;
+@@ -438,7 +438,7 @@ int acpi_processor_notify_smm(struct module *calling_module)
+ 
+ EXPORT_SYMBOL(acpi_processor_notify_smm);
+ 
+-static int acpi_processor_get_psd(struct acpi_processor	*pr)
++int acpi_processor_get_psd(struct acpi_processor	*pr)
+ {
+ 	int result = 0;
+ 	acpi_status status = AE_OK;
+diff --git a/drivers/acpi/processor_xen.c b/drivers/acpi/processor_xen.c
+new file mode 100644
+index 0000000..305398d
+--- /dev/null
++++ b/drivers/acpi/processor_xen.c
+@@ -0,0 +1,651 @@
++/*
++ * processor_xen.c - ACPI Processor Driver for xen
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/pci.h>
++#include <linux/pm.h>
++#include <linux/cpufreq.h>
++#include <linux/cpu.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/dmi.h>
++#include <linux/moduleparam.h>
++#include <linux/cpuidle.h>
++#include <linux/acpi.h>
++
++#include <acpi/acpi_bus.h>
++#include <acpi/acpi_drivers.h>
++#include <acpi/processor.h>
++#include <xen/acpi.h>
++#include <xen/pcpu.h>
++
++#define PREFIX "ACPI: "
++
++#define ACPI_PROCESSOR_CLASS            "processor"
++#define ACPI_PROCESSOR_DEVICE_NAME	"Processor"
++#define ACPI_PROCESSOR_FILE_INFO	"info"
++#define ACPI_PROCESSOR_FILE_THROTTLING	"throttling"
++#define ACPI_PROCESSOR_FILE_LIMIT	"limit"
++#define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80
++#define ACPI_PROCESSOR_NOTIFY_POWER	0x81
++#define ACPI_PROCESSOR_NOTIFY_THROTTLING	0x82
++
++#define _COMPONENT              ACPI_PROCESSOR_COMPONENT
++ACPI_MODULE_NAME("processor_xen");
++
++static const struct acpi_device_id processor_device_ids[] = {
++	{ACPI_PROCESSOR_OBJECT_HID, 0},
++	{"ACPI0007", 0},
++	{"", 0},
++};
++
++/*
++ * Xen ACPI processor driver
++ */
++
++/* from processor_core.c */
++
++static int xen_acpi_processor_add(struct acpi_device *device);
++static void xen_acpi_processor_notify(struct acpi_device *device, u32 event);
++
++struct acpi_driver xen_acpi_processor_driver = {
++	.name = "processor",
++	.class = ACPI_PROCESSOR_CLASS,
++	.ids = processor_device_ids,
++	.ops = {
++		.add = xen_acpi_processor_add,
++		.remove = acpi_processor_remove,
++		.suspend = acpi_processor_suspend,
++		.resume = acpi_processor_resume,
++		.notify = xen_acpi_processor_notify,
++		},
++};
++
++static int is_processor_present(acpi_handle handle)
++{
++	acpi_status status;
++	unsigned long long sta = 0;
++
++
++	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
++
++	if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT))
++		return 1;
++
++	/*
++	 * _STA is mandatory for a processor that supports hot plug
++	 */
++	if (status == AE_NOT_FOUND)
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++				"Processor does not support hot plug\n"));
++	else
++		ACPI_EXCEPTION((AE_INFO, status,
++				"Processor Device is not present"));
++	return 0;
++}
++
++static acpi_status
++xen_acpi_processor_hotadd_init(struct acpi_processor *pr, int *p_cpu)
++{
++	if (!is_processor_present(pr->handle))
++		return AE_ERROR;
++
++	if (processor_cntl_xen_notify(pr,
++				PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD))
++		return AE_ERROR;
++
++	return AE_OK;
++}
++
++static int xen_acpi_processor_get_info(struct acpi_device *device)
++{
++	acpi_status status = 0;
++	union acpi_object object = { 0 };
++	struct acpi_buffer buffer = { sizeof(union acpi_object), &object };
++	struct acpi_processor *pr;
++	int cpu_index, device_declaration = 0;
++	static int cpu0_initialized;
++
++	pr = acpi_driver_data(device);
++	if (!pr)
++		return -EINVAL;
++
++	if (num_online_cpus() > 1)
++		errata.smp = TRUE;
++
++	acpi_processor_errata(pr);
++
++	/*
++	 * Check to see if we have bus mastering arbitration control.  This
++	 * is required for proper C3 usage (to maintain cache coherency).
++	 */
++	if (acpi_gbl_FADT.pm2_control_block &&
++			acpi_gbl_FADT.pm2_control_length) {
++		pr->flags.bm_control = 1;
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++				  "Bus mastering arbitration control present\n"
++				  ));
++	} else
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++				  "No bus mastering arbitration control\n"));
++
++	if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) {
++		/* Declared with "Processor" statement; match ProcessorID */
++		status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer);
++		if (ACPI_FAILURE(status)) {
++			printk(KERN_ERR PREFIX "Evaluating processor object\n");
++			return -ENODEV;
++		}
++
++		/*
++		 * TBD: Synch processor ID (via LAPIC/LSAPIC structures) on SMP.
++		 *      >>> 'acpi_get_processor_id(acpi_id, &id)' in
++		 *      arch/xxx/acpi.c
++		 */
++		pr->acpi_id = object.processor.proc_id;
++	} else {
++		/*
++		 * Declared with "Device" statement; match _UID.
++		 * Note that we don't handle string _UIDs yet.
++		 */
++		unsigned long long value;
++		status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID,
++						NULL, &value);
++		if (ACPI_FAILURE(status)) {
++			printk(KERN_ERR PREFIX
++			    "Evaluating processor _UID [%#x]\n", status);
++			return -ENODEV;
++		}
++		device_declaration = 1;
++		pr->acpi_id = value;
++	}
++
++	/* TBD: add Xen specific code to query cpu_index */
++	cpu_index = -1;
++
++	/* Handle UP system running SMP kernel, with no LAPIC in MADT */
++	if (!cpu0_initialized && (cpu_index == -1) &&
++	    (num_online_cpus() == 1)) {
++		cpu_index = 0;
++	}
++
++	cpu0_initialized = 1;
++
++	pr->id = cpu_index;
++
++	/*
++	 *  Extra Processor objects may be enumerated on MP systems with
++	 *  less than the max # of CPUs, or Xen vCPU < pCPU.
++	 *  They should be ignored _iff they are physically not present.
++	 *
++	 */
++	if (xen_pcpu_index(pr->acpi_id, 1) == -1) {
++		if (ACPI_FAILURE
++		    (xen_acpi_processor_hotadd_init(pr, &pr->id))) {
++			return -ENODEV;
++		}
++	}
++
++	/*
++	 * On some boxes several processors use the same processor bus id.
++	 * But they are located in different scope. For example:
++	 * \_SB.SCK0.CPU0
++	 * \_SB.SCK1.CPU0
++	 * Rename the processor device bus id. And the new bus id will be
++	 * generated as the following format:
++	 * CPU+CPU ID.
++	 */
++	sprintf(acpi_device_bid(device), "CPU%X", pr->id);
++	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
++				pr->acpi_id));
++
++	if (!object.processor.pblk_address)
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No PBLK (NULL address)\n"));
++	else if (object.processor.pblk_length != 6)
++		printk(KERN_ERR PREFIX "Invalid PBLK length [%d]\n",
++				object.processor.pblk_length);
++	else {
++		pr->throttling.address = object.processor.pblk_address;
++		pr->throttling.duty_offset = acpi_gbl_FADT.duty_offset;
++		pr->throttling.duty_width = acpi_gbl_FADT.duty_width;
++
++		pr->pblk = object.processor.pblk_address;
++
++		/*
++		 * We don't care about error returns - we just try to mark
++		 * these reserved so that nobody else is confused into thinking
++		 * that this region might be unused..
++		 *
++		 * (In particular, allocating the IO range for Cardbus)
++		 */
++		request_region(pr->throttling.address, 6, "ACPI CPU throttle");
++	}
++
++	/*
++	 * If ACPI describes a slot number for this CPU, we can use it
++	 * ensure we get the right value in the "physical id" field
++	 * of /proc/cpuinfo
++	 */
++	status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
++	if (ACPI_SUCCESS(status))
++		arch_fix_phys_package_id(pr->id, object.integer.value);
++
++	return 0;
++}
++
++static struct acpi_device *processor_device_array[XEN_MAX_ACPI_ID + 1];
++
++static int __cpuinit xen_acpi_processor_add(struct acpi_device *device)
++{
++	struct acpi_processor *pr = NULL;
++	int result = 0;
++	struct sys_device *sysdev;
++
++	pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL);
++	if (!pr)
++		return -ENOMEM;
++
++	if (!zalloc_cpumask_var(&pr->throttling.shared_cpu_map, GFP_KERNEL)) {
++		kfree(pr);
++		return -ENOMEM;
++	}
++
++	pr->handle = device->handle;
++	strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME);
++	strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS);
++	device->driver_data = pr;
++
++	result = xen_acpi_processor_get_info(device);
++	if (result) {
++		/* Processor is physically not present */
++		return 0;
++	}
++
++	/*
++	 * Buggy BIOS check
++	 * ACPI id of processors can be reported wrongly by the BIOS.
++	 * Don't trust it blindly
++	 */
++	if (pr->acpi_id > XEN_MAX_ACPI_ID ||
++			(processor_device_array[pr->acpi_id] != NULL &&
++			 processor_device_array[pr->acpi_id] != device)) {
++		printk(KERN_WARNING "BIOS reported wrong ACPI id "
++			"for the processor\n");
++		result = -ENODEV;
++		goto err_free_cpumask;
++	}
++
++	processor_device_array[pr->acpi_id] = device;
++
++	if (pr->id != -1) {
++		per_cpu(processors, pr->id) = pr;
++
++		result = acpi_processor_add_fs(device);
++		if (result)
++			goto err_free_cpumask;
++
++		sysdev = get_cpu_sysdev(pr->id);
++		if (sysdev != NULL && sysfs_create_link(&device->dev.kobj,
++					&sysdev->kobj, "sysdev")) {
++			result = -EFAULT;
++			goto err_remove_fs;
++		}
++	}
++
++	/* _PDC call should be done before doing anything else (if reqd.). */
++	xen_arch_acpi_processor_init_pdc(pr);
++	acpi_processor_set_pdc(pr);
++	arch_acpi_processor_cleanup_pdc(pr);
++
++#ifdef CONFIG_CPU_FREQ
++	xen_acpi_processor_ppc_has_changed(pr);
++	result = xen_acpi_processor_get_performance(pr);
++	if (result)
++		goto err_remove_fs;
++#endif
++
++	if (pr->id != -1) {
++		acpi_processor_get_throttling_info(pr);
++		acpi_processor_get_limit_info(pr);
++	}
++
++	xen_acpi_processor_power_init(pr, device);
++
++	if (pr->id != -1) {
++		pr->cdev = thermal_cooling_device_register("Processor", device,
++				&processor_cooling_ops);
++		if (IS_ERR(pr->cdev)) {
++			result = PTR_ERR(pr->cdev);
++			goto err_power_exit;
++		}
++
++		dev_info(&device->dev, "registered as cooling_device%d\n",
++				pr->cdev->id);
++
++		result = sysfs_create_link(&device->dev.kobj,
++				&pr->cdev->device.kobj,
++				"thermal_cooling");
++		if (result) {
++			printk(KERN_ERR PREFIX "Create sysfs link\n");
++			goto err_thermal_unregister;
++		}
++		result = sysfs_create_link(&pr->cdev->device.kobj,
++				&device->dev.kobj,
++				"device");
++		if (result) {
++			printk(KERN_ERR PREFIX "Create sysfs link\n");
++			goto err_remove_sysfs;
++		}
++	}
++
++	return 0;
++
++err_remove_sysfs:
++	sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
++err_thermal_unregister:
++	thermal_cooling_device_unregister(pr->cdev);
++err_power_exit:
++	acpi_processor_power_exit(pr, device);
++err_remove_fs:
++	acpi_processor_remove_fs(device);
++err_free_cpumask:
++	free_cpumask_var(pr->throttling.shared_cpu_map);
++
++	return result;
++}
++
++static void xen_acpi_processor_notify(struct acpi_device *device, u32 event)
++{
++	struct acpi_processor *pr = acpi_driver_data(device);
++	int saved;
++
++	if (!pr)
++		return;
++
++	switch (event) {
++	case ACPI_PROCESSOR_NOTIFY_PERFORMANCE:
++		saved = pr->performance_platform_limit;
++		xen_acpi_processor_ppc_has_changed(pr);
++		if (saved == pr->performance_platform_limit)
++			break;
++		acpi_bus_generate_proc_event(device, event,
++					pr->performance_platform_limit);
++		acpi_bus_generate_netlink_event(device->pnp.device_class,
++					dev_name(&device->dev), event,
++					pr->performance_platform_limit);
++		break;
++	case ACPI_PROCESSOR_NOTIFY_POWER:
++		xen_acpi_processor_cst_has_changed(pr);
++		acpi_bus_generate_proc_event(device, event, 0);
++		acpi_bus_generate_netlink_event(device->pnp.device_class,
++					dev_name(&device->dev), event, 0);
++		break;
++	case ACPI_PROCESSOR_NOTIFY_THROTTLING:
++		acpi_processor_tstate_has_changed(pr);
++		acpi_bus_generate_proc_event(device, event, 0);
++		acpi_bus_generate_netlink_event(device->pnp.device_class,
++					dev_name(&device->dev), event, 0);
++	default:
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++				  "Unsupported event [0x%x]\n", event));
++		break;
++	}
++
++	return;
++}
++
++/* from processor_idle.c */
++
++static int xen_acpi_processor_get_power_info(struct acpi_processor *pr)
++{
++	int ret;
++	int invalid_pr_id = 0;
++
++	/*
++	 * acpi_processor_get_power_info need valid pr->id
++	 * so set pr->id=0 temporarily
++	 */
++	if (pr->id == -1) {
++		invalid_pr_id = 1;
++		pr->id = 0;
++	}
++
++	ret = acpi_processor_get_power_info(pr);
++
++	if (invalid_pr_id)
++		pr->id = -1;
++
++	return ret;
++}
++
++int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr)
++{
++	if (!pr)
++		return -EINVAL;
++
++	if (!pr->flags.power_setup_done)
++		return -ENODEV;
++
++	xen_acpi_processor_get_power_info(pr);
++
++	processor_cntl_xen_notify(pr,
++			PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
++
++	return 0;
++}
++
++
++int __cpuinit xen_acpi_processor_power_init(struct acpi_processor *pr,
++			      struct acpi_device *device)
++{
++	acpi_status status = 0;
++	unsigned int i;
++
++	if (!pr)
++		return -EINVAL;
++
++	if (acpi_gbl_FADT.cst_control) {
++		status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
++				acpi_gbl_FADT.cst_control, 8);
++		if (ACPI_FAILURE(status)) {
++			ACPI_EXCEPTION((AE_INFO, status,
++				"Notifying BIOS of _CST ability failed"));
++		}
++	}
++
++	xen_acpi_processor_get_power_info(pr);
++
++	pr->flags.power_setup_done = 1;
++
++	if (pr->flags.power) {
++			processor_cntl_xen_notify(pr,
++					PROCESSOR_PM_INIT, PM_TYPE_IDLE);
++
++		printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
++		for (i = 1; i <= pr->power.count; i++)
++			if (pr->power.states[i].valid)
++				printk(" C%d[C%d]", i,
++				       pr->power.states[i].type);
++		printk(")\n");
++	}
++
++	return 0;
++}
++
++/* from processor_perflib.c */
++
++#ifdef CONFIG_CPU_FREQ
++static int xen_processor_notify_smm(void)
++{
++	acpi_status status;
++	static int is_done;
++
++	/* only need successfully notify BIOS once */
++	/* avoid double notification which may lead to unexpected result */
++	if (is_done)
++		return 0;
++
++	/* Can't write pstate_cnt to smi_cmd if either value is zero */
++	if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) {
++		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_cnt\n"));
++		return 0;
++	}
++
++	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
++		"Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n",
++		acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
++
++	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
++				    (u32) acpi_gbl_FADT.pstate_control, 8);
++	if (ACPI_FAILURE(status))
++		return status;
++
++	is_done = 1;
++
++	return 0;
++}
++
++static int xen_acpi_processor_get_platform_limit(struct acpi_processor *pr)
++{
++	acpi_status status = 0;
++	unsigned long long ppc = 0;
++
++	if (!pr)
++		return -EINVAL;
++
++	/*
++	 * _PPC indicates the maximum state currently supported by the platform
++	 * (e.g. 0 = states 0..n; 1 = states 1..n; etc.
++	 */
++	status = acpi_evaluate_integer(pr->handle, "_PPC", NULL, &ppc);
++
++	if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
++		ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PPC"));
++		return -ENODEV;
++	}
++
++	pr->performance_platform_limit = (int)ppc;
++
++	return 0;
++}
++
++int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr)
++{
++	int ret;
++
++	ret = xen_acpi_processor_get_platform_limit(pr);
++
++	if (ret < 0)
++		return ret;
++	else
++		return processor_cntl_xen_notify(pr,
++				PROCESSOR_PM_CHANGE, PM_TYPE_PERF);
++}
++
++/*
++ * Existing ACPI module does parse performance states at some point,
++ * when acpi-cpufreq driver is loaded which however is something
++ * we'd like to disable to avoid confliction with xen PM
++ * logic. So we have to collect raw performance information here
++ * when ACPI processor object is found and started.
++ */
++int xen_acpi_processor_get_performance(struct acpi_processor *pr)
++{
++	int ret;
++	struct acpi_processor_performance *perf;
++	struct acpi_psd_package *pdomain;
++
++	if (pr->performance)
++		return -EBUSY;
++
++	perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL);
++	if (!perf)
++		return -ENOMEM;
++
++	pr->performance = perf;
++	/* Get basic performance state information */
++	ret = acpi_processor_get_performance_info(pr);
++	if (ret < 0)
++		goto err_out;
++
++	/*
++	 * Well, here we need retrieve performance dependency information
++	 * from _PSD object. The reason why existing interface is not used
++	 * is due to the reason that existing interface sticks to Linux cpu
++	 * id to construct some bitmap, however we want to split ACPI
++	 * processor objects from Linux cpu id logic. For example, even
++	 * when Linux is configured as UP, we still want to parse all ACPI
++	 * processor objects to xen. In this case, it's preferred
++	 * to use ACPI ID instead.
++	 */
++	pdomain = &pr->performance->domain_info;
++	pdomain->num_processors = 0;
++	ret = acpi_processor_get_psd(pr);
++	if (ret < 0) {
++		/*
++		 * _PSD is optional - assume no coordination if absent (or
++		 * broken), matching native kernels' behavior.
++		 */
++		pdomain->num_entries = ACPI_PSD_REV0_ENTRIES;
++		pdomain->revision = ACPI_PSD_REV0_REVISION;
++		pdomain->domain = pr->acpi_id;
++		pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL;
++		pdomain->num_processors = 1;
++	}
++
++	/* Some sanity check */
++	if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
++	    (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) ||
++	    ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) &&
++	     (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) &&
++	     (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) {
++		ret = -EINVAL;
++		goto err_out;
++	}
++
++	/* Last step is to notify BIOS that xen exists */
++	xen_processor_notify_smm();
++
++	processor_cntl_xen_notify(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF);
++
++	return 0;
++err_out:
++	pr->performance = NULL;
++	kfree(perf);
++	return ret;
++}
++#endif /* CONFIG_CPU_FREQ */
++
++/* init and exit */
++
++int xen_acpi_processor_init(void)
++{
++	return acpi_bus_register_driver(&xen_acpi_processor_driver);
++}
++
++void xen_acpi_processor_exit(void)
++{
++	acpi_bus_unregister_driver(&xen_acpi_processor_driver);
++}
+diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
+index 0458094..85a1308 100644
+--- a/drivers/acpi/sleep.c
++++ b/drivers/acpi/sleep.c
+@@ -19,6 +19,8 @@
+ 
+ #include <asm/io.h>
+ 
++#include <xen/acpi.h>
++
+ #include <acpi/acpi_bus.h>
+ #include <acpi/acpi_drivers.h>
+ 
+@@ -200,6 +202,21 @@ static int acpi_suspend_begin(suspend_state_t pm_state)
+ 	return error;
+ }
+ 
++static void do_suspend(void)
++{
++	if (!xen_pv_acpi()) {
++		do_suspend_lowlevel();
++		return;
++	}
++
++	/*
++	 * Xen will save and restore CPU context, so
++	 * we can skip that and just go straight to
++	 * the suspend.
++	 */
++	acpi_enter_sleep_state(ACPI_STATE_S3);
++}
++
+ /**
+  *	acpi_suspend_enter - Actually enter a sleep state.
+  *	@pm_state: ignored
+@@ -233,7 +250,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
+ 		break;
+ 
+ 	case ACPI_STATE_S3:
+-		do_suspend_lowlevel();
++		do_suspend();
+ 		break;
+ 	}
+ 
+diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
+index 1d886e0..f4a2b10 100644
+--- a/drivers/block/Kconfig
++++ b/drivers/block/Kconfig
+@@ -462,6 +462,7 @@ config XEN_BLKDEV_FRONTEND
+ 	tristate "Xen virtual block device support"
+ 	depends on XEN
+ 	default y
++	select XEN_XENBUS_FRONTEND
+ 	help
+ 	  This driver implements the front-end of the Xen virtual
+ 	  block device driver.  It communicates with a back-end driver
+diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
+index b8578bb..44059e6 100644
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -42,10 +42,12 @@
+ #include <linux/module.h>
+ #include <linux/scatterlist.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/grant_table.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
++#include <xen/platform_pci.h>
+ 
+ #include <xen/interface/grant_table.h>
+ #include <xen/interface/io/blkif.h>
+@@ -76,6 +78,7 @@ static const struct block_device_operations xlvbd_block_fops;
+  */
+ struct blkfront_info
+ {
++	struct mutex mutex;
+ 	struct xenbus_device *xbdev;
+ 	struct gendisk *gd;
+ 	int vdevice;
+@@ -85,6 +88,7 @@ struct blkfront_info
+ 	struct blkif_front_ring ring;
+ 	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ 	unsigned int evtchn, irq;
++	struct tasklet_struct tasklet;
+ 	struct request_queue *rq;
+ 	struct work_struct work;
+ 	struct gnttab_free_callback callback;
+@@ -93,14 +97,12 @@ struct blkfront_info
+ 	int feature_barrier;
+ 	int is_ready;
+ 
+-	/**
+-	 * The number of people holding this device open.  We won't allow a
+-	 * hot-unplug unless this is 0.
+-	 */
+-	int users;
++	spinlock_t io_lock;
+ };
+ 
+-static DEFINE_SPINLOCK(blkif_io_lock);
++static unsigned int nr_minors;
++static unsigned long *minors;
++static DEFINE_SPINLOCK(minor_lock);
+ 
+ #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+ 	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+@@ -116,6 +118,10 @@ static DEFINE_SPINLOCK(blkif_io_lock);
+ #define EXTENDED (1<<EXT_SHIFT)
+ #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
+ #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
++#define EMULATED_HD_DISK_MINOR_OFFSET (0)
++#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
++#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
++#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
+ 
+ #define DEV_NAME	"xvd"	/* name in /dev */
+ 
+@@ -136,6 +142,55 @@ static void add_id_to_freelist(struct blkfront_info *info,
+ 	info->shadow_free = id;
+ }
+ 
++static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
++{
++	unsigned int end = minor + nr;
++	int rc;
++
++	if (end > nr_minors) {
++		unsigned long *bitmap, *old;
++
++		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
++				 GFP_KERNEL);
++		if (bitmap == NULL)
++			return -ENOMEM;
++
++		spin_lock(&minor_lock);
++		if (end > nr_minors) {
++			old = minors;
++			memcpy(bitmap, minors,
++			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
++			minors = bitmap;
++			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
++		} else
++			old = bitmap;
++		spin_unlock(&minor_lock);
++		kfree(old);
++	}
++
++	spin_lock(&minor_lock);
++	if (find_next_bit(minors, end, minor) >= end) {
++		for (; minor < end; ++minor)
++			__set_bit(minor, minors);
++		rc = 0;
++	} else
++		rc = -EBUSY;
++	spin_unlock(&minor_lock);
++
++	return rc;
++}
++
++static void xlbd_release_minors(unsigned int minor, unsigned int nr)
++{
++	unsigned int end = minor + nr;
++
++	BUG_ON(end > nr_minors);
++	spin_lock(&minor_lock);
++	for (; minor < end; ++minor)
++		__clear_bit(minor, minors);
++	spin_unlock(&minor_lock);
++}
++
+ static void blkif_restart_queue_callback(void *arg)
+ {
+ 	struct blkfront_info *info = (struct blkfront_info *)arg;
+@@ -333,11 +388,12 @@ wait:
+ 		flush_requests(info);
+ }
+ 
+-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
++static int xlvbd_init_blk_queue(struct blkfront_info *info,
++				struct gendisk *gd, u16 sector_size)
+ {
+ 	struct request_queue *rq;
+ 
+-	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
++	rq = blk_init_queue(do_blkif_request, &info->io_lock);
+ 	if (rq == NULL)
+ 		return -1;
+ 
+@@ -370,20 +426,84 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+ static int xlvbd_barrier(struct blkfront_info *info)
+ {
+ 	int err;
++	const char *barrier;
++
++	switch (info->feature_barrier) {
++	case QUEUE_ORDERED_DRAIN:	barrier = "enabled (drain)"; break;
++	case QUEUE_ORDERED_TAG:		barrier = "enabled (tag)"; break;
++	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
++	default:			return -EINVAL;
++	}
+ 
+-	err = blk_queue_ordered(info->rq,
+-				info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE,
+-				NULL);
++	err = blk_queue_ordered(info->rq, info->feature_barrier, NULL);
+ 
+ 	if (err)
+ 		return err;
+ 
+ 	printk(KERN_INFO "blkfront: %s: barriers %s\n",
+-	       info->gd->disk_name,
+-	       info->feature_barrier ? "enabled" : "disabled");
++	       info->gd->disk_name, barrier);
+ 	return 0;
+ }
+ 
++static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
++{
++	int major;
++	major = BLKIF_MAJOR(vdevice);
++	*minor = BLKIF_MINOR(vdevice);
++	switch (major) {
++		case XEN_IDE0_MAJOR:
++			*offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
++			*minor = ((*minor / 64) * PARTS_PER_DISK) +
++				EMULATED_HD_DISK_MINOR_OFFSET;
++			break;
++		case XEN_IDE1_MAJOR:
++			*offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
++			*minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
++				EMULATED_HD_DISK_MINOR_OFFSET;
++			break;
++		case XEN_SCSI_DISK0_MAJOR:
++			*offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
++			*minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
++			break;
++		case XEN_SCSI_DISK1_MAJOR:
++		case XEN_SCSI_DISK2_MAJOR:
++		case XEN_SCSI_DISK3_MAJOR:
++		case XEN_SCSI_DISK4_MAJOR:
++		case XEN_SCSI_DISK5_MAJOR:
++		case XEN_SCSI_DISK6_MAJOR:
++		case XEN_SCSI_DISK7_MAJOR:
++			*offset = (*minor / PARTS_PER_DISK) + 
++				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
++				EMULATED_SD_DISK_NAME_OFFSET;
++			*minor = *minor +
++				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
++				EMULATED_SD_DISK_MINOR_OFFSET;
++			break;
++		case XEN_SCSI_DISK8_MAJOR:
++		case XEN_SCSI_DISK9_MAJOR:
++		case XEN_SCSI_DISK10_MAJOR:
++		case XEN_SCSI_DISK11_MAJOR:
++		case XEN_SCSI_DISK12_MAJOR:
++		case XEN_SCSI_DISK13_MAJOR:
++		case XEN_SCSI_DISK14_MAJOR:
++		case XEN_SCSI_DISK15_MAJOR:
++			*offset = (*minor / PARTS_PER_DISK) + 
++				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
++				EMULATED_SD_DISK_NAME_OFFSET;
++			*minor = *minor +
++				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
++				EMULATED_SD_DISK_MINOR_OFFSET;
++			break;
++		case XENVBD_MAJOR:
++			*offset = *minor / PARTS_PER_DISK;
++			break;
++		default:
++			printk(KERN_WARNING "blkfront: your disk configuration is "
++					"incorrect, please use an xvd device instead\n");
++			return -ENODEV;
++	}
++	return 0;
++}
+ 
+ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ 			       struct blkfront_info *info,
+@@ -391,7 +511,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ {
+ 	struct gendisk *gd;
+ 	int nr_minors = 1;
+-	int err = -ENODEV;
++	int err;
+ 	unsigned int offset;
+ 	int minor;
+ 	int nr_parts;
+@@ -406,21 +526,33 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ 	}
+ 
+ 	if (!VDEV_IS_EXTENDED(info->vdevice)) {
+-		minor = BLKIF_MINOR(info->vdevice);
+-		nr_parts = PARTS_PER_DISK;
++		err = xen_translate_vdev(info->vdevice, &minor, &offset);
++		if (err)
++			return err;		
++ 		nr_parts = PARTS_PER_DISK;
+ 	} else {
+ 		minor = BLKIF_MINOR_EXT(info->vdevice);
+ 		nr_parts = PARTS_PER_EXT_DISK;
++		offset = minor / nr_parts;
++		if (xen_hvm_domain() && minor >= EMULATED_HD_DISK_MINOR_OFFSET) {
++			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
++					"emulated IDE and SCSI disks; ignoring", info->vdevice);
++			return -ENODEV;
++		}
+ 	}
++	err = -ENODEV;
+ 
+ 	if ((minor % nr_parts) == 0)
+ 		nr_minors = nr_parts;
+ 
+-	gd = alloc_disk(nr_minors);
+-	if (gd == NULL)
++	err = xlbd_reserve_minors(minor, nr_minors);
++	if (err)
+ 		goto out;
++	err = -ENODEV;
+ 
+-	offset = minor / nr_parts;
++	gd = alloc_disk(nr_minors);
++	if (gd == NULL)
++		goto release;
+ 
+ 	if (nr_minors > 1) {
+ 		if (offset < 26)
+@@ -447,16 +579,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ 	gd->driverfs_dev = &(info->xbdev->dev);
+ 	set_capacity(gd, capacity);
+ 
+-	if (xlvbd_init_blk_queue(gd, sector_size)) {
++	if (xlvbd_init_blk_queue(info, gd, sector_size)) {
+ 		del_gendisk(gd);
+-		goto out;
++		goto release;
+ 	}
+ 
+ 	info->rq = gd->queue;
+ 	info->gd = gd;
+ 
+-	if (info->feature_barrier)
+-		xlvbd_barrier(info);
++	xlvbd_barrier(info);
+ 
+ 	if (vdisk_info & VDISK_READONLY)
+ 		set_disk_ro(gd, 1);
+@@ -469,10 +600,45 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ 
+ 	return 0;
+ 
++ release:
++	xlbd_release_minors(minor, nr_minors);
+  out:
+ 	return err;
+ }
+ 
++static void xlvbd_release_gendisk(struct blkfront_info *info)
++{
++	unsigned int minor, nr_minors;
++	unsigned long flags;
++
++	if (info->rq == NULL)
++		return;
++
++	spin_lock_irqsave(&info->io_lock, flags);
++
++	/* No more blkif_request(). */
++	blk_stop_queue(info->rq);
++
++	/* No more gnttab callback work. */
++	gnttab_cancel_free_callback(&info->callback);
++	spin_unlock_irqrestore(&info->io_lock, flags);
++
++	/* Flush gnttab callback work. Must be done with no locks held. */
++	flush_scheduled_work();
++
++	del_gendisk(info->gd);
++
++	minor = info->gd->first_minor;
++	nr_minors = info->gd->minors;
++	xlbd_release_minors(minor, nr_minors);
++
++	blk_cleanup_queue(info->rq);
++	info->rq = NULL;
++
++	put_disk(info->gd);
++	info->gd = NULL;
++}
++
+ static void kick_pending_request_queues(struct blkfront_info *info)
+ {
+ 	if (!RING_FULL(&info->ring)) {
+@@ -487,16 +653,16 @@ static void blkif_restart_queue(struct work_struct *work)
+ {
+ 	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+ 
+-	spin_lock_irq(&blkif_io_lock);
++	spin_lock_irq(&info->io_lock);
+ 	if (info->connected == BLKIF_STATE_CONNECTED)
+ 		kick_pending_request_queues(info);
+-	spin_unlock_irq(&blkif_io_lock);
++	spin_unlock_irq(&info->io_lock);
+ }
+ 
+ static void blkif_free(struct blkfront_info *info, int suspend)
+ {
+ 	/* Prevent new requests being issued until we fix things up. */
+-	spin_lock_irq(&blkif_io_lock);
++	spin_lock_irq(&info->io_lock);
+ 	info->connected = suspend ?
+ 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+ 	/* No more blkif_request(). */
+@@ -504,7 +670,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
+ 		blk_stop_queue(info->rq);
+ 	/* No more gnttab callback work. */
+ 	gnttab_cancel_free_callback(&info->callback);
+-	spin_unlock_irq(&blkif_io_lock);
++	spin_unlock_irq(&info->io_lock);
+ 
+ 	/* Flush gnttab callback work. Must be done with no locks held. */
+ 	flush_scheduled_work();
+@@ -529,21 +695,20 @@ static void blkif_completion(struct blk_shadow *s)
+ 		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
+ }
+ 
+-static irqreturn_t blkif_interrupt(int irq, void *dev_id)
++static void
++blkif_do_interrupt(unsigned long data)
+ {
++	struct blkfront_info *info = (struct blkfront_info *)data;
+ 	struct request *req;
+ 	struct blkif_response *bret;
+ 	RING_IDX i, rp;
+ 	unsigned long flags;
+-	struct blkfront_info *info = (struct blkfront_info *)dev_id;
+ 	int error;
+ 
+-	spin_lock_irqsave(&blkif_io_lock, flags);
++	spin_lock_irqsave(&info->io_lock, flags);
+ 
+-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+-		spin_unlock_irqrestore(&blkif_io_lock, flags);
+-		return IRQ_HANDLED;
+-	}
++	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
++		goto out;
+ 
+  again:
+ 	rp = info->ring.sring->rsp_prod;
+@@ -567,7 +732,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+ 				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
+ 				       info->gd->disk_name);
+ 				error = -EOPNOTSUPP;
+-				info->feature_barrier = 0;
++				info->feature_barrier = QUEUE_ORDERED_NONE;
+ 				xlvbd_barrier(info);
+ 			}
+ 			/* fall through */
+@@ -596,7 +761,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+ 
+ 	kick_pending_request_queues(info);
+ 
+-	spin_unlock_irqrestore(&blkif_io_lock, flags);
++out:
++	spin_unlock_irqrestore(&info->io_lock, flags);
++}
++
++
++static irqreturn_t
++blkif_interrupt(int irq, void *dev_id)
++{
++	struct blkfront_info *info = (struct blkfront_info *)dev_id;
++
++	tasklet_schedule(&info->tasklet);
+ 
+ 	return IRQ_HANDLED;
+ }
+@@ -650,7 +825,7 @@ fail:
+ 
+ 
+ /* Common code used when first setting up, and when resuming. */
+-static int talk_to_backend(struct xenbus_device *dev,
++static int talk_to_blkback(struct xenbus_device *dev,
+ 			   struct blkfront_info *info)
+ {
+ 	const char *message = NULL;
+@@ -710,7 +885,6 @@ again:
+ 	return err;
+ }
+ 
+-
+ /**
+  * Entry point to this code when a new device is created.  Allocate the basic
+  * structures and the ring buffer for communication with the backend, and
+@@ -736,16 +910,48 @@ static int blkfront_probe(struct xenbus_device *dev,
+ 		}
+ 	}
+ 
++	if (xen_hvm_domain()) {
++		char *type;
++		int len;
++		/* no unplug has been done: do not hook devices != xen vbds */
++		if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
++			int major;
++
++			if (!VDEV_IS_EXTENDED(vdevice))
++				major = BLKIF_MAJOR(vdevice);
++			else
++				major = XENVBD_MAJOR;
++
++			if (major != XENVBD_MAJOR) {
++				printk(KERN_INFO
++						"%s: HVM does not support vbd %d as xen block device\n",
++						__FUNCTION__, vdevice);
++				return -ENODEV;
++			}
++		}
++		/* do not create a PV cdrom device if we are an HVM guest */
++		type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
++		if (IS_ERR(type))
++			return -ENODEV;
++		if (strncmp(type, "cdrom", 5) == 0) {
++			kfree(type);
++			return -ENODEV;
++		}
++		kfree(type);
++	}
+ 	info = kzalloc(sizeof(*info), GFP_KERNEL);
+ 	if (!info) {
+ 		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+ 		return -ENOMEM;
+ 	}
+ 
++	mutex_init(&info->mutex);
+ 	info->xbdev = dev;
+ 	info->vdevice = vdevice;
+ 	info->connected = BLKIF_STATE_DISCONNECTED;
+ 	INIT_WORK(&info->work, blkif_restart_queue);
++	spin_lock_init(&info->io_lock);
++	tasklet_init(&info->tasklet, blkif_do_interrupt, (unsigned long)info);
+ 
+ 	for (i = 0; i < BLK_RING_SIZE; i++)
+ 		info->shadow[i].req.id = i+1;
+@@ -755,7 +961,7 @@ static int blkfront_probe(struct xenbus_device *dev,
+ 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+ 	dev_set_drvdata(&dev->dev, info);
+ 
+-	err = talk_to_backend(dev, info);
++	err = talk_to_blkback(dev, info);
+ 	if (err) {
+ 		kfree(info);
+ 		dev_set_drvdata(&dev->dev, NULL);
+@@ -819,7 +1025,7 @@ static int blkif_recover(struct blkfront_info *info)
+ 
+ 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+ 
+-	spin_lock_irq(&blkif_io_lock);
++	spin_lock_irq(&info->io_lock);
+ 
+ 	/* Now safe for us to use the shared ring */
+ 	info->connected = BLKIF_STATE_CONNECTED;
+@@ -830,7 +1036,7 @@ static int blkif_recover(struct blkfront_info *info)
+ 	/* Kick any other new requests queued since we resumed */
+ 	kick_pending_request_queues(info);
+ 
+-	spin_unlock_irq(&blkif_io_lock);
++	spin_unlock_irq(&info->io_lock);
+ 
+ 	return 0;
+ }
+@@ -850,13 +1056,50 @@ static int blkfront_resume(struct xenbus_device *dev)
+ 
+ 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+ 
+-	err = talk_to_backend(dev, info);
++	err = talk_to_blkback(dev, info);
+ 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+ 		err = blkif_recover(info);
+ 
+ 	return err;
+ }
+ 
++static void
++blkfront_closing(struct blkfront_info *info)
++{
++	struct xenbus_device *xbdev = info->xbdev;
++	struct block_device *bdev = NULL;
++
++	mutex_lock(&info->mutex);
++
++	if (xbdev->state == XenbusStateClosing) {
++		mutex_unlock(&info->mutex);
++		return;
++	}
++
++	if (info->gd)
++		bdev = bdget_disk(info->gd, 0);
++
++	mutex_unlock(&info->mutex);
++
++	if (!bdev) {
++		xenbus_frontend_closed(xbdev);
++		return;
++	}
++
++	mutex_lock(&bdev->bd_mutex);
++
++	if (bdev->bd_openers) {
++		xenbus_dev_error(xbdev, -EBUSY,
++				 "Device in use; refusing to close");
++		xenbus_switch_state(xbdev, XenbusStateClosing);
++	} else {
++		xlvbd_release_gendisk(info);
++		xenbus_frontend_closed(xbdev);
++	}
++
++	mutex_unlock(&bdev->bd_mutex);
++	bdput(bdev);
++}
+ 
+ /*
+  * Invoked when the backend is finally 'ready' (and has told produced
+@@ -868,11 +1111,31 @@ static void blkfront_connect(struct blkfront_info *info)
+ 	unsigned long sector_size;
+ 	unsigned int binfo;
+ 	int err;
+-
+-	if ((info->connected == BLKIF_STATE_CONNECTED) ||
+-	    (info->connected == BLKIF_STATE_SUSPENDED) )
++	int barrier;
++
++	switch (info->connected) {
++	case BLKIF_STATE_CONNECTED:
++		/*
++		 * Potentially, the back-end may be signalling
++		 * a capacity change; update the capacity.
++		 */
++		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
++				   "sectors", "%Lu", &sectors);
++		if (XENBUS_EXIST_ERR(err))
++			return;
++		printk(KERN_INFO "Setting capacity to %Lu\n",
++		       sectors);
++		set_capacity(info->gd, sectors);
++		revalidate_disk(info->gd);
++
++		/* fall through */
++	case BLKIF_STATE_SUSPENDED:
+ 		return;
+ 
++	default:
++		break;
++	}
++
+ 	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
+ 		__func__, info->xbdev->otherend);
+ 
+@@ -889,10 +1152,26 @@ static void blkfront_connect(struct blkfront_info *info)
+ 	}
+ 
+ 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+-			    "feature-barrier", "%lu", &info->feature_barrier,
++			    "feature-barrier", "%lu", &barrier,
+ 			    NULL);
++
++	/*
++	 * If there's no "feature-barrier" defined, then it means
++	 * we're dealing with a very old backend which writes
++	 * synchronously; draining will do what needs to get done.
++	 *
++	 * If there are barriers, then we can do full queued writes
++	 * with tagged barriers.
++	 *
++	 * If barriers are not supported, then there's no much we can
++	 * do, so just set ordering to NONE.
++	 */
+ 	if (err)
+-		info->feature_barrier = 0;
++		info->feature_barrier = QUEUE_ORDERED_DRAIN;
++	else if (barrier)
++		info->feature_barrier = QUEUE_ORDERED_TAG;
++	else
++		info->feature_barrier = QUEUE_ORDERED_NONE;
+ 
+ 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
+ 	if (err) {
+@@ -904,10 +1183,10 @@ static void blkfront_connect(struct blkfront_info *info)
+ 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+ 
+ 	/* Kick pending requests. */
+-	spin_lock_irq(&blkif_io_lock);
++	spin_lock_irq(&info->io_lock);
+ 	info->connected = BLKIF_STATE_CONNECTED;
+ 	kick_pending_request_queues(info);
+-	spin_unlock_irq(&blkif_io_lock);
++	spin_unlock_irq(&info->io_lock);
+ 
+ 	add_disk(info->gd);
+ 
+@@ -915,57 +1194,21 @@ static void blkfront_connect(struct blkfront_info *info)
+ }
+ 
+ /**
+- * Handle the change of state of the backend to Closing.  We must delete our
+- * device-layer structures now, to ensure that writes are flushed through to
+- * the backend.  Once is this done, we can switch to Closed in
+- * acknowledgement.
+- */
+-static void blkfront_closing(struct xenbus_device *dev)
+-{
+-	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+-	unsigned long flags;
+-
+-	dev_dbg(&dev->dev, "blkfront_closing: %s removed\n", dev->nodename);
+-
+-	if (info->rq == NULL)
+-		goto out;
+-
+-	spin_lock_irqsave(&blkif_io_lock, flags);
+-
+-	/* No more blkif_request(). */
+-	blk_stop_queue(info->rq);
+-
+-	/* No more gnttab callback work. */
+-	gnttab_cancel_free_callback(&info->callback);
+-	spin_unlock_irqrestore(&blkif_io_lock, flags);
+-
+-	/* Flush gnttab callback work. Must be done with no locks held. */
+-	flush_scheduled_work();
+-
+-	blk_cleanup_queue(info->rq);
+-	info->rq = NULL;
+-
+-	del_gendisk(info->gd);
+-
+- out:
+-	xenbus_frontend_closed(dev);
+-}
+-
+-/**
+  * Callback received when the backend's state changes.
+  */
+-static void backend_changed(struct xenbus_device *dev,
++static void blkback_changed(struct xenbus_device *dev,
+ 			    enum xenbus_state backend_state)
+ {
+ 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+-	struct block_device *bd;
+ 
+-	dev_dbg(&dev->dev, "blkfront:backend_changed.\n");
++	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
+ 
+ 	switch (backend_state) {
+ 	case XenbusStateInitialising:
+ 	case XenbusStateInitWait:
+ 	case XenbusStateInitialised:
++	case XenbusStateReconfiguring:
++	case XenbusStateReconfigured:
+ 	case XenbusStateUnknown:
+ 	case XenbusStateClosed:
+ 		break;
+@@ -975,35 +1218,56 @@ static void backend_changed(struct xenbus_device *dev,
+ 		break;
+ 
+ 	case XenbusStateClosing:
+-		if (info->gd == NULL) {
+-			xenbus_frontend_closed(dev);
+-			break;
+-		}
+-		bd = bdget_disk(info->gd, 0);
+-		if (bd == NULL)
+-			xenbus_dev_fatal(dev, -ENODEV, "bdget failed");
+-
+-		mutex_lock(&bd->bd_mutex);
+-		if (info->users > 0)
+-			xenbus_dev_error(dev, -EBUSY,
+-					 "Device in use; refusing to close");
+-		else
+-			blkfront_closing(dev);
+-		mutex_unlock(&bd->bd_mutex);
+-		bdput(bd);
++		blkfront_closing(info);
+ 		break;
+ 	}
+ }
+ 
+-static int blkfront_remove(struct xenbus_device *dev)
++static int blkfront_remove(struct xenbus_device *xbdev)
+ {
+-	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
++	struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
++	struct block_device *bdev = NULL;
++	struct gendisk *disk;
+ 
+-	dev_dbg(&dev->dev, "blkfront_remove: %s removed\n", dev->nodename);
++	dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
+ 
+ 	blkif_free(info, 0);
+ 
+-	kfree(info);
++	mutex_lock(&info->mutex);
++
++	disk = info->gd;
++	if (disk)
++		bdev = bdget_disk(disk, 0);
++
++	info->xbdev = NULL;
++	mutex_unlock(&info->mutex);
++
++	if (!bdev) {
++		kfree(info);
++		return 0;
++	}
++
++	/*
++	 * The xbdev was removed before we reached the Closed
++	 * state. See if it's safe to remove the disk. If the bdev
++	 * isn't closed yet, we let release take care of it.
++	 */
++
++	mutex_lock(&bdev->bd_mutex);
++	info = disk->private_data;
++
++	dev_warn(disk_to_dev(disk),
++		 "%s was hot-unplugged, %d stale handles\n",
++		 xbdev->nodename, bdev->bd_openers);
++
++	if (info && !bdev->bd_openers) {
++		xlvbd_release_gendisk(info);
++		disk->private_data = NULL;
++		kfree(info);
++	}
++
++	mutex_unlock(&bdev->bd_mutex);
++	bdput(bdev);
+ 
+ 	return 0;
+ }
+@@ -1012,30 +1276,68 @@ static int blkfront_is_ready(struct xenbus_device *dev)
+ {
+ 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+ 
+-	return info->is_ready;
++	return info->is_ready && info->xbdev;
+ }
+ 
+ static int blkif_open(struct block_device *bdev, fmode_t mode)
+ {
+-	struct blkfront_info *info = bdev->bd_disk->private_data;
+-	info->users++;
+-	return 0;
++	struct gendisk *disk = bdev->bd_disk;
++	struct blkfront_info *info;
++	int err = 0;
++
++	info = disk->private_data;
++	if (!info)
++		/* xbdev gone */
++		return -ERESTARTSYS;
++
++	mutex_lock(&info->mutex);
++
++	if (!info->gd)
++		/* xbdev is closed */
++		err = -ERESTARTSYS;
++
++	mutex_unlock(&info->mutex);
++
++	return err;
+ }
+ 
+ static int blkif_release(struct gendisk *disk, fmode_t mode)
+ {
+ 	struct blkfront_info *info = disk->private_data;
+-	info->users--;
+-	if (info->users == 0) {
+-		/* Check whether we have been instructed to close.  We will
+-		   have ignored this request initially, as the device was
+-		   still mounted. */
+-		struct xenbus_device *dev = info->xbdev;
+-		enum xenbus_state state = xenbus_read_driver_state(dev->otherend);
+-
+-		if (state == XenbusStateClosing && info->is_ready)
+-			blkfront_closing(dev);
++	struct block_device *bdev;
++	struct xenbus_device *xbdev;
++
++	bdev = bdget_disk(disk, 0);
++	bdput(bdev);
++
++	if (bdev->bd_openers)
++		return 0;
++
++	/*
++	 * Check if we have been instructed to close. We will have
++	 * deferred this request, because the bdev was still open.
++	 */
++
++	mutex_lock(&info->mutex);
++	xbdev = info->xbdev;
++
++	if (xbdev && xbdev->state == XenbusStateClosing) {
++		/* pending switch to state closed */
++		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
++		xlvbd_release_gendisk(info);
++		xenbus_frontend_closed(info->xbdev);
+ 	}
++
++	mutex_unlock(&info->mutex);
++
++	if (!xbdev) {
++		/* sudden device removal */
++		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
++		xlvbd_release_gendisk(info);
++		disk->private_data = NULL;
++		kfree(info);
++	}
++
+ 	return 0;
+ }
+ 
+@@ -1061,7 +1363,7 @@ static struct xenbus_driver blkfront = {
+ 	.probe = blkfront_probe,
+ 	.remove = blkfront_remove,
+ 	.resume = blkfront_resume,
+-	.otherend_changed = backend_changed,
++	.otherend_changed = blkback_changed,
+ 	.is_ready = blkfront_is_ready,
+ };
+ 
+diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
+index c496c8a..4064d95 100644
+--- a/drivers/char/agp/amd64-agp.c
++++ b/drivers/char/agp/amd64-agp.c
+@@ -18,6 +18,8 @@
+ #include <asm/k8.h>
+ #include <asm/gart.h>
+ #include "agp.h"
++#include <xen/page.h>
++#include <asm/xen/page.h>
+ 
+ /* NVIDIA K8 registers */
+ #define NVIDIA_X86_64_0_APBASE		0x10
+@@ -78,8 +80,21 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
+ 	}
+ 
+ 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
++		phys_addr_t phys = page_to_phys(mem->pages[i]);
++		if (xen_pv_domain()) {
++			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++					page_to_pfn(mem->pages[i])));
++			if (phys != xen_phys) {
++				printk(KERN_ERR "Fixing up GART: (0x%lx->0x%lx)." \
++					" CODE UNTESTED!\n",
++					(unsigned long)phys,
++					(unsigned long)xen_phys);
++				WARN_ON_ONCE(phys != xen_phys);
++				phys = xen_phys;
++			}
++		}
+ 		tmp = agp_bridge->driver->mask_memory(agp_bridge,
+-						      page_to_phys(mem->pages[i]),
++						      phys,
+ 						      mask_type);
+ 
+ 		BUG_ON(tmp & 0xffffff0000000ffcULL);
+@@ -181,6 +196,20 @@ static int amd_8151_configure(void)
+ 	unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
+ 	int i;
+ 
++	if (xen_pv_domain()) {
++		phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++				virt_to_pfn(agp_bridge->gatt_table_real)));
++		/* Future thoughts: Perhaps use the gatt_table_bus that
++		 * agp_generic_create_gatt_table has setup instead of
++		 * doing the virt_to_phys once more? */
++		if (gatt_bus != xen_phys) {
++			printk(KERN_ERR "Fixing up GATT: (0x%lx->0x%lx)." \
++					" CODE UNTESTED!\n", gatt_bus,
++					(unsigned long)xen_phys);
++			WARN_ON_ONCE(gatt_bus != xen_phys);
++			gatt_bus = xen_phys;
++		}
++	}
+ 	/* Configure AGP regs in each x86-64 host bridge. */
+         for (i = 0; i < num_k8_northbridges; i++) {
+ 		agp_bridge->gart_bus_addr =
+diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c
+index a56ca08..30fc4b6 100644
+--- a/drivers/char/agp/backend.c
++++ b/drivers/char/agp/backend.c
+@@ -38,6 +38,8 @@
+ #include <linux/vmalloc.h>
+ #include <asm/io.h>
+ #include "agp.h"
++#include <xen/page.h>
++#include <asm/xen/page.h>
+ 
+ /* Due to XFree86 brain-damage, we can't go to 1.0 until they
+  * fix some real stupidity. It's only by chance we can bump
+@@ -160,8 +162,13 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge)
+ 			}
+ 		} else {
+ 			bridge->scratch_page_dma = page_to_phys(page);
++			if (xen_pv_domain()) {
++				phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++							page_to_pfn(page)));
++				if (bridge->scratch_page_dma != xen_phys)
++					bridge->scratch_page_dma = xen_phys;
++			}
+ 		}
+-
+ 		bridge->scratch_page = bridge->driver->mask_memory(bridge,
+ 						   bridge->scratch_page_dma, 0);
+ 	}
+diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
+index c505439..2434c91 100644
+--- a/drivers/char/agp/generic.c
++++ b/drivers/char/agp/generic.c
+@@ -42,6 +42,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/pgtable.h>
+ #include "agp.h"
++#include <xen/page.h>
++#include <asm/xen/page.h>
+ 
+ __u32 *agp_gatt_table;
+ int agp_memory_reserved;
+@@ -1002,6 +1004,14 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
+ 		return -ENOMEM;
+ 	}
+ 	bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
++	/* KRW: virt_to_phys under Xen is not safe. */
++	if (xen_pv_domain()) {
++		/* Use back-door to get the "real" PFN. */
++		phys_addr_t pfn = virt_to_pfn(bridge->gatt_table_real);
++		phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(pfn));
++		if (bridge->gatt_bus_addr != xen_phys)
++			bridge->gatt_bus_addr = xen_phys;
++	}
+ 
+ 	/* AK: bogus, should encode addresses > 4GB */
+ 	for (i = 0; i < num_entries; i++) {
+@@ -1141,8 +1151,17 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type)
+ 	}
+ 
+ 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
++		phys_addr_t phys = page_to_phys(mem->pages[i]);
++
++		/* HACK: Via a back-door we get the bus address. */
++		if (xen_pv_domain()) {
++			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++					page_to_pfn(mem->pages[i])));
++			if (phys != xen_phys)
++				phys = xen_phys;
++		}
+ 		writel(bridge->driver->mask_memory(bridge,
+-						   page_to_phys(mem->pages[i]),
++						   phys,
+ 						   mask_type),
+ 		       bridge->gatt_table+j);
+ 	}
+@@ -1235,7 +1254,16 @@ int agp_generic_alloc_pages(struct agp_bridge_data *bridge, struct agp_memory *m
+ 	int i, ret = -ENOMEM;
+ 
+ 	for (i = 0; i < num_pages; i++) {
+-		page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
++		if (xen_pv_domain()) {
++			void *addr;
++			dma_addr_t _d;
++
++			addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
++			if (!addr)
++				goto out;
++			page = virt_to_page(addr);
++		} else
++			page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
+ 		/* agp_free_memory() needs gart address */
+ 		if (page == NULL)
+ 			goto out;
+@@ -1263,7 +1291,17 @@ struct page *agp_generic_alloc_page(struct agp_bridge_data *bridge)
+ {
+ 	struct page * page;
+ 
+-	page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
++	if (xen_pv_domain()) {
++		void *addr;
++		dma_addr_t _d;
++
++		addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
++		if (!addr)
++			return NULL;
++		page = virt_to_page(addr);
++	} else
++		page = alloc_page(GFP_KERNEL | GFP_DMA32 | __GFP_ZERO);
++
+ 	if (page == NULL)
+ 		return NULL;
+ 
+@@ -1294,7 +1332,12 @@ void agp_generic_destroy_pages(struct agp_memory *mem)
+ 		unmap_page_from_agp(page);
+ #endif
+ 		put_page(page);
+-		__free_page(page);
++		if (xen_pv_domain()) {
++			void *addr = page_address(page);
++			dma_free_coherent(NULL, PAGE_SIZE, addr,
++					  virt_to_bus(addr));
++		} else 
++			__free_page(page);
+ 		atomic_dec(&agp_bridge->current_memory_agp);
+ 		mem->pages[i] = NULL;
+ 	}
+@@ -1311,7 +1354,12 @@ void agp_generic_destroy_page(struct page *page, int flags)
+ 
+ 	if (flags & AGP_PAGE_DESTROY_FREE) {
+ 		put_page(page);
+-		__free_page(page);
++		if (xen_pv_domain()) {
++			void *addr = page_address(page);
++			dma_free_coherent(NULL, PAGE_SIZE, addr,
++					  virt_to_bus(addr));
++		} else
++			__free_page(page);
+ 		atomic_dec(&agp_bridge->current_memory_agp);
+ 	}
+ }
+diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
+index b8e0219..7a62c3c 100644
+--- a/drivers/char/agp/intel-agp.c
++++ b/drivers/char/agp/intel-agp.c
+@@ -10,14 +10,20 @@
+ #include <linux/agp_backend.h>
+ #include <asm/smp.h>
+ #include "agp.h"
++#include <xen/page.h>
++#include <asm/xen/page.h>
+ 
+ /*
+  * If we have Intel graphics, we're not going to have anything other than
+  * an Intel IOMMU. So make the correct use of the PCI DMA API contingent
+  * on the Intel IOMMU support (CONFIG_DMAR).
+  * Only newer chipsets need to bother with this, of course.
++ *
++ * Xen guests accessing graphics hardware also need proper translation
++ * between pseudo-physical addresses and real machine addresses, which
++ * is also achieved by using the DMA API.
+  */
+-#ifdef CONFIG_DMAR
++#if defined(CONFIG_DMAR) || defined(CONFIG_XEN)
+ #define USE_PCI_DMA_API 1
+ #endif
+ 
+@@ -296,8 +302,20 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem,
+ 	int i, j;
+ 
+ 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
++		phys_addr_t phys = page_to_phys(mem->pages[i]);
++		if (xen_pv_domain()) {
++			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++					page_to_pfn(mem->pages[i])));
++			if (xen_phys != phys) {
++				printk(KERN_ERR "Compile kernel with " \
++					"CONFIG_DMAR to get rid of this " \
++					"warning!\n");
++				WARN_ON_ONCE(xen_phys != phys);
++				/* Fixup: */
++				phys = xen_phys;
++			}
+ 		writel(agp_bridge->driver->mask_memory(agp_bridge,
+-				page_to_phys(mem->pages[i]), mask_type),
++				phys, mask_type),
+ 		       intel_private.gtt+j);
+ 	}
+ 
+@@ -395,15 +413,19 @@ static void intel_i810_agp_enable(struct agp_bridge_data *bridge, u32 mode)
+ /* Exists to support ARGB cursors */
+ static struct page *i8xx_alloc_pages(void)
+ {
++	void *addr;
++	dma_addr_t _d;
+ 	struct page *page;
+ 
+-	page = alloc_pages(GFP_KERNEL | GFP_DMA32, 2);
+-	if (page == NULL)
++	addr = dma_alloc_coherent(NULL, 4 * PAGE_SIZE, &_d, GFP_KERNEL);
++	if (addr == NULL)
+ 		return NULL;
+ 
++	page = virt_to_page(addr);
++
+ 	if (set_pages_uc(page, 4) < 0) {
+ 		set_pages_wb(page, 4);
+-		__free_pages(page, 2);
++		dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, _d);
+ 		return NULL;
+ 	}
+ 	get_page(page);
+@@ -413,12 +435,17 @@ static struct page *i8xx_alloc_pages(void)
+ 
+ static void i8xx_destroy_pages(struct page *page)
+ {
++	void *addr;
++
+ 	if (page == NULL)
+ 		return;
+ 
+ 	set_pages_wb(page, 4);
+ 	put_page(page);
+-	__free_pages(page, 2);
++
++	addr = page_address(page);
++
++	dma_free_coherent(NULL, 4 * PAGE_SIZE, addr, virt_to_bus(addr));
+ 	atomic_dec(&agp_bridge->current_memory_agp);
+ }
+ 
+@@ -478,8 +505,16 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start,
+ 		if (!mem->is_flushed)
+ 			global_cache_flush();
+ 		for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
++			phys_addr_t phys = page_to_phys(mem->pages[i]);
++			if (xen_pv_domain()) {
++				phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++						page_to_pfn(mem->pages[i])));
++				/* Fixup: */
++				if (xen_phys != phys)
++					phys = xen_phys;
++			}
+ 			writel(agp_bridge->driver->mask_memory(agp_bridge,
+-					page_to_phys(mem->pages[i]), mask_type),
++					phys, mask_type),
+ 			       intel_private.registers+I810_PTE_BASE+(j*4));
+ 		}
+ 		readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
+@@ -552,6 +587,12 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
+ 	new->num_scratch_pages = pg_count;
+ 	new->type = AGP_PHYS_MEMORY;
+ 	new->physical = page_to_phys(new->pages[0]);
++	if (xen_pv_domain()) {
++		phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++					page_to_pfn(new->pages[0])));
++		if (xen_phys != new->physical)
++			new->physical = xen_phys;
++	}
+ 	return new;
+ }
+ 
+@@ -992,8 +1033,16 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start,
+ 		global_cache_flush();
+ 
+ 	for (i = 0, j = pg_start; i < mem->page_count; i++, j++) {
++		phys_addr_t phys = page_to_phys(mem->pages[i]);
++		if (xen_pv_domain()) {
++			phys_addr_t xen_phys = PFN_PHYS(pfn_to_mfn(
++					page_to_pfn(mem->pages[i])));
++			/* Fixup: */
++			if (xen_phys != phys)
++				phys = xen_phys;
++		}
+ 		writel(agp_bridge->driver->mask_memory(agp_bridge,
+-				page_to_phys(mem->pages[i]), mask_type),
++				phys, mask_type),
+ 		       intel_private.registers+I810_PTE_BASE+(j*4));
+ 	}
+ 	readl(intel_private.registers+I810_PTE_BASE+((j-1)*4));
+diff --git a/drivers/char/hvc_xen.c b/drivers/char/hvc_xen.c
+index a6ee32b..5be0dd3 100644
+--- a/drivers/char/hvc_xen.c
++++ b/drivers/char/hvc_xen.c
+@@ -25,6 +25,8 @@
+ #include <linux/types.h>
+ 
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/page.h>
+ #include <xen/events.h>
+ #include <xen/interface/io/console.h>
+@@ -76,7 +78,7 @@ static int __write_console(const char *data, int len)
+ 	return sent;
+ }
+ 
+-static int write_console(uint32_t vtermno, const char *data, int len)
++static int domU_write_console(uint32_t vtermno, const char *data, int len)
+ {
+ 	int ret = len;
+ 
+@@ -99,7 +101,7 @@ static int write_console(uint32_t vtermno, const char *data, int len)
+ 	return ret;
+ }
+ 
+-static int read_console(uint32_t vtermno, char *buf, int len)
++static int domU_read_console(uint32_t vtermno, char *buf, int len)
+ {
+ 	struct xencons_interface *intf = xencons_interface();
+ 	XENCONS_RING_IDX cons, prod;
+@@ -120,28 +122,63 @@ static int read_console(uint32_t vtermno, char *buf, int len)
+ 	return recv;
+ }
+ 
+-static struct hv_ops hvc_ops = {
+-	.get_chars = read_console,
+-	.put_chars = write_console,
++static struct hv_ops domU_hvc_ops = {
++	.get_chars = domU_read_console,
++	.put_chars = domU_write_console,
++	.notifier_add = notifier_add_irq,
++	.notifier_del = notifier_del_irq,
++	.notifier_hangup = notifier_hangup_irq,
++};
++
++static int dom0_read_console(uint32_t vtermno, char *buf, int len)
++{
++	return HYPERVISOR_console_io(CONSOLEIO_read, len, buf);
++}
++
++/*
++ * Either for a dom0 to write to the system console, or a domU with a
++ * debug version of Xen
++ */
++static int dom0_write_console(uint32_t vtermno, const char *str, int len)
++{
++	int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
++	if (rc < 0)
++		return 0;
++
++	return len;
++}
++
++static struct hv_ops dom0_hvc_ops = {
++	.get_chars = dom0_read_console,
++	.put_chars = dom0_write_console,
+ 	.notifier_add = notifier_add_irq,
+ 	.notifier_del = notifier_del_irq,
+ 	.notifier_hangup = notifier_hangup_irq,
+ };
+ 
+-static int __init xen_init(void)
++static int __init xen_hvc_init(void)
+ {
+ 	struct hvc_struct *hp;
++	struct hv_ops *ops;
+ 
+-	if (!xen_pv_domain() ||
+-	    xen_initial_domain() ||
+-	    !xen_start_info->console.domU.evtchn)
++	if (!xen_pv_domain())
+ 		return -ENODEV;
+ 
+-	xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
++	if (xen_initial_domain()) {
++		ops = &dom0_hvc_ops;
++		xencons_irq = bind_virq_to_irq(VIRQ_CONSOLE, 0);
++	} else {
++		if (!xen_start_info->console.domU.evtchn)
++			return -ENODEV;
++
++		ops = &domU_hvc_ops;
++		xencons_irq = bind_evtchn_to_irq(xen_start_info->console.domU.evtchn);
++	}
++
+ 	if (xencons_irq < 0)
+ 		xencons_irq = 0; /* NO_IRQ */
+ 
+-	hp = hvc_alloc(HVC_COOKIE, xencons_irq, &hvc_ops, 256);
++	hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256);
+ 	if (IS_ERR(hp))
+ 		return PTR_ERR(hp);
+ 
+@@ -158,7 +195,7 @@ void xen_console_resume(void)
+ 		rebind_evtchn_irq(xen_start_info->console.domU.evtchn, xencons_irq);
+ }
+ 
+-static void __exit xen_fini(void)
++static void __exit xen_hvc_fini(void)
+ {
+ 	if (hvc)
+ 		hvc_remove(hvc);
+@@ -166,29 +203,24 @@ static void __exit xen_fini(void)
+ 
+ static int xen_cons_init(void)
+ {
++	struct hv_ops *ops;
++
+ 	if (!xen_pv_domain())
+ 		return 0;
+ 
+-	hvc_instantiate(HVC_COOKIE, 0, &hvc_ops);
++	ops = &domU_hvc_ops;
++	if (xen_initial_domain())
++		ops = &dom0_hvc_ops;
++
++	hvc_instantiate(HVC_COOKIE, 0, ops);
++
+ 	return 0;
+ }
+ 
+-module_init(xen_init);
+-module_exit(xen_fini);
++module_init(xen_hvc_init);
++module_exit(xen_hvc_fini);
+ console_initcall(xen_cons_init);
+ 
+-static void raw_console_write(const char *str, int len)
+-{
+-	while(len > 0) {
+-		int rc = HYPERVISOR_console_io(CONSOLEIO_write, len, (char *)str);
+-		if (rc <= 0)
+-			break;
+-
+-		str += rc;
+-		len -= rc;
+-	}
+-}
+-
+ #ifdef CONFIG_EARLY_PRINTK
+ static void xenboot_write_console(struct console *console, const char *string,
+ 				  unsigned len)
+@@ -196,19 +228,22 @@ static void xenboot_write_console(struct console *console, const char *string,
+ 	unsigned int linelen, off = 0;
+ 	const char *pos;
+ 
+-	raw_console_write(string, len);
++	dom0_write_console(0, string, len);
++
++	if (xen_initial_domain())
++		return;
+ 
+-	write_console(0, "(early) ", 8);
++	domU_write_console(0, "(early) ", 8);
+ 	while (off < len && NULL != (pos = strchr(string+off, '\n'))) {
+ 		linelen = pos-string+off;
+ 		if (off + linelen > len)
+ 			break;
+-		write_console(0, string+off, linelen);
+-		write_console(0, "\r\n", 2);
++		domU_write_console(0, string+off, linelen);
++		domU_write_console(0, "\r\n", 2);
+ 		off += linelen + 1;
+ 	}
+ 	if (off < len)
+-		write_console(0, string+off, len-off);
++		domU_write_console(0, string+off, len-off);
+ }
+ 
+ struct console xenboot_console = {
+@@ -220,7 +255,7 @@ struct console xenboot_console = {
+ 
+ void xen_raw_console_write(const char *str)
+ {
+-	raw_console_write(str, strlen(str));
++	dom0_write_console(0, str, strlen(str));
+ }
+ 
+ void xen_raw_printk(const char *fmt, ...)
+diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
+index 0e27d98..f5e2572 100644
+--- a/drivers/gpu/drm/drm_drv.c
++++ b/drivers/gpu/drm/drm_drv.c
+@@ -201,7 +201,7 @@ int drm_lastclose(struct drm_device * dev)
+ 	}
+ 	if (drm_core_check_feature(dev, DRIVER_SG) && dev->sg &&
+ 	    !drm_core_check_feature(dev, DRIVER_MODESET)) {
+-		drm_sg_cleanup(dev->sg);
++		drm_sg_cleanup(dev, dev->sg);
+ 		dev->sg = NULL;
+ 	}
+ 
+diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
+index 8bf3770..dde5f66 100644
+--- a/drivers/gpu/drm/drm_gem.c
++++ b/drivers/gpu/drm/drm_gem.c
+@@ -539,7 +539,7 @@ int drm_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+ 	vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTEXPAND;
+ 	vma->vm_ops = obj->dev->driver->gem_vm_ops;
+ 	vma->vm_private_data = map->handle;
+-	vma->vm_page_prot =  pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
++	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
+ 
+ 	/* Take a ref for this mapping of the object, so that the fault
+ 	 * handler can dereference the mmap offset's pointer to the object.
+diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c
+index c7823c8..95ffb8a 100644
+--- a/drivers/gpu/drm/drm_scatter.c
++++ b/drivers/gpu/drm/drm_scatter.c
+@@ -32,20 +32,73 @@
+  */
+ 
+ #include <linux/vmalloc.h>
++#include <linux/mm.h>
+ #include "drmP.h"
+ 
+ #define DEBUG_SCATTER 0
+ 
+-static inline void *drm_vmalloc_dma(unsigned long size)
++static void *drm_vmalloc_dma(struct drm_device *drmdev, unsigned long size)
+ {
+ #if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
+ 	return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL | _PAGE_NO_CACHE);
+ #else
+-	return vmalloc_32(size);
++	struct device *dev = &drmdev->pdev->dev;
++	struct page **pages;
++	void *addr;
++	const int npages = PFN_UP(size);
++	int i;
++
++	pages = kmalloc(npages * sizeof(*pages), GFP_KERNEL);
++	if (!pages)
++		goto fail;
++
++	for (i = 0; i < npages; i++) {
++		dma_addr_t phys;
++		void *addr;
++		addr = dma_alloc_coherent(dev, PAGE_SIZE, &phys, GFP_KERNEL);
++		if (addr == NULL)
++			goto out_free_pages;
++
++		pages[i] = virt_to_page(addr);
++	}
++
++	addr = vmap(pages, npages, VM_MAP | VM_IOREMAP, PAGE_KERNEL);
++
++	kfree(pages);
++
++	return addr;
++
++out_free_pages:
++	while (i > 0) {
++		void *addr = page_address(pages[--i]);
++		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
++	}
++
++	kfree(pages);
++
++fail:
++	return NULL;
++#endif
++}
++
++static void drm_vfree_dma(struct drm_device *drmdev, void *addr, int npages,
++			  struct page **pages)
++{
++#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE)
++	vfree(addr);
++#else
++	struct device *dev = &drmdev->pdev->dev;
++	int i;
++
++	for (i = 0; i < npages; i++) {
++		void *addr = page_address(pages[i]);
++		dma_free_coherent(dev, PAGE_SIZE, addr, virt_to_bus(addr));
++	}
++	vunmap(addr);
+ #endif
+ }
+ 
+-void drm_sg_cleanup(struct drm_sg_mem * entry)
++void drm_sg_cleanup(struct drm_device *drmdev, struct drm_sg_mem * entry)
+ {
+ 	struct page *page;
+ 	int i;
+@@ -56,7 +109,7 @@ void drm_sg_cleanup(struct drm_sg_mem * entry)
+ 			ClearPageReserved(page);
+ 	}
+ 
+-	vfree(entry->virtual);
++	drm_vfree_dma(drmdev, entry->virtual, entry->pages, entry->pagelist);
+ 
+ 	kfree(entry->busaddr);
+ 	kfree(entry->pagelist);
+@@ -107,7 +160,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
+ 	}
+ 	memset((void *)entry->busaddr, 0, pages * sizeof(*entry->busaddr));
+ 
+-	entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT);
++	entry->virtual = drm_vmalloc_dma(dev, pages << PAGE_SHIFT);
+ 	if (!entry->virtual) {
+ 		kfree(entry->busaddr);
+ 		kfree(entry->pagelist);
+@@ -180,7 +233,7 @@ int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request)
+ 	return 0;
+ 
+       failed:
+-	drm_sg_cleanup(entry);
++	drm_sg_cleanup(dev, entry);
+ 	return -ENOMEM;
+ }
+ EXPORT_SYMBOL(drm_sg_alloc);
+@@ -212,7 +265,7 @@ int drm_sg_free(struct drm_device *dev, void *data,
+ 
+ 	DRM_DEBUG("virtual  = %p\n", entry->virtual);
+ 
+-	drm_sg_cleanup(entry);
++	drm_sg_cleanup(dev, entry);
+ 
+ 	return 0;
+ }
+diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
+index 1c040d0..e3555bf 100644
+--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
++++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
+@@ -87,6 +87,9 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ 	bool is_iomem;
+ 	unsigned long address = (unsigned long)vmf->virtual_address;
+ 	int retval = VM_FAULT_NOPAGE;
++	bool vm_io = (vma->vm_flags & VM_IO) && VM_IO;
++	bool pte_iomap = (pgprot_val(vma->vm_page_prot) & _PAGE_IOMAP)
++			&& _PAGE_IOMAP;
+ 
+ 	/*
+ 	 * Work around locking order reversal in fault / nopfn
+@@ -158,11 +161,30 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ 	if (is_iomem) {
+ 		vma->vm_page_prot = ttm_io_prot(bo->mem.placement,
+ 						vma->vm_page_prot);
++		if (!vm_io || !pte_iomap) {
++			vma->vm_flags |= VM_IO;
++			pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
++		}
+ 	} else {
+ 		ttm = bo->ttm;
+ 		vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ?
+ 		    vm_get_page_prot(vma->vm_flags) :
+ 		    ttm_io_prot(bo->mem.placement, vma->vm_page_prot);
++		/*
++		 * During PCI suspend the graphic cards purge their VRAM and
++		 * move their graphic objects to the TT. They also unmap all
++		 * of the objects, meaning that when an user application is
++		 * unfrozen it will re-fault  and call here.
++		 *
++		 * What this means is that the VMA for the graphic object might
++		 * have been set for VRAM TTM but now it is with the TT
++		 * (normal RAM) meaning that the vma->vm_flags could be
++		 * inappropiate (say, VM_IO on TT - no good).
++		 */
++		if (vm_io || pte_iomap) {
++			vma->vm_flags &= ~VM_IO;
++			pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP;
++		}
+ 	}
+ 
+ 	/*
+@@ -239,6 +261,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
+ {
+ 	struct ttm_bo_driver *driver;
+ 	struct ttm_buffer_object *bo;
++	struct ttm_mem_type_manager *man;
+ 	int ret;
+ 
+ 	read_lock(&bdev->vm_lock);
+@@ -271,7 +294,11 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
+ 	 */
+ 
+ 	vma->vm_private_data = bo;
+-	vma->vm_flags |= VM_RESERVED | VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
++	vma->vm_flags |= VM_RESERVED | VM_MIXEDMAP | VM_DONTEXPAND;
++	man = &bdev->man[bo->mem.mem_type];
++	if (man->flags & TTM_MEMTYPE_FLAG_NEEDS_IOREMAP)
++		vma->vm_flags |= VM_IO;
++	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ 	return 0;
+ out_unref:
+ 	ttm_bo_unref(&bo);
+diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
+index 3d5b8b0..8b05e38 100644
+--- a/drivers/gpu/drm/ttm/ttm_tt.c
++++ b/drivers/gpu/drm/ttm/ttm_tt.c
+@@ -38,7 +38,8 @@
+ #include "ttm/ttm_module.h"
+ #include "ttm/ttm_bo_driver.h"
+ #include "ttm/ttm_placement.h"
+-
++#include <linux/dma-mapping.h>
++#include <xen/xen.h>
+ static int ttm_tt_swapin(struct ttm_tt *ttm);
+ 
+ /**
+@@ -84,6 +85,16 @@ static struct page *ttm_tt_alloc_page(unsigned page_flags)
+ 	else
+ 		gfp_flags |= __GFP_HIGHMEM;
+ 
++	if ((page_flags & TTM_PAGE_FLAG_DMA32) && xen_pv_domain())
++	{
++		void *addr;
++		dma_addr_t _d;
++
++		addr = dma_alloc_coherent(NULL, PAGE_SIZE, &_d, GFP_KERNEL);
++		if (addr == NULL)
++			return NULL;
++		return virt_to_page(addr);
++	}
+ 	return alloc_page(gfp_flags);
+ }
+ 
+@@ -286,6 +297,7 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
+ 	int i;
+ 	struct page *cur_page;
+ 	struct ttm_backend *be = ttm->be;
++	void *addr;
+ 
+ 	if (be)
+ 		be->func->clear(be);
+@@ -300,7 +312,16 @@ static void ttm_tt_free_alloced_pages(struct ttm_tt *ttm)
+ 				       "Leaking pages.\n");
+ 			ttm_mem_global_free_page(ttm->glob->mem_glob,
+ 						 cur_page);
+-			__free_page(cur_page);
++
++			if ((ttm->page_flags & TTM_PAGE_FLAG_DMA32) &&
++				xen_pv_domain()) {
++				addr = page_address(cur_page);
++				WARN_ON(!addr);
++				if (addr)
++					dma_free_coherent(NULL, PAGE_SIZE, addr,
++						  virt_to_bus(addr));
++			} else
++				__free_page(cur_page);
+ 		}
+ 	}
+ 	ttm->state = tt_unpopulated;
+diff --git a/drivers/input/xen-kbdfront.c b/drivers/input/xen-kbdfront.c
+index b115726..80a072e 100644
+--- a/drivers/input/xen-kbdfront.c
++++ b/drivers/input/xen-kbdfront.c
+@@ -21,7 +21,10 @@
+ #include <linux/errno.h>
+ #include <linux/module.h>
+ #include <linux/input.h>
++
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+ #include <xen/interface/io/fbif.h>
+@@ -272,6 +275,8 @@ static void xenkbd_backend_changed(struct xenbus_device *dev,
+ 	switch (backend_state) {
+ 	case XenbusStateInitialising:
+ 	case XenbusStateInitialised:
++	case XenbusStateReconfiguring:
++	case XenbusStateReconfigured:
+ 	case XenbusStateUnknown:
+ 	case XenbusStateClosed:
+ 		break;
+@@ -335,7 +340,7 @@ static struct xenbus_driver xenkbd_driver = {
+ 
+ static int __init xenkbd_init(void)
+ {
+-	if (!xen_domain())
++	if (!xen_domain() || xen_hvm_domain())
+ 		return -ENODEV;
+ 
+ 	/* Nothing to do if running in dom0. */
+diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
+index b2f71f7..b7feb84 100644
+--- a/drivers/net/Kconfig
++++ b/drivers/net/Kconfig
+@@ -2787,6 +2787,7 @@ source "drivers/s390/net/Kconfig"
+ config XEN_NETDEV_FRONTEND
+ 	tristate "Xen network device frontend driver"
+ 	depends on XEN
++	select XEN_XENBUS_FRONTEND
+ 	default y
+ 	help
+ 	  The network device frontend driver allows the kernel to
+diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
+index 1a11d95..3f71199 100644
+--- a/drivers/net/xen-netfront.c
++++ b/drivers/net/xen-netfront.c
+@@ -42,6 +42,7 @@
+ #include <linux/mm.h>
+ #include <net/ip.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+@@ -53,19 +54,36 @@
+ 
+ static const struct ethtool_ops xennet_ethtool_ops;
+ 
++static int use_smartpoll = 0;
++module_param(use_smartpoll, int, 0600);
++MODULE_PARM_DESC (use_smartpoll, "Use smartpoll mechanism if available");
++
+ struct netfront_cb {
+ 	struct page *page;
+ 	unsigned offset;
+ };
+ 
++#define MICRO_SECOND 1000000UL
++#define NANO_SECOND 1000000000UL
++#define DEFAULT_SMART_POLL_FREQ   1000UL
++
++struct netfront_smart_poll {
++	struct hrtimer timer;
++	struct net_device *netdev;
++	unsigned int smart_poll_freq;
++	unsigned int feature_smart_poll;
++	unsigned int active;
++	unsigned long counter;
++};
++
+ #define NETFRONT_SKB_CB(skb)	((struct netfront_cb *)((skb)->cb))
+ 
+ #define RX_COPY_THRESHOLD 256
+ 
+ #define GRANT_INVALID_REF	0
+ 
+-#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
+-#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
++#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
++#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
+ #define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+ 
+ struct netfront_info {
+@@ -104,7 +122,7 @@ struct netfront_info {
+ 
+ 	/* Receive-ring batched refills. */
+ #define RX_MIN_TARGET 8
+-#define RX_DFL_MIN_TARGET 64
++#define RX_DFL_MIN_TARGET 80
+ #define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+ 	unsigned rx_min_target, rx_max_target, rx_target;
+ 	struct sk_buff_head rx_batch;
+@@ -118,6 +136,8 @@ struct netfront_info {
+ 	unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+ 	struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
+ 	struct mmu_update rx_mmu[NET_RX_RING_SIZE];
++
++	struct netfront_smart_poll smart_poll;
+ };
+ 
+ struct netfront_rx_info {
+@@ -337,15 +357,17 @@ static int xennet_open(struct net_device *dev)
+ 	return 0;
+ }
+ 
+-static void xennet_tx_buf_gc(struct net_device *dev)
++static int xennet_tx_buf_gc(struct net_device *dev)
+ {
+ 	RING_IDX cons, prod;
++	RING_IDX cons_begin, cons_end;
+ 	unsigned short id;
+ 	struct netfront_info *np = netdev_priv(dev);
+ 	struct sk_buff *skb;
+ 
+ 	BUG_ON(!netif_carrier_ok(dev));
+ 
++	cons_begin = np->tx.rsp_cons;
+ 	do {
+ 		prod = np->tx.sring->rsp_prod;
+ 		rmb(); /* Ensure we see responses up to 'rp'. */
+@@ -390,7 +412,11 @@ static void xennet_tx_buf_gc(struct net_device *dev)
+ 		mb();		/* update shared area */
+ 	} while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
+ 
++	cons_end = np->tx.rsp_cons;
++
+ 	xennet_maybe_wake_tx(dev);
++
++	return (cons_begin == cons_end);
+ }
+ 
+ static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+@@ -1267,6 +1293,14 @@ static void xennet_disconnect_backend(struct netfront_info *info)
+ 	info->rx.sring = NULL;
+ }
+ 
++static int netfront_suspend(struct xenbus_device *dev, pm_message_t state)
++{
++	struct netfront_info *info = dev_get_drvdata(&dev->dev);
++	struct hrtimer *timer = &info->smart_poll.timer;
++	hrtimer_cancel(timer);
++	return 0;
++}
++
+ /**
+  * We are reconnecting to the backend, due to a suspend/resume, or a backend
+  * driver restart.  We tear down our netif structure and recreate it, but
+@@ -1305,6 +1339,59 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+ 	return 0;
+ }
+ 
++static enum hrtimer_restart smart_poll_function(struct hrtimer *timer)
++{
++	struct netfront_smart_poll *psmart_poll;
++	struct net_device *dev;
++	struct netfront_info *np;
++	unsigned long flags;
++	unsigned int tx_active = 0, rx_active = 0;
++
++	psmart_poll = container_of(timer, struct netfront_smart_poll, timer);
++	dev = psmart_poll->netdev;
++	np = netdev_priv(dev);
++
++	spin_lock_irqsave(&np->tx_lock, flags);
++
++	if (!np->rx.sring)
++		goto end;
++
++	np->smart_poll.counter++;
++
++	if (likely(netif_carrier_ok(dev))) {
++		tx_active = !(xennet_tx_buf_gc(dev));
++		/* Under tx_lock: protects access to rx shared-ring indexes. */
++		if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
++			rx_active = 1;
++			napi_schedule(&np->napi);
++		}
++	}
++
++	np->smart_poll.active |= (tx_active || rx_active);
++	if (np->smart_poll.counter %
++			(np->smart_poll.smart_poll_freq / 10) == 0) {
++		if (!np->smart_poll.active) {
++			np->rx.sring->private.netif.smartpoll_active = 0;
++			goto end;
++		}
++		np->smart_poll.active = 0;
++	}
++
++	if (np->rx.sring->private.netif.smartpoll_active) {
++		if ( hrtimer_start(timer,
++			ktime_set(0, NANO_SECOND/psmart_poll->smart_poll_freq),
++			HRTIMER_MODE_REL) ) {
++			printk(KERN_DEBUG "Failed to start hrtimer,"
++					"use interrupt mode for this packet\n");
++			np->rx.sring->private.netif.smartpoll_active = 0;
++		}
++	}
++
++end:
++	spin_unlock_irqrestore(&np->tx_lock, flags);
++	return HRTIMER_NORESTART;
++}
++
+ static irqreturn_t xennet_interrupt(int irq, void *dev_id)
+ {
+ 	struct net_device *dev = dev_id;
+@@ -1320,6 +1407,16 @@ static irqreturn_t xennet_interrupt(int irq, void *dev_id)
+ 			napi_schedule(&np->napi);
+ 	}
+ 
++	if (np->smart_poll.feature_smart_poll) {
++		if ( hrtimer_start(&np->smart_poll.timer,
++			ktime_set(0,NANO_SECOND/np->smart_poll.smart_poll_freq),
++			HRTIMER_MODE_REL) ) {
++			printk(KERN_DEBUG "Failed to start hrtimer,"
++					"use interrupt mode for this packet\n");
++			np->rx.sring->private.netif.smartpoll_active = 0;
++		}
++	}
++
+ 	spin_unlock_irqrestore(&np->tx_lock, flags);
+ 
+ 	return IRQ_HANDLED;
+@@ -1393,7 +1490,7 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
+ }
+ 
+ /* Common code used when first setting up, and when resuming. */
+-static int talk_to_backend(struct xenbus_device *dev,
++static int talk_to_netback(struct xenbus_device *dev,
+ 			   struct netfront_info *info)
+ {
+ 	const char *message;
+@@ -1456,6 +1553,12 @@ again:
+ 		goto abort_transaction;
+ 	}
+ 
++	err = xenbus_printf(xbt, dev->nodename, "feature-smart-poll", "%d", use_smartpoll);
++	if (err) {
++		message = "writing feature-smart-poll";
++		goto abort_transaction;
++	}
++
+ 	err = xenbus_transaction_end(xbt, 0);
+ 	if (err) {
+ 		if (err == -EAGAIN)
+@@ -1543,7 +1646,26 @@ static int xennet_connect(struct net_device *dev)
+ 		return -ENODEV;
+ 	}
+ 
+-	err = talk_to_backend(np->xbdev, np);
++	np->smart_poll.feature_smart_poll = 0;
++	if (use_smartpoll) {
++		err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
++				   "feature-smart-poll", "%u",
++				   &np->smart_poll.feature_smart_poll);
++		if (err != 1)
++			np->smart_poll.feature_smart_poll = 0;
++	}
++
++	hrtimer_init(&np->smart_poll.timer, CLOCK_MONOTONIC,
++		     HRTIMER_MODE_REL);
++	if (np->smart_poll.feature_smart_poll) {
++		np->smart_poll.timer.function = smart_poll_function;
++		np->smart_poll.netdev = dev;
++		np->smart_poll.smart_poll_freq = DEFAULT_SMART_POLL_FREQ;
++		np->smart_poll.active = 0;
++		np->smart_poll.counter = 0;
++	}
++
++	err = talk_to_netback(np->xbdev, np);
+ 	if (err)
+ 		return err;
+ 
+@@ -1597,7 +1719,7 @@ static int xennet_connect(struct net_device *dev)
+ /**
+  * Callback received when the backend's state changes.
+  */
+-static void backend_changed(struct xenbus_device *dev,
++static void netback_changed(struct xenbus_device *dev,
+ 			    enum xenbus_state backend_state)
+ {
+ 	struct netfront_info *np = dev_get_drvdata(&dev->dev);
+@@ -1608,6 +1730,8 @@ static void backend_changed(struct xenbus_device *dev,
+ 	switch (backend_state) {
+ 	case XenbusStateInitialising:
+ 	case XenbusStateInitialised:
++	case XenbusStateReconfiguring:
++	case XenbusStateReconfigured:
+ 	case XenbusStateConnected:
+ 	case XenbusStateUnknown:
+ 	case XenbusStateClosed:
+@@ -1628,12 +1752,30 @@ static void backend_changed(struct xenbus_device *dev,
+ 	}
+ }
+ 
++static int xennet_get_coalesce(struct net_device *netdev,
++			       struct ethtool_coalesce *ec)
++{
++	struct netfront_info *np = netdev_priv(netdev);
++	ec->rx_coalesce_usecs = MICRO_SECOND / np->smart_poll.smart_poll_freq;
++	return 0;
++}
++
++static int xennet_set_coalesce(struct net_device *netdev,
++		struct ethtool_coalesce *ec)
++{
++	struct netfront_info *np = netdev_priv(netdev);
++	np->smart_poll.smart_poll_freq = MICRO_SECOND / ec->rx_coalesce_usecs;
++	return 0;
++}
++
+ static const struct ethtool_ops xennet_ethtool_ops =
+ {
+ 	.set_tx_csum = ethtool_op_set_tx_csum,
+ 	.set_sg = xennet_set_sg,
+ 	.set_tso = xennet_set_tso,
+ 	.get_link = ethtool_op_get_link,
++	.get_coalesce = xennet_get_coalesce,
++	.set_coalesce = xennet_set_coalesce,
+ };
+ 
+ #ifdef CONFIG_SYSFS
+@@ -1798,8 +1940,9 @@ static struct xenbus_driver netfront_driver = {
+ 	.ids = netfront_ids,
+ 	.probe = netfront_probe,
+ 	.remove = __devexit_p(xennet_remove),
++	.suspend = netfront_suspend,
+ 	.resume = netfront_resume,
+-	.otherend_changed = backend_changed,
++	.otherend_changed = netback_changed,
+ };
+ 
+ static int __init netif_init(void)
+diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
+index fdc864f..7802fcd 100644
+--- a/drivers/pci/Kconfig
++++ b/drivers/pci/Kconfig
+@@ -51,6 +51,16 @@ config PCI_STUB
+ 
+ 	  When in doubt, say N.
+ 
++config XEN_PCIDEV_FRONTEND
++        tristate "Xen PCI Frontend"
++        depends on XEN && PCI && X86
++        select HOTPLUG
++	select XEN_XENBUS_FRONTEND
++	default y
++        help
++          The PCI device frontend driver allows the kernel to import arbitrary
++          PCI devices from a PCI backend to support PCI driver domains.
++
+ config HT_IRQ
+ 	bool "Interrupts on hypertransport devices"
+ 	default y
+diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
+index 4a7f11d..b70aa4d 100644
+--- a/drivers/pci/Makefile
++++ b/drivers/pci/Makefile
+@@ -31,6 +31,8 @@ obj-$(CONFIG_HT_IRQ) += htirq.o
+ # Build Intel IOMMU support
+ obj-$(CONFIG_DMAR) += dmar.o iova.o intel-iommu.o
+ 
++# Build Xen IOMMU support
++obj-$(CONFIG_PCI_XEN) += xen-iommu.o
+ obj-$(CONFIG_INTR_REMAP) += dmar.o intr_remapping.o
+ 
+ obj-$(CONFIG_PCI_IOV) += iov.o
+@@ -60,6 +62,8 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
+ 
+ obj-$(CONFIG_PCI_STUB) += pci-stub.o
+ 
++obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
++
+ ifeq ($(CONFIG_PCI_DEBUG),y)
+ EXTRA_CFLAGS += -DDEBUG
+ endif
+diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
+index cef28a7..1940183 100644
+--- a/drivers/pci/bus.c
++++ b/drivers/pci/bus.c
+@@ -249,6 +249,7 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
+ 	up_read(&pci_bus_sem);
+ }
+ 
++EXPORT_SYMBOL_GPL(pci_walk_bus);
+ EXPORT_SYMBOL(pci_bus_alloc_resource);
+ EXPORT_SYMBOL_GPL(pci_bus_add_device);
+ EXPORT_SYMBOL(pci_bus_add_devices);
+diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
+index 5753036..8e6e6d1 100644
+--- a/drivers/pci/dmar.c
++++ b/drivers/pci/dmar.c
+@@ -673,10 +673,13 @@ void __init detect_intel_iommu(void)
+ 			       "x2apic and Intr-remapping.\n");
+ #endif
+ #ifdef CONFIG_DMAR
+-		if (ret && !no_iommu && !iommu_detected && !swiotlb &&
+-		    !dmar_disabled)
++		if (ret && !no_iommu && !iommu_detected && !dmar_disabled)
+ 			iommu_detected = 1;
+ #endif
++#ifdef CONFIG_X86
++		if (ret)
++			x86_init.iommu.iommu_init = intel_iommu_init;
++#endif
+ 	}
+ 	early_acpi_os_unmap_memory(dmar_tbl, dmar_tbl_size);
+ 	dmar_tbl = NULL;
+diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
+index ba83495..1506d4a 100644
+--- a/drivers/pci/intel-iommu.c
++++ b/drivers/pci/intel-iommu.c
+@@ -3278,7 +3278,7 @@ int __init intel_iommu_init(void)
+ 	 * Check the need for DMA-remapping initialization now.
+ 	 * Above initialization will also be used by Interrupt-remapping.
+ 	 */
+-	if (no_iommu || swiotlb || dmar_disabled)
++	if (no_iommu || dmar_disabled)
+ 		return -ENODEV;
+ 
+ 	iommu_init_mempool();
+@@ -3299,7 +3299,9 @@ int __init intel_iommu_init(void)
+ 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
+ 
+ 	init_timer(&unmap_timer);
+-	force_iommu = 1;
++#ifdef CONFIG_SWIOTLB
++	swiotlb = 0;
++#endif
+ 	dma_ops = &intel_dma_ops;
+ 
+ 	init_iommu_sysfs();
+diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
+index e03fe98..f9db891 100644
+--- a/drivers/pci/iov.c
++++ b/drivers/pci/iov.c
+@@ -706,6 +706,21 @@ irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+ }
+ EXPORT_SYMBOL_GPL(pci_sriov_migration);
+ 
++/**
++ * pci_num_vf - return number of VFs associated with a PF device_release_driver
++ * @dev: the PCI device
++ *
++ * Returns number of VFs, or 0 if SR-IOV is not enabled.
++ */
++int pci_num_vf(struct pci_dev *dev)
++{
++	if (!dev || !dev->is_physfn)
++		return 0;
++	else
++		return dev->sriov->nr_virtfn;
++}
++EXPORT_SYMBOL_GPL(pci_num_vf);
++
+ static int ats_alloc_one(struct pci_dev *dev, int ps)
+ {
+ 	int pos;
+diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
+index 0fb1d05..c7e8a69 100644
+--- a/drivers/pci/msi.c
++++ b/drivers/pci/msi.c
+@@ -19,6 +19,9 @@
+ #include <linux/errno.h>
+ #include <linux/io.h>
+ 
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
+ #include "pci.h"
+ #include "msi.h"
+ 
+@@ -391,6 +394,20 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
+ 
+ void pci_restore_msi_state(struct pci_dev *dev)
+ {
++	if (xen_initial_domain()) {
++		struct physdev_restore_msi physdev;
++
++		if (!dev->msi_enabled && !dev->msix_enabled)
++			return;
++
++		pci_intx_for_msi(dev, 0);
++
++		physdev.bus = dev->bus->number;
++		physdev.devfn = dev->devfn;
++		HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &physdev);
++
++		return;
++	}
+ 	__pci_restore_msi_state(dev);
+ 	__pci_restore_msix_state(dev);
+ }
+diff --git a/drivers/pci/xen-iommu.c b/drivers/pci/xen-iommu.c
+new file mode 100644
+index 0000000..ac6bcdb
+--- /dev/null
++++ b/drivers/pci/xen-iommu.c
+@@ -0,0 +1,271 @@
++#include <linux/types.h>
++#include <linux/mm.h>
++#include <linux/string.h>
++#include <linux/pci.h>
++#include <linux/module.h>
++#include <linux/version.h>
++#include <linux/scatterlist.h>
++#include <linux/io.h>
++#include <linux/bug.h>
++
++#include <xen/interface/xen.h>
++#include <xen/grant_table.h>
++#include <xen/page.h>
++#include <xen/xen-ops.h>
++
++#include <asm/iommu.h>
++#include <asm/swiotlb.h>
++#include <asm/tlbflush.h>
++
++#define IOMMU_BUG_ON(test)				\
++do {							\
++	if (unlikely(test)) {				\
++		printk(KERN_ALERT "Fatal DMA error! "	\
++		       "Please use 'swiotlb=force'\n");	\
++		BUG();					\
++	}						\
++} while (0)
++
++/* Print address range with message */
++#define PAR(msg, addr, size)					\
++do {							\
++	printk(msg "[%#llx - %#llx]\n",			\
++	(unsigned long long)addr,			\
++	(unsigned long long)addr + size);		\
++} while (0)
++
++static inline int address_needs_mapping(struct device *hwdev,
++						dma_addr_t addr)
++{
++	dma_addr_t mask = DMA_BIT_MASK(32);
++	int ret;
++
++	/* If the device has a mask, use it, otherwise default to 32 bits */
++	if (hwdev)
++		mask = *hwdev->dma_mask;
++
++	ret = (addr & ~mask) != 0;
++
++	if (ret) {
++		printk(KERN_ERR "dma address needs mapping\n");
++		printk(KERN_ERR "mask: %#llx\n address: [%#llx]\n", mask, addr);
++	}
++	return ret;
++}
++
++static int check_pages_physically_contiguous(unsigned long pfn,
++					     unsigned int offset,
++					     size_t length)
++{
++	unsigned long next_mfn;
++	int i;
++	int nr_pages;
++
++	next_mfn = pfn_to_mfn(pfn);
++	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++	for (i = 1; i < nr_pages; i++) {
++		if (pfn_to_mfn(++pfn) != ++next_mfn)
++			return 0;
++	}
++	return 1;
++}
++
++static int range_straddles_page_boundary(phys_addr_t p, size_t size)
++{
++	unsigned long pfn = PFN_DOWN(p);
++	unsigned int offset = p & ~PAGE_MASK;
++
++	if (offset + size <= PAGE_SIZE)
++		return 0;
++	if (check_pages_physically_contiguous(pfn, offset, size))
++		return 0;
++	return 1;
++}
++
++static inline void xen_dma_unmap_page(struct page *page)
++{
++	/* Xen TODO: 2.6.18 xen calls __gnttab_dma_unmap_page here
++	 * to deal with foreign pages.  We'll need similar logic here at
++	 * some point.
++	 */
++}
++
++/* Gets dma address of a page */
++static inline dma_addr_t xen_dma_map_page(struct page *page)
++{
++	/* Xen TODO: 2.6.18 xen calls __gnttab_dma_map_page here to deal
++	 * with foreign pages.  We'll need similar logic here at some
++	 * point.
++	 */
++	return ((dma_addr_t)pfn_to_mfn(page_to_pfn(page))) << PAGE_SHIFT;
++}
++
++static int xen_map_sg(struct device *hwdev, struct scatterlist *sg,
++		      int nents,
++		      enum dma_data_direction direction,
++		      struct dma_attrs *attrs)
++{
++	struct scatterlist *s;
++	struct page *page;
++	int i, rc;
++
++	BUG_ON(direction == DMA_NONE);
++	WARN_ON(nents == 0 || sg[0].length == 0);
++
++	for_each_sg(sg, s, nents, i) {
++		BUG_ON(!sg_page(s));
++		page = sg_page(s);
++		s->dma_address = xen_dma_map_page(page) + s->offset;
++		s->dma_length = s->length;
++		IOMMU_BUG_ON(range_straddles_page_boundary(
++				page_to_phys(page), s->length));
++	}
++
++	rc = nents;
++
++	flush_write_buffers();
++	return rc;
++}
++
++static void xen_unmap_sg(struct device *hwdev, struct scatterlist *sg,
++			 int nents,
++			 enum dma_data_direction direction,
++			 struct dma_attrs *attrs)
++{
++	struct scatterlist *s;
++	struct page *page;
++	int i;
++
++	for_each_sg(sg, s, nents, i) {
++		page = pfn_to_page(mfn_to_pfn(PFN_DOWN(s->dma_address)));
++		xen_dma_unmap_page(page);
++	}
++}
++
++static void *xen_alloc_coherent(struct device *dev, size_t size,
++				dma_addr_t *dma_handle, gfp_t gfp)
++{
++	void *ret;
++	unsigned int order = get_order(size);
++	unsigned long vstart;
++	u64 mask;
++
++	/* ignore region specifiers */
++	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
++
++	if (dma_alloc_from_coherent(dev, size, dma_handle, &ret))
++		return ret;
++
++	if (dev == NULL || (dev->coherent_dma_mask < DMA_BIT_MASK(32)))
++		gfp |= GFP_DMA;
++
++	vstart = __get_free_pages(gfp, order);
++	ret = (void *)vstart;
++
++	if (dev != NULL && dev->coherent_dma_mask)
++		mask = dev->coherent_dma_mask;
++	else
++		mask = DMA_BIT_MASK(32);
++
++	if (ret != NULL) {
++		if (xen_create_contiguous_region(vstart, order,
++						 fls64(mask)) != 0) {
++			free_pages(vstart, order);
++			return NULL;
++		}
++		memset(ret, 0, size);
++		*dma_handle = virt_to_machine(ret).maddr;
++	}
++	return ret;
++}
++
++static void xen_free_coherent(struct device *dev, size_t size,
++			      void *vaddr, dma_addr_t dma_addr)
++{
++	int order = get_order(size);
++
++	if (dma_release_from_coherent(dev, order, vaddr))
++		return;
++
++	xen_destroy_contiguous_region((unsigned long)vaddr, order);
++	free_pages((unsigned long)vaddr, order);
++}
++
++static dma_addr_t xen_map_page(struct device *dev, struct page *page,
++			       unsigned long offset, size_t size,
++			       enum dma_data_direction direction,
++			       struct dma_attrs *attrs)
++{
++	dma_addr_t dma;
++
++	BUG_ON(direction == DMA_NONE);
++
++	WARN_ON(size == 0);
++
++	dma = xen_dma_map_page(page) + offset;
++
++	IOMMU_BUG_ON(address_needs_mapping(dev, dma));
++	flush_write_buffers();
++	return dma;
++}
++
++static void xen_unmap_page(struct device *dev, dma_addr_t dma_addr,
++			   size_t size,
++			   enum dma_data_direction direction,
++			   struct dma_attrs *attrs)
++{
++	BUG_ON(direction == DMA_NONE);
++	xen_dma_unmap_page(pfn_to_page(mfn_to_pfn(PFN_DOWN(dma_addr))));
++}
++
++static struct dma_map_ops xen_dma_ops = {
++	.dma_supported = NULL,
++
++	.alloc_coherent = xen_alloc_coherent,
++	.free_coherent = xen_free_coherent,
++
++	.map_page = xen_map_page,
++	.unmap_page = xen_unmap_page,
++
++	.map_sg = xen_map_sg,
++	.unmap_sg = xen_unmap_sg,
++
++	.mapping_error = NULL,
++
++	.is_phys = 0,
++};
++
++static struct dma_map_ops xen_swiotlb_dma_ops = {
++	.dma_supported = swiotlb_dma_supported,
++
++	.alloc_coherent = xen_alloc_coherent,
++	.free_coherent = xen_free_coherent,
++
++	.map_page = swiotlb_map_page,
++	.unmap_page = swiotlb_unmap_page,
++
++	.map_sg = swiotlb_map_sg_attrs,
++	.unmap_sg = swiotlb_unmap_sg_attrs,
++
++	.mapping_error = swiotlb_dma_mapping_error,
++
++	.is_phys = 0,
++};
++
++void __init xen_iommu_init(void)
++{
++	if (!xen_pv_domain())
++		return;
++
++	printk(KERN_INFO "Xen: Initializing Xen DMA ops\n");
++
++	force_iommu = 0;
++	dma_ops = &xen_dma_ops;
++
++	if (swiotlb) {
++		printk(KERN_INFO "Xen: Enabling DMA fallback to swiotlb\n");
++		dma_ops = &xen_swiotlb_dma_ops;
++	}
++}
++
+diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
+new file mode 100644
+index 0000000..76d0bdd
+--- /dev/null
++++ b/drivers/pci/xen-pcifront.c
+@@ -0,0 +1,1157 @@
++/*
++ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/mm.h>
++#include <xen/xenbus.h>
++#include <xen/events.h>
++#include <xen/grant_table.h>
++#include <xen/page.h>
++#include <linux/spinlock.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
++#include <xen/xenbus.h>
++#include <xen/interface/io/pciif.h>
++#include <asm/xen/pci.h>
++#include <linux/interrupt.h>
++#include <asm/atomic.h>
++#include <linux/workqueue.h>
++#include <linux/bitops.h>
++#include <linux/time.h>
++
++
++#ifndef __init_refok
++#define __init_refok
++#endif
++
++#define INVALID_GRANT_REF (0)
++#define INVALID_EVTCHN    (-1)
++
++
++struct pci_bus_entry {
++	struct list_head list;
++	struct pci_bus *bus;
++};
++
++#define _PDEVB_op_active		(0)
++#define PDEVB_op_active 		(1 << (_PDEVB_op_active))
++
++struct pcifront_device {
++	struct xenbus_device *xdev;
++	struct list_head root_buses;
++
++	int evtchn;
++	int gnt_ref;
++
++	int irq;
++
++	/* Lock this when doing any operations in sh_info */
++	spinlock_t sh_info_lock;
++	struct xen_pci_sharedinfo *sh_info;
++	struct work_struct op_work;
++	unsigned long flags;
++
++};
++
++struct pcifront_sd {
++	int domain;
++	struct pcifront_device *pdev;
++};
++
++static inline struct pcifront_device *
++pcifront_get_pdev(struct pcifront_sd *sd)
++{
++	return sd->pdev;
++}
++
++static inline void pcifront_init_sd(struct pcifront_sd *sd,
++				    unsigned int domain, unsigned int bus,
++				    struct pcifront_device *pdev)
++{
++	sd->domain = domain;
++	sd->pdev = pdev;
++}
++
++static inline void pcifront_setup_root_resources(struct pci_bus *bus,
++						 struct pcifront_sd *sd)
++{
++}
++
++
++DEFINE_SPINLOCK(pcifront_dev_lock);
++static struct pcifront_device *pcifront_dev;
++
++static int verbose_request;
++module_param(verbose_request, int, 0644);
++
++static int errno_to_pcibios_err(int errno)
++{
++	switch (errno) {
++	case XEN_PCI_ERR_success:
++		return PCIBIOS_SUCCESSFUL;
++
++	case XEN_PCI_ERR_dev_not_found:
++		return PCIBIOS_DEVICE_NOT_FOUND;
++
++	case XEN_PCI_ERR_invalid_offset:
++	case XEN_PCI_ERR_op_failed:
++		return PCIBIOS_BAD_REGISTER_NUMBER;
++
++	case XEN_PCI_ERR_not_implemented:
++		return PCIBIOS_FUNC_NOT_SUPPORTED;
++
++	case XEN_PCI_ERR_access_denied:
++		return PCIBIOS_SET_FAILED;
++	}
++	return errno;
++}
++
++static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev)
++{
++	if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
++		&& !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) {
++		dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n");
++		schedule_work(&pdev->op_work);
++	}
++}
++
++static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
++{
++	int err = 0;
++	struct xen_pci_op *active_op = &pdev->sh_info->op;
++	unsigned long irq_flags;
++	evtchn_port_t port = pdev->evtchn;
++	unsigned irq = pdev->irq;
++	s64 ns, ns_timeout;
++	struct timeval tv;
++
++	spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
++
++	memcpy(active_op, op, sizeof(struct xen_pci_op));
++
++	/* Go */
++	wmb();
++	set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
++	notify_remote_via_evtchn(port);
++
++	/*
++	 * We set a poll timeout of 3 seconds but give up on return after
++	 * 2 seconds. It is better to time out too late rather than too early
++	 * (in the latter case we end up continually re-executing poll() with a
++	 * timeout in the past). 1s difference gives plenty of slack for error.
++	 */
++	do_gettimeofday(&tv);
++	ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
++
++	xen_clear_irq_pending(irq);
++
++	while (test_bit(_XEN_PCIF_active,
++			(unsigned long *)&pdev->sh_info->flags)) {
++		xen_poll_irq_timeout(irq, jiffies + 3*HZ);
++		xen_clear_irq_pending(irq);
++		do_gettimeofday(&tv);
++		ns = timeval_to_ns(&tv);
++		if (ns > ns_timeout) {
++			dev_err(&pdev->xdev->dev,
++				"pciback not responding!!!\n");
++			clear_bit(_XEN_PCIF_active,
++				  (unsigned long *)&pdev->sh_info->flags);
++			err = XEN_PCI_ERR_dev_not_found;
++			goto out;
++		}
++	}
++
++	/*
++	* We might lose backend service request since we
++	* reuse same evtchn with pci_conf backend response. So re-schedule
++	* aer pcifront service.
++	*/
++	if (test_bit(_XEN_PCIB_active,
++			(unsigned long *)&pdev->sh_info->flags)) {
++		dev_err(&pdev->xdev->dev,
++			"schedule aer pcifront service\n");
++		schedule_pcifront_aer_op(pdev);
++	}
++
++	memcpy(op, active_op, sizeof(struct xen_pci_op));
++
++	err = op->err;
++out:
++	spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
++	return err;
++}
++
++/* Access to this function is spinlocked in drivers/pci/access.c */
++static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
++			     int where, int size, u32 *val)
++{
++	int err = 0;
++	struct xen_pci_op op = {
++		.cmd    = XEN_PCI_OP_conf_read,
++		.domain = pci_domain_nr(bus),
++		.bus    = bus->number,
++		.devfn  = devfn,
++		.offset = where,
++		.size   = size,
++	};
++	struct pcifront_sd *sd = bus->sysdata;
++	struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++	if (verbose_request)
++		dev_info(&pdev->xdev->dev,
++			 "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n",
++			 pci_domain_nr(bus), bus->number, PCI_SLOT(devfn),
++			 PCI_FUNC(devfn), where, size);
++
++	err = do_pci_op(pdev, &op);
++
++	if (likely(!err)) {
++		if (verbose_request)
++			dev_info(&pdev->xdev->dev, "read got back value %x\n",
++				 op.value);
++
++		*val = op.value;
++	} else if (err == -ENODEV) {
++		/* No device here, pretend that it just returned 0 */
++		err = 0;
++		*val = 0;
++	}
++
++	return errno_to_pcibios_err(err);
++}
++
++/* Access to this function is spinlocked in drivers/pci/access.c */
++static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
++			      int where, int size, u32 val)
++{
++	struct xen_pci_op op = {
++		.cmd    = XEN_PCI_OP_conf_write,
++		.domain = pci_domain_nr(bus),
++		.bus    = bus->number,
++		.devfn  = devfn,
++		.offset = where,
++		.size   = size,
++		.value  = val,
++	};
++	struct pcifront_sd *sd = bus->sysdata;
++	struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++	if (verbose_request)
++		dev_info(&pdev->xdev->dev,
++			 "write dev=%04x:%02x:%02x.%01x - "
++			 "offset %x size %d val %x\n",
++			 pci_domain_nr(bus), bus->number,
++			 PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
++
++	return errno_to_pcibios_err(do_pci_op(pdev, &op));
++}
++
++struct pci_ops pcifront_bus_ops = {
++	.read = pcifront_bus_read,
++	.write = pcifront_bus_write,
++};
++
++#ifdef CONFIG_PCI_MSI
++static int pci_frontend_enable_msix(struct pci_dev *dev,
++				    int **vector, int nvec)
++{
++	int err;
++	int i;
++	struct xen_pci_op op = {
++		.cmd    = XEN_PCI_OP_enable_msix,
++		.domain = pci_domain_nr(dev->bus),
++		.bus = dev->bus->number,
++		.devfn = dev->devfn,
++		.value = nvec,
++	};
++	struct pcifront_sd *sd = dev->bus->sysdata;
++	struct pcifront_device *pdev = pcifront_get_pdev(sd);
++	struct msi_desc *entry;
++
++	if (nvec > SH_INFO_MAX_VEC) {
++		dev_err(&dev->dev, "too much vector for pci frontend: %x."
++				   " Increase SH_INFO_MAX_VEC.\n", nvec);
++		return -EINVAL;
++	}
++
++	i = 0;
++	list_for_each_entry(entry, &dev->msi_list, list) {
++		op.msix_entries[i].entry = entry->msi_attrib.entry_nr;
++		/* Vector is useless at this point. */
++		op.msix_entries[i].vector = -1;
++		i++;
++	}
++
++	err = do_pci_op(pdev, &op);
++
++	if (likely(!err)) {
++		if (likely(!op.value)) {
++			/* we get the result */
++			for (i = 0; i < nvec; i++)
++				*(*vector+i) = op.msix_entries[i].vector;
++			return 0;
++		} else {
++			printk(KERN_DEBUG "enable msix get value %x\n",
++				op.value);
++			return op.value;
++		}
++	} else {
++		dev_err(&dev->dev, "enable msix get err %x\n", err);
++		return err;
++	}
++}
++
++static void pci_frontend_disable_msix(struct pci_dev *dev)
++{
++	int err;
++	struct xen_pci_op op = {
++		.cmd    = XEN_PCI_OP_disable_msix,
++		.domain = pci_domain_nr(dev->bus),
++		.bus = dev->bus->number,
++		.devfn = dev->devfn,
++	};
++	struct pcifront_sd *sd = dev->bus->sysdata;
++	struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++	err = do_pci_op(pdev, &op);
++
++	/* What should do for error ? */
++	if (err)
++		dev_err(&dev->dev, "pci_disable_msix get err %x\n", err);
++}
++
++static int pci_frontend_enable_msi(struct pci_dev *dev, int **vector)
++{
++	int err;
++	struct xen_pci_op op = {
++		.cmd    = XEN_PCI_OP_enable_msi,
++		.domain = pci_domain_nr(dev->bus),
++		.bus = dev->bus->number,
++		.devfn = dev->devfn,
++	};
++	struct pcifront_sd *sd = dev->bus->sysdata;
++	struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++	err = do_pci_op(pdev, &op);
++	if (likely(!err)) {
++		*(*vector) = op.value;
++	} else {
++		dev_err(&dev->dev, "pci frontend enable msi failed for dev "
++				   "%x:%x \n", op.bus, op.devfn);
++		err = -EINVAL;
++	}
++	return err;
++}
++
++static void pci_frontend_disable_msi(struct pci_dev *dev)
++{
++	int err;
++	struct xen_pci_op op = {
++		.cmd    = XEN_PCI_OP_disable_msi,
++		.domain = pci_domain_nr(dev->bus),
++		.bus = dev->bus->number,
++		.devfn = dev->devfn,
++	};
++	struct pcifront_sd *sd = dev->bus->sysdata;
++	struct pcifront_device *pdev = pcifront_get_pdev(sd);
++
++	err = do_pci_op(pdev, &op);
++	if (err == XEN_PCI_ERR_dev_not_found) {
++		/* XXX No response from backend, what shall we do? */
++		printk(KERN_DEBUG "get no response from backend for disable MSI\n");
++		return;
++	}
++	if (err)
++		/* how can pciback notify us fail? */
++		printk(KERN_DEBUG "get fake response frombackend \n");
++}
++
++static struct xen_pci_frontend_ops pci_frontend_ops = {
++	.enable_msi = pci_frontend_enable_msi,
++	.disable_msi = pci_frontend_disable_msi,
++	.enable_msix = pci_frontend_enable_msix,
++	.disable_msix = pci_frontend_disable_msix,
++};
++
++static void pci_frontend_registrar(int enable)
++{
++	if (enable)
++		xen_pci_frontend = &pci_frontend_ops;
++	else
++		xen_pci_frontend = NULL;
++};
++#else
++static inline void pci_frontend_registrar(int enable) { };
++#endif /* CONFIG_PCI_MSI */
++
++/* Claim resources for the PCI frontend as-is, backend won't allow changes */
++static int pcifront_claim_resource(struct pci_dev *dev, void *data)
++{
++	struct pcifront_device *pdev = data;
++	int i;
++	struct resource *r;
++
++	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
++		r = &dev->resource[i];
++
++		if (!r->parent && r->start && r->flags) {
++			dev_info(&pdev->xdev->dev, "claiming resource %s/%d\n",
++				pci_name(dev), i);
++			if (pci_claim_resource(dev, i)) {
++				dev_err(&pdev->xdev->dev, "Could not claim "
++					"resource %s/%d! Device offline. Try "
++					"giving less than 4GB to domain.\n",
++					pci_name(dev), i);
++			}
++		}
++	}
++
++	return 0;
++}
++
++int __devinit pcifront_scan_bus(struct pcifront_device *pdev,
++				unsigned int domain, unsigned int bus,
++				struct pci_bus *b)
++{
++	struct pci_dev *d;
++	unsigned int devfn;
++	int err;
++
++	/* Scan the bus for functions and add.
++	 * We omit handling of PCI bridge attachment because pciback prevents
++	 * bridges from being exported.
++	 */
++	for (devfn = 0; devfn < 0x100; devfn++) {
++		d = pci_get_slot(b, devfn);
++		if (d) {
++			/* Device is already known. */
++			pci_dev_put(d);
++			continue;
++		}
++
++		d = pci_scan_single_device(b, devfn);
++		if (d)
++			dev_info(&pdev->xdev->dev, "New device on "
++				 "%04x:%02x:%02x.%02x found.\n", domain, bus,
++				 PCI_SLOT(devfn), PCI_FUNC(devfn));
++	}
++
++	return 0;
++}
++
++int __devinit pcifront_scan_root(struct pcifront_device *pdev,
++				 unsigned int domain, unsigned int bus)
++{
++	struct pci_bus *b;
++	struct pcifront_sd *sd = NULL;
++	struct pci_bus_entry *bus_entry = NULL;
++	int err = 0;
++
++#ifndef CONFIG_PCI_DOMAINS
++	if (domain != 0) {
++		dev_err(&pdev->xdev->dev,
++			"PCI Root in non-zero PCI Domain! domain=%d\n", domain);
++		dev_err(&pdev->xdev->dev,
++			"Please compile with CONFIG_PCI_DOMAINS\n");
++		err = -EINVAL;
++		goto err_out;
++	}
++#endif
++
++	dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
++		 domain, bus);
++
++	bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
++	sd = kmalloc(sizeof(*sd), GFP_KERNEL);
++	if (!bus_entry || !sd) {
++		err = -ENOMEM;
++		goto err_out;
++	}
++	pcifront_init_sd(sd, domain, bus, pdev);
++
++	b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
++				  &pcifront_bus_ops, sd);
++	if (!b) {
++		dev_err(&pdev->xdev->dev,
++			"Error creating PCI Frontend Bus!\n");
++		err = -ENOMEM;
++		goto err_out;
++	}
++
++	pcifront_setup_root_resources(b, sd);
++	bus_entry->bus = b;
++
++	list_add(&bus_entry->list, &pdev->root_buses);
++
++	/* pci_scan_bus_parented skips devices which do not have a have
++	* devfn==0. The pcifront_scan_bus enumerates all devfn. */
++	err = pcifront_scan_bus(pdev, domain, bus, b);
++
++	/* Claim resources before going "live" with our devices */
++	pci_walk_bus(b, pcifront_claim_resource, pdev);
++
++	/* Create SysFS and notify udev of the devices. Aka: "going live" */
++	pci_bus_add_devices(b);
++
++	return err;
++
++err_out:
++	kfree(bus_entry);
++	kfree(sd);
++
++	return err;
++}
++
++int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
++				   unsigned int domain, unsigned int bus)
++{
++	int err;
++	struct pci_bus *b;
++
++#ifndef CONFIG_PCI_DOMAINS
++	if (domain != 0) {
++		dev_err(&pdev->xdev->dev,
++			"PCI Root in non-zero PCI Domain! domain=%d\n", domain);
++		dev_err(&pdev->xdev->dev,
++			"Please compile with CONFIG_PCI_DOMAINS\n");
++		return -EINVAL;
++	}
++#endif
++
++	dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
++		 domain, bus);
++
++	b = pci_find_bus(domain, bus);
++	if (!b)
++		/* If the bus is unknown, create it. */
++		return pcifront_scan_root(pdev, domain, bus);
++
++	err = pcifront_scan_bus(pdev, domain, bus, b);
++
++	/* Claim resources before going "live" with our devices */
++	pci_walk_bus(b, pcifront_claim_resource, pdev);
++
++	/* Create SysFS and notify udev of the devices. Aka: "going live" */
++	pci_bus_add_devices(b);
++
++	return err;
++}
++
++static void free_root_bus_devs(struct pci_bus *bus)
++{
++	struct pci_dev *dev;
++
++	while (!list_empty(&bus->devices)) {
++		dev = container_of(bus->devices.next, struct pci_dev,
++				   bus_list);
++		dev_dbg(&dev->dev, "removing device\n");
++		pci_remove_bus_device(dev);
++	}
++}
++
++void pcifront_free_roots(struct pcifront_device *pdev)
++{
++	struct pci_bus_entry *bus_entry, *t;
++
++	dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
++
++	list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
++		list_del(&bus_entry->list);
++
++		free_root_bus_devs(bus_entry->bus);
++
++		kfree(bus_entry->bus->sysdata);
++
++		device_unregister(bus_entry->bus->bridge);
++		pci_remove_bus(bus_entry->bus);
++
++		kfree(bus_entry);
++	}
++}
++
++static pci_ers_result_t pcifront_common_process(int cmd,
++						struct pcifront_device *pdev,
++						pci_channel_state_t state)
++{
++	pci_ers_result_t result;
++	struct pci_driver *pdrv;
++	int bus = pdev->sh_info->aer_op.bus;
++	int devfn = pdev->sh_info->aer_op.devfn;
++	struct pci_dev *pcidev;
++	int flag = 0;
++
++	dev_dbg(&pdev->xdev->dev,
++		"pcifront AER process: cmd %x (bus:%x, devfn%x)",
++		cmd, bus, devfn);
++	result = PCI_ERS_RESULT_NONE;
++
++	pcidev = pci_get_bus_and_slot(bus, devfn);
++	if (!pcidev || !pcidev->driver) {
++		dev_err(&pcidev->dev,
++			"device or driver is NULL\n");
++		return result;
++	}
++	pdrv = pcidev->driver;
++
++	if (get_driver(&pdrv->driver)) {
++		if (pdrv->err_handler && pdrv->err_handler->error_detected) {
++			dev_dbg(&pcidev->dev,
++				"trying to call AER service\n");
++			if (pcidev) {
++				flag = 1;
++				switch (cmd) {
++				case XEN_PCI_OP_aer_detected:
++					result = pdrv->err_handler->
++						 error_detected(pcidev, state);
++					break;
++				case XEN_PCI_OP_aer_mmio:
++					result = pdrv->err_handler->
++						 mmio_enabled(pcidev);
++					break;
++				case XEN_PCI_OP_aer_slotreset:
++					result = pdrv->err_handler->
++						 slot_reset(pcidev);
++					break;
++				case XEN_PCI_OP_aer_resume:
++					pdrv->err_handler->resume(pcidev);
++					break;
++				default:
++					dev_err(&pdev->xdev->dev,
++						"bad request in aer recovery "
++						"operation!\n");
++
++				}
++			}
++		}
++		put_driver(&pdrv->driver);
++	}
++	if (!flag)
++		result = PCI_ERS_RESULT_NONE;
++
++	return result;
++}
++
++
++void pcifront_do_aer(struct work_struct *data)
++{
++	struct pcifront_device *pdev =
++		container_of(data, struct pcifront_device, op_work);
++	int cmd = pdev->sh_info->aer_op.cmd;
++	pci_channel_state_t state =
++		(pci_channel_state_t)pdev->sh_info->aer_op.err;
++
++	/*If a pci_conf op is in progress,
++		we have to wait until it is done before service aer op*/
++	dev_dbg(&pdev->xdev->dev,
++		"pcifront service aer bus %x devfn %x\n",
++		pdev->sh_info->aer_op.bus, pdev->sh_info->aer_op.devfn);
++
++	pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state);
++
++	wmb();
++	clear_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags);
++	notify_remote_via_evtchn(pdev->evtchn);
++
++	/*in case of we lost an aer request in four lines time_window*/
++	smp_mb__before_clear_bit();
++	clear_bit(_PDEVB_op_active, &pdev->flags);
++	smp_mb__after_clear_bit();
++
++	schedule_pcifront_aer_op(pdev);
++
++}
++
++irqreturn_t pcifront_handler_aer(int irq, void *dev)
++{
++	struct pcifront_device *pdev = dev;
++	schedule_pcifront_aer_op(pdev);
++	return IRQ_HANDLED;
++}
++int pcifront_connect(struct pcifront_device *pdev)
++{
++	int err = 0;
++
++	spin_lock(&pcifront_dev_lock);
++
++	if (!pcifront_dev) {
++		dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
++		pcifront_dev = pdev;
++	} else {
++		dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
++		err = -EEXIST;
++	}
++
++	spin_unlock(&pcifront_dev_lock);
++
++	return err;
++}
++
++void pcifront_disconnect(struct pcifront_device *pdev)
++{
++	spin_lock(&pcifront_dev_lock);
++
++	if (pdev == pcifront_dev) {
++		dev_info(&pdev->xdev->dev,
++			 "Disconnecting PCI Frontend Buses\n");
++		pcifront_dev = NULL;
++	}
++
++	spin_unlock(&pcifront_dev_lock);
++}
++static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
++{
++	struct pcifront_device *pdev;
++
++	pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
++	if (pdev == NULL)
++		goto out;
++
++	pdev->sh_info =
++	    (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
++	if (pdev->sh_info == NULL) {
++		kfree(pdev);
++		pdev = NULL;
++		goto out;
++	}
++	pdev->sh_info->flags = 0;
++
++	/*Flag for registering PV AER handler*/
++	set_bit(_XEN_PCIB_AERHANDLER, (void *)&pdev->sh_info->flags);
++
++	dev_set_drvdata(&xdev->dev, pdev);
++	pdev->xdev = xdev;
++
++	INIT_LIST_HEAD(&pdev->root_buses);
++
++	spin_lock_init(&pdev->sh_info_lock);
++
++	pdev->evtchn = INVALID_EVTCHN;
++	pdev->gnt_ref = INVALID_GRANT_REF;
++	pdev->irq = -1;
++
++	INIT_WORK(&pdev->op_work, pcifront_do_aer);
++
++	dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
++		pdev, pdev->sh_info);
++out:
++	return pdev;
++}
++
++static void free_pdev(struct pcifront_device *pdev)
++{
++	dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
++
++	pcifront_free_roots(pdev);
++
++	/*For PCIE_AER error handling job*/
++	flush_scheduled_work();
++	unbind_from_irqhandler(pdev->irq, pdev);
++
++	if (pdev->evtchn != INVALID_EVTCHN)
++		xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
++
++	if (pdev->gnt_ref != INVALID_GRANT_REF)
++		gnttab_end_foreign_access(pdev->gnt_ref, 0 /* r/w page */,
++					  (unsigned long)pdev->sh_info);
++
++	dev_set_drvdata(&pdev->xdev->dev, NULL);
++	kfree(pdev);
++}
++
++static int pcifront_publish_info(struct pcifront_device *pdev)
++{
++	int err = 0;
++	struct xenbus_transaction trans;
++
++	err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
++	if (err < 0)
++		goto out;
++
++	pdev->gnt_ref = err;
++
++	err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
++	if (err)
++		goto out;
++
++	err = bind_evtchn_to_irqhandler(pdev->evtchn, pcifront_handler_aer,
++		0, "pcifront", pdev);
++	if (err < 0) {
++		xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
++		xenbus_dev_fatal(pdev->xdev, err, "Failed to bind evtchn to "
++				 "irqhandler.\n");
++		return err;
++	}
++	pdev->irq = err;
++
++do_publish:
++	err = xenbus_transaction_start(&trans);
++	if (err) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error writing configuration for backend "
++				 "(start transaction)");
++		goto out;
++	}
++
++	err = xenbus_printf(trans, pdev->xdev->nodename,
++			    "pci-op-ref", "%u", pdev->gnt_ref);
++	if (!err)
++		err = xenbus_printf(trans, pdev->xdev->nodename,
++				    "event-channel", "%u", pdev->evtchn);
++	if (!err)
++		err = xenbus_printf(trans, pdev->xdev->nodename,
++				    "magic", XEN_PCI_MAGIC);
++
++	if (err) {
++		xenbus_transaction_end(trans, 1);
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error writing configuration for backend");
++		goto out;
++	} else {
++		err = xenbus_transaction_end(trans, 0);
++		if (err == -EAGAIN)
++			goto do_publish;
++		else if (err) {
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error completing transaction "
++					 "for backend");
++			goto out;
++		}
++	}
++
++	xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
++
++	dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
++
++out:
++	return err;
++}
++
++static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
++{
++	int err = -EFAULT;
++	int i, num_roots, len;
++	char str[64];
++	unsigned int domain, bus;
++
++
++	/* Only connect once */
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateInitialised)
++		goto out;
++
++	err = pcifront_connect(pdev);
++	if (err) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error connecting PCI Frontend");
++		goto out;
++	}
++
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++			   "root_num", "%d", &num_roots);
++	if (err == -ENOENT) {
++		xenbus_dev_error(pdev->xdev, err,
++				 "No PCI Roots found, trying 0000:00");
++		err = pcifront_scan_root(pdev, 0, 0);
++		num_roots = 0;
++	} else if (err != 1) {
++		if (err == 0)
++			err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading number of PCI roots");
++		goto out;
++	}
++
++	for (i = 0; i < num_roots; i++) {
++		len = snprintf(str, sizeof(str), "root-%d", i);
++		if (unlikely(len >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
++		}
++
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++				   "%x:%x", &domain, &bus);
++		if (err != 2) {
++			if (err >= 0)
++				err = -EINVAL;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error reading PCI root %d", i);
++			goto out;
++		}
++
++		err = pcifront_scan_root(pdev, domain, bus);
++		if (err) {
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error scanning PCI root %04x:%02x",
++					 domain, bus);
++			goto out;
++		}
++	}
++
++	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++
++out:
++	return err;
++}
++
++static int pcifront_try_disconnect(struct pcifront_device *pdev)
++{
++	int err = 0;
++	enum xenbus_state prev_state;
++
++
++	prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
++
++	if (prev_state >= XenbusStateClosing)
++		goto out;
++
++	if (prev_state == XenbusStateConnected) {
++		pcifront_free_roots(pdev);
++		pcifront_disconnect(pdev);
++	}
++
++	err = xenbus_switch_state(pdev->xdev, XenbusStateClosed);
++
++out:
++
++	return err;
++}
++
++static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
++{
++	int err = -EFAULT;
++	int i, num_roots, len;
++	unsigned int domain, bus;
++	char str[64];
++
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateReconfiguring)
++		goto out;
++
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
++			   "root_num", "%d", &num_roots);
++	if (err == -ENOENT) {
++		xenbus_dev_error(pdev->xdev, err,
++				 "No PCI Roots found, trying 0000:00");
++		err = pcifront_rescan_root(pdev, 0, 0);
++		num_roots = 0;
++	} else if (err != 1) {
++		if (err == 0)
++			err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading number of PCI roots");
++		goto out;
++	}
++
++	for (i = 0; i < num_roots; i++) {
++		len = snprintf(str, sizeof(str), "root-%d", i);
++		if (unlikely(len >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
++		}
++
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++				   "%x:%x", &domain, &bus);
++		if (err != 2) {
++			if (err >= 0)
++				err = -EINVAL;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error reading PCI root %d", i);
++			goto out;
++		}
++
++		err = pcifront_rescan_root(pdev, domain, bus);
++		if (err) {
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error scanning PCI root %04x:%02x",
++					 domain, bus);
++			goto out;
++		}
++	}
++
++	xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++
++out:
++	return err;
++}
++
++static int pcifront_detach_devices(struct pcifront_device *pdev)
++{
++	int err = 0;
++	int i, num_devs;
++	unsigned int domain, bus, slot, func;
++	struct pci_bus *pci_bus;
++	struct pci_dev *pci_dev;
++	char str[64];
++
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateConnected)
++		goto out;
++
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
++			   &num_devs);
++	if (err != 1) {
++		if (err >= 0)
++			err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading number of PCI devices");
++		goto out;
++	}
++
++	/* Find devices being detached and remove them. */
++	for (i = 0; i < num_devs; i++) {
++		int l, state;
++		l = snprintf(str, sizeof(str), "state-%d", i);
++		if (unlikely(l >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
++		}
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
++				   &state);
++		if (err != 1)
++			state = XenbusStateUnknown;
++
++		if (state != XenbusStateClosing)
++			continue;
++
++		/* Remove device. */
++		l = snprintf(str, sizeof(str), "vdev-%d", i);
++		if (unlikely(l >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
++		}
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
++				   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
++		if (err != 4) {
++			if (err >= 0)
++				err = -EINVAL;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error reading PCI device %d", i);
++			goto out;
++		}
++
++		pci_bus = pci_find_bus(domain, bus);
++		if (!pci_bus) {
++			dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
++				domain, bus);
++			continue;
++		}
++		pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
++		if (!pci_dev) {
++			dev_dbg(&pdev->xdev->dev,
++				"Cannot get PCI device %04x:%02x:%02x.%02x\n",
++				domain, bus, slot, func);
++			continue;
++		}
++		pci_remove_bus_device(pci_dev);
++		pci_dev_put(pci_dev);
++
++		dev_dbg(&pdev->xdev->dev,
++			"PCI device %04x:%02x:%02x.%02x removed.\n",
++			domain, bus, slot, func);
++	}
++
++	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
++
++out:
++	return err;
++}
++
++static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
++						  enum xenbus_state be_state)
++{
++	struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
++
++	switch (be_state) {
++	case XenbusStateUnknown:
++	case XenbusStateInitialising:
++	case XenbusStateInitWait:
++	case XenbusStateInitialised:
++	case XenbusStateClosed:
++		break;
++
++	case XenbusStateConnected:
++		pcifront_try_connect(pdev);
++		break;
++
++	case XenbusStateClosing:
++		dev_warn(&xdev->dev, "backend going away!\n");
++		pcifront_try_disconnect(pdev);
++		break;
++
++	case XenbusStateReconfiguring:
++		pcifront_detach_devices(pdev);
++		break;
++
++	case XenbusStateReconfigured:
++		pcifront_attach_devices(pdev);
++		break;
++	}
++}
++
++static int pcifront_xenbus_probe(struct xenbus_device *xdev,
++				 const struct xenbus_device_id *id)
++{
++	int err = 0;
++	struct pcifront_device *pdev = alloc_pdev(xdev);
++
++	if (pdev == NULL) {
++		err = -ENOMEM;
++		xenbus_dev_fatal(xdev, err,
++				 "Error allocating pcifront_device struct");
++		goto out;
++	}
++
++	err = pcifront_publish_info(pdev);
++
++out:
++	return err;
++}
++
++static int pcifront_xenbus_remove(struct xenbus_device *xdev)
++{
++	struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
++
++	if (pdev)
++		free_pdev(pdev);
++
++	return 0;
++}
++
++static const struct xenbus_device_id xenpci_ids[] = {
++	{"pci"},
++	{""},
++};
++
++static struct xenbus_driver xenbus_pcifront_driver = {
++	.name 			= "pcifront",
++	.owner 			= THIS_MODULE,
++	.ids 			= xenpci_ids,
++	.probe 			= pcifront_xenbus_probe,
++	.remove 		= pcifront_xenbus_remove,
++	.otherend_changed 	= pcifront_backend_changed,
++};
++
++static int __init pcifront_init(void)
++{
++	if (!xen_domain())
++		return -ENODEV;
++
++	pci_frontend_registrar(1 /* enable */);
++
++	return xenbus_register_frontend(&xenbus_pcifront_driver);
++}
++
++static void __exit pcifront_cleanup(void)
++{
++	xenbus_unregister_driver(&xenbus_pcifront_driver);
++	pci_frontend_registrar(0 /* disable */);
++}
++module_init(pcifront_init);
++module_exit(pcifront_cleanup);
++
++MODULE_DESCRIPTION("Xen PCI passthrough frontend.");
++MODULE_LICENSE("GPL");
++MODULE_ALIAS("xen:pci");
+diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
+index 188e1ba..efac9e3 100644
+--- a/drivers/video/Kconfig
++++ b/drivers/video/Kconfig
+@@ -2063,6 +2063,7 @@ config XEN_FBDEV_FRONTEND
+ 	select FB_SYS_IMAGEBLIT
+ 	select FB_SYS_FOPS
+ 	select FB_DEFERRED_IO
++	select XEN_XENBUS_FRONTEND
+ 	default y
+ 	help
+ 	  This driver implements the front-end of the Xen virtual
+diff --git a/drivers/video/broadsheetfb.c b/drivers/video/broadsheetfb.c
+index 509cb92..df9ccb9 100644
+--- a/drivers/video/broadsheetfb.c
++++ b/drivers/video/broadsheetfb.c
+@@ -470,7 +470,7 @@ static int __devinit broadsheetfb_probe(struct platform_device *dev)
+ 	par->read_reg = broadsheet_read_reg;
+ 	init_waitqueue_head(&par->waitq);
+ 
+-	info->flags = FBINFO_FLAG_DEFAULT;
++	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+ 
+ 	info->fbdefio = &broadsheetfb_defio;
+ 	fb_deferred_io_init(info);
+diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
+index c27ab1e..94414fc 100644
+--- a/drivers/video/fb_defio.c
++++ b/drivers/video/fb_defio.c
+@@ -144,7 +144,9 @@ static const struct address_space_operations fb_deferred_io_aops = {
+ static int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
+ {
+ 	vma->vm_ops = &fb_deferred_io_vm_ops;
+-	vma->vm_flags |= ( VM_IO | VM_RESERVED | VM_DONTEXPAND );
++	vma->vm_flags |= ( VM_RESERVED | VM_DONTEXPAND );
++	if (!(info->flags & FBINFO_VIRTFB))
++		vma->vm_flags |= VM_IO;
+ 	vma->vm_private_data = info;
+ 	return 0;
+ }
+diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
+index 99bbd28..057433a 100644
+--- a/drivers/video/fbmem.c
++++ b/drivers/video/fbmem.c
+@@ -1362,6 +1362,7 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
+ 	vma->vm_pgoff = off >> PAGE_SHIFT;
+ 	/* This is an IO map - tell maydump to skip this VMA */
+ 	vma->vm_flags |= VM_IO | VM_RESERVED;
++	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+ 	fb_pgprotect(file, vma, off);
+ 	if (io_remap_pfn_range(vma, vma->vm_start, off >> PAGE_SHIFT,
+ 			     vma->vm_end - vma->vm_start, vma->vm_page_prot))
+diff --git a/drivers/video/hecubafb.c b/drivers/video/hecubafb.c
+index 0b4bffb..f9d77ad 100644
+--- a/drivers/video/hecubafb.c
++++ b/drivers/video/hecubafb.c
+@@ -253,7 +253,7 @@ static int __devinit hecubafb_probe(struct platform_device *dev)
+ 	par->send_command = apollo_send_command;
+ 	par->send_data = apollo_send_data;
+ 
+-	info->flags = FBINFO_FLAG_DEFAULT;
++	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+ 
+ 	info->fbdefio = &hecubafb_defio;
+ 	fb_deferred_io_init(info);
+diff --git a/drivers/video/metronomefb.c b/drivers/video/metronomefb.c
+index df1f757..661bfd2 100644
+--- a/drivers/video/metronomefb.c
++++ b/drivers/video/metronomefb.c
+@@ -700,7 +700,7 @@ static int __devinit metronomefb_probe(struct platform_device *dev)
+ 	if (retval < 0)
+ 		goto err_free_irq;
+ 
+-	info->flags = FBINFO_FLAG_DEFAULT;
++	info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+ 
+ 	info->fbdefio = &metronomefb_defio;
+ 	fb_deferred_io_init(info);
+diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
+index 54cd916..dc72563 100644
+--- a/drivers/video/xen-fbfront.c
++++ b/drivers/video/xen-fbfront.c
+@@ -25,7 +25,10 @@
+ #include <linux/module.h>
+ #include <linux/vmalloc.h>
+ #include <linux/mm.h>
++
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+ #include <xen/interface/io/fbif.h>
+@@ -440,7 +443,7 @@ static int __devinit xenfb_probe(struct xenbus_device *dev,
+ 	fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
+ 	fb_info->fix.accel = FB_ACCEL_NONE;
+ 
+-	fb_info->flags = FBINFO_FLAG_DEFAULT;
++	fb_info->flags = FBINFO_FLAG_DEFAULT | FBINFO_VIRTFB;
+ 
+ 	ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
+ 	if (ret < 0) {
+@@ -627,6 +630,8 @@ static void xenfb_backend_changed(struct xenbus_device *dev,
+ 	switch (backend_state) {
+ 	case XenbusStateInitialising:
+ 	case XenbusStateInitialised:
++	case XenbusStateReconfiguring:
++	case XenbusStateReconfigured:
+ 	case XenbusStateUnknown:
+ 	case XenbusStateClosed:
+ 		break;
+@@ -680,7 +685,7 @@ static struct xenbus_driver xenfb_driver = {
+ 
+ static int __init xenfb_init(void)
+ {
+-	if (!xen_domain())
++	if (!xen_domain() || xen_hvm_domain())
+ 		return -ENODEV;
+ 
+ 	/* Nothing to do if running in dom0. */
+diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
+index 3711b88..4fcb4c5 100644
+--- a/drivers/watchdog/Kconfig
++++ b/drivers/watchdog/Kconfig
+@@ -975,6 +975,16 @@ config WATCHDOG_RIO
+ 
+ # XTENSA Architecture
+ 
++# Xen Architecture
++
++config XEN_WDT
++	tristate "Xen Watchdog support"
++	depends on XEN
++	help
++	  Say Y here to support the hypervisor watchdog capability provided
++	  by Xen 4.0 and newer.  The watchdog timeout period is normally one
++	  minute but can be changed with a boot-time parameter.
++
+ #
+ # ISA-based Watchdog Cards
+ #
+diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
+index 699199b..2f6739a 100644
+--- a/drivers/watchdog/Makefile
++++ b/drivers/watchdog/Makefile
+@@ -141,6 +141,9 @@ obj-$(CONFIG_WATCHDOG_CP1XXX)		+= cpwd.o
+ 
+ # XTENSA Architecture
+ 
++# Xen
++obj-$(CONFIG_XEN_WDT) += xen_wdt.o
++
+ # Architecture Independant
+ obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o
+ obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o
+diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c
+new file mode 100644
+index 0000000..bcfaafb
+--- /dev/null
++++ b/drivers/watchdog/xen_wdt.c
+@@ -0,0 +1,359 @@
++/*
++ *	Xen Watchdog Driver
++ *
++ *	(c) Copyright 2010 Novell, Inc.
++ *
++ *	This program is free software; you can redistribute it and/or
++ *	modify it under the terms of the GNU General Public License
++ *	as published by the Free Software Foundation; either version
++ *	2 of the License, or (at your option) any later version.
++ */
++
++#define DRV_NAME	"wdt"
++#define DRV_VERSION	"0.01"
++#define PFX		DRV_NAME ": "
++
++#include <linux/bug.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/hrtimer.h>
++#include <linux/kernel.h>
++#include <linux/ktime.h>
++#include <linux/init.h>
++#include <linux/miscdevice.h>
++#include <linux/module.h>
++#include <linux/moduleparam.h>
++#include <linux/platform_device.h>
++#include <linux/spinlock.h>
++#include <linux/uaccess.h>
++#include <linux/watchdog.h>
++#include <xen/xen.h>
++#include <asm/xen/hypercall.h>
++#include <xen/interface/sched.h>
++
++static struct platform_device *platform_device;
++static DEFINE_SPINLOCK(wdt_lock);
++static struct sched_watchdog wdt;
++static __kernel_time_t wdt_expires;
++static bool is_active, expect_release;
++
++#define WATCHDOG_TIMEOUT 60 /* in seconds */
++static unsigned int timeout = WATCHDOG_TIMEOUT;
++module_param(timeout, uint, S_IRUGO);
++MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds "
++	"(default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")");
++
++static bool nowayout = WATCHDOG_NOWAYOUT;
++module_param(nowayout, bool, S_IRUGO);
++MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
++	"(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
++
++static inline __kernel_time_t set_timeout(void)
++{
++	wdt.timeout = timeout;
++	return ktime_to_timespec(ktime_get()).tv_sec + timeout;
++}
++
++static int xen_wdt_start(void)
++{
++	__kernel_time_t expires;
++	int err;
++
++	spin_lock(&wdt_lock);
++
++	expires = set_timeout();
++	if (!wdt.id)
++		err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
++	else
++		err = -EBUSY;
++	if (err > 0) {
++		wdt.id = err;
++		wdt_expires = expires;
++		err = 0;
++	} else
++		BUG_ON(!err);
++
++	spin_unlock(&wdt_lock);
++
++	return err;
++}
++
++static int xen_wdt_stop(void)
++{
++	int err = 0;
++
++	spin_lock(&wdt_lock);
++
++	wdt.timeout = 0;
++	if (wdt.id)
++		err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
++	if (!err)
++		wdt.id = 0;
++
++	spin_unlock(&wdt_lock);
++
++	return err;
++}
++
++static int xen_wdt_kick(void)
++{
++	__kernel_time_t expires;
++	int err;
++
++	spin_lock(&wdt_lock);
++
++	expires = set_timeout();
++	if (wdt.id)
++		err = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wdt);
++	else
++		err = -ENXIO;
++	if (!err)
++		wdt_expires = expires;
++
++	spin_unlock(&wdt_lock);
++
++	return err;
++}
++
++static int xen_wdt_open(struct inode *inode, struct file *file)
++{
++	int err;
++
++	/* /dev/watchdog can only be opened once */
++	if (xchg(&is_active, true))
++		return -EBUSY;
++
++	err = xen_wdt_start();
++	if (err == -EBUSY)
++		err = xen_wdt_kick();
++	return err ?: nonseekable_open(inode, file);
++}
++
++static int xen_wdt_release(struct inode *inode, struct file *file)
++{
++	if (expect_release)
++		xen_wdt_stop();
++	else {
++		printk(KERN_CRIT PFX
++		       "unexpected close, not stopping watchdog!\n");
++		xen_wdt_kick();
++	}
++	is_active = false;
++	expect_release = false;
++	return 0;
++}
++
++static ssize_t xen_wdt_write(struct file *file, const char __user *data,
++			     size_t len, loff_t *ppos)
++{
++	/* See if we got the magic character 'V' and reload the timer */
++	if (len) {
++		if (!nowayout) {
++			size_t i;
++
++			/* in case it was set long ago */
++			expect_release = false;
++
++			/* scan to see whether or not we got the magic
++			   character */
++			for (i = 0; i != len; i++) {
++				char c;
++				if (get_user(c, data + i))
++					return -EFAULT;
++				if (c == 'V')
++					expect_release = true;
++			}
++		}
++
++		/* someone wrote to us, we should reload the timer */
++		xen_wdt_kick();
++	}
++	return len;
++}
++
++static long xen_wdt_ioctl(struct file *file, unsigned int cmd,
++			  unsigned long arg)
++{
++	int new_options, retval = -EINVAL;
++	int new_timeout;
++	int __user *argp = (void __user *)arg;
++	static const struct watchdog_info ident = {
++		.options =		WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE,
++		.firmware_version =	0,
++		.identity =		DRV_NAME,
++	};
++
++	switch (cmd) {
++	case WDIOC_GETSUPPORT:
++		return copy_to_user(argp, &ident, sizeof(ident)) ? -EFAULT : 0;
++
++	case WDIOC_GETSTATUS:
++	case WDIOC_GETBOOTSTATUS:
++		return put_user(0, argp);
++
++	case WDIOC_SETOPTIONS:
++		if (get_user(new_options, argp))
++			return -EFAULT;
++
++		if (new_options & WDIOS_DISABLECARD)
++			retval = xen_wdt_stop();
++		if (new_options & WDIOS_ENABLECARD) {
++			retval = xen_wdt_start();
++			if (retval == -EBUSY)
++				retval = xen_wdt_kick();
++		}
++		return retval;
++
++	case WDIOC_KEEPALIVE:
++		xen_wdt_kick();
++		return 0;
++
++	case WDIOC_SETTIMEOUT:
++		if (get_user(new_timeout, argp))
++			return -EFAULT;
++		if (!new_timeout)
++			return -EINVAL;
++		timeout = new_timeout;
++		xen_wdt_kick();
++		/* fall through */
++	case WDIOC_GETTIMEOUT:
++		return put_user(timeout, argp);
++
++	case WDIOC_GETTIMELEFT:
++		retval = wdt_expires - ktime_to_timespec(ktime_get()).tv_sec;
++		return put_user(retval, argp);
++	}
++
++	return -ENOTTY;
++}
++
++static const struct file_operations xen_wdt_fops = {
++	.owner =		THIS_MODULE,
++	.llseek =		no_llseek,
++	.write =		xen_wdt_write,
++	.unlocked_ioctl =	xen_wdt_ioctl,
++	.open =			xen_wdt_open,
++	.release =		xen_wdt_release,
++};
++
++static struct miscdevice xen_wdt_miscdev = {
++	.minor =	WATCHDOG_MINOR,
++	.name =		"watchdog",
++	.fops =		&xen_wdt_fops,
++};
++
++static int __devinit xen_wdt_probe(struct platform_device *dev)
++{
++	struct sched_watchdog wd = { .id = ~0 };
++	int ret = HYPERVISOR_sched_op(SCHEDOP_watchdog, &wd);
++
++	switch (ret) {
++	case -EINVAL:
++		if (!timeout) {
++			timeout = WATCHDOG_TIMEOUT;
++			printk(KERN_INFO PFX
++			       "timeout value invalid, using %d\n", timeout);
++		}
++
++		ret = misc_register(&xen_wdt_miscdev);
++		if (ret) {
++			printk(KERN_ERR PFX
++			       "cannot register miscdev on minor=%d (%d)\n",
++			       WATCHDOG_MINOR, ret);
++			break;
++		}
++
++		printk(KERN_INFO PFX
++		       "initialized (timeout=%ds, nowayout=%d)\n",
++		       timeout, nowayout);
++		break;
++
++	case -ENOSYS:
++		printk(KERN_INFO PFX "not supported\n");
++		ret = -ENODEV;
++		break;
++
++	default:
++		printk(KERN_INFO PFX "bogus return value %d\n", ret);
++		break;
++	}
++
++	return ret;
++}
++
++static int __devexit xen_wdt_remove(struct platform_device *dev)
++{
++	/* Stop the timer before we leave */
++	if (!nowayout)
++		xen_wdt_stop();
++
++	misc_deregister(&xen_wdt_miscdev);
++
++	return 0;
++}
++
++static void xen_wdt_shutdown(struct platform_device *dev)
++{
++	xen_wdt_stop();
++}
++
++static int xen_wdt_suspend(struct platform_device *dev, pm_message_t state)
++{
++	return xen_wdt_stop();
++}
++
++static int xen_wdt_resume(struct platform_device *dev)
++{
++	return xen_wdt_start();
++}
++
++static struct platform_driver xen_wdt_driver = {
++	.probe          = xen_wdt_probe,
++	.remove         = __devexit_p(xen_wdt_remove),
++	.shutdown       = xen_wdt_shutdown,
++	.suspend        = xen_wdt_suspend,
++	.resume         = xen_wdt_resume,
++	.driver         = {
++		.owner  = THIS_MODULE,
++		.name   = DRV_NAME,
++	},
++};
++
++static int __init xen_wdt_init_module(void)
++{
++	int err;
++
++	if (!xen_domain())
++		return -ENODEV;
++
++	printk(KERN_INFO PFX "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
++
++	err = platform_driver_register(&xen_wdt_driver);
++	if (err)
++		return err;
++
++	platform_device = platform_device_register_simple(DRV_NAME,
++								  -1, NULL, 0);
++	if (IS_ERR(platform_device)) {
++		err = PTR_ERR(platform_device);
++		platform_driver_unregister(&xen_wdt_driver);
++	}
++
++	return err;
++}
++
++static void __exit xen_wdt_cleanup_module(void)
++{
++	platform_device_unregister(platform_device);
++	platform_driver_unregister(&xen_wdt_driver);
++	printk(KERN_INFO PFX "module unloaded\n");
++}
++
++module_init(xen_wdt_init_module);
++module_exit(xen_wdt_cleanup_module);
++
++MODULE_AUTHOR("Jen Beulich <jbeulich@novell.com>");
++MODULE_DESCRIPTION("Xen WatchDog Timer Driver");
++MODULE_VERSION(DRV_VERSION);
++MODULE_LICENSE("GPL");
++MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
+diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
+index cab100a..fa9982e 100644
+--- a/drivers/xen/Kconfig
++++ b/drivers/xen/Kconfig
+@@ -28,6 +28,110 @@ config XEN_DEV_EVTCHN
+ 	  firing.
+ 	  If in doubt, say yes.
+ 
++config XEN_BACKEND
++	bool "Backend driver support"
++	depends on XEN_DOM0
++	default y
++	help
++	  Support for backend device drivers that provide I/O services
++	  to other virtual machines.
++
++config XEN_NETDEV_BACKEND
++       tristate "Xen backend network device"
++       depends on XEN_BACKEND && NET
++       help
++         Implement the network backend driver, which passes packets
++         from the guest domain's frontend drivers to the network.
++
++config XEN_BLKDEV_BACKEND
++	tristate "Block-device backend driver"
++	depends on XEN_BACKEND && BLOCK
++	help
++	  The block-device backend driver allows the kernel to export its
++	  block devices to other guests via a high-performance shared-memory
++	  interface.
++
++ 
++config XEN_BLKDEV_TAP
++	tristate "Block-device tap backend driver"
++	depends on XEN_BACKEND && BLOCK
++	help
++	  The block tap driver is an alternative to the block back driver
++	  and allows VM block requests to be redirected to userspace through
++	  a device interface.  The tap allows user-space development of
++	  high-performance block backends, where disk images may be implemented
++	  as files, in memory, or on other hosts across the network.  This
++	  driver can safely coexist with the existing blockback driver.
++
++config XEN_BLKBACK_PAGEMAP
++	tristate
++	depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP != n
++	default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP
++
++config XEN_PCIDEV_BACKEND
++	tristate "PCI-device backend driver"
++	depends on PCI && XEN_BACKEND
++	default XEN_BACKEND
++	help
++	  The PCI device backend driver allows the kernel to export arbitrary
++	  PCI devices to other guests. If you select this to be a module, you
++	  will need to make sure no other driver has bound to the device(s)
++	  you want to make visible to other guests.
++
++choice
++	prompt "PCI Backend Mode"
++	depends on XEN_PCIDEV_BACKEND
++	default XEN_PCIDEV_BACKEND_VPCI if !IA64
++	default XEN_PCIDEV_BACKEND_CONTROLLER if IA64
++
++config XEN_PCIDEV_BACKEND_VPCI
++	bool "Virtual PCI"
++	---help---
++	  This PCI Backend hides the true PCI topology and makes the frontend
++	  think there is a single PCI bus with only the exported devices on it.
++	  For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
++	  second device at 02:1a.1 will be re-assigned to 00:01.1.
++
++config XEN_PCIDEV_BACKEND_PASS
++	bool "Passthrough"
++	---help---
++	  This PCI Backend provides a real view of the PCI topology to the
++	  frontend (for example, a device at 06:01.b will still appear at
++	  06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
++	  PCI devices to its driver domains. This may be required for drivers
++	  which depend on finding their hardward in certain bus/slot
++	  locations.
++
++config XEN_PCIDEV_BACKEND_SLOT
++	bool "Slot"
++	---help---
++	  This PCI Backend hides the true PCI topology and makes the frontend
++	  think there is a single PCI bus with only the exported devices on it.
++	  Contrary to the virtual PCI backend, a function becomes a new slot.
++	  For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
++	  second device at 02:1a.1 will be re-assigned to 00:01.0.
++
++config XEN_PCIDEV_BACKEND_CONTROLLER
++	bool "Controller"
++	depends on IA64
++	---help---
++	  This PCI backend virtualizes the PCI bus topology by providing a
++	  virtual bus per PCI root device.  Devices which are physically under
++	  the same root bus will appear on the same virtual bus.  For systems
++	  with complex I/O addressing, this is the only backend which supports
++	  extended I/O port spaces and MMIO translation offsets.  This backend
++	  also supports slot virtualization.  For example, a device at
++	  0000:01:02.1 will be re-assigned to 0000:00:00.0.  A second device
++	  at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
++	  re-assigned to 0000:00:01.0.  A third device at 0000:16:05.0 (under
++	  a different PCI root bus) will be re-assigned to 0000:01:00.0.
++
++endchoice
++
++config XEN_PCIDEV_BE_DEBUG
++	bool "PCI Backend Debugging"
++	depends on XEN_PCIDEV_BACKEND
++
+ config XENFS
+ 	tristate "Xen filesystem"
+ 	depends on XEN
+@@ -60,4 +164,37 @@ config XEN_SYS_HYPERVISOR
+          Create entries under /sys/hypervisor describing the Xen
+ 	 hypervisor environment.  When running native or in another
+ 	 virtual environment, /sys/hypervisor will still be present,
+-	 but will have no xen contents.
+\ No newline at end of file
++	 but will have no xen contents.
++
++config XEN_MCE
++       def_bool y
++       depends on XEN_DOM0 && X86_64 && X86_MCE_INTEL
++
++config XEN_XENBUS_FRONTEND
++       tristate
++
++config XEN_GNTDEV
++	tristate "userspace grant access device driver"
++	depends on XEN
++	select MMU_NOTIFIER
++	help
++	  Allows userspace processes use grants.
++
++config XEN_S3
++       def_bool y
++       depends on XEN_DOM0 && ACPI
++
++config ACPI_PROCESSOR_XEN
++	   tristate
++	   depends on XEN_DOM0 && ACPI_PROCESSOR && CPU_FREQ
++	   default y
++
++config XEN_PLATFORM_PCI
++	tristate "xen platform pci device driver"
++	depends on XEN_PVHVM
++	default m
++	help
++	  Driver for the Xen PCI Platform device: it is responsible for
++	  initializing xenbus and grant_table when running in a Xen HVM
++	  domain. As a consequence this driver is required to run any Xen PV
++	  frontend on Xen HVM.
+diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
+index 7c28434..ef1ea63 100644
+--- a/drivers/xen/Makefile
++++ b/drivers/xen/Makefile
+@@ -1,12 +1,27 @@
+-obj-y	+= grant-table.o features.o events.o manage.o
++obj-y	+= grant-table.o features.o events.o manage.o biomerge.o pcpu.o
+ obj-y	+= xenbus/
+ 
+ nostackp := $(call cc-option, -fno-stack-protector)
+ CFLAGS_features.o			:= $(nostackp)
+ 
+-obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
+-obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
+-obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
+-obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
+-obj-$(CONFIG_XENFS)		+= xenfs/
+-obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
+\ No newline at end of file
++obj-$(CONFIG_PCI)			+= pci.o
++obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
++obj-$(CONFIG_XEN_XENCOMM)		+= xencomm.o
++obj-$(CONFIG_XEN_BALLOON)		+= balloon.o
++obj-$(CONFIG_XEN_DEV_EVTCHN)		+= xen-evtchn.o
++obj-$(CONFIG_XEN_GNTDEV)		+= xen-gntdev.o
++obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= pciback/
++obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= blkback/
++obj-$(CONFIG_XEN_BLKDEV_TAP)            += blktap/
++obj-$(CONFIG_XEN_NETDEV_BACKEND)	+= netback/
++obj-$(CONFIG_XENFS)			+= xenfs/
++obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
++obj-$(CONFIG_XEN_MCE)		+= mce.o
++
++obj-$(CONFIG_XEN_S3)           += acpi.o
++obj-$(CONFIG_ACPI_PROCESSOR_XEN) += acpi_processor.o
++obj-$(CONFIG_ACPI_HOTPLUG_MEMORY)  += xen_acpi_memhotplug.o
++obj-$(CONFIG_XEN_PLATFORM_PCI)	+= platform-pci.o
++
++xen-evtchn-y				:= evtchn.o
++xen-gntdev-y				:= gntdev.o
+diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c
+new file mode 100644
+index 0000000..e6d3d0e
+--- /dev/null
++++ b/drivers/xen/acpi.c
+@@ -0,0 +1,23 @@
++#include <xen/acpi.h>
++
++#include <xen/interface/platform.h>
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
++int acpi_notify_hypervisor_state(u8 sleep_state,
++				 u32 pm1a_cnt, u32 pm1b_cnt)
++{
++	struct xen_platform_op op = {
++		.cmd = XENPF_enter_acpi_sleep,
++		.interface_version = XENPF_INTERFACE_VERSION,
++		.u = {
++			.enter_acpi_sleep = {
++				.pm1a_cnt_val = (u16)pm1a_cnt,
++				.pm1b_cnt_val = (u16)pm1b_cnt,
++				.sleep_state = sleep_state,
++			},
++		},
++	};
++
++	return HYPERVISOR_dom0_op(&op);
++}
+diff --git a/drivers/xen/acpi_processor.c b/drivers/xen/acpi_processor.c
+new file mode 100644
+index 0000000..e83b615
+--- /dev/null
++++ b/drivers/xen/acpi_processor.c
+@@ -0,0 +1,417 @@
++/*
++ *  acpi_processor.c - interface to notify Xen on acpi processor object
++ *                     info parsing
++ *
++ *  Copyright (C) 2008, Intel corporation
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/acpi.h>
++#include <linux/pm.h>
++#include <linux/cpu.h>
++
++#include <linux/cpufreq.h>
++#include <acpi/processor.h>
++#include <xen/acpi.h>
++#include <xen/pcpu.h>
++
++#include <asm/xen/hypercall.h>
++#include <asm/xen/hypervisor.h>
++
++static int xen_hotplug_notifier(struct acpi_processor *pr, int event);
++
++static struct processor_cntl_xen_ops xen_ops = {
++	.hotplug		= xen_hotplug_notifier,
++};
++
++static struct acpi_power_register *power_registers[XEN_MAX_ACPI_ID + 1];
++
++int processor_cntl_xen_power_cache(int cpu, int cx,
++		struct acpi_power_register *reg)
++{
++	struct acpi_power_register *buf;
++
++	if (cpu < 0 || cpu > XEN_MAX_ACPI_ID ||
++			cx < 1 || cx > ACPI_PROCESSOR_MAX_POWER) {
++		return -EINVAL;
++	}
++
++	if (power_registers[cpu] == NULL) {
++		buf = kzalloc(ACPI_PROCESSOR_MAX_POWER *
++				sizeof(struct xen_processor_cx), GFP_KERNEL);
++		if (buf == NULL)
++			return -ENOMEM;
++
++		power_registers[cpu] = buf;
++	}
++
++	memcpy(power_registers[cpu]+cx-1, reg, sizeof(*reg));
++
++	return 0;
++}
++EXPORT_SYMBOL(processor_cntl_xen_power_cache);
++
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++static int xen_get_apic_id(acpi_handle handle)
++{
++	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
++	union acpi_object *obj;
++	struct acpi_madt_local_apic *lapic;
++	u8 physid;
++
++	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
++		return -EINVAL;
++
++	if (!buffer.length || !buffer.pointer)
++		return -EINVAL;
++
++	obj = buffer.pointer;
++	if (obj->type != ACPI_TYPE_BUFFER ||
++	    obj->buffer.length < sizeof(*lapic)) {
++		kfree(buffer.pointer);
++		return -EINVAL;
++	}
++
++	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
++
++	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
++	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
++		kfree(buffer.pointer);
++		return -EINVAL;
++	}
++
++	physid = lapic->id;
++	kfree(buffer.pointer);
++	buffer.length = ACPI_ALLOCATE_BUFFER;
++	buffer.pointer = NULL;
++
++	return physid;
++}
++#else
++static int xen_get_apic_id(acpi_handle handle)
++{
++	return -1;
++}
++#endif
++
++int processor_cntl_xen_notify(struct acpi_processor *pr, int event, int type)
++{
++	int ret = -EINVAL;
++
++	switch (event) {
++	case PROCESSOR_PM_INIT:
++	case PROCESSOR_PM_CHANGE:
++		if ((type >= PM_TYPE_MAX) ||
++			!xen_ops.pm_ops[type])
++			break;
++
++		ret = xen_ops.pm_ops[type](pr, event);
++		break;
++	case PROCESSOR_HOTPLUG:
++	{
++		int apic_id;
++
++		apic_id = xen_get_apic_id(pr->handle);
++		if (apic_id < 0)
++			break;
++		if (xen_ops.hotplug)
++			ret = xen_ops.hotplug(pr, type);
++		xen_pcpu_hotplug(type, apic_id);
++		break;
++	}
++	default:
++		printk(KERN_ERR "Unsupport processor events %d.\n", event);
++		break;
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL(processor_cntl_xen_notify);
++
++static inline void xen_convert_pct_reg(struct xen_pct_register *xpct,
++	struct acpi_pct_register *apct)
++{
++	xpct->descriptor = apct->descriptor;
++	xpct->length     = apct->length;
++	xpct->space_id   = apct->space_id;
++	xpct->bit_width  = apct->bit_width;
++	xpct->bit_offset = apct->bit_offset;
++	xpct->reserved   = apct->reserved;
++	xpct->address    = apct->address;
++}
++
++static inline void xen_convert_pss_states(struct xen_processor_px *xpss,
++	struct acpi_processor_px *apss, int state_count)
++{
++	int i;
++	for (i = 0; i < state_count; i++) {
++		xpss->core_frequency     = apss->core_frequency;
++		xpss->power              = apss->power;
++		xpss->transition_latency = apss->transition_latency;
++		xpss->bus_master_latency = apss->bus_master_latency;
++		xpss->control            = apss->control;
++		xpss->status             = apss->status;
++		xpss++;
++		apss++;
++	}
++}
++
++static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd,
++	struct acpi_psd_package *apsd)
++{
++	xpsd->num_entries    = apsd->num_entries;
++	xpsd->revision       = apsd->revision;
++	xpsd->domain         = apsd->domain;
++	xpsd->coord_type     = apsd->coord_type;
++	xpsd->num_processors = apsd->num_processors;
++}
++
++static int xen_cx_notifier(struct acpi_processor *pr, int action)
++{
++	int ret, count = 0, i;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_set_processor_pminfo,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.set_pminfo.id	= pr->acpi_id,
++		.u.set_pminfo.type	= XEN_PM_CX,
++	};
++	struct xen_processor_cx *data, *buf;
++	struct acpi_processor_cx *cx;
++	struct acpi_power_register *reg;
++
++	if (action == PROCESSOR_PM_CHANGE)
++		return -EINVAL;
++
++	if (power_registers[pr->acpi_id] == NULL) {
++		printk(KERN_WARNING "No C state info for acpi processor %d\n",
++				pr->acpi_id);
++		return -EINVAL;
++	}
++
++	/* Convert to Xen defined structure and hypercall */
++	buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
++			GFP_KERNEL);
++	if (!buf)
++		return -ENOMEM;
++
++	data = buf;
++	for (i = 1; i <= pr->power.count; i++) {
++		cx = &pr->power.states[i];
++		reg = power_registers[pr->acpi_id]+i-1;
++		/* Skip invalid cstate entry */
++		if (!cx->valid)
++			continue;
++
++		data->type = cx->type;
++		data->latency = cx->latency;
++		data->power = cx->power;
++		data->reg.space_id = reg->space_id;
++		data->reg.bit_width = reg->bit_width;
++		data->reg.bit_offset = reg->bit_offset;
++		data->reg.access_size = reg->access_size;
++		data->reg.address = reg->address;
++
++		/* Get dependency relationships, _CSD is not supported yet */
++		data->dpcnt = 0;
++		set_xen_guest_handle(data->dp, NULL);
++
++		data++;
++		count++;
++	}
++
++	if (!count) {
++		printk(KERN_ERR "No available Cx info for cpu %d\n",
++				pr->acpi_id);
++		kfree(buf);
++		return -EINVAL;
++	}
++
++	op.u.set_pminfo.power.count = count;
++	op.u.set_pminfo.power.flags.bm_control = pr->flags.bm_control;
++	op.u.set_pminfo.power.flags.bm_check = pr->flags.bm_check;
++	op.u.set_pminfo.power.flags.has_cst = pr->flags.has_cst;
++	op.u.set_pminfo.power.flags.power_setup_done =
++		pr->flags.power_setup_done;
++
++	set_xen_guest_handle(op.u.set_pminfo.power.states, buf);
++	ret = HYPERVISOR_dom0_op(&op);
++	kfree(buf);
++	return ret;
++}
++
++static int xen_px_notifier(struct acpi_processor *pr, int action)
++{
++	int ret = -EINVAL;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_set_processor_pminfo,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.set_pminfo.id	= pr->acpi_id,
++		.u.set_pminfo.type	= XEN_PM_PX,
++	};
++	struct xen_processor_performance *perf;
++	struct xen_processor_px *states = NULL;
++	struct acpi_processor_performance *px;
++	struct acpi_psd_package *pdomain;
++
++	if (!pr)
++		return -EINVAL;
++
++	perf = &op.u.set_pminfo.perf;
++	px = pr->performance;
++
++	switch (action) {
++	case PROCESSOR_PM_CHANGE:
++		/* ppc dynamic handle */
++		perf->flags = XEN_PX_PPC;
++		perf->platform_limit = pr->performance_platform_limit;
++
++		ret = HYPERVISOR_dom0_op(&op);
++		break;
++
++	case PROCESSOR_PM_INIT:
++		/* px normal init */
++		perf->flags = XEN_PX_PPC |
++			      XEN_PX_PCT |
++			      XEN_PX_PSS |
++			      XEN_PX_PSD;
++
++		/* ppc */
++		perf->platform_limit = pr->performance_platform_limit;
++
++		/* pct */
++		xen_convert_pct_reg(&perf->control_register,
++				&px->control_register);
++		xen_convert_pct_reg(&perf->status_register,
++				&px->status_register);
++
++		/* pss */
++		perf->state_count = px->state_count;
++		states = kzalloc(px->state_count*sizeof(xen_processor_px_t),
++				GFP_KERNEL);
++		if (!states)
++			return -ENOMEM;
++		xen_convert_pss_states(states, px->states, px->state_count);
++		set_xen_guest_handle(perf->states, states);
++
++		/* psd */
++		pdomain = &px->domain_info;
++		xen_convert_psd_pack(&perf->domain_info, pdomain);
++		if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
++			perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
++		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
++			perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
++		else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
++			perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
++		else {
++			ret = -ENODEV;
++			kfree(states);
++			break;
++		}
++
++		ret = HYPERVISOR_dom0_op(&op);
++		kfree(states);
++		break;
++
++	default:
++		break;
++	}
++
++	return ret;
++}
++
++static int xen_tx_notifier(struct acpi_processor *pr, int action)
++{
++	return -EINVAL;
++}
++
++#ifdef CONFIG_ACPI_HOTPLUG_CPU
++static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
++{
++	int ret = -EINVAL;
++	uint32_t apic_id;
++	unsigned long long pxm;
++	acpi_status status = 0;
++
++	xen_platform_op_t op = {
++		.interface_version  = XENPF_INTERFACE_VERSION,
++	};
++
++	apic_id = xen_get_apic_id(pr->handle);
++	if (apic_id < 0) {
++		printk(KERN_WARNING "Can't get apic_id for acpi_id %x\n",
++		  pr->acpi_id);
++		return -1;
++	}
++
++	status = acpi_evaluate_integer(pr->handle, "_PXM",
++	  NULL, &pxm);
++	if (ACPI_FAILURE(status)) {
++		printk(KERN_WARNING "can't get pxm for acpi_id %x\n",
++		  pr->acpi_id);
++		return -1;
++	}
++
++	switch (event) {
++	case HOTPLUG_TYPE_ADD:
++		op.cmd = XENPF_cpu_hotadd;
++		op.u.cpu_add.apic_id = apic_id;
++		op.u.cpu_add.acpi_id = pr->acpi_id;
++		op.u.cpu_add.pxm = pxm;
++		ret = HYPERVISOR_dom0_op(&op);
++		break;
++	case HOTPLUG_TYPE_REMOVE:
++		printk(KERN_WARNING "Xen not support CPU hotremove\n");
++		ret = -ENOSYS;
++		break;
++	}
++
++	return ret;
++}
++#else
++static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
++{
++	return -ENOSYS;
++}
++#endif
++
++static int __init xen_acpi_processor_extcntl_init(void)
++{
++	unsigned int pmbits;
++
++	/* Only xen dom0 is allowed to handle ACPI processor info */
++	if (!xen_initial_domain())
++		return 0;
++
++	pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
++
++	if (pmbits & XEN_PROCESSOR_PM_CX)
++		xen_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
++	if (pmbits & XEN_PROCESSOR_PM_PX)
++		xen_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
++	if (pmbits & XEN_PROCESSOR_PM_TX)
++		xen_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
++
++	return 0;
++}
++
++subsys_initcall(xen_acpi_processor_extcntl_init);
++MODULE_LICENSE("GPL");
+diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
+index 4204336..ce198b4 100644
+--- a/drivers/xen/balloon.c
++++ b/drivers/xen/balloon.c
+@@ -43,22 +43,26 @@
+ #include <linux/mutex.h>
+ #include <linux/list.h>
+ #include <linux/sysdev.h>
++#include <linux/swap.h>
+ 
+ #include <asm/page.h>
+ #include <asm/pgalloc.h>
+ #include <asm/pgtable.h>
+ #include <asm/uaccess.h>
+ #include <asm/tlb.h>
++#include <asm/e820.h>
+ 
+ #include <asm/xen/hypervisor.h>
+ #include <asm/xen/hypercall.h>
++
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/memory.h>
+ #include <xen/xenbus.h>
+ #include <xen/features.h>
+ #include <xen/page.h>
+ 
+-#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
++#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT+balloon_order-10))
+ 
+ #define BALLOON_CLASS_NAME "xen_memory"
+ 
+@@ -82,14 +86,15 @@ static struct sys_device balloon_sysdev;
+ 
+ static int register_balloon(struct sys_device *sysdev);
+ 
++static struct balloon_stats balloon_stats;
++
+ /*
+- * Protects atomic reservation decrease/increase against concurrent increases.
+- * Also protects non-atomic updates of current_pages and driver_pages, and
+- * balloon lists.
++ * Work in pages of this order.  Can be either 0 for normal pages
++ * or 9 for hugepages.
+  */
+-static DEFINE_SPINLOCK(balloon_lock);
+-
+-static struct balloon_stats balloon_stats;
++static int balloon_order;
++static unsigned long balloon_npages;
++static unsigned long discontig_frame_list[PAGE_SIZE / sizeof(unsigned long)];
+ 
+ /* We increase/decrease in batches which fit in a page */
+ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+@@ -118,12 +123,43 @@ static struct timer_list balloon_timer;
+ static void scrub_page(struct page *page)
+ {
+ #ifdef CONFIG_XEN_SCRUB_PAGES
+-	clear_highpage(page);
++	int i;
++
++	for (i = 0; i < balloon_npages; i++)
++		clear_highpage(page++);
+ #endif
+ }
+ 
++static void free_discontig_frame(void)
++{
++	int rc;
++	struct xen_memory_reservation reservation = {
++		.address_bits = 0,
++		.domid        = DOMID_SELF,
++		.nr_extents   = balloon_npages,
++		.extent_order = 0
++	};
++
++	set_xen_guest_handle(reservation.extent_start, discontig_frame_list);
++	rc = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++	BUG_ON(rc != balloon_npages);
++}
++
++static unsigned long shrink_frame(unsigned long nr_pages)
++{
++	unsigned long i, j;
++
++	for (i = 0, j = 0; i < nr_pages; i++, j++) {
++		if (frame_list[i] == 0)
++			j++;
++		if (i != j)
++			frame_list[i] = frame_list[j];
++	}
++	return i;
++}
++
+ /* balloon_append: add the given page to the balloon. */
+-static void balloon_append(struct page *page)
++static void __balloon_append(struct page *page)
+ {
+ 	/* Lowmem is re-populated first, so highmem pages go at list tail. */
+ 	if (PageHighMem(page)) {
+@@ -134,7 +170,11 @@ static void balloon_append(struct page *page)
+ 		list_add(&page->lru, &ballooned_pages);
+ 		balloon_stats.balloon_low++;
+ 	}
++}
+ 
++static void balloon_append(struct page *page)
++{
++	__balloon_append(page);
+ 	totalram_pages--;
+ }
+ 
+@@ -195,20 +235,17 @@ static unsigned long current_target(void)
+ 
+ static int increase_reservation(unsigned long nr_pages)
+ {
+-	unsigned long  pfn, i, flags;
++	unsigned long  pfn, mfn, i, j;
+ 	struct page   *page;
+ 	long           rc;
+ 	struct xen_memory_reservation reservation = {
+ 		.address_bits = 0,
+-		.extent_order = 0,
+ 		.domid        = DOMID_SELF
+ 	};
+ 
+ 	if (nr_pages > ARRAY_SIZE(frame_list))
+ 		nr_pages = ARRAY_SIZE(frame_list);
+ 
+-	spin_lock_irqsave(&balloon_lock, flags);
+-
+ 	page = balloon_first_page();
+ 	for (i = 0; i < nr_pages; i++) {
+ 		BUG_ON(page == NULL);
+@@ -218,6 +255,8 @@ static int increase_reservation(unsigned long nr_pages)
+ 
+ 	set_xen_guest_handle(reservation.extent_start, frame_list);
+ 	reservation.nr_extents = nr_pages;
++	reservation.extent_order = balloon_order;
++
+ 	rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+ 	if (rc < 0)
+ 		goto out;
+@@ -227,19 +266,22 @@ static int increase_reservation(unsigned long nr_pages)
+ 		BUG_ON(page == NULL);
+ 
+ 		pfn = page_to_pfn(page);
++		mfn = frame_list[i];
+ 		BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+ 		       phys_to_machine_mapping_valid(pfn));
+ 
+-		set_phys_to_machine(pfn, frame_list[i]);
+-
+-		/* Link back into the page tables if not highmem. */
+-		if (pfn < max_low_pfn) {
+-			int ret;
+-			ret = HYPERVISOR_update_va_mapping(
+-				(unsigned long)__va(pfn << PAGE_SHIFT),
+-				mfn_pte(frame_list[i], PAGE_KERNEL),
+-				0);
+-			BUG_ON(ret);
++		for (j = 0; j < balloon_npages; j++, pfn++, mfn++) {
++			set_phys_to_machine(pfn, mfn);
++
++			/* Link back into the page tables if not highmem. */
++			if (pfn < max_low_pfn) {
++				int ret;
++				ret = HYPERVISOR_update_va_mapping(
++					(unsigned long)__va(pfn << PAGE_SHIFT),
++					mfn_pte(mfn, PAGE_KERNEL),
++					0);
++				BUG_ON(ret);
++			}
+ 		}
+ 
+ 		/* Relinquish the page back to the allocator. */
+@@ -251,20 +293,18 @@ static int increase_reservation(unsigned long nr_pages)
+ 	balloon_stats.current_pages += rc;
+ 
+  out:
+-	spin_unlock_irqrestore(&balloon_lock, flags);
+-
+ 	return rc < 0 ? rc : rc != nr_pages;
+ }
+ 
+ static int decrease_reservation(unsigned long nr_pages)
+ {
+-	unsigned long  pfn, i, flags;
+-	struct page   *page;
++	unsigned long  pfn, lpfn, mfn, i, j;
++	struct page   *page = NULL;
+ 	int            need_sleep = 0;
+-	int ret;
++	int		discontig, discontig_free;
++	int		ret;
+ 	struct xen_memory_reservation reservation = {
+ 		.address_bits = 0,
+-		.extent_order = 0,
+ 		.domid        = DOMID_SELF
+ 	};
+ 
+@@ -272,7 +312,7 @@ static int decrease_reservation(unsigned long nr_pages)
+ 		nr_pages = ARRAY_SIZE(frame_list);
+ 
+ 	for (i = 0; i < nr_pages; i++) {
+-		if ((page = alloc_page(GFP_BALLOON)) == NULL) {
++		if ((page = alloc_pages(GFP_BALLOON, balloon_order)) == NULL) {
+ 			nr_pages = i;
+ 			need_sleep = 1;
+ 			break;
+@@ -282,38 +322,49 @@ static int decrease_reservation(unsigned long nr_pages)
+ 		frame_list[i] = pfn_to_mfn(pfn);
+ 
+ 		scrub_page(page);
+-
+-		if (!PageHighMem(page)) {
+-			ret = HYPERVISOR_update_va_mapping(
+-				(unsigned long)__va(pfn << PAGE_SHIFT),
+-				__pte_ma(0), 0);
+-			BUG_ON(ret);
+-                }
+-
+ 	}
+ 
+ 	/* Ensure that ballooned highmem pages don't have kmaps. */
+ 	kmap_flush_unused();
+ 	flush_tlb_all();
+ 
+-	spin_lock_irqsave(&balloon_lock, flags);
+-
+ 	/* No more mappings: invalidate P2M and add to balloon. */
+ 	for (i = 0; i < nr_pages; i++) {
+-		pfn = mfn_to_pfn(frame_list[i]);
+-		set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
++		mfn = frame_list[i];
++		lpfn = pfn = mfn_to_pfn(mfn);
+ 		balloon_append(pfn_to_page(pfn));
++		discontig_free = 0;
++		for (j = 0; j < balloon_npages; j++, lpfn++, mfn++) {
++			if ((discontig_frame_list[j] = pfn_to_mfn(lpfn)) != mfn)
++				discontig_free = 1;
++
++			set_phys_to_machine(lpfn, INVALID_P2M_ENTRY);
++                        page = pfn_to_page(lpfn);
++
++			if (!PageHighMem(page)) {
++				ret = HYPERVISOR_update_va_mapping(
++					(unsigned long)__va(lpfn << PAGE_SHIFT),
++					__pte_ma(0), 0);
++				BUG_ON(ret);
++			}
++		}
++		if (discontig_free) {
++			free_discontig_frame();
++			frame_list[i] = 0;
++			discontig = 1;
++		}
+ 	}
++	balloon_stats.current_pages -= nr_pages;
++
++	if (discontig)
++		nr_pages = shrink_frame(nr_pages);
+ 
+ 	set_xen_guest_handle(reservation.extent_start, frame_list);
+ 	reservation.nr_extents   = nr_pages;
++	reservation.extent_order = balloon_order;
+ 	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+ 	BUG_ON(ret != nr_pages);
+ 
+-	balloon_stats.current_pages -= nr_pages;
+-
+-	spin_unlock_irqrestore(&balloon_lock, flags);
+-
+ 	return need_sleep;
+ }
+ 
+@@ -379,7 +430,7 @@ static void watch_target(struct xenbus_watch *watch,
+ 	/* The given memory/target value is in KiB, so it needs converting to
+ 	 * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+ 	 */
+-	balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
++	balloon_set_new_target(new_target >> ((PAGE_SHIFT - 10) + balloon_order));
+ }
+ 
+ static int balloon_init_watcher(struct notifier_block *notifier,
+@@ -399,15 +450,18 @@ static struct notifier_block xenstore_notifier;
+ 
+ static int __init balloon_init(void)
+ {
+-	unsigned long pfn;
++	unsigned long pfn, extra_pfn_end;
+ 	struct page *page;
+ 
+ 	if (!xen_pv_domain())
+ 		return -ENODEV;
+ 
+-	pr_info("xen_balloon: Initialising balloon driver.\n");
++	pr_info("xen_balloon: Initialising balloon driver with page order %d.\n",
++		balloon_order);
++
++	balloon_npages = 1 << balloon_order;
+ 
+-	balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
++	balloon_stats.current_pages = (min(xen_start_info->nr_pages, max_pfn)) >> balloon_order;
+ 	balloon_stats.target_pages  = balloon_stats.current_pages;
+ 	balloon_stats.balloon_low   = 0;
+ 	balloon_stats.balloon_high  = 0;
+@@ -420,10 +474,15 @@ static int __init balloon_init(void)
+ 	register_balloon(&balloon_sysdev);
+ 
+ 	/* Initialise the balloon with excess memory space. */
+-	for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
++	extra_pfn_end = min(e820_end_of_ram_pfn(),
++			    (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size));
++	for (pfn = PFN_UP(xen_extra_mem_start);
++	     pfn < extra_pfn_end;
++	     pfn += balloon_npages) {
+ 		page = pfn_to_page(pfn);
+-		if (!PageReserved(page))
+-			balloon_append(page);
++		/* totalram_pages doesn't include the boot-time
++		   balloon extension, so don't subtract from it. */
++		__balloon_append(page);
+ 	}
+ 
+ 	target_watch.callback = watch_target;
+@@ -444,6 +503,121 @@ static void balloon_exit(void)
+ 
+ module_exit(balloon_exit);
+ 
++static int __init balloon_parse_huge(char *s)
++{
++	balloon_order = 9;
++	return 1;
++}
++
++__setup("balloon_hugepages", balloon_parse_huge);
++
++static int dealloc_pte_fn(pte_t *pte, struct page *pmd_page,
++			  unsigned long addr, void *data)
++{
++	unsigned long mfn = pte_mfn(*pte);
++	int ret;
++	struct xen_memory_reservation reservation = {
++		.nr_extents   = 1,
++		.extent_order = 0,
++		.domid        = DOMID_SELF
++	};
++
++	set_xen_guest_handle(reservation.extent_start, &mfn);
++	set_pte_at(&init_mm, addr, pte, __pte_ma(0));
++	set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY);
++
++	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
++	BUG_ON(ret != 1);
++
++	return 0;
++}
++
++struct page **alloc_empty_pages_and_pagevec(int nr_pages)
++{
++	struct page *page, **pagevec;
++	int npages;
++	int i, j, ret;
++
++	/* Round up to next number of balloon_order pages */
++	npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
++
++	pagevec = kmalloc(sizeof(page) * nr_pages << balloon_order, GFP_KERNEL);
++	if (pagevec == NULL)
++		return NULL;
++
++	for (i = 0; i < nr_pages; i++) {
++		void *v;
++
++		page = alloc_pages(GFP_KERNEL|__GFP_COLD, balloon_order);
++		if (page == NULL)
++			goto err;
++
++		scrub_page(page);
++
++		mutex_lock(&balloon_mutex);
++
++		v = page_address(page);
++
++		ret = apply_to_page_range(&init_mm, (unsigned long)v,
++					  PAGE_SIZE << balloon_order,
++					  dealloc_pte_fn, NULL);
++
++		if (ret != 0) {
++			mutex_unlock(&balloon_mutex);
++			//balloon_free_page(page); /* tries to use free_cold_page */
++			__free_page(page);
++			goto err;
++		}
++		for (j = 0; j < balloon_npages; j++)
++			pagevec[(i<<balloon_order)+j] = page++;
++
++		totalram_pages = balloon_stats.current_pages -= balloon_npages;
++
++		mutex_unlock(&balloon_mutex);
++	}
++
++ out:
++	schedule_work(&balloon_worker);
++	flush_tlb_all();
++	return pagevec;
++
++ err:
++	mutex_lock(&balloon_mutex);
++	while (--i >= 0)
++		balloon_append(pagevec[i << balloon_order]);
++	mutex_unlock(&balloon_mutex);
++	kfree(pagevec);
++	pagevec = NULL;
++	goto out;
++}
++EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
++
++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
++{
++	struct page *page;
++	int i;
++	int npages;
++
++	if (pagevec == NULL)
++		return;
++
++	/* Round up to next number of balloon_order pages */
++	npages = (nr_pages + (balloon_npages-1)) >> balloon_order;
++
++	mutex_lock(&balloon_mutex);
++	for (i = 0; i < nr_pages; i++) {
++		page = pagevec[i << balloon_order];
++		BUG_ON(page_count(page) != 1);
++		balloon_append(page);
++	}
++	mutex_unlock(&balloon_mutex);
++
++	kfree(pagevec);
++
++	schedule_work(&balloon_worker);
++}
++EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
++
+ #define BALLOON_SHOW(name, format, args...)				\
+ 	static ssize_t show_##name(struct sys_device *dev,		\
+ 				   struct sysdev_attribute *attr,	\
+@@ -477,7 +651,7 @@ static ssize_t store_target_kb(struct sys_device *dev,
+ 
+ 	target_bytes = simple_strtoull(buf, &endchar, 0) * 1024;
+ 
+-	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++	balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
+ 
+ 	return count;
+ }
+@@ -491,7 +665,7 @@ static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr
+ {
+ 	return sprintf(buf, "%llu\n",
+ 		       (unsigned long long)balloon_stats.target_pages
+-		       << PAGE_SHIFT);
++		       << (PAGE_SHIFT + balloon_order));
+ }
+ 
+ static ssize_t store_target(struct sys_device *dev,
+@@ -507,7 +681,7 @@ static ssize_t store_target(struct sys_device *dev,
+ 
+ 	target_bytes = memparse(buf, &endchar);
+ 
+-	balloon_set_new_target(target_bytes >> PAGE_SHIFT);
++	balloon_set_new_target(target_bytes >> (PAGE_SHIFT + balloon_order));
+ 
+ 	return count;
+ }
+diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
+new file mode 100644
+index 0000000..d40f534
+--- /dev/null
++++ b/drivers/xen/biomerge.c
+@@ -0,0 +1,14 @@
++#include <linux/bio.h>
++#include <asm/io.h>
++#include <xen/page.h>
++
++bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
++			       const struct bio_vec *vec2)
++{
++	unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
++	unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
++
++	return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
++		((mfn1 == mfn2) || ((mfn1+1) == mfn2));
++}
++
+diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile
+new file mode 100644
+index 0000000..dee55ba
+--- /dev/null
++++ b/drivers/xen/blkback/Makefile
+@@ -0,0 +1,4 @@
++obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
++obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
++
++xen-blkback-y	:= blkback.o xenbus.o interface.o vbd.o
+diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c
+new file mode 100644
+index 0000000..45f6eb2
+--- /dev/null
++++ b/drivers/xen/blkback/blkback-pagemap.c
+@@ -0,0 +1,109 @@
++#include <linux/module.h>
++#include "blkback-pagemap.h"
++
++static int blkback_pagemap_size;
++static struct blkback_pagemap *blkback_pagemap;
++
++static inline int
++blkback_pagemap_entry_clear(struct blkback_pagemap *map)
++{
++	static struct blkback_pagemap zero;
++	return !memcmp(map, &zero, sizeof(zero));
++}
++
++int
++blkback_pagemap_init(int pages)
++{
++	blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
++				  GFP_KERNEL);
++	if (!blkback_pagemap)
++		return -ENOMEM;
++
++	blkback_pagemap_size = pages;
++	return 0;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_init);
++
++void
++blkback_pagemap_set(int idx, struct page *page,
++		    domid_t domid, busid_t busid, grant_ref_t gref)
++{
++	struct blkback_pagemap *entry;
++
++	BUG_ON(!blkback_pagemap);
++	BUG_ON(idx >= blkback_pagemap_size);
++
++	set_page_private(page, idx);
++
++	entry = blkback_pagemap + idx;
++	if (!blkback_pagemap_entry_clear(entry)) {
++		printk("overwriting pagemap %d: d %u b %u g %u\n",
++		       idx, entry->domid, entry->busid, entry->gref);
++		BUG();
++	}
++
++	entry->page  = page;
++	entry->domid = domid;
++	entry->busid = busid;
++	entry->gref  = gref;
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_set);
++
++void
++blkback_pagemap_clear(struct page *page)
++{
++	int idx;
++	struct blkback_pagemap *entry;
++
++	idx = (int)page_private(page);
++
++	BUG_ON(!blkback_pagemap);
++	BUG_ON(idx >= blkback_pagemap_size);
++
++	entry = blkback_pagemap + idx;
++	if (blkback_pagemap_entry_clear(entry)) {
++		printk("clearing empty pagemap %d\n", idx);
++		BUG();
++	}
++
++	memset(entry, 0, sizeof(*entry));
++}
++EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
++
++struct blkback_pagemap
++blkback_pagemap_read(struct page *page)
++{
++	int idx;
++	struct blkback_pagemap *entry;
++
++	idx = (int)page_private(page);
++
++	BUG_ON(!blkback_pagemap);
++	BUG_ON(idx >= blkback_pagemap_size);
++
++	entry = blkback_pagemap + idx;
++	if (blkback_pagemap_entry_clear(entry)) {
++		printk("reading empty pagemap %d\n", idx);
++		BUG();
++	}
++
++	return *entry;
++}
++EXPORT_SYMBOL(blkback_pagemap_read);
++
++MODULE_LICENSE("Dual BSD/GPL");
++
++int
++blkback_pagemap_contains_page(struct page *page)
++{
++	struct blkback_pagemap *entry;
++	int idx = (int)page_private(page);
++
++	if (idx < 0 || idx >= blkback_pagemap_size)
++		return 0;
++
++	entry = blkback_pagemap + idx;
++
++	return (entry->page == page);
++}
++EXPORT_SYMBOL(blkback_pagemap_contains_page);
+diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h
+new file mode 100644
+index 0000000..7f97d15
+--- /dev/null
++++ b/drivers/xen/blkback/blkback-pagemap.h
+@@ -0,0 +1,36 @@
++#ifndef _BLKBACK_PAGEMAP_H_
++#define _BLKBACK_PAGEMAP_H_
++
++#include <linux/mm.h>
++#include <xen/interface/xen.h>
++#include <xen/interface/grant_table.h>
++
++typedef unsigned int busid_t;
++
++struct blkback_pagemap {
++	struct page     *page;
++	domid_t          domid;
++	busid_t          busid;
++	grant_ref_t      gref;
++};
++
++#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
++
++int blkback_pagemap_init(int);
++void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
++void blkback_pagemap_clear(struct page *);
++struct blkback_pagemap blkback_pagemap_read(struct page *);
++int blkback_pagemap_contains_page(struct page *page);
++
++#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++static inline int blkback_pagemap_init(int pages) { return 0; }
++static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
++				       busid_t bus, grant_ref_t gnt) {}
++static inline void blkback_pagemap_clear(struct page *page) {}
++#define blkback_pagemap_read(_page) ({ BUG(); (struct blkback_pagemap){0}; })
++static inline int blkback_pagemap_contains_page(struct page *page) { return 0; }
++
++#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
++
++#endif
+diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c
+new file mode 100644
+index 0000000..0bef445
+--- /dev/null
++++ b/drivers/xen/blkback/blkback.c
+@@ -0,0 +1,675 @@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/main.c
++ *
++ * Back-end of the driver for virtual block devices. This portion of the
++ * driver exports a 'unified' block-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A
++ * reference front-end implementation can be found in:
++ *  arch/xen/drivers/blkif/frontend
++ *
++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
++ * Copyright (c) 2005, Christopher Clark
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/spinlock.h>
++#include <linux/kthread.h>
++#include <linux/list.h>
++#include <linux/delay.h>
++#include <linux/freezer.h>
++
++#include <xen/balloon.h>
++#include <xen/events.h>
++#include <xen/page.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include "common.h"
++
++/*
++ * These are rather arbitrary. They are fairly large because adjacent requests
++ * pulled from a communication ring are quite likely to end up being part of
++ * the same scatter/gather request at the disc.
++ *
++ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
++ *
++ * This will increase the chances of being able to write whole tracks.
++ * 64 should be enough to keep us competitive with Linux.
++ */
++static int blkif_reqs = 64;
++module_param_named(reqs, blkif_reqs, int, 0);
++MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
++
++/* Run-time switchable: /sys/module/blkback/parameters/ */
++static unsigned int log_stats = 0;
++static unsigned int debug_lvl = 0;
++module_param(log_stats, int, 0644);
++module_param(debug_lvl, int, 0644);
++
++/*
++ * Each outstanding request that we've passed to the lower device layers has a
++ * 'pending_req' allocated to it. Each buffer_head that completes decrements
++ * the pendcnt towards zero. When it hits zero, the specified domain has a
++ * response queued for it, with the saved 'id' passed back.
++ */
++typedef struct {
++	blkif_t       *blkif;
++	u64            id;
++	int            nr_pages;
++	atomic_t       pendcnt;
++	unsigned short operation;
++	int            status;
++	struct list_head free_list;
++} pending_req_t;
++
++static pending_req_t *pending_reqs;
++static struct list_head pending_free;
++static DEFINE_SPINLOCK(pending_free_lock);
++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
++
++#define BLKBACK_INVALID_HANDLE (~0)
++
++static struct page **pending_pages;
++static grant_handle_t *pending_grant_handles;
++
++static inline int vaddr_pagenr(pending_req_t *req, int seg)
++{
++	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++}
++
++#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
++
++static inline unsigned long vaddr(pending_req_t *req, int seg)
++{
++	unsigned long pfn = page_to_pfn(pending_page(req, seg));
++	return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#define pending_handle(_req, _seg) \
++	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
++
++
++static int do_block_io_op(blkif_t *blkif);
++static void dispatch_rw_block_io(blkif_t *blkif,
++				 struct blkif_request *req,
++				 pending_req_t *pending_req);
++static void make_response(blkif_t *blkif, u64 id,
++			  unsigned short op, int st);
++
++/******************************************************************
++ * misc small helpers
++ */
++static pending_req_t* alloc_req(void)
++{
++	pending_req_t *req = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&pending_free_lock, flags);
++	if (!list_empty(&pending_free)) {
++		req = list_entry(pending_free.next, pending_req_t, free_list);
++		list_del(&req->free_list);
++	}
++	spin_unlock_irqrestore(&pending_free_lock, flags);
++	return req;
++}
++
++static void free_req(pending_req_t *req)
++{
++	unsigned long flags;
++	int was_empty;
++
++	spin_lock_irqsave(&pending_free_lock, flags);
++	was_empty = list_empty(&pending_free);
++	list_add(&req->free_list, &pending_free);
++	spin_unlock_irqrestore(&pending_free_lock, flags);
++	if (was_empty)
++		wake_up(&pending_free_wq);
++}
++
++static void unplug_queue(blkif_t *blkif)
++{
++	if (blkif->plug == NULL)
++		return;
++	if (blkif->plug->unplug_fn)
++		blkif->plug->unplug_fn(blkif->plug);
++	blk_put_queue(blkif->plug);
++	blkif->plug = NULL;
++}
++
++static void plug_queue(blkif_t *blkif, struct block_device *bdev)
++{
++	struct request_queue *q = bdev_get_queue(bdev);
++
++	if (q == blkif->plug)
++		return;
++	unplug_queue(blkif);
++	blk_get_queue(q);
++	blkif->plug = q;
++}
++
++static void fast_flush_area(pending_req_t *req)
++{
++	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++	unsigned int i, invcount = 0;
++	grant_handle_t handle;
++	int ret;
++
++	for (i = 0; i < req->nr_pages; i++) {
++		handle = pending_handle(req, i);
++		if (handle == BLKBACK_INVALID_HANDLE)
++			continue;
++		blkback_pagemap_clear(pending_page(req, i));
++		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
++				    GNTMAP_host_map, handle);
++		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
++		invcount++;
++	}
++
++	ret = HYPERVISOR_grant_table_op(
++		GNTTABOP_unmap_grant_ref, unmap, invcount);
++	BUG_ON(ret);
++}
++
++/******************************************************************
++ * SCHEDULER FUNCTIONS
++ */
++
++static void print_stats(blkif_t *blkif)
++{
++	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d\n",
++	       current->comm, blkif->st_oo_req,
++	       blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req);
++	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
++	blkif->st_rd_req = 0;
++	blkif->st_wr_req = 0;
++	blkif->st_oo_req = 0;
++}
++
++int blkif_schedule(void *arg)
++{
++	blkif_t *blkif = arg;
++	struct vbd *vbd = &blkif->vbd;
++
++	blkif_get(blkif);
++
++	if (debug_lvl)
++		printk(KERN_DEBUG "%s: started\n", current->comm);
++
++	while (!kthread_should_stop()) {
++		if (try_to_freeze())
++			continue;
++		if (unlikely(vbd->size != vbd_size(vbd)))
++			vbd_resize(blkif);
++
++		wait_event_interruptible(
++			blkif->wq,
++			blkif->waiting_reqs || kthread_should_stop());
++		wait_event_interruptible(
++			pending_free_wq,
++			!list_empty(&pending_free) || kthread_should_stop());
++
++		blkif->waiting_reqs = 0;
++		smp_mb(); /* clear flag *before* checking for work */
++
++		if (do_block_io_op(blkif))
++			blkif->waiting_reqs = 1;
++		unplug_queue(blkif);
++
++		if (log_stats && time_after(jiffies, blkif->st_print))
++			print_stats(blkif);
++	}
++
++	if (log_stats)
++		print_stats(blkif);
++	if (debug_lvl)
++		printk(KERN_DEBUG "%s: exiting\n", current->comm);
++
++	blkif->xenblkd = NULL;
++	blkif_put(blkif);
++
++	return 0;
++}
++
++/******************************************************************
++ * COMPLETION CALLBACK -- Called as bh->b_end_io()
++ */
++
++static void __end_block_io_op(pending_req_t *pending_req, int error)
++{
++	/* An error fails the entire request. */
++	if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
++	    (error == -EOPNOTSUPP)) {
++		DPRINTK("blkback: write barrier op failed, not supported\n");
++		blkback_barrier(XBT_NIL, pending_req->blkif->be, 0);
++		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
++	} else if (error) {
++		DPRINTK("Buffer not up-to-date at end of operation, "
++			"error=%d\n", error);
++		pending_req->status = BLKIF_RSP_ERROR;
++	}
++
++	if (atomic_dec_and_test(&pending_req->pendcnt)) {
++		fast_flush_area(pending_req);
++		make_response(pending_req->blkif, pending_req->id,
++			      pending_req->operation, pending_req->status);
++		blkif_put(pending_req->blkif);
++		free_req(pending_req);
++	}
++}
++
++static void end_block_io_op(struct bio *bio, int error)
++{
++	__end_block_io_op(bio->bi_private, error);
++	bio_put(bio);
++}
++
++
++/******************************************************************************
++ * NOTIFICATION FROM GUEST OS.
++ */
++
++static void blkif_notify_work(blkif_t *blkif)
++{
++	blkif->waiting_reqs = 1;
++	wake_up(&blkif->wq);
++}
++
++irqreturn_t blkif_be_int(int irq, void *dev_id)
++{
++	blkif_notify_work(dev_id);
++	return IRQ_HANDLED;
++}
++
++
++
++/******************************************************************
++ * DOWNWARD CALLS -- These interface with the block-device layer proper.
++ */
++
++static int do_block_io_op(blkif_t *blkif)
++{
++	union blkif_back_rings *blk_rings = &blkif->blk_rings;
++	struct blkif_request req;
++	pending_req_t *pending_req;
++	RING_IDX rc, rp;
++	int more_to_do = 0;
++
++	rc = blk_rings->common.req_cons;
++	rp = blk_rings->common.sring->req_prod;
++	rmb(); /* Ensure we see queued requests up to 'rp'. */
++
++	while (rc != rp) {
++
++		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
++			break;
++
++		if (kthread_should_stop()) {
++			more_to_do = 1;
++			break;
++		}
++
++		pending_req = alloc_req();
++		if (NULL == pending_req) {
++			blkif->st_oo_req++;
++			more_to_do = 1;
++			break;
++		}
++
++		switch (blkif->blk_protocol) {
++		case BLKIF_PROTOCOL_NATIVE:
++			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
++			break;
++		case BLKIF_PROTOCOL_X86_32:
++			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
++			break;
++		case BLKIF_PROTOCOL_X86_64:
++			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
++			break;
++		default:
++			BUG();
++		}
++		blk_rings->common.req_cons = ++rc; /* before make_response() */
++
++		/* Apply all sanity checks to /private copy/ of request. */
++		barrier();
++
++		switch (req.operation) {
++		case BLKIF_OP_READ:
++			blkif->st_rd_req++;
++			dispatch_rw_block_io(blkif, &req, pending_req);
++			break;
++		case BLKIF_OP_WRITE_BARRIER:
++			blkif->st_br_req++;
++			/* fall through */
++		case BLKIF_OP_WRITE:
++			blkif->st_wr_req++;
++			dispatch_rw_block_io(blkif, &req, pending_req);
++			break;
++		default:
++			/* A good sign something is wrong: sleep for a while to
++			 * avoid excessive CPU consumption by a bad guest. */
++			msleep(1);
++			DPRINTK("error: unknown block io operation [%d]\n",
++				req.operation);
++			make_response(blkif, req.id, req.operation,
++				      BLKIF_RSP_ERROR);
++			free_req(pending_req);
++			break;
++		}
++
++		/* Yield point for this unbounded loop. */
++		cond_resched();
++	}
++
++	return more_to_do;
++}
++
++static void dispatch_rw_block_io(blkif_t *blkif,
++				 struct blkif_request *req,
++				 pending_req_t *pending_req)
++{
++	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++	struct phys_req preq;
++	struct {
++		unsigned long buf; unsigned int nsec;
++	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++	unsigned int nseg;
++	struct bio *bio = NULL;
++	int ret, i;
++	int operation;
++
++	switch (req->operation) {
++	case BLKIF_OP_READ:
++		operation = READ;
++		break;
++	case BLKIF_OP_WRITE:
++		operation = WRITE;
++		break;
++	case BLKIF_OP_WRITE_BARRIER:
++		operation = WRITE_BARRIER;
++		break;
++	default:
++		operation = 0; /* make gcc happy */
++		BUG();
++	}
++
++	/* Check that number of segments is sane. */
++	nseg = req->nr_segments;
++	if (unlikely(nseg == 0 && operation != WRITE_BARRIER) ||
++	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
++		DPRINTK("Bad number of segments in request (%d)\n", nseg);
++		goto fail_response;
++	}
++
++	preq.dev           = req->handle;
++	preq.sector_number = req->sector_number;
++	preq.nr_sects      = 0;
++
++	pending_req->blkif     = blkif;
++	pending_req->id        = req->id;
++	pending_req->operation = req->operation;
++	pending_req->status    = BLKIF_RSP_OKAY;
++	pending_req->nr_pages  = nseg;
++
++	for (i = 0; i < nseg; i++) {
++		uint32_t flags;
++
++		seg[i].nsec = req->seg[i].last_sect -
++			req->seg[i].first_sect + 1;
++
++		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
++		    (req->seg[i].last_sect < req->seg[i].first_sect))
++			goto fail_response;
++		preq.nr_sects += seg[i].nsec;
++
++		flags = GNTMAP_host_map;
++		if (operation != READ)
++			flags |= GNTMAP_readonly;
++		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
++				  req->seg[i].gref, blkif->domid);
++	}
++
++	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
++	BUG_ON(ret);
++
++	for (i = 0; i < nseg; i++) {
++		if (unlikely(map[i].status != 0)) {
++			DPRINTK("invalid buffer -- could not remap it\n");
++			map[i].handle = BLKBACK_INVALID_HANDLE;
++			ret |= 1;
++			continue;
++		}
++
++		set_phys_to_machine(
++			page_to_pfn(pending_page(pending_req, i)),
++			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
++		seg[i].buf  = map[i].dev_bus_addr |
++			(req->seg[i].first_sect << 9);
++		blkback_pagemap_set(vaddr_pagenr(pending_req, i),
++				    pending_page(pending_req, i),
++				    blkif->domid, req->handle,
++				    req->seg[i].gref);
++		pending_handle(pending_req, i) = map[i].handle;
++	}
++
++	if (ret)
++		goto fail_flush;
++
++	if (vbd_translate(&preq, blkif, operation) != 0) {
++		DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
++			operation == READ ? "read" : "write",
++			preq.sector_number,
++			preq.sector_number + preq.nr_sects, preq.dev);
++		goto fail_flush;
++	}
++
++	plug_queue(blkif, preq.bdev);
++	atomic_set(&pending_req->pendcnt, 1);
++	blkif_get(blkif);
++
++	for (i = 0; i < nseg; i++) {
++		if (((int)preq.sector_number|(int)seg[i].nsec) &
++		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
++			DPRINTK("Misaligned I/O request from domain %d",
++				blkif->domid);
++			goto fail_put_bio;
++		}
++
++		while ((bio == NULL) ||
++		       (bio_add_page(bio,
++				     pending_page(pending_req, i),
++				     seg[i].nsec << 9,
++				     seg[i].buf & ~PAGE_MASK) == 0)) {
++			if (bio) {
++				atomic_inc(&pending_req->pendcnt);
++				submit_bio(operation, bio);
++			}
++
++			bio = bio_alloc(GFP_KERNEL, nseg-i);
++			if (unlikely(bio == NULL))
++				goto fail_put_bio;
++
++			bio->bi_bdev    = preq.bdev;
++			bio->bi_private = pending_req;
++			bio->bi_end_io  = end_block_io_op;
++			bio->bi_sector  = preq.sector_number;
++		}
++
++		preq.sector_number += seg[i].nsec;
++	}
++
++	if (!bio) {
++		BUG_ON(operation != WRITE_BARRIER);
++		bio = bio_alloc(GFP_KERNEL, 0);
++		if (unlikely(bio == NULL))
++			goto fail_put_bio;
++
++		bio->bi_bdev    = preq.bdev;
++		bio->bi_private = pending_req;
++		bio->bi_end_io  = end_block_io_op;
++		bio->bi_sector  = -1;
++	}
++
++	submit_bio(operation, bio);
++
++	if (operation == READ)
++		blkif->st_rd_sect += preq.nr_sects;
++	else if (operation == WRITE || operation == WRITE_BARRIER)
++		blkif->st_wr_sect += preq.nr_sects;
++
++	return;
++
++ fail_flush:
++	fast_flush_area(pending_req);
++ fail_response:
++	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
++	free_req(pending_req);
++	msleep(1); /* back off a bit */
++	return;
++
++ fail_put_bio:
++	__end_block_io_op(pending_req, -EINVAL);
++	if (bio)
++		bio_put(bio);
++	unplug_queue(blkif);
++	msleep(1); /* back off a bit */
++	return;
++}
++
++
++
++/******************************************************************
++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
++ */
++
++
++static void make_response(blkif_t *blkif, u64 id,
++			  unsigned short op, int st)
++{
++	struct blkif_response  resp;
++	unsigned long     flags;
++	union blkif_back_rings *blk_rings = &blkif->blk_rings;
++	int more_to_do = 0;
++	int notify;
++
++	resp.id        = id;
++	resp.operation = op;
++	resp.status    = st;
++
++	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
++	/* Place on the response ring for the relevant domain. */
++	switch (blkif->blk_protocol) {
++	case BLKIF_PROTOCOL_NATIVE:
++		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
++		       &resp, sizeof(resp));
++		break;
++	case BLKIF_PROTOCOL_X86_32:
++		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
++		       &resp, sizeof(resp));
++		break;
++	case BLKIF_PROTOCOL_X86_64:
++		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
++		       &resp, sizeof(resp));
++		break;
++	default:
++		BUG();
++	}
++	blk_rings->common.rsp_prod_pvt++;
++	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
++	if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
++		/*
++		 * Tail check for pending requests. Allows frontend to avoid
++		 * notifications if requests are already in flight (lower
++		 * overheads and promotes batching).
++		 */
++		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
++
++	} else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
++		more_to_do = 1;
++	}
++
++	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
++
++	if (more_to_do)
++		blkif_notify_work(blkif);
++	if (notify)
++		notify_remote_via_irq(blkif->irq);
++}
++
++static int __init blkif_init(void)
++{
++	int i, mmap_pages;
++	int rc = 0;
++
++	if (!xen_pv_domain())
++		return -ENODEV;
++
++	mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
++					blkif_reqs, GFP_KERNEL);
++	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
++					mmap_pages, GFP_KERNEL);
++	pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
++
++	if (blkback_pagemap_init(mmap_pages))
++		goto out_of_memory;
++
++	if (!pending_reqs || !pending_grant_handles || !pending_pages) {
++		rc = -ENOMEM;
++		goto out_of_memory;
++	}
++
++	for (i = 0; i < mmap_pages; i++)
++		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
++
++	rc = blkif_interface_init();
++	if (rc)
++		goto failed_init;
++
++	memset(pending_reqs, 0, sizeof(pending_reqs));
++	INIT_LIST_HEAD(&pending_free);
++
++	for (i = 0; i < blkif_reqs; i++)
++		list_add_tail(&pending_reqs[i].free_list, &pending_free);
++
++	rc = blkif_xenbus_init();
++	if (rc)
++		goto failed_init;
++
++	return 0;
++
++ out_of_memory:
++	printk(KERN_ERR "%s: out of memory\n", __func__);
++ failed_init:
++	kfree(pending_reqs);
++	kfree(pending_grant_handles);
++	free_empty_pages_and_pagevec(pending_pages, mmap_pages);
++	return rc;
++}
++
++module_init(blkif_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h
+new file mode 100644
+index 0000000..531ba81
+--- /dev/null
++++ b/drivers/xen/blkback/common.h
+@@ -0,0 +1,143 @@
++/*
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __BLKIF__BACKEND__COMMON_H__
++#define __BLKIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/blkdev.h>
++#include <linux/vmalloc.h>
++#include <linux/wait.h>
++#include <asm/io.h>
++#include <asm/setup.h>
++#include <asm/pgalloc.h>
++#include <asm/hypervisor.h>
++#include <xen/blkif.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
++#include "blkback-pagemap.h"
++
++
++#define DPRINTK(_f, _a...)			\
++	pr_debug("(file=%s, line=%d) " _f,	\
++		 __FILE__ , __LINE__ , ## _a )
++
++struct vbd {
++	blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
++	unsigned char  readonly;    /* Non-zero -> read-only */
++	unsigned char  type;        /* VDISK_xxx */
++	u32            pdevice;     /* phys device that this vbd maps to */
++	struct block_device *bdev;
++	sector_t       size;        /* Cached size parameter */
++};
++
++struct backend_info;
++
++typedef struct blkif_st {
++	/* Unique identifier for this interface. */
++	domid_t           domid;
++	unsigned int      handle;
++	/* Physical parameters of the comms window. */
++	unsigned int      irq;
++	/* Comms information. */
++	enum blkif_protocol blk_protocol;
++	union blkif_back_rings blk_rings;
++	struct vm_struct *blk_ring_area;
++	/* The VBD attached to this interface. */
++	struct vbd        vbd;
++	/* Back pointer to the backend_info. */
++	struct backend_info *be;
++	/* Private fields. */
++	spinlock_t       blk_ring_lock;
++	atomic_t         refcnt;
++
++	wait_queue_head_t   wq;
++	struct task_struct  *xenblkd;
++	unsigned int        waiting_reqs;
++	struct request_queue     *plug;
++
++	/* statistics */
++	unsigned long       st_print;
++	int                 st_rd_req;
++	int                 st_wr_req;
++	int                 st_oo_req;
++	int                 st_br_req;
++	int                 st_rd_sect;
++	int                 st_wr_sect;
++
++	wait_queue_head_t waiting_to_free;
++
++	grant_handle_t shmem_handle;
++	grant_ref_t    shmem_ref;
++} blkif_t;
++
++blkif_t *blkif_alloc(domid_t domid);
++void blkif_disconnect(blkif_t *blkif);
++void blkif_free(blkif_t *blkif);
++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn);
++void vbd_resize(blkif_t *blkif);
++
++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
++#define blkif_put(_b)					\
++	do {						\
++		if (atomic_dec_and_test(&(_b)->refcnt))	\
++			wake_up(&(_b)->waiting_to_free);\
++	} while (0)
++
++/* Create a vbd. */
++int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
++	       unsigned minor, int readonly, int cdrom);
++void vbd_free(struct vbd *vbd);
++
++unsigned long long vbd_size(struct vbd *vbd);
++unsigned int vbd_info(struct vbd *vbd);
++unsigned long vbd_secsize(struct vbd *vbd);
++
++struct phys_req {
++	unsigned short       dev;
++	unsigned short       nr_sects;
++	struct block_device *bdev;
++	blkif_sector_t       sector_number;
++};
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
++
++int blkif_interface_init(void);
++
++int blkif_xenbus_init(void);
++
++irqreturn_t blkif_be_int(int irq, void *dev_id);
++int blkif_schedule(void *arg);
++
++int blkback_barrier(struct xenbus_transaction xbt,
++		    struct backend_info *be, int state);
++
++struct xenbus_device *blkback_xenbus(struct backend_info *be);
++
++#endif /* __BLKIF__BACKEND__COMMON_H__ */
+diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c
+new file mode 100644
+index 0000000..e397a41
+--- /dev/null
++++ b/drivers/xen/blkback/interface.c
+@@ -0,0 +1,186 @@
++/******************************************************************************
++ * arch/xen/drivers/blkif/backend/interface.c
++ *
++ * Block-device interface management.
++ *
++ * Copyright (c) 2004, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <xen/events.h>
++#include <xen/grant_table.h>
++#include <linux/kthread.h>
++
++static struct kmem_cache *blkif_cachep;
++
++blkif_t *blkif_alloc(domid_t domid)
++{
++	blkif_t *blkif;
++
++	blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL);
++	if (!blkif)
++		return ERR_PTR(-ENOMEM);
++
++	memset(blkif, 0, sizeof(*blkif));
++	blkif->domid = domid;
++	spin_lock_init(&blkif->blk_ring_lock);
++	atomic_set(&blkif->refcnt, 1);
++	init_waitqueue_head(&blkif->wq);
++	blkif->st_print = jiffies;
++	init_waitqueue_head(&blkif->waiting_to_free);
++
++	return blkif;
++}
++
++static int map_frontend_page(blkif_t *blkif, unsigned long shared_page)
++{
++	struct gnttab_map_grant_ref op;
++
++	gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr,
++			  GNTMAP_host_map, shared_page, blkif->domid);
++
++	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++		BUG();
++
++	if (op.status) {
++		DPRINTK(" Grant table operation failure !\n");
++		return op.status;
++	}
++
++	blkif->shmem_ref = shared_page;
++	blkif->shmem_handle = op.handle;
++
++	return 0;
++}
++
++static void unmap_frontend_page(blkif_t *blkif)
++{
++	struct gnttab_unmap_grant_ref op;
++
++	gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr,
++			    GNTMAP_host_map, blkif->shmem_handle);
++
++	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++		BUG();
++}
++
++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn)
++{
++	int err;
++
++	/* Already connected through? */
++	if (blkif->irq)
++		return 0;
++
++	if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL )
++		return -ENOMEM;
++
++	err = map_frontend_page(blkif, shared_page);
++	if (err) {
++		free_vm_area(blkif->blk_ring_area);
++		return err;
++	}
++
++	switch (blkif->blk_protocol) {
++	case BLKIF_PROTOCOL_NATIVE:
++	{
++		struct blkif_sring *sring;
++		sring = (struct blkif_sring *)blkif->blk_ring_area->addr;
++		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
++		break;
++	}
++	case BLKIF_PROTOCOL_X86_32:
++	{
++		struct blkif_x86_32_sring *sring_x86_32;
++		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr;
++		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
++		break;
++	}
++	case BLKIF_PROTOCOL_X86_64:
++	{
++		struct blkif_x86_64_sring *sring_x86_64;
++		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr;
++		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
++		break;
++	}
++	default:
++		BUG();
++	}
++
++	err = bind_interdomain_evtchn_to_irqhandler(
++		blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
++	if (err < 0)
++	{
++		unmap_frontend_page(blkif);
++		free_vm_area(blkif->blk_ring_area);
++		blkif->blk_rings.common.sring = NULL;
++		return err;
++	}
++	blkif->irq = err;
++
++	return 0;
++}
++
++void blkif_disconnect(blkif_t *blkif)
++{
++	if (blkif->xenblkd) {
++		kthread_stop(blkif->xenblkd);
++		blkif->xenblkd = NULL;
++	}
++
++	atomic_dec(&blkif->refcnt);
++	wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
++	atomic_inc(&blkif->refcnt);
++
++	if (blkif->irq) {
++		unbind_from_irqhandler(blkif->irq, blkif);
++		blkif->irq = 0;
++	}
++
++	if (blkif->blk_rings.common.sring) {
++		unmap_frontend_page(blkif);
++		free_vm_area(blkif->blk_ring_area);
++		blkif->blk_rings.common.sring = NULL;
++	}
++}
++
++void blkif_free(blkif_t *blkif)
++{
++	if (!atomic_dec_and_test(&blkif->refcnt))
++		BUG();
++	kmem_cache_free(blkif_cachep, blkif);
++}
++
++int __init blkif_interface_init(void)
++{
++	blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t),
++					 0, 0, NULL);
++	if (!blkif_cachep)
++		return -ENOMEM;
++
++	return 0;
++}
+diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c
+new file mode 100644
+index 0000000..943ec23
+--- /dev/null
++++ b/drivers/xen/blkback/vbd.c
+@@ -0,0 +1,161 @@
++/******************************************************************************
++ * blkback/vbd.c
++ *
++ * Routines for managing virtual block devices (VBDs).
++ *
++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++
++#define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
++		      (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
++
++unsigned long long vbd_size(struct vbd *vbd)
++{
++	return vbd_sz(vbd);
++}
++
++unsigned int vbd_info(struct vbd *vbd)
++{
++	return vbd->type | (vbd->readonly?VDISK_READONLY:0);
++}
++
++unsigned long vbd_secsize(struct vbd *vbd)
++{
++	return bdev_logical_block_size(vbd->bdev);
++}
++
++int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
++	       unsigned minor, int readonly, int cdrom)
++{
++	struct vbd *vbd;
++	struct block_device *bdev;
++
++	vbd = &blkif->vbd;
++	vbd->handle   = handle;
++	vbd->readonly = readonly;
++	vbd->type     = 0;
++
++	vbd->pdevice  = MKDEV(major, minor);
++
++	bdev = open_by_devnum(vbd->pdevice,
++			      vbd->readonly ? FMODE_READ : FMODE_WRITE);
++
++	if (IS_ERR(bdev)) {
++		DPRINTK("vbd_creat: device %08x could not be opened.\n",
++			vbd->pdevice);
++		return -ENOENT;
++	}
++
++	vbd->bdev = bdev;
++	vbd->size = vbd_size(vbd);
++
++	if (vbd->bdev->bd_disk == NULL) {
++		DPRINTK("vbd_creat: device %08x doesn't exist.\n",
++			vbd->pdevice);
++		vbd_free(vbd);
++		return -ENOENT;
++	}
++
++	if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
++		vbd->type |= VDISK_CDROM;
++	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
++		vbd->type |= VDISK_REMOVABLE;
++
++	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
++		handle, blkif->domid);
++	return 0;
++}
++
++void vbd_free(struct vbd *vbd)
++{
++	if (vbd->bdev)
++		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
++	vbd->bdev = NULL;
++}
++
++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
++{
++	struct vbd *vbd = &blkif->vbd;
++	int rc = -EACCES;
++
++	if ((operation != READ) && vbd->readonly)
++		goto out;
++
++	if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)))
++		goto out;
++
++	req->dev  = vbd->pdevice;
++	req->bdev = vbd->bdev;
++	rc = 0;
++
++ out:
++	return rc;
++}
++
++void vbd_resize(blkif_t *blkif)
++{
++	struct vbd *vbd = &blkif->vbd;
++	struct xenbus_transaction xbt;
++	int err;
++	struct xenbus_device *dev = blkback_xenbus(blkif->be);
++	unsigned long long new_size = vbd_size(vbd);
++
++	printk(KERN_INFO "VBD Resize: new size %Lu\n", new_size);
++	vbd->size = new_size;
++again:
++	err = xenbus_transaction_start(&xbt);
++	if (err) {
++		printk(KERN_WARNING "Error starting transaction");
++		return;
++	}
++	err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
++			    vbd_size(vbd));
++	if (err) {
++		printk(KERN_WARNING "Error writing new size");
++		goto abort;
++	}
++	/*
++	 * Write the current state; we will use this to synchronize
++	 * the front-end. If the current state is "connected" the
++	 * front-end will get the new size information online.
++	 */
++	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
++	if (err) {
++		printk(KERN_WARNING "Error writing the state");
++		goto abort;
++	}
++
++	err = xenbus_transaction_end(xbt, 0);
++	if (err == -EAGAIN)
++		goto again;
++	if (err)
++		printk(KERN_WARNING "Error ending transaction");
++abort:
++	xenbus_transaction_end(xbt, 1);
++}
+diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c
+new file mode 100644
+index 0000000..a0534fc
+--- /dev/null
++++ b/drivers/xen/blkback/xenbus.c
+@@ -0,0 +1,553 @@
++/*  Xenbus code for blkif backend
++    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
++    Copyright (C) 2005 XenSource Ltd
++
++    This program is free software; you can redistribute it and/or modify
++    it under the terms of the GNU General Public License as published by
++    the Free Software Foundation; either version 2 of the License, or
++    (at your option) any later version.
++
++    This program is distributed in the hope that it will be useful,
++    but WITHOUT ANY WARRANTY; without even the implied warranty of
++    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++    GNU General Public License for more details.
++
++    You should have received a copy of the GNU General Public License
++    along with this program; if not, write to the Free Software
++    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++*/
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include "common.h"
++
++#undef DPRINTK
++#define DPRINTK(fmt, args...)				\
++	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
++		 __FUNCTION__, __LINE__, ##args)
++
++struct backend_info
++{
++	struct xenbus_device *dev;
++	blkif_t *blkif;
++	struct xenbus_watch backend_watch;
++	unsigned major;
++	unsigned minor;
++	char *mode;
++};
++
++static void connect(struct backend_info *);
++static int connect_ring(struct backend_info *);
++static void backend_changed(struct xenbus_watch *, const char **,
++			    unsigned int);
++
++struct xenbus_device *blkback_xenbus(struct backend_info *be)
++{
++	return be->dev;
++}
++
++static int blkback_name(blkif_t *blkif, char *buf)
++{
++	char *devpath, *devname;
++	struct xenbus_device *dev = blkif->be->dev;
++
++	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
++	if (IS_ERR(devpath))
++		return PTR_ERR(devpath);
++
++	if ((devname = strstr(devpath, "/dev/")) != NULL)
++		devname += strlen("/dev/");
++	else
++		devname  = devpath;
++
++	snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
++	kfree(devpath);
++
++	return 0;
++}
++
++static void update_blkif_status(blkif_t *blkif)
++{
++	int err;
++	char name[TASK_COMM_LEN];
++
++	/* Not ready to connect? */
++	if (!blkif->irq || !blkif->vbd.bdev)
++		return;
++
++	/* Already connected? */
++	if (blkif->be->dev->state == XenbusStateConnected)
++		return;
++
++	/* Attempt to connect: exit if we fail to. */
++	connect(blkif->be);
++	if (blkif->be->dev->state != XenbusStateConnected)
++		return;
++
++	err = blkback_name(blkif, name);
++	if (err) {
++		xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
++		return;
++	}
++
++	err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
++	if (err) {
++		xenbus_dev_error(blkif->be->dev, err, "block flush");
++		return;
++	}
++	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
++
++	blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
++	if (IS_ERR(blkif->xenblkd)) {
++		err = PTR_ERR(blkif->xenblkd);
++		blkif->xenblkd = NULL;
++		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
++	}
++}
++
++
++/****************************************************************
++ *  sysfs interface for VBD I/O requests
++ */
++
++#define VBD_SHOW(name, format, args...)					\
++	static ssize_t show_##name(struct device *_dev,			\
++				   struct device_attribute *attr,	\
++				   char *buf)				\
++	{								\
++		struct xenbus_device *dev = to_xenbus_device(_dev);	\
++		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
++									\
++		return sprintf(buf, format, ##args);			\
++	}								\
++	static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
++
++VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
++VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
++VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
++VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
++
++static struct attribute *vbdstat_attrs[] = {
++	&dev_attr_oo_req.attr,
++	&dev_attr_rd_req.attr,
++	&dev_attr_wr_req.attr,
++	&dev_attr_br_req.attr,
++	&dev_attr_rd_sect.attr,
++	&dev_attr_wr_sect.attr,
++	NULL
++};
++
++static struct attribute_group vbdstat_group = {
++	.name = "statistics",
++	.attrs = vbdstat_attrs,
++};
++
++VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
++VBD_SHOW(mode, "%s\n", be->mode);
++
++int xenvbd_sysfs_addif(struct xenbus_device *dev)
++{
++	int error;
++
++	error = device_create_file(&dev->dev, &dev_attr_physical_device);
++ 	if (error)
++		goto fail1;
++
++	error = device_create_file(&dev->dev, &dev_attr_mode);
++	if (error)
++		goto fail2;
++
++	error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
++	if (error)
++		goto fail3;
++
++	return 0;
++
++fail3:	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
++fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
++	return error;
++}
++
++void xenvbd_sysfs_delif(struct xenbus_device *dev)
++{
++	sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
++	device_remove_file(&dev->dev, &dev_attr_mode);
++	device_remove_file(&dev->dev, &dev_attr_physical_device);
++}
++
++static int blkback_remove(struct xenbus_device *dev)
++{
++	struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++	DPRINTK("");
++
++	if (be->major || be->minor)
++		xenvbd_sysfs_delif(dev);
++
++	if (be->backend_watch.node) {
++		unregister_xenbus_watch(&be->backend_watch);
++		kfree(be->backend_watch.node);
++		be->backend_watch.node = NULL;
++	}
++
++	if (be->blkif) {
++		blkif_disconnect(be->blkif);
++		vbd_free(&be->blkif->vbd);
++		blkif_free(be->blkif);
++		be->blkif = NULL;
++	}
++
++	kfree(be);
++	dev_set_drvdata(&dev->dev, NULL);
++	return 0;
++}
++
++int blkback_barrier(struct xenbus_transaction xbt,
++		    struct backend_info *be, int state)
++{
++	struct xenbus_device *dev = be->dev;
++	int err;
++
++	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
++			    "%d", state);
++	if (err)
++		xenbus_dev_fatal(dev, err, "writing feature-barrier");
++
++	return err;
++}
++
++/**
++ * Entry point to this code when a new device is created.  Allocate the basic
++ * structures, and watch the store waiting for the hotplug scripts to tell us
++ * the device's physical major and minor numbers.  Switch to InitWait.
++ */
++static int blkback_probe(struct xenbus_device *dev,
++			 const struct xenbus_device_id *id)
++{
++	int err;
++	struct backend_info *be = kzalloc(sizeof(struct backend_info),
++					  GFP_KERNEL);
++	if (!be) {
++		xenbus_dev_fatal(dev, -ENOMEM,
++				 "allocating backend structure");
++		return -ENOMEM;
++	}
++	be->dev = dev;
++	dev_set_drvdata(&dev->dev, be);
++
++	be->blkif = blkif_alloc(dev->otherend_id);
++	if (IS_ERR(be->blkif)) {
++		err = PTR_ERR(be->blkif);
++		be->blkif = NULL;
++		xenbus_dev_fatal(dev, err, "creating block interface");
++		goto fail;
++	}
++
++	/* setup back pointer */
++	be->blkif->be = be;
++
++	err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
++				   "%s/%s", dev->nodename, "physical-device");
++	if (err)
++		goto fail;
++
++	err = xenbus_switch_state(dev, XenbusStateInitWait);
++	if (err)
++		goto fail;
++
++	return 0;
++
++fail:
++	DPRINTK("failed");
++	blkback_remove(dev);
++	return err;
++}
++
++
++/**
++ * Callback received when the hotplug scripts have placed the physical-device
++ * node.  Read it and the mode node, and create a vbd.  If the frontend is
++ * ready, connect.
++ */
++static void backend_changed(struct xenbus_watch *watch,
++			    const char **vec, unsigned int len)
++{
++	int err;
++	unsigned major;
++	unsigned minor;
++	struct backend_info *be
++		= container_of(watch, struct backend_info, backend_watch);
++	struct xenbus_device *dev = be->dev;
++	int cdrom = 0;
++	char *device_type;
++
++	DPRINTK("");
++
++	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
++			   &major, &minor);
++	if (XENBUS_EXIST_ERR(err)) {
++		/* Since this watch will fire once immediately after it is
++		   registered, we expect this.  Ignore it, and wait for the
++		   hotplug scripts. */
++		return;
++	}
++	if (err != 2) {
++		xenbus_dev_fatal(dev, err, "reading physical-device");
++		return;
++	}
++
++	if ((be->major || be->minor) &&
++	    ((be->major != major) || (be->minor != minor))) {
++		printk(KERN_WARNING
++		       "blkback: changing physical device (from %x:%x to "
++		       "%x:%x) not supported.\n", be->major, be->minor,
++		       major, minor);
++		return;
++	}
++
++	be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
++	if (IS_ERR(be->mode)) {
++		err = PTR_ERR(be->mode);
++		be->mode = NULL;
++		xenbus_dev_fatal(dev, err, "reading mode");
++		return;
++	}
++
++	device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
++	if (!IS_ERR(device_type)) {
++		cdrom = strcmp(device_type, "cdrom") == 0;
++		kfree(device_type);
++	}
++
++	if (be->major == 0 && be->minor == 0) {
++		/* Front end dir is a number, which is used as the handle. */
++
++		char *p = strrchr(dev->otherend, '/') + 1;
++		long handle = simple_strtoul(p, NULL, 0);
++
++		be->major = major;
++		be->minor = minor;
++
++		err = vbd_create(be->blkif, handle, major, minor,
++				 (NULL == strchr(be->mode, 'w')), cdrom);
++		if (err) {
++			be->major = be->minor = 0;
++			xenbus_dev_fatal(dev, err, "creating vbd structure");
++			return;
++		}
++
++		err = xenvbd_sysfs_addif(dev);
++		if (err) {
++			vbd_free(&be->blkif->vbd);
++			be->major = be->minor = 0;
++			xenbus_dev_fatal(dev, err, "creating sysfs entries");
++			return;
++		}
++
++		/* We're potentially connected now */
++		update_blkif_status(be->blkif);
++	}
++}
++
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++			     enum xenbus_state frontend_state)
++{
++	struct backend_info *be = dev_get_drvdata(&dev->dev);
++	int err;
++
++	DPRINTK("%s", xenbus_strstate(frontend_state));
++
++	switch (frontend_state) {
++	case XenbusStateInitialising:
++		if (dev->state == XenbusStateClosed) {
++			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++			       __FUNCTION__, dev->nodename);
++			xenbus_switch_state(dev, XenbusStateInitWait);
++		}
++		break;
++
++	case XenbusStateInitialised:
++	case XenbusStateConnected:
++		/* Ensure we connect even when two watches fire in
++		   close successsion and we miss the intermediate value
++		   of frontend_state. */
++		if (dev->state == XenbusStateConnected)
++			break;
++
++		err = connect_ring(be);
++		if (err)
++			break;
++		update_blkif_status(be->blkif);
++		break;
++
++	case XenbusStateClosing:
++		blkif_disconnect(be->blkif);
++		xenbus_switch_state(dev, XenbusStateClosing);
++		break;
++
++	case XenbusStateClosed:
++		xenbus_switch_state(dev, XenbusStateClosed);
++		if (xenbus_dev_is_online(dev))
++			break;
++		/* fall through if not online */
++	case XenbusStateUnknown:
++		device_unregister(&dev->dev);
++		break;
++
++	default:
++		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++				 frontend_state);
++		break;
++	}
++}
++
++
++/* ** Connection ** */
++
++
++/**
++ * Write the physical details regarding the block device to the store, and
++ * switch to Connected state.
++ */
++static void connect(struct backend_info *be)
++{
++	struct xenbus_transaction xbt;
++	int err;
++	struct xenbus_device *dev = be->dev;
++
++	DPRINTK("%s", dev->otherend);
++
++	/* Supply the information about the device the frontend needs */
++again:
++	err = xenbus_transaction_start(&xbt);
++	if (err) {
++		xenbus_dev_fatal(dev, err, "starting transaction");
++		return;
++	}
++
++	err = blkback_barrier(xbt, be, 1);
++	if (err)
++		goto abort;
++
++	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
++			    vbd_size(&be->blkif->vbd));
++	if (err) {
++		xenbus_dev_fatal(dev, err, "writing %s/sectors",
++				 dev->nodename);
++		goto abort;
++	}
++
++	/* FIXME: use a typename instead */
++	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
++			    vbd_info(&be->blkif->vbd));
++	if (err) {
++		xenbus_dev_fatal(dev, err, "writing %s/info",
++				 dev->nodename);
++		goto abort;
++	}
++	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
++			    vbd_secsize(&be->blkif->vbd));
++	if (err) {
++		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
++				 dev->nodename);
++		goto abort;
++	}
++
++	err = xenbus_transaction_end(xbt, 0);
++	if (err == -EAGAIN)
++		goto again;
++	if (err)
++		xenbus_dev_fatal(dev, err, "ending transaction");
++
++	err = xenbus_switch_state(dev, XenbusStateConnected);
++	if (err)
++		xenbus_dev_fatal(dev, err, "switching to Connected state",
++				 dev->nodename);
++
++	return;
++ abort:
++	xenbus_transaction_end(xbt, 1);
++}
++
++
++static int connect_ring(struct backend_info *be)
++{
++	struct xenbus_device *dev = be->dev;
++	unsigned long ring_ref;
++	unsigned int evtchn;
++	char protocol[64] = "";
++	int err;
++
++	DPRINTK("%s", dev->otherend);
++
++	err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref,
++			    "event-channel", "%u", &evtchn, NULL);
++	if (err) {
++		xenbus_dev_fatal(dev, err,
++				 "reading %s/ring-ref and event-channel",
++				 dev->otherend);
++		return err;
++	}
++
++	be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++	err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
++			    "%63s", protocol, NULL);
++	if (err)
++		strcpy(protocol, "unspecified, assuming native");
++	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
++		be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
++	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
++		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
++	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
++		be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
++	else {
++		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
++		return -1;
++	}
++	printk(KERN_INFO
++	       "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n",
++	       ring_ref, evtchn, be->blkif->blk_protocol, protocol);
++
++	/* Map the shared frame, irq etc. */
++	err = blkif_map(be->blkif, ring_ref, evtchn);
++	if (err) {
++		xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
++				 ring_ref, evtchn);
++		return err;
++	}
++
++	return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id blkback_ids[] = {
++	{ "vbd" },
++	{ "" }
++};
++
++
++static struct xenbus_driver blkback = {
++	.name = "vbd",
++	.owner = THIS_MODULE,
++	.ids = blkback_ids,
++	.probe = blkback_probe,
++	.remove = blkback_remove,
++	.otherend_changed = frontend_changed
++};
++
++
++int blkif_xenbus_init(void)
++{
++	return xenbus_register_backend(&blkback);
++}
+diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile
+new file mode 100644
+index 0000000..822b4e4
+--- /dev/null
++++ b/drivers/xen/blktap/Makefile
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
++
++blktap-objs := control.o ring.o device.o request.o sysfs.o
+diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h
+new file mode 100644
+index 0000000..a29b509
+--- /dev/null
++++ b/drivers/xen/blktap/blktap.h
+@@ -0,0 +1,199 @@
++#ifndef _BLKTAP_H_
++#define _BLKTAP_H_
++
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <linux/init.h>
++#include <linux/scatterlist.h>
++#include <xen/blkif.h>
++#include <xen/grant_table.h>
++
++extern int blktap_debug_level;
++extern int blktap_ring_major;
++extern int blktap_device_major;
++
++#define BTPRINTK(level, tag, force, _f, _a...)				\
++	do {								\
++		if (blktap_debug_level > level &&			\
++		    (force || printk_ratelimit()))			\
++			printk(tag "%s: " _f, __func__, ##_a);		\
++	} while (0)
++
++#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
++#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
++#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
++#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
++
++#define MAX_BLKTAP_DEVICE            1024
++
++#define BLKTAP_CONTROL               1
++#define BLKTAP_DEVICE                4
++#define BLKTAP_DEVICE_CLOSED         5
++#define BLKTAP_SHUTDOWN_REQUESTED    8
++
++/* blktap IOCTLs: */
++#define BLKTAP2_IOCTL_KICK_FE        1
++#define BLKTAP2_IOCTL_ALLOC_TAP      200
++#define BLKTAP2_IOCTL_FREE_TAP       201
++#define BLKTAP2_IOCTL_CREATE_DEVICE  202
++#define BLKTAP2_IOCTL_REMOVE_DEVICE  207
++
++#define BLKTAP2_MAX_MESSAGE_LEN      256
++
++#define BLKTAP2_RING_MESSAGE_CLOSE   3
++
++#define BLKTAP_REQUEST_FREE          0
++#define BLKTAP_REQUEST_PENDING       1
++
++/*
++ * The maximum number of requests that can be outstanding at any time
++ * is determined by
++ *
++ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
++ *
++ * where mmap_alloc < MAX_DYNAMIC_MEM.
++ *
++ * TODO:
++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
++ * sysfs.
++ */
++#define BLK_RING_SIZE		__RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
++#define MAX_DYNAMIC_MEM		BLK_RING_SIZE
++#define MAX_PENDING_REQS	BLK_RING_SIZE
++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
++#define MMAP_VADDR(_start, _req, _seg)					\
++        (_start +                                                       \
++         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
++         ((_seg) * PAGE_SIZE))
++
++struct grant_handle_pair {
++	grant_handle_t                 kernel;
++	grant_handle_t                 user;
++};
++#define INVALID_GRANT_HANDLE           0xFFFF
++
++struct blktap_handle {
++	unsigned int                   ring;
++	unsigned int                   device;
++	unsigned int                   minor;
++};
++
++struct blktap_params {
++	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
++	unsigned long long             capacity;
++	unsigned long                  sector_size;
++};
++
++struct blktap_device {
++	spinlock_t                     lock;
++	struct gendisk                *gd;
++};
++
++struct blktap_ring {
++	struct task_struct            *task;
++
++	struct vm_area_struct         *vma;
++	struct blkif_front_ring             ring;
++	struct vm_foreign_map          foreign_map;
++	unsigned long                  ring_vstart;
++	unsigned long                  user_vstart;
++
++	wait_queue_head_t              poll_wait;
++
++	dev_t                          devno;
++	struct device                 *dev;
++};
++
++struct blktap_statistics {
++	unsigned long                  st_print;
++	int                            st_rd_req;
++	int                            st_wr_req;
++	int                            st_oo_req;
++	int                            st_rd_sect;
++	int                            st_wr_sect;
++	s64                            st_rd_cnt;
++	s64                            st_rd_sum_usecs;
++	s64                            st_rd_max_usecs;
++	s64                            st_wr_cnt;
++	s64                            st_wr_sum_usecs;
++	s64                            st_wr_max_usecs;	
++};
++
++struct blktap_request {
++	struct request                *rq;
++	uint16_t                       usr_idx;
++
++	uint8_t                        status;
++	atomic_t                       pendcnt;
++	uint8_t                        nr_pages;
++	unsigned short                 operation;
++
++	struct timeval                 time;
++	struct grant_handle_pair       handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++	struct list_head               free_list;
++};
++
++struct blktap {
++	int                            minor;
++	unsigned long                  dev_inuse;
++
++	struct blktap_ring             ring;
++	struct blktap_device           device;
++
++	int                            pending_cnt;
++	struct blktap_request         *pending_requests[MAX_PENDING_REQS];
++	struct scatterlist             sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++
++	wait_queue_head_t              remove_wait;
++	struct work_struct             remove_work;
++	char                           name[BLKTAP2_MAX_MESSAGE_LEN];
++
++	struct blktap_statistics       stats;
++};
++
++extern struct mutex blktap_lock;
++extern struct blktap **blktaps;
++extern int blktap_max_minor;
++
++int blktap_control_destroy_tap(struct blktap *);
++size_t blktap_control_debug(struct blktap *, char *, size_t);
++
++int blktap_ring_init(void);
++void blktap_ring_exit(void);
++size_t blktap_ring_debug(struct blktap *, char *, size_t);
++int blktap_ring_create(struct blktap *);
++int blktap_ring_destroy(struct blktap *);
++void blktap_ring_kick_user(struct blktap *);
++void blktap_ring_kick_all(void);
++
++int blktap_sysfs_init(void);
++void blktap_sysfs_exit(void);
++int blktap_sysfs_create(struct blktap *);
++void blktap_sysfs_destroy(struct blktap *);
++
++int blktap_device_init(void);
++void blktap_device_exit(void);
++size_t blktap_device_debug(struct blktap *, char *, size_t);
++int blktap_device_create(struct blktap *, struct blktap_params *);
++int blktap_device_destroy(struct blktap *);
++void blktap_device_destroy_sync(struct blktap *);
++int blktap_device_run_queue(struct blktap *);
++void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
++
++int blktap_request_pool_init(void);
++void blktap_request_pool_free(void);
++int blktap_request_pool_grow(void);
++int blktap_request_pool_shrink(void);
++struct blktap_request *blktap_request_allocate(struct blktap *);
++void blktap_request_free(struct blktap *, struct blktap_request *);
++struct page *request_to_page(struct blktap_request *, int);
++
++static inline unsigned long
++request_to_kaddr(struct blktap_request *req, int seg)
++{
++	unsigned long pfn = page_to_pfn(request_to_page(req, seg));
++	return (unsigned long)pfn_to_kaddr(pfn);
++}
++
++#endif
+diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c
+new file mode 100644
+index 0000000..ef54fa1
+--- /dev/null
++++ b/drivers/xen/blktap/control.c
+@@ -0,0 +1,271 @@
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/miscdevice.h>
++
++#include <asm/uaccess.h>
++
++#include "blktap.h"
++
++DEFINE_MUTEX(blktap_lock);
++
++struct blktap **blktaps;
++int blktap_max_minor;
++
++static struct blktap *
++blktap_control_get_minor(void)
++{
++	int minor;
++	struct blktap *tap;
++
++	tap = kmalloc(sizeof(*tap), GFP_KERNEL);
++	if (unlikely(!tap))
++		return NULL;
++
++	memset(tap, 0, sizeof(*tap));
++	sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++	mutex_lock(&blktap_lock);
++
++	for (minor = 0; minor < blktap_max_minor; minor++)
++		if (!blktaps[minor])
++			break;
++
++	if (minor == MAX_BLKTAP_DEVICE)
++		goto fail;
++
++	if (minor == blktap_max_minor) {
++		void *p;
++		int n;
++
++		n = min(2 * blktap_max_minor, MAX_BLKTAP_DEVICE);
++		p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
++		if (!p)
++			goto fail;
++
++		blktaps          = p;
++		minor            = blktap_max_minor;
++		blktap_max_minor = n;
++
++		memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
++	}
++
++	tap->minor = minor;
++	blktaps[minor] = tap;
++
++	__module_get(THIS_MODULE);
++out:
++	mutex_unlock(&blktap_lock);
++	return tap;
++
++fail:
++	mutex_unlock(&blktap_lock);
++	kfree(tap);
++	tap = NULL;
++	goto out;
++}
++
++static void
++blktap_control_put_minor(struct blktap* tap)
++{
++	blktaps[tap->minor] = NULL;
++	kfree(tap);
++
++	module_put(THIS_MODULE);
++}
++
++static struct blktap*
++blktap_control_create_tap(void)
++{
++	struct blktap *tap;
++	int err;
++
++	tap = blktap_control_get_minor();
++	if (!tap)
++		return NULL;
++
++	err = blktap_ring_create(tap);
++	if (err)
++		goto fail_tap;
++
++	err = blktap_sysfs_create(tap);
++	if (err)
++		goto fail_ring;
++
++	return tap;
++
++fail_ring:
++	blktap_ring_destroy(tap);
++fail_tap:
++	blktap_control_put_minor(tap);
++
++	return NULL;
++}
++
++int
++blktap_control_destroy_tap(struct blktap *tap)
++{
++	int err;
++
++	err = blktap_ring_destroy(tap);
++	if (err)
++		return err;
++
++	blktap_sysfs_destroy(tap);
++
++	blktap_control_put_minor(tap);
++
++	return 0;
++}
++
++static int
++blktap_control_ioctl(struct inode *inode, struct file *filp,
++		     unsigned int cmd, unsigned long arg)
++{
++	struct blktap *tap;
++
++	switch (cmd) {
++	case BLKTAP2_IOCTL_ALLOC_TAP: {
++		struct blktap_handle h;
++		void __user *ptr = (void __user*)arg;
++
++		tap = blktap_control_create_tap();
++		if (!tap)
++			return -ENOMEM;
++
++		h.ring   = blktap_ring_major;
++		h.device = blktap_device_major;
++		h.minor  = tap->minor;
++
++		if (copy_to_user(ptr, &h, sizeof(h))) {
++			blktap_control_destroy_tap(tap);
++			return -EFAULT;
++		}
++
++		return 0;
++	}
++
++	case BLKTAP2_IOCTL_FREE_TAP: {
++		int minor = arg;
++
++		if (minor > MAX_BLKTAP_DEVICE)
++			return -EINVAL;
++
++		tap = blktaps[minor];
++		if (!tap)
++			return -ENODEV;
++
++		return blktap_control_destroy_tap(tap);
++	}
++	}
++
++	return -ENOIOCTLCMD;
++}
++
++static struct file_operations blktap_control_file_operations = {
++	.owner    = THIS_MODULE,
++	.ioctl    = blktap_control_ioctl,
++};
++
++static struct miscdevice blktap_misc = {
++	.minor    = MISC_DYNAMIC_MINOR,
++	.name     = "blktap-control",
++	.fops     = &blktap_control_file_operations,
++};
++
++size_t
++blktap_control_debug(struct blktap *tap, char *buf, size_t size)
++{
++	char *s = buf, *end = buf + size;
++
++	s += snprintf(s, end - s,
++		      "tap %u:%u name:'%s' flags:%#08lx\n",
++		      MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
++		      tap->name, tap->dev_inuse);
++
++	return s - buf;
++}
++
++static int __init
++blktap_control_init(void)
++{
++	int err;
++
++	err = misc_register(&blktap_misc);
++	if (err) {
++		blktap_misc.minor = MISC_DYNAMIC_MINOR;
++		BTERR("misc_register failed for control device");
++		return err;
++	}
++
++	blktap_max_minor = min(64, MAX_BLKTAP_DEVICE);
++	blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
++	if (!blktaps) {
++		BTERR("failed to allocate blktap minor map");
++		return -ENOMEM;
++	}
++
++	return 0;
++}
++
++static void
++blktap_control_exit(void)
++{
++	if (blktaps) {
++		kfree(blktaps);
++		blktaps = NULL;
++	}
++
++	if (blktap_misc.minor != MISC_DYNAMIC_MINOR) {
++		misc_deregister(&blktap_misc);
++		blktap_misc.minor = MISC_DYNAMIC_MINOR;
++	}
++}
++
++static void
++blktap_exit(void)
++{
++	blktap_control_exit();
++	blktap_ring_exit();
++	blktap_sysfs_exit();
++	blktap_device_exit();
++	blktap_request_pool_free();
++}
++
++static int __init
++blktap_init(void)
++{
++	int err;
++
++	if (!xen_pv_domain())
++		return -ENODEV;
++
++	err = blktap_request_pool_init();
++	if (err)
++		return err;
++
++	err = blktap_device_init();
++	if (err)
++		goto fail;
++
++	err = blktap_ring_init();
++	if (err)
++		goto fail;
++
++	err = blktap_sysfs_init();
++	if (err)
++		goto fail;
++
++	err = blktap_control_init();
++	if (err)
++		goto fail;
++
++	return 0;
++
++fail:
++	blktap_exit();
++	return err;
++}
++
++module_init(blktap_init);
++module_exit(blktap_exit);
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c
+new file mode 100644
+index 0000000..e4fc23e
+--- /dev/null
++++ b/drivers/xen/blktap/device.c
+@@ -0,0 +1,941 @@
++#include <linux/fs.h>
++#include <linux/blkdev.h>
++#include <linux/cdrom.h>
++#include <linux/hdreg.h>
++#include <linux/module.h>
++#include <asm/tlbflush.h>
++
++#include <scsi/scsi.h>
++#include <scsi/scsi_ioctl.h>
++
++#include <xen/xenbus.h>
++#include <xen/interface/io/blkif.h>
++
++#include <asm/xen/page.h>
++#include <asm/xen/hypercall.h>
++
++#include "blktap.h"
++
++#include "../blkback/blkback-pagemap.h"
++
++struct blktap_grant_table {
++	int cnt;
++	struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++};
++
++int blktap_device_major;
++
++#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
++
++static int
++blktap_device_open(struct block_device *bdev, fmode_t mode)
++{
++	struct gendisk *disk = bdev->bd_disk;
++	struct blktap_device *tapdev = disk->private_data;
++
++	if (!tapdev)
++		return -ENXIO;
++
++	/* NB. we might have bounced a bd trylock by tapdisk. when
++	 * failing for reasons not !tapdev, make sure to kick tapdisk
++	 * out of destroy wait state again. */
++
++	return 0;
++}
++
++static int
++blktap_device_release(struct gendisk *disk, fmode_t mode)
++{
++	struct blktap_device *tapdev = disk->private_data;
++	struct block_device *bdev = bdget_disk(disk, 0);
++	struct blktap *tap = dev_to_blktap(tapdev);
++
++	bdput(bdev);
++
++	if (!bdev->bd_openers) {
++		set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
++		blktap_ring_kick_user(tap);
++	}
++
++	return 0;
++}
++
++static int
++blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
++{
++	/* We don't have real geometry info, but let's at least return
++	   values consistent with the size of the device */
++	sector_t nsect = get_capacity(bd->bd_disk);
++	sector_t cylinders = nsect;
++
++	hg->heads = 0xff;
++	hg->sectors = 0x3f;
++	sector_div(cylinders, hg->heads * hg->sectors);
++	hg->cylinders = cylinders;
++	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
++		hg->cylinders = 0xffff;
++	return 0;
++}
++
++static int
++blktap_device_ioctl(struct block_device *bd, fmode_t mode,
++		    unsigned command, unsigned long argument)
++{
++	int i;
++
++	switch (command) {
++	case CDROMMULTISESSION:
++		BTDBG("FIXME: support multisession CDs later\n");
++		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
++			if (put_user(0, (char __user *)(argument + i)))
++				return -EFAULT;
++		return 0;
++
++	case SCSI_IOCTL_GET_IDLUN:
++		if (!access_ok(VERIFY_WRITE, argument, 
++			sizeof(struct scsi_idlun)))
++			return -EFAULT;
++
++		/* return 0 for now. */
++		__put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
++		__put_user(0, 
++			&((struct scsi_idlun __user *)argument)->host_unique_id);
++		return 0;
++
++	default:
++		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
++		  command);*/
++		return -EINVAL; /* same return as native Linux */
++	}
++
++	return 0;
++}
++
++static struct block_device_operations blktap_device_file_operations = {
++	.owner     = THIS_MODULE,
++	.open      = blktap_device_open,
++	.release   = blktap_device_release,
++	.ioctl     = blktap_device_ioctl,
++	.getgeo    = blktap_device_getgeo
++};
++
++static int
++blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++		    unsigned long addr, void *data)
++{
++	pte_t *pte = (pte_t *)data;
++
++	BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
++	set_pte(ptep, *pte);
++	return 0;
++}
++
++static int
++blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte)
++{
++	return apply_to_page_range(mm, address,
++				   PAGE_SIZE, blktap_map_uaddr_fn, &pte);
++}
++
++static int
++blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
++		     unsigned long addr, void *data)
++{
++	struct mm_struct *mm = (struct mm_struct *)data;
++
++	BTDBG("ptep %p\n", ptep);
++	pte_clear(mm, addr, ptep);
++	return 0;
++}
++
++static int
++blktap_umap_uaddr(struct mm_struct *mm, unsigned long address)
++{
++	return apply_to_page_range(mm, address,
++				   PAGE_SIZE, blktap_umap_uaddr_fn, mm);
++}
++
++static inline void
++flush_tlb_kernel_page(unsigned long kvaddr)
++{
++	flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
++}
++
++static void
++blktap_device_end_dequeued_request(struct blktap_device *dev,
++				   struct request *req, int error)
++{
++	unsigned long flags;
++	int ret;
++
++	//spin_lock_irq(&dev->lock);
++	spin_lock_irqsave(dev->gd->queue->queue_lock, flags);
++	ret = __blk_end_request(req, error, blk_rq_bytes(req));
++	spin_unlock_irqrestore(dev->gd->queue->queue_lock, flags);
++	//spin_unlock_irq(&dev->lock);
++
++	BUG_ON(ret);
++}
++
++static void
++blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
++{
++	uint64_t ptep;
++	int ret, usr_idx;
++	unsigned int i, cnt;
++	struct page **map, *page;
++	struct blktap_ring *ring;
++	struct grant_handle_pair *khandle;
++	unsigned long kvaddr, uvaddr, offset;
++	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
++
++	cnt     = 0;
++	ring    = &tap->ring;
++	usr_idx = request->usr_idx;
++	map     = ring->foreign_map.map;
++
++	if (!ring->vma)
++		return;
++
++	if (xen_feature(XENFEAT_auto_translated_physmap))
++		zap_page_range(ring->vma, 
++			       MMAP_VADDR(ring->user_vstart, usr_idx, 0),
++			       request->nr_pages << PAGE_SHIFT, NULL);
++
++	for (i = 0; i < request->nr_pages; i++) {
++		kvaddr = request_to_kaddr(request, i);
++		uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++		khandle = request->handles + i;
++
++		if (khandle->kernel != INVALID_GRANT_HANDLE) {
++			gnttab_set_unmap_op(&unmap[cnt], kvaddr,
++					    GNTMAP_host_map, khandle->kernel);
++			cnt++;
++			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++					    INVALID_P2M_ENTRY);
++		}
++
++		if (khandle->user != INVALID_GRANT_HANDLE) {
++			BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++			if (create_lookup_pte_addr(ring->vma->vm_mm,
++						   uvaddr, &ptep) != 0) {
++				BTERR("Couldn't get a pte addr!\n");
++				return;
++			}
++
++			gnttab_set_unmap_op(&unmap[cnt], ptep,
++					    GNTMAP_host_map
++					    | GNTMAP_application_map
++					    | GNTMAP_contains_pte,
++					    khandle->user);
++			cnt++;
++		}
++
++		offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++
++		BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
++		      "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
++		      "0x%08lx, handle: %u\n", offset, map[offset], request,
++		      usr_idx, i, kvaddr, khandle->kernel, uvaddr,
++		      khandle->user);
++
++		page = map[offset];
++		if (page && blkback_pagemap_contains_page(page))
++			set_page_private(page, 0);
++
++		map[offset] = NULL;
++
++		khandle->kernel = INVALID_GRANT_HANDLE;
++		khandle->user   = INVALID_GRANT_HANDLE;
++	}
++
++	if (cnt) {
++		ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++						unmap, cnt);
++		BUG_ON(ret);
++	}
++
++	if (!xen_feature(XENFEAT_auto_translated_physmap))
++		zap_page_range(ring->vma, 
++			       MMAP_VADDR(ring->user_vstart, usr_idx, 0), 
++			       request->nr_pages << PAGE_SHIFT, NULL);
++}
++
++static void
++blktap_unmap(struct blktap *tap, struct blktap_request *request)
++{
++	int i, usr_idx;
++	unsigned long kvaddr;
++
++	usr_idx = request->usr_idx;
++
++	for (i = 0; i < request->nr_pages; i++) {
++		kvaddr = request_to_kaddr(request, i);
++		BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
++		      "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
++		      kvaddr, request->handles[i].kernel,
++		      MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
++		      request->handles[i].user);
++
++		if (request->handles[i].kernel == INVALID_GRANT_HANDLE) {
++			blktap_umap_uaddr(current->mm, kvaddr);
++			flush_tlb_kernel_page(kvaddr);
++			set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
++					    INVALID_P2M_ENTRY);
++		}
++	}
++
++	blktap_device_fast_flush(tap, request);
++}
++
++void
++blktap_device_end_request(struct blktap *tap,
++			  struct blktap_request *request,
++			  int error)
++{
++	struct blktap_device *tapdev = &tap->device;
++	struct request *rq = request->rq;
++
++	blktap_unmap(tap, request);
++
++	spin_lock_irq(&tapdev->lock);
++	__blk_end_request(rq, error, blk_rq_bytes(rq));
++	spin_unlock_irq(&tapdev->lock);
++
++	blktap_request_free(tap, request);
++}
++
++static int
++blktap_prep_foreign(struct blktap *tap,
++		    struct blktap_request *request,
++		    struct blkif_request *blkif_req,
++		    unsigned int seg, struct page *page,
++		    struct blktap_grant_table *table)
++{
++	uint64_t ptep;
++	uint32_t flags;
++#ifdef BLKTAP_CHAINED_BLKTAP
++	struct page *tap_page;
++#endif
++	struct blktap_ring *ring;
++	struct blkback_pagemap map;
++	unsigned long uvaddr, kvaddr;
++
++	ring = &tap->ring;
++	map  = blkback_pagemap_read(page);
++	blkif_req->seg[seg].gref = map.gref;
++
++	uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
++	kvaddr = request_to_kaddr(request, seg);
++	flags  = GNTMAP_host_map |
++		(request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
++
++	gnttab_set_map_op(&table->grants[table->cnt],
++			  kvaddr, flags, map.gref, map.domid);
++	table->cnt++;
++
++
++#ifdef BLKTAP_CHAINED_BLKTAP
++	/* enable chained tap devices */
++	tap_page = request_to_page(request, seg);
++	set_page_private(tap_page, page_private(page));
++	SetPageBlkback(tap_page);
++#endif
++
++	if (xen_feature(XENFEAT_auto_translated_physmap))
++		return 0;
++
++	if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
++		BTERR("couldn't get a pte addr!\n");
++		return -1;
++	}
++
++	flags |= GNTMAP_application_map | GNTMAP_contains_pte;
++	gnttab_set_map_op(&table->grants[table->cnt],
++			  ptep, flags, map.gref, map.domid);
++	table->cnt++;
++
++	return 0;
++}
++
++static int
++blktap_map_foreign(struct blktap *tap,
++		   struct blktap_request *request,
++		   struct blkif_request *blkif_req,
++		   struct blktap_grant_table *table)
++{
++	struct page *page;
++	int i, grant, err, usr_idx;
++	struct blktap_ring *ring;
++	unsigned long uvaddr, foreign_mfn;
++
++	if (!table->cnt)
++		return 0;
++
++	err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++					table->grants, table->cnt);
++	BUG_ON(err);
++
++	grant   = 0;
++	usr_idx = request->usr_idx;
++	ring    = &tap->ring;
++
++	for (i = 0; i < request->nr_pages; i++) {
++		if (!blkif_req->seg[i].gref)
++			continue;
++
++		uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++
++		if (unlikely(table->grants[grant].status)) {
++			BTERR("invalid kernel buffer: could not remap it\n");
++			err |= 1;
++			table->grants[grant].handle = INVALID_GRANT_HANDLE;
++		}
++
++		request->handles[i].kernel = table->grants[grant].handle;
++		foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
++		grant++;
++
++		if (xen_feature(XENFEAT_auto_translated_physmap))
++			goto done;
++
++		if (unlikely(table->grants[grant].status)) {
++			BTERR("invalid user buffer: could not remap it\n");
++			err |= 1;
++			table->grants[grant].handle = INVALID_GRANT_HANDLE;
++		}
++
++		request->handles[i].user = table->grants[grant].handle;
++		grant++;
++
++	done:
++		if (err)
++			continue;
++
++		page = request_to_page(request, i);
++
++		if (!xen_feature(XENFEAT_auto_translated_physmap))
++			set_phys_to_machine(page_to_pfn(page),
++					    FOREIGN_FRAME(foreign_mfn));
++		else if (vm_insert_page(ring->vma, uvaddr, page))
++			err |= 1;
++
++		BTDBG("pending_req: %p, seg: %d, page: %p, "
++		      "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
++		      "uhandle: %u\n", request, i, page,
++		      pfn_to_kaddr(page_to_pfn(page)),
++		      request->handles[i].kernel,
++		      uvaddr, request->handles[i].user);
++	}
++
++	return err;
++}
++
++static void
++blktap_map(struct blktap *tap,
++	   struct blktap_request *request,
++	   unsigned int seg, struct page *page)
++{
++	pte_t pte;
++	int usr_idx;
++	struct blktap_ring *ring;
++	unsigned long uvaddr, kvaddr;
++
++	ring    = &tap->ring;
++	usr_idx = request->usr_idx;
++	uvaddr  = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
++	kvaddr  = request_to_kaddr(request, seg);
++
++	pte = mk_pte(page, ring->vma->vm_page_prot);
++	blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte));
++	flush_tlb_page(ring->vma, uvaddr);
++	blktap_map_uaddr(ring->vma->vm_mm, kvaddr, mk_pte(page, PAGE_KERNEL));
++	flush_tlb_kernel_page(kvaddr);
++
++	set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
++	request->handles[seg].kernel = INVALID_GRANT_HANDLE;
++	request->handles[seg].user   = INVALID_GRANT_HANDLE;
++
++	BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
++	      "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
++	      uvaddr);
++}
++
++static int
++blktap_device_process_request(struct blktap *tap,
++			      struct blktap_request *request,
++			      struct request *req)
++{
++	struct page *page;
++	int i, usr_idx, err;
++	struct blktap_ring *ring;
++	struct scatterlist *sg;
++	struct blktap_grant_table table;
++	unsigned int fsect, lsect, nr_sects;
++	unsigned long offset, uvaddr;
++	struct blkif_request blkif_req, *target;
++
++	err = -1;
++	memset(&table, 0, sizeof(table));
++
++	ring    = &tap->ring;
++	usr_idx = request->usr_idx;
++	blkif_req.id = usr_idx;
++	blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
++	blkif_req.handle = 0;
++	blkif_req.operation = rq_data_dir(req) ?
++		BLKIF_OP_WRITE : BLKIF_OP_READ;
++
++	request->rq        = req;
++	request->operation = blkif_req.operation;
++	request->status    = BLKTAP_REQUEST_PENDING;
++	do_gettimeofday(&request->time);
++
++	nr_sects = 0;
++	request->nr_pages = 0;
++	blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
++	BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
++	for (i = 0; i < blkif_req.nr_segments; ++i) {
++			sg = tap->sg + i;
++			fsect = sg->offset >> 9;
++			lsect = fsect + (sg->length >> 9) - 1;
++			nr_sects += sg->length >> 9;
++
++			blkif_req.seg[i] =
++				(struct blkif_request_segment) {
++				.gref       = 0,
++				.first_sect = fsect,
++				.last_sect  = lsect };
++
++			if (blkback_pagemap_contains_page(sg_page(sg))) {
++				/* foreign page -- use xen */
++				if (blktap_prep_foreign(tap,
++							request,
++							&blkif_req,
++							i,
++							sg_page(sg),
++							&table))
++					goto out;
++			} else {
++				/* do it the old fashioned way */
++				blktap_map(tap,
++					   request,
++					   i,
++					   sg_page(sg));
++			}
++
++			uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
++			offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
++			page   = request_to_page(request, i);
++			ring->foreign_map.map[offset] = page;
++			SetPageReserved(page);
++
++			BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
++			      uvaddr, page, page_to_pfn(page));
++			BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
++			      "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
++			      offset, request, i,
++			      page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
++
++			request->nr_pages++;
++	}
++
++	if (blktap_map_foreign(tap, request, &blkif_req, &table))
++		goto out;
++
++	/* Finally, write the request message to the user ring. */
++	target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
++	memcpy(target, &blkif_req, sizeof(blkif_req));
++	target->id = request->usr_idx;
++	wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
++	ring->ring.req_prod_pvt++;
++
++	if (rq_data_dir(req)) {
++		tap->stats.st_wr_sect += nr_sects;
++		tap->stats.st_wr_req++;
++	} else {
++		tap->stats.st_rd_sect += nr_sects;
++		tap->stats.st_rd_req++;
++	}
++
++	err = 0;
++
++out:
++	if (err)
++		blktap_device_fast_flush(tap, request);
++	return err;
++}
++
++/*
++ * called from tapdisk context
++ */
++int
++blktap_device_run_queue(struct blktap *tap)
++{
++	int err, rv;
++	struct request_queue *rq;
++	struct request *req;
++	struct blktap_ring *ring;
++	struct blktap_device *dev;
++	struct blktap_request *request;
++
++	ring   = &tap->ring;
++	dev    = &tap->device;
++	rq     = dev->gd->queue;
++
++	BTDBG("running queue for %d\n", tap->minor);
++	spin_lock_irq(&dev->lock);
++	queue_flag_clear(QUEUE_FLAG_STOPPED, rq);
++
++	while ((req = blk_peek_request(rq)) != NULL) {
++		if (!blk_fs_request(req)) {
++			blk_start_request(req);
++			__blk_end_request_cur(req, -EOPNOTSUPP);
++			continue;
++		}
++
++		if (blk_barrier_rq(req) && !blk_rq_bytes(req)) {
++			blk_start_request(req);
++			__blk_end_request_cur(req, 0);
++			continue;
++		}
++
++		if (RING_FULL(&ring->ring)) {
++		wait:
++			/* Avoid pointless unplugs. */
++			blk_stop_queue(rq);
++			break;
++		}
++
++		request = blktap_request_allocate(tap);
++		if (!request) {
++			tap->stats.st_oo_req++;
++			goto wait;
++		}
++
++		BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
++		      "buffer:%p [%s], pending: %p\n", req, tap->minor,
++		      req->cmd, (unsigned long long)blk_rq_pos(req),
++		      blk_rq_cur_sectors(req),
++		      blk_rq_sectors(req), req->buffer,
++		      rq_data_dir(req) ? "write" : "read", request);
++
++		blk_start_request(req);
++
++		spin_unlock_irq(&dev->lock);
++
++		err = blktap_device_process_request(tap, request, req);
++		if (err) {
++			blktap_device_end_dequeued_request(dev, req, -EIO);
++			blktap_request_free(tap, request);
++		}
++
++		spin_lock_irq(&dev->lock);
++	}
++
++	spin_unlock_irq(&dev->lock);
++
++	rv = ring->ring.req_prod_pvt -
++		ring->ring.sring->req_prod;
++
++	RING_PUSH_REQUESTS(&ring->ring);
++
++	return rv;
++}
++
++static void
++blktap_device_do_request(struct request_queue *rq)
++{
++	struct blktap_device *tapdev = rq->queuedata;
++	struct blktap *tap = dev_to_blktap(tapdev);
++
++	blktap_ring_kick_user(tap);
++}
++
++static void
++blktap_device_configure(struct blktap *tap,
++			struct blktap_params *params)
++{
++	struct request_queue *rq;
++	struct blktap_device *dev = &tap->device;
++
++	dev = &tap->device;
++	rq  = dev->gd->queue;
++
++	spin_lock_irq(&dev->lock);
++
++	set_capacity(dev->gd, params->capacity);
++
++	/* Hard sector size and max sectors impersonate the equiv. hardware. */
++	blk_queue_logical_block_size(rq, params->sector_size);
++	blk_queue_max_sectors(rq, 512);
++
++	/* Each segment in a request is up to an aligned page in size. */
++	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
++	blk_queue_max_segment_size(rq, PAGE_SIZE);
++
++	/* Ensure a merged request will fit in a single I/O ring slot. */
++	blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++	blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
++
++	/* Make sure buffer addresses are sector-aligned. */
++	blk_queue_dma_alignment(rq, 511);
++
++	/* We are reordering, but cacheless. */
++	blk_queue_ordered(rq, QUEUE_ORDERED_DRAIN, NULL);
++
++	spin_unlock_irq(&dev->lock);
++}
++
++static int
++blktap_device_validate_params(struct blktap *tap,
++			      struct blktap_params *params)
++{
++	struct device *dev = tap->ring.dev;
++	int sector_order, name_sz;
++
++	sector_order = ffs(params->sector_size) - 1;
++
++	if (sector_order <  9 ||
++	    sector_order > 12 ||
++	    params->sector_size != 1U<<sector_order)
++		goto fail;
++
++	if (!params->capacity ||
++	    (params->capacity > ULLONG_MAX >> sector_order))
++		goto fail;
++
++	name_sz = min(sizeof(params->name), sizeof(tap->name));
++	if (strnlen(params->name, name_sz) >= name_sz)
++		goto fail;
++
++	return 0;
++
++fail:
++	params->name[name_sz-1] = 0;
++	dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n",
++		params->capacity, params->sector_size, params->name);
++	return -EINVAL;
++}
++
++int
++blktap_device_destroy(struct blktap *tap)
++{
++	struct blktap_device *tapdev = &tap->device;
++	struct block_device *bdev;
++	struct gendisk *gd;
++	int err;
++
++	gd = tapdev->gd;
++	if (!gd)
++		return 0;
++
++	bdev = bdget_disk(gd, 0);
++
++	err = !mutex_trylock(&bdev->bd_mutex);
++	if (err) {
++		/* NB. avoid a deadlock. the last opener syncs the
++		 * bdev holding bd_mutex. */
++		err = -EBUSY;
++		goto out_nolock;
++	}
++
++	if (bdev->bd_openers) {
++		err = -EBUSY;
++		goto out;
++	}
++
++	del_gendisk(gd);
++	gd->private_data = NULL;
++
++	blk_cleanup_queue(gd->queue);
++
++	put_disk(gd);
++	tapdev->gd = NULL;
++
++	clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++	err = 0;
++out:
++	mutex_unlock(&bdev->bd_mutex);
++out_nolock:
++	bdput(bdev);
++
++	return err;
++}
++
++static void
++blktap_device_fail_queue(struct blktap *tap)
++{
++	struct blktap_device *tapdev = &tap->device;
++	struct request_queue *q = tapdev->gd->queue;
++
++	spin_lock_irq(&tapdev->lock);
++	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
++
++	do {
++		struct request *rq = blk_fetch_request(q);
++		if (!rq)
++			break;
++
++		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
++	} while (1);
++
++	spin_unlock_irq(&tapdev->lock);
++}
++
++static int
++blktap_device_try_destroy(struct blktap *tap)
++{
++	int err;
++
++	err = blktap_device_destroy(tap);
++	if (err)
++		blktap_device_fail_queue(tap);
++
++	return err;
++}
++
++void
++blktap_device_destroy_sync(struct blktap *tap)
++{
++	wait_event(tap->ring.poll_wait,
++		   !blktap_device_try_destroy(tap));
++}
++
++int
++blktap_device_create(struct blktap *tap, struct blktap_params *params)
++{
++	int minor, err;
++	struct gendisk *gd;
++	struct request_queue *rq;
++	struct blktap_device *tapdev;
++
++	gd     = NULL;
++	rq     = NULL;
++	tapdev = &tap->device;
++	minor  = tap->minor;
++
++	if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++		return -EEXIST;
++
++	if (blktap_device_validate_params(tap, params))
++		return -EINVAL;
++
++	gd = alloc_disk(1);
++	if (!gd) {
++		err = -ENOMEM;
++		goto fail;
++	}
++
++	if (minor < 26) {
++		sprintf(gd->disk_name, "td%c", 'a' + minor % 26);
++	} else if (minor < (26 + 1) * 26) {
++		sprintf(gd->disk_name, "td%c%c",
++			'a' + minor / 26 - 1,'a' + minor % 26);
++	} else {
++		const unsigned int m1 = (minor / 26 - 1) / 26 - 1;
++		const unsigned int m2 = (minor / 26 - 1) % 26;
++		const unsigned int m3 =  minor % 26;
++		sprintf(gd->disk_name, "td%c%c%c",
++			'a' + m1, 'a' + m2, 'a' + m3);
++	}
++
++	gd->major = blktap_device_major;
++	gd->first_minor = minor;
++	gd->fops = &blktap_device_file_operations;
++	gd->private_data = tapdev;
++
++	spin_lock_init(&tapdev->lock);
++	rq = blk_init_queue(blktap_device_do_request, &tapdev->lock);
++	if (!rq) {
++		err = -ENOMEM;
++		goto fail;
++	}
++	elevator_init(rq, "noop");
++
++	gd->queue     = rq;
++	rq->queuedata = tapdev;
++	tapdev->gd    = gd;
++
++	blktap_device_configure(tap, params);
++	add_disk(gd);
++
++	if (params->name[0])
++		strncpy(tap->name, params->name, sizeof(tap->name)-1);
++
++	set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
++
++	dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n",
++		 queue_logical_block_size(rq), get_capacity(gd));
++
++	return 0;
++
++fail:
++	if (gd)
++		del_gendisk(gd);
++	if (rq)
++		blk_cleanup_queue(rq);
++
++	return err;
++}
++
++size_t
++blktap_device_debug(struct blktap *tap, char *buf, size_t size)
++{
++	struct gendisk *disk = tap->device.gd;
++	struct request_queue *q;
++	struct block_device *bdev;
++	char *s = buf, *end = buf + size;
++
++	if (!disk)
++		return 0;
++
++	q = disk->queue;
++
++	s += snprintf(s, end - s,
++		      "disk capacity:%llu sector size:%u\n",
++		      get_capacity(disk), queue_logical_block_size(q));
++
++	s += snprintf(s, end - s,
++		      "queue flags:%#lx plugged:%d stopped:%d empty:%d\n",
++		      q->queue_flags,
++		      blk_queue_plugged(q), blk_queue_stopped(q),
++		      elv_queue_empty(q));
++
++	bdev = bdget_disk(disk, 0);
++	if (bdev) {
++		s += snprintf(s, end - s,
++			      "bdev openers:%d closed:%d\n",
++			      bdev->bd_openers,
++			      test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse));
++		bdput(bdev);
++	}
++
++	return s - buf;
++}
++
++int __init
++blktap_device_init()
++{
++	int major;
++
++	/* Dynamically allocate a major for this device */
++	major = register_blkdev(0, "tapdev");
++	if (major < 0) {
++		BTERR("Couldn't register blktap device\n");
++		return -ENOMEM;
++	}
++
++	blktap_device_major = major;
++	BTINFO("blktap device major %d\n", major);
++
++	return 0;
++}
++
++void
++blktap_device_exit(void)
++{
++	if (blktap_device_major)
++		unregister_blkdev(blktap_device_major, "tapdev");
++}
+diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c
+new file mode 100644
+index 0000000..eee7100
+--- /dev/null
++++ b/drivers/xen/blktap/request.c
+@@ -0,0 +1,297 @@
++#include <linux/spinlock.h>
++#include <xen/balloon.h>
++#include <linux/sched.h>
++
++#include "blktap.h"
++
++#define MAX_BUCKETS                      8
++#define BUCKET_SIZE                      MAX_PENDING_REQS
++
++#define BLKTAP_POOL_CLOSING              1
++
++struct blktap_request_bucket;
++
++struct blktap_request_handle {
++	int                              slot;
++	uint8_t                          inuse;
++	struct blktap_request            request;
++	struct blktap_request_bucket    *bucket;
++};
++
++struct blktap_request_bucket {
++	atomic_t                         reqs_in_use;
++	struct blktap_request_handle     handles[BUCKET_SIZE];
++	struct page                    **foreign_pages;
++};
++
++struct blktap_request_pool {
++	spinlock_t                       lock;
++	uint8_t                          status;
++	struct list_head                 free_list;
++	atomic_t                         reqs_in_use;
++	wait_queue_head_t                wait_queue;
++	struct blktap_request_bucket    *buckets[MAX_BUCKETS];
++};
++
++static struct blktap_request_pool pool;
++
++static inline struct blktap_request_handle *
++blktap_request_to_handle(struct blktap_request *req)
++{
++	return container_of(req, struct blktap_request_handle, request);
++}
++
++static void
++blktap_request_pool_init_request(struct blktap_request *request)
++{
++	int i;
++
++	request->usr_idx  = -1;
++	request->nr_pages = 0;
++	request->status   = BLKTAP_REQUEST_FREE;
++	INIT_LIST_HEAD(&request->free_list);
++	for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
++		request->handles[i].user   = INVALID_GRANT_HANDLE;
++		request->handles[i].kernel = INVALID_GRANT_HANDLE;
++	}
++}
++
++static int
++blktap_request_pool_allocate_bucket(void)
++{
++	int i, idx;
++	unsigned long flags;
++	struct blktap_request *request;
++	struct blktap_request_handle *handle;
++	struct blktap_request_bucket *bucket;
++
++	bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
++	if (!bucket)
++		goto fail;
++
++	bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
++	if (!bucket->foreign_pages)
++		goto fail;
++
++	spin_lock_irqsave(&pool.lock, flags);
++
++	idx = -1;
++	for (i = 0; i < MAX_BUCKETS; i++) {
++		if (!pool.buckets[i]) {
++			idx = i;
++			pool.buckets[idx] = bucket;
++			break;
++		}
++	}
++
++	if (idx == -1) {
++		spin_unlock_irqrestore(&pool.lock, flags);
++		goto fail;
++	}
++
++	for (i = 0; i < BUCKET_SIZE; i++) {
++		handle  = bucket->handles + i;
++		request = &handle->request;
++
++		handle->slot   = i;
++		handle->inuse  = 0;
++		handle->bucket = bucket;
++
++		blktap_request_pool_init_request(request);
++		list_add_tail(&request->free_list, &pool.free_list);
++	}
++
++	spin_unlock_irqrestore(&pool.lock, flags);
++
++	return 0;
++
++fail:
++	if (bucket && bucket->foreign_pages)
++		free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++	kfree(bucket);
++	return -ENOMEM;
++}
++
++static void
++blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
++{
++	if (!bucket)
++		return;
++
++	BTDBG("freeing bucket %p\n", bucket);
++
++	free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
++	kfree(bucket);
++}
++
++struct page *
++request_to_page(struct blktap_request *req, int seg)
++{
++	struct blktap_request_handle *handle = blktap_request_to_handle(req);
++	int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
++	return handle->bucket->foreign_pages[idx];
++}
++
++int
++blktap_request_pool_shrink(void)
++{
++	int i, err;
++	unsigned long flags;
++	struct blktap_request_bucket *bucket;
++
++	err = -EAGAIN;
++
++	spin_lock_irqsave(&pool.lock, flags);
++
++	/* always keep at least one bucket */
++	for (i = 1; i < MAX_BUCKETS; i++) {
++		bucket = pool.buckets[i];
++		if (!bucket)
++			continue;
++
++		if (atomic_read(&bucket->reqs_in_use))
++			continue;
++
++		blktap_request_pool_free_bucket(bucket);
++		pool.buckets[i] = NULL;
++		err = 0;
++		break;
++	}
++
++	spin_unlock_irqrestore(&pool.lock, flags);
++
++	return err;
++}
++
++int
++blktap_request_pool_grow(void)
++{
++	return blktap_request_pool_allocate_bucket();
++}
++
++struct blktap_request *
++blktap_request_allocate(struct blktap *tap)
++{
++	int i;
++	uint16_t usr_idx;
++	unsigned long flags;
++	struct blktap_request *request;
++
++	usr_idx = -1;
++	request = NULL;
++
++	spin_lock_irqsave(&pool.lock, flags);
++
++	if (pool.status == BLKTAP_POOL_CLOSING)
++		goto out;
++
++	for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
++		if (!tap->pending_requests[i]) {
++			usr_idx = i;
++			break;
++		}
++
++	if (usr_idx == (uint16_t)-1)
++		goto out;
++
++	if (!list_empty(&pool.free_list)) {
++		request = list_entry(pool.free_list.next,
++				     struct blktap_request, free_list);
++		list_del(&request->free_list);
++	}
++
++	if (request) {
++		struct blktap_request_handle *handle;
++
++		atomic_inc(&pool.reqs_in_use);
++
++		handle = blktap_request_to_handle(request);
++		atomic_inc(&handle->bucket->reqs_in_use);
++		handle->inuse = 1;
++
++		request->usr_idx = usr_idx;
++
++		tap->pending_requests[usr_idx] = request;
++		tap->pending_cnt++;
++	}
++
++out:
++	spin_unlock_irqrestore(&pool.lock, flags);
++	return request;
++}
++
++void
++blktap_request_free(struct blktap *tap, struct blktap_request *request)
++{
++	int free;
++	unsigned long flags;
++	struct blktap_request_handle *handle;
++
++	BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
++	handle = blktap_request_to_handle(request);
++
++	spin_lock_irqsave(&pool.lock, flags);
++
++	handle->inuse = 0;
++	tap->pending_requests[request->usr_idx] = NULL;
++	blktap_request_pool_init_request(request);
++	list_add(&request->free_list, &pool.free_list);
++	atomic_dec(&handle->bucket->reqs_in_use);
++	free = atomic_dec_and_test(&pool.reqs_in_use);
++	tap->pending_cnt--;
++
++	spin_unlock_irqrestore(&pool.lock, flags);
++
++	if (free)
++		wake_up(&pool.wait_queue);
++
++	blktap_ring_kick_all();
++}
++
++void
++blktap_request_pool_free(void)
++{
++	int i;
++	unsigned long flags;
++
++	spin_lock_irqsave(&pool.lock, flags);
++
++	pool.status = BLKTAP_POOL_CLOSING;
++	while (atomic_read(&pool.reqs_in_use)) {
++		spin_unlock_irqrestore(&pool.lock, flags);
++		wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
++		spin_lock_irqsave(&pool.lock, flags);
++	}
++
++	for (i = 0; i < MAX_BUCKETS; i++) {
++		blktap_request_pool_free_bucket(pool.buckets[i]);
++		pool.buckets[i] = NULL;
++	}
++
++	spin_unlock_irqrestore(&pool.lock, flags);
++}
++
++int __init
++blktap_request_pool_init(void)
++{
++	int i, err;
++
++	memset(&pool, 0, sizeof(pool));
++
++	spin_lock_init(&pool.lock);
++	INIT_LIST_HEAD(&pool.free_list);
++	atomic_set(&pool.reqs_in_use, 0);
++	init_waitqueue_head(&pool.wait_queue);
++
++	for (i = 0; i < 2; i++) {
++		err = blktap_request_pool_allocate_bucket();
++		if (err)
++			goto fail;
++	}
++
++	return 0;
++
++fail:
++	blktap_request_pool_free();
++	return err;
++}
+diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c
+new file mode 100644
+index 0000000..057e97f
+--- /dev/null
++++ b/drivers/xen/blktap/ring.c
+@@ -0,0 +1,545 @@
++#include <linux/device.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/poll.h>
++
++#include <asm/xen/page.h>
++#include <asm/xen/hypercall.h>
++
++#include "blktap.h"
++
++#ifdef CONFIG_XEN_BLKDEV_BACKEND
++#include "../blkback/blkback-pagemap.h"
++#else
++#define blkback_pagemap_contains_page(page) 0
++#endif
++
++int blktap_ring_major;
++static struct cdev blktap_ring_cdev;
++
++static DECLARE_WAIT_QUEUE_HEAD(blktap_poll_wait);
++
++static inline struct blktap *
++vma_to_blktap(struct vm_area_struct *vma)
++{
++	struct vm_foreign_map *m = vma->vm_private_data;
++	struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
++	return container_of(r, struct blktap, ring);
++}
++
++ /* 
++  * BLKTAP - immediately before the mmap area,
++  * we have a bunch of pages reserved for shared memory rings.
++  */
++#define RING_PAGES 1
++
++static void
++blktap_ring_read_response(struct blktap *tap,
++		     const struct blkif_response *rsp)
++{
++	struct blktap_ring *ring = &tap->ring;
++	struct blktap_request *request;
++	int usr_idx, err;
++
++	request = NULL;
++
++	usr_idx = rsp->id;
++	if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) {
++		err = -ERANGE;
++		goto invalid;
++	}
++
++	request = tap->pending_requests[usr_idx];
++
++	if (!request) {
++		err = -ESRCH;
++		goto invalid;
++	}
++
++	if (rsp->operation != request->operation) {
++		err = -EINVAL;
++		goto invalid;
++	}
++
++	dev_dbg(ring->dev,
++		"request %d [%p] response: %d\n",
++		request->usr_idx, request, rsp->status);
++
++	err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO;
++end_request:
++	blktap_device_end_request(tap, request, err);
++	return;
++
++invalid:
++	dev_warn(ring->dev,
++		 "invalid response, idx:%d status:%d op:%d/%d: err %d\n",
++		 usr_idx, rsp->status,
++		 rsp->operation, request->operation,
++		 err);
++	if (request)
++		goto end_request;
++}
++
++static void
++blktap_read_ring(struct blktap *tap)
++{
++	struct blktap_ring *ring = &tap->ring;
++	struct blkif_response rsp;
++	RING_IDX rc, rp;
++
++	down_read(&current->mm->mmap_sem);
++	if (!ring->vma) {
++		up_read(&current->mm->mmap_sem);
++		return;
++	}
++
++	/* for each outstanding message on the ring  */
++	rp = ring->ring.sring->rsp_prod;
++	rmb();
++
++	for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
++		memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
++		blktap_ring_read_response(tap, &rsp);
++	}
++
++	ring->ring.rsp_cons = rc;
++
++	up_read(&current->mm->mmap_sem);
++}
++
++static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++	return VM_FAULT_SIGBUS;
++}
++
++static pte_t
++blktap_ring_clear_pte(struct vm_area_struct *vma,
++		      unsigned long uvaddr,
++		      pte_t *ptep, int is_fullmm)
++{
++	pte_t copy;
++	struct blktap *tap;
++	unsigned long kvaddr;
++	struct page **map, *page;
++	struct blktap_ring *ring;
++	struct blktap_request *request;
++	struct grant_handle_pair *khandle;
++	struct gnttab_unmap_grant_ref unmap[2];
++	int offset, seg, usr_idx, count = 0;
++
++	tap  = vma_to_blktap(vma);
++	ring = &tap->ring;
++	map  = ring->foreign_map.map;
++	BUG_ON(!map);	/* TODO Should this be changed to if statement? */
++
++	/*
++	 * Zap entry if the address is before the start of the grant
++	 * mapped region.
++	 */
++	if (uvaddr < ring->user_vstart)
++		return ptep_get_and_clear_full(vma->vm_mm, uvaddr,
++					       ptep, is_fullmm);
++
++	offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
++	usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
++	seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
++
++	offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
++	page    = map[offset];
++	if (page && blkback_pagemap_contains_page(page))
++		set_page_private(page, 0);
++	map[offset] = NULL;
++
++	request = tap->pending_requests[usr_idx];
++	kvaddr  = request_to_kaddr(request, seg);
++	khandle = request->handles + seg;
++
++	if (khandle->kernel != INVALID_GRANT_HANDLE) {
++		gnttab_set_unmap_op(&unmap[count], kvaddr, 
++				    GNTMAP_host_map, khandle->kernel);
++		count++;
++
++		set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
++				    INVALID_P2M_ENTRY);
++	}
++
++	if (khandle->user != INVALID_GRANT_HANDLE) {
++		BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
++
++		copy = *ptep;
++		gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr,
++				    GNTMAP_host_map
++				    | GNTMAP_application_map
++				    | GNTMAP_contains_pte,
++				    khandle->user);
++		count++;
++	} else
++		copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep,
++					       is_fullmm);
++
++	if (count)
++		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++					      unmap, count))
++			BUG();
++
++	khandle->kernel = INVALID_GRANT_HANDLE;
++	khandle->user   = INVALID_GRANT_HANDLE;
++
++	return copy;
++}
++
++static void
++blktap_ring_fail_pending(struct blktap *tap)
++{
++	struct blktap_request *request;
++	int usr_idx;
++
++	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
++		request = tap->pending_requests[usr_idx];
++		if (!request)
++			continue;
++
++		blktap_device_end_request(tap, request, -EIO);
++	}
++}
++
++static void
++blktap_ring_vm_close(struct vm_area_struct *vma)
++{
++	struct blktap *tap = vma_to_blktap(vma);
++	struct blktap_ring *ring = &tap->ring;
++	struct page *page = virt_to_page(ring->ring.sring);
++
++	blktap_ring_fail_pending(tap);
++
++	kfree(ring->foreign_map.map);
++	ring->foreign_map.map = NULL;
++
++	zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
++	ClearPageReserved(page);
++	__free_page(page);
++
++	ring->vma = NULL;
++
++	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++		blktap_control_destroy_tap(tap);
++}
++
++static struct vm_operations_struct blktap_ring_vm_operations = {
++	.close    = blktap_ring_vm_close,
++	.fault    = blktap_ring_fault,
++	.zap_pte  = blktap_ring_clear_pte,
++};
++
++static int
++blktap_ring_open(struct inode *inode, struct file *filp)
++{
++	struct blktap *tap = NULL;
++	int minor;
++
++	minor = iminor(inode);
++
++	if (minor < blktap_max_minor)
++		tap = blktaps[minor];
++
++	if (!tap)
++		return -ENXIO;
++
++	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++		return -ENXIO;
++
++	if (tap->ring.task)
++		return -EBUSY;
++
++	filp->private_data = tap;
++	tap->ring.task = current;
++
++	return 0;
++}
++
++static int
++blktap_ring_release(struct inode *inode, struct file *filp)
++{
++	struct blktap *tap = filp->private_data;
++
++	blktap_device_destroy_sync(tap);
++
++	tap->ring.task = NULL;
++
++	if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++		blktap_control_destroy_tap(tap);
++
++	return 0;
++}
++
++/* Note on mmap:
++ * We need to map pages to user space in a way that will allow the block
++ * subsystem set up direct IO to them.  This couldn't be done before, because
++ * there isn't really a sane way to translate a user virtual address down to a 
++ * physical address when the page belongs to another domain.
++ *
++ * My first approach was to map the page in to kernel memory, add an entry
++ * for it in the physical frame list (using alloc_lomem_region as in blkback)
++ * and then attempt to map that page up to user space.  This is disallowed
++ * by xen though, which realizes that we don't really own the machine frame
++ * underlying the physical page.
++ *
++ * The new approach is to provide explicit support for this in xen linux.
++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
++ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
++ * from pages to actual page structs.  There is a new clause in get_user_pages
++ * that does the right thing for this sort of mapping.
++ */
++static int
++blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++	struct blktap *tap = filp->private_data;
++	struct blktap_ring *ring = &tap->ring;
++	struct blkif_sring *sring;
++	struct page *page;
++	int size, err;
++	struct page **map;
++
++	map   = NULL;
++	sring = NULL;
++
++	if (ring->vma)
++		return -EBUSY;
++
++	size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++	if (size != (MMAP_PAGES + RING_PAGES)) {
++		BTERR("you _must_ map exactly %lu pages!\n",
++		      MMAP_PAGES + RING_PAGES);
++		return -EAGAIN;
++	}
++
++	/* allocate the shared ring */
++	page = alloc_page(GFP_KERNEL|__GFP_ZERO);
++	if (!page)
++		goto fail;
++
++	SetPageReserved(page);
++
++	err = vm_insert_page(vma, vma->vm_start, page);
++	if (err)
++		goto fail;
++
++	sring = page_address(page);
++	SHARED_RING_INIT(sring);
++	FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
++
++	ring->ring_vstart = vma->vm_start;
++	ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
++
++	/* allocate the foreign map */
++	map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
++	if (!map)
++		goto fail;
++
++	/* Mark this VM as containing foreign pages, and set up mappings. */
++	ring->foreign_map.map = map;
++	vma->vm_private_data = &ring->foreign_map;
++	vma->vm_flags |= VM_FOREIGN;
++	vma->vm_flags |= VM_DONTCOPY;
++	vma->vm_flags |= VM_RESERVED;
++	vma->vm_ops = &blktap_ring_vm_operations;
++
++#ifdef CONFIG_X86
++	vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
++
++	ring->vma = vma;
++	return 0;
++
++fail:
++	if (page) {
++		zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
++		ClearPageReserved(page);
++		__free_page(page);
++	}
++
++	if (map)
++		kfree(map);
++
++	return -ENOMEM;
++}
++
++static int
++blktap_ring_ioctl(struct inode *inode, struct file *filp,
++		  unsigned int cmd, unsigned long arg)
++{
++	struct blktap *tap = filp->private_data;
++	struct blktap_ring *ring = &tap->ring;
++
++	BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
++
++	if (!ring->vma || ring->vma->vm_mm != current->mm)
++		return -EACCES;
++
++	switch(cmd) {
++	case BLKTAP2_IOCTL_KICK_FE:
++
++		blktap_read_ring(tap);
++		return 0;
++
++	case BLKTAP2_IOCTL_CREATE_DEVICE: {
++		struct blktap_params params;
++		void __user *ptr = (void *)arg;
++
++		if (!arg)
++			return -EINVAL;
++
++		if (copy_from_user(&params, ptr, sizeof(params)))
++			return -EFAULT;
++
++		return blktap_device_create(tap, &params);
++	}
++
++	case BLKTAP2_IOCTL_REMOVE_DEVICE:
++
++		return blktap_device_destroy(tap);
++	}
++
++	return -ENOIOCTLCMD;
++}
++
++static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
++{
++	struct blktap *tap = filp->private_data;
++	struct blktap_ring *ring = &tap->ring;
++	int work = 0;
++
++	poll_wait(filp, &blktap_poll_wait, wait);
++	poll_wait(filp, &ring->poll_wait, wait);
++
++	down_read(&current->mm->mmap_sem);
++	if (ring->vma && tap->device.gd)
++		work = blktap_device_run_queue(tap);
++	up_read(&current->mm->mmap_sem);
++
++	if (work ||
++	    ring->ring.sring->private.tapif_user.msg ||
++	    test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
++		return POLLIN | POLLRDNORM;
++
++	return 0;
++}
++
++static struct file_operations blktap_ring_file_operations = {
++	.owner    = THIS_MODULE,
++	.open     = blktap_ring_open,
++	.release  = blktap_ring_release,
++	.ioctl    = blktap_ring_ioctl,
++	.mmap     = blktap_ring_mmap,
++	.poll     = blktap_ring_poll,
++};
++
++void
++blktap_ring_kick_user(struct blktap *tap)
++{
++	wake_up(&tap->ring.poll_wait);
++}
++
++void
++blktap_ring_kick_all(void)
++{
++	wake_up(&blktap_poll_wait);
++}
++
++int
++blktap_ring_destroy(struct blktap *tap)
++{
++	struct blktap_ring *ring = &tap->ring;
++
++	if (ring->task || ring->vma)
++		return -EBUSY;
++
++	return 0;
++}
++
++int
++blktap_ring_create(struct blktap *tap)
++{
++	struct blktap_ring *ring = &tap->ring;
++
++	init_waitqueue_head(&ring->poll_wait);
++	ring->devno = MKDEV(blktap_ring_major, tap->minor);
++
++	return 0;
++}
++
++size_t
++blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
++{
++	char *s = buf, *end = buf + size;
++	int usr_idx;
++
++	s += snprintf(s, end - s,
++		      "begin pending:%d\n", tap->pending_cnt);
++
++	for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
++		struct blktap_request *request;
++		struct timeval *time;
++		int write;
++
++		request = tap->pending_requests[usr_idx];
++		if (!request)
++			continue;
++
++		write = request->operation == BLKIF_OP_WRITE;
++		time  = &request->time;
++
++		s += snprintf(s, end - s,
++			      "%02d: usr_idx:%02d "
++			      "op:%c nr_pages:%02d time:%lu.%09lu\n",
++			      usr_idx, request->usr_idx,
++			      write ? 'W' : 'R', request->nr_pages,
++			      time->tv_sec, time->tv_usec);
++	}
++
++	s += snprintf(s, end - s, "end pending\n");
++
++	return s - buf;
++}
++
++
++int __init
++blktap_ring_init(void)
++{
++	dev_t dev = 0;
++	int err;
++
++	cdev_init(&blktap_ring_cdev, &blktap_ring_file_operations);
++	blktap_ring_cdev.owner = THIS_MODULE;
++
++	err = alloc_chrdev_region(&dev, 0, MAX_BLKTAP_DEVICE, "blktap2");
++	if (err < 0) {
++		BTERR("error registering ring devices: %d\n", err);
++		return err;
++	}
++
++	err = cdev_add(&blktap_ring_cdev, dev, MAX_BLKTAP_DEVICE);
++	if (err) {
++		BTERR("error adding ring device: %d\n", err);
++		unregister_chrdev_region(dev, MAX_BLKTAP_DEVICE);
++		return err;
++	}
++
++	blktap_ring_major = MAJOR(dev);
++	BTINFO("blktap ring major: %d\n", blktap_ring_major);
++
++	return 0;
++}
++
++void
++blktap_ring_exit(void)
++{
++	if (!blktap_ring_major)
++		return;
++
++	cdev_del(&blktap_ring_cdev);
++	unregister_chrdev_region(MKDEV(blktap_ring_major, 0),
++				 MAX_BLKTAP_DEVICE);
++
++	blktap_ring_major = 0;
++}
+diff --git a/drivers/xen/blktap/sysfs.c b/drivers/xen/blktap/sysfs.c
+new file mode 100644
+index 0000000..5d421e4
+--- /dev/null
++++ b/drivers/xen/blktap/sysfs.c
+@@ -0,0 +1,252 @@
++#include <linux/types.h>
++#include <linux/device.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/genhd.h>
++#include <linux/blkdev.h>
++
++#include "blktap.h"
++
++int blktap_debug_level = 1;
++
++static struct class *class;
++
++static ssize_t
++blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
++{
++	struct blktap *tap;
++
++	tap = dev_get_drvdata(dev);
++	if (!tap)
++		return 0;
++
++	if (size >= BLKTAP2_MAX_MESSAGE_LEN)
++		return -ENAMETOOLONG;
++
++	if (strnlen(buf, size) != size)
++		return -EINVAL;
++
++	strcpy(tap->name, buf);
++
++	return size;
++}
++
++static ssize_t
++blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
++{
++	struct blktap *tap;
++	ssize_t size;
++
++	tap = dev_get_drvdata(dev);
++	if (!tap)
++		return 0;
++
++	if (tap->name[0])
++		size = sprintf(buf, "%s\n", tap->name);
++	else
++		size = sprintf(buf, "%d\n", tap->minor);
++
++	return size;
++}
++static DEVICE_ATTR(name, S_IRUGO|S_IWUSR,
++		   blktap_sysfs_get_name, blktap_sysfs_set_name);
++
++static void
++blktap_sysfs_remove_work(struct work_struct *work)
++{
++	struct blktap *tap
++		= container_of(work, struct blktap, remove_work);
++	blktap_control_destroy_tap(tap);
++}
++
++static ssize_t
++blktap_sysfs_remove_device(struct device *dev,
++			   struct device_attribute *attr,
++			   const char *buf, size_t size)
++{
++	struct blktap *tap;
++	int err;
++
++	tap = dev_get_drvdata(dev);
++	if (!tap)
++		return size;
++
++	if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
++		goto wait;
++
++	if (tap->ring.vma) {
++		struct blkif_sring *sring = tap->ring.ring.sring;
++		sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE;
++		blktap_ring_kick_user(tap);
++	} else {
++		INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work);
++		schedule_work(&tap->remove_work);
++	}
++wait:
++	err = wait_event_interruptible(tap->remove_wait,
++				       !dev_get_drvdata(dev));
++	if (err)
++		return err;
++
++	return size;
++}
++static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
++
++static ssize_t
++blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
++{
++	struct blktap *tap;
++	char *s = buf, *end = buf + PAGE_SIZE;
++
++	tap = dev_get_drvdata(dev);
++	if (!tap)
++		return 0;
++
++	s += blktap_control_debug(tap, s, end - s);
++
++	s += blktap_device_debug(tap, s, end - s);
++
++	s += blktap_ring_debug(tap, s, end - s);
++
++	return s - buf;
++}
++static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL);
++
++static ssize_t
++blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf)
++{
++	struct blktap *tap;
++	ssize_t rv = 0;
++
++	tap = dev_get_drvdata(dev);
++	if (!tap)
++		return 0;
++
++	if (tap->ring.task)
++		rv = sprintf(buf, "%d\n", tap->ring.task->pid);
++
++	return rv;
++}
++static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL);
++
++int
++blktap_sysfs_create(struct blktap *tap)
++{
++	struct blktap_ring *ring = &tap->ring;
++	struct device *dev;
++	int err = 0;
++
++	init_waitqueue_head(&tap->remove_wait);
++
++	dev = device_create(class, NULL, ring->devno,
++			    tap, "blktap%d", tap->minor);
++	if (IS_ERR(dev))
++		err = PTR_ERR(dev);
++	if (!err)
++		err = device_create_file(dev, &dev_attr_name);
++	if (!err)
++		err = device_create_file(dev, &dev_attr_remove);
++	if (!err)
++		err = device_create_file(dev, &dev_attr_debug);
++	if (!err)
++		err = device_create_file(dev, &dev_attr_task);
++	if (!err)
++		ring->dev = dev;
++	else
++		device_unregister(dev);
++
++	return err;
++}
++
++void
++blktap_sysfs_destroy(struct blktap *tap)
++{
++	struct blktap_ring *ring = &tap->ring;
++	struct device *dev;
++
++	dev = ring->dev;
++
++	if (!dev)
++		return;
++
++	dev_set_drvdata(dev, NULL);
++	wake_up(&tap->remove_wait);
++
++	device_unregister(dev);
++	ring->dev = NULL;
++}
++
++static ssize_t
++blktap_sysfs_show_verbosity(struct class *class, char *buf)
++{
++	return sprintf(buf, "%d\n", blktap_debug_level);
++}
++
++static ssize_t
++blktap_sysfs_set_verbosity(struct class *class, const char *buf, size_t size)
++{
++	int level;
++
++	if (sscanf(buf, "%d", &level) == 1) {
++		blktap_debug_level = level;
++		return size;
++	}
++
++	return -EINVAL;
++}
++static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR,
++		  blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
++
++static ssize_t
++blktap_sysfs_show_devices(struct class *class, char *buf)
++{
++	int i, ret;
++	struct blktap *tap;
++
++	mutex_lock(&blktap_lock);
++
++	ret = 0;
++	for (i = 0; i < blktap_max_minor; i++) {
++		tap = blktaps[i];
++		if (!tap)
++			continue;
++
++		if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
++			continue;
++
++		ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name);
++	}
++
++	mutex_unlock(&blktap_lock);
++
++	return ret;
++}
++static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
++
++void
++blktap_sysfs_exit(void)
++{
++	if (class)
++		class_destroy(class);
++}
++
++int __init
++blktap_sysfs_init(void)
++{
++	struct class *cls;
++	int err = 0;
++
++	cls = class_create(THIS_MODULE, "blktap2");
++	if (IS_ERR(cls))
++		err = PTR_ERR(cls);
++	if (!err)
++		err = class_create_file(cls, &class_attr_verbosity);
++	if (!err)
++		err = class_create_file(cls, &class_attr_devices);
++	if (!err)
++		class = cls;
++	else
++		class_destroy(cls);
++
++	return err;
++}
+diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
+index bdfd584..6625ffe 100644
+--- a/drivers/xen/cpu_hotplug.c
++++ b/drivers/xen/cpu_hotplug.c
+@@ -1,5 +1,6 @@
+ #include <linux/notifier.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ 
+ #include <asm/xen/hypervisor.h>
+diff --git a/drivers/xen/events.c b/drivers/xen/events.c
+index a4dc7bf..4f64072 100644
+--- a/drivers/xen/events.c
++++ b/drivers/xen/events.c
+@@ -16,7 +16,7 @@
+  *    (typically dom0).
+  * 2. VIRQs, typically used for timers.  These are per-cpu events.
+  * 3. IPIs.
+- * 4. Hardware interrupts. Not supported at present.
++ * 4. PIRQs - Hardware interrupts.
+  *
+  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
+  */
+@@ -27,18 +27,32 @@
+ #include <linux/module.h>
+ #include <linux/string.h>
+ #include <linux/bootmem.h>
++#include <linux/irqnr.h>
++#include <linux/pci_regs.h>
++#include <linux/pci.h>
++#include <linux/msi.h>
+ 
++#include <asm/desc.h>
+ #include <asm/ptrace.h>
+ #include <asm/irq.h>
+ #include <asm/idle.h>
++#include <asm/io_apic.h>
+ #include <asm/sync_bitops.h>
+ #include <asm/xen/hypercall.h>
+ #include <asm/xen/hypervisor.h>
++#include <asm/xen/pci.h>
+ 
++#include <xen/xen.h>
++#include <xen/hvm.h>
+ #include <xen/xen-ops.h>
+ #include <xen/events.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/event_channel.h>
++#include <xen/interface/hvm/hvm_op.h>
++#include <xen/interface/hvm/params.h>
++#include <xen/page.h>
++
++#include "../pci/msi.h"
+ 
+ /*
+  * This lock protects updates to the following mapping and reference-count
+@@ -67,7 +81,7 @@ enum xen_irq_type {
+  * event channel - irq->event channel mapping
+  * cpu - cpu this event channel is bound to
+  * index - type-specific information:
+- *    PIRQ - vector, with MSB being "needs EIO"
++ *    PIRQ - with MSB being "needs EIO"
+  *    VIRQ - virq number
+  *    IPI - IPI vector
+  *    EVTCHN -
+@@ -83,20 +97,30 @@ struct irq_info
+ 		enum ipi_vector ipi;
+ 		struct {
+ 			unsigned short gsi;
+-			unsigned short vector;
++			unsigned char vector;
++			unsigned char flags;
++			uint16_t domid;
+ 		} pirq;
+ 	} u;
+ };
++#define PIRQ_SHAREABLE	(1 << 1)
+ 
+-static struct irq_info irq_info[NR_IRQS];
++/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
++static bool pirq_eoi_does_unmask;
++static unsigned long *pirq_needs_eoi_bits;
+ 
+-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+-	[0 ... NR_EVENT_CHANNELS-1] = -1
+-};
++static struct irq_info *irq_info;
++
++static int *evtchn_to_irq;
+ struct cpu_evtchn_s {
+ 	unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
+ };
+-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
++
++static __initdata struct cpu_evtchn_s init_evtchn_mask = {
++	.bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
++};
++static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
++
+ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ {
+ 	return cpu_evtchn_mask_p[cpu].bits;
+@@ -107,6 +131,7 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
+ 
+ static struct irq_chip xen_dynamic_chip;
+ static struct irq_chip xen_percpu_chip;
++static struct irq_chip xen_pirq_chip;
+ 
+ /* Constructor for packed IRQ information. */
+ static struct irq_info mk_unbound_info(void)
+@@ -136,7 +161,8 @@ static struct irq_info mk_pirq_info(unsigned short evtchn,
+ 				    unsigned short gsi, unsigned short vector)
+ {
+ 	return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
+-			.cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
++			.cpu = 0, .u.pirq =
++			{ .gsi = gsi, .vector = vector, .domid = DOMID_SELF } };
+ }
+ 
+ /*
+@@ -219,6 +245,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
+ 	return ret;
+ }
+ 
++static bool pirq_needs_eoi(unsigned irq)
++{
++	struct irq_info *info = info_for_irq(irq);
++
++	BUG_ON(info->type != IRQT_PIRQ);
++
++	return test_bit(info->u.pirq.gsi, pirq_needs_eoi_bits);
++}
++
+ static inline unsigned long active_evtchns(unsigned int cpu,
+ 					   struct shared_info *sh,
+ 					   unsigned int idx)
+@@ -300,6 +335,14 @@ static void mask_evtchn(int port)
+ 	sync_set_bit(port, &s->evtchn_mask[0]);
+ }
+ 
++static void mask_irq(unsigned int irq)
++{
++	int evtchn = evtchn_from_irq(irq);
++
++	if (VALID_EVTCHN(evtchn))
++		mask_evtchn(evtchn);
++}
++
+ static void unmask_evtchn(int port)
+ {
+ 	struct shared_info *s = HYPERVISOR_shared_info;
+@@ -330,27 +373,371 @@ static void unmask_evtchn(int port)
+ 	put_cpu();
+ }
+ 
++static void unmask_irq(unsigned int irq)
++{
++	int evtchn = evtchn_from_irq(irq);
++
++	if (VALID_EVTCHN(evtchn))
++		unmask_evtchn(evtchn);
++}
++
++static int get_nr_hw_irqs(void)
++{
++	int ret = 1;
++
++#ifdef CONFIG_X86_IO_APIC
++	ret = get_nr_irqs_gsi();
++#endif
++
++	return ret;
++}
++
+ static int find_unbound_irq(void)
+ {
+ 	int irq;
+ 	struct irq_desc *desc;
++	int start = get_nr_hw_irqs();
+ 
+-	for (irq = 0; irq < nr_irqs; irq++)
++	if (start == nr_irqs)
++		goto no_irqs;
++
++	/* nr_irqs is a magic value. Must not use it.*/
++	for (irq = nr_irqs-1; irq > start; irq--) {
++		desc = irq_to_desc(irq);
++		/* only 0->15 have init'd desc; handle irq > 16 */
++		if (desc == NULL)
++			break;
++		if (desc->chip == &no_irq_chip)
++			break;
++		if (desc->chip != &xen_dynamic_chip)
++			continue;
+ 		if (irq_info[irq].type == IRQT_UNBOUND)
+ 			break;
++	}
+ 
+-	if (irq == nr_irqs)
+-		panic("No available IRQ to bind to: increase nr_irqs!\n");
++	if (irq == start)
++		goto no_irqs;
+ 
+ 	desc = irq_to_desc_alloc_node(irq, 0);
+ 	if (WARN_ON(desc == NULL))
+ 		return -1;
+ 
+-	dynamic_irq_init(irq);
++	dynamic_irq_init_keep_chip_data(irq);
+ 
+ 	return irq;
++
++no_irqs:
++	panic("No available IRQ to bind to: increase nr_irqs!\n");
++}
++
++static bool identity_mapped_irq(unsigned irq)
++{
++	/* identity map all the hardware irqs */
++	return irq < get_nr_hw_irqs();
++}
++
++static void pirq_eoi(int irq)
++{
++	struct irq_info *info = info_for_irq(irq);
++	struct physdev_eoi eoi = { .irq = info->u.pirq.gsi };
++	bool need_eoi;
++
++	need_eoi = pirq_needs_eoi(irq);
++
++	if (!need_eoi || !pirq_eoi_does_unmask)
++		unmask_evtchn(info->evtchn);
++
++	if (need_eoi) {
++		int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
++		WARN_ON(rc);
++	}
+ }
+ 
++static void pirq_query_unmask(int irq)
++{
++	struct physdev_irq_status_query irq_status;
++	struct irq_info *info = info_for_irq(irq);
++
++	if (pirq_eoi_does_unmask)
++		return;
++
++	BUG_ON(info->type != IRQT_PIRQ);
++
++	irq_status.irq = info->u.pirq.gsi;
++	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
++		irq_status.flags = 0;
++
++	clear_bit(info->u.pirq.gsi, pirq_needs_eoi_bits);
++	if (irq_status.flags & XENIRQSTAT_needs_eoi)
++		set_bit(info->u.pirq.gsi, pirq_needs_eoi_bits);
++}
++
++static bool probing_irq(int irq)
++{
++	struct irq_desc *desc = irq_to_desc(irq);
++
++	return desc && desc->action == NULL;
++}
++
++static unsigned int startup_pirq(unsigned int irq)
++{
++	struct evtchn_bind_pirq bind_pirq;
++	struct irq_info *info = info_for_irq(irq);
++	int evtchn = evtchn_from_irq(irq);
++	int rc;
++
++	BUG_ON(info->type != IRQT_PIRQ);
++
++	if (VALID_EVTCHN(evtchn))
++		goto out;
++
++	bind_pirq.pirq = info->u.pirq.gsi;
++	/* NB. We are happy to share unless we are probing. */
++	bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
++					BIND_PIRQ__WILL_SHARE : 0;
++	rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
++	if (rc != 0) {
++		if (!probing_irq(irq))
++			printk(KERN_INFO "Failed to obtain physical IRQ %d" \
++				" (GSI:%d)\n", irq, info->u.pirq.gsi);
++		return 0;
++	}
++	evtchn = bind_pirq.port;
++
++	pirq_query_unmask(irq);
++
++	evtchn_to_irq[evtchn] = irq;
++	bind_evtchn_to_cpu(evtchn, 0);
++	info->evtchn = evtchn;
++
++ out:
++	pirq_eoi(irq);
++
++	return 0;
++}
++
++static void shutdown_pirq(unsigned int irq)
++{
++	struct evtchn_close close;
++	struct irq_info *info = info_for_irq(irq);
++	int evtchn = evtchn_from_irq(irq);
++
++	BUG_ON(info->type != IRQT_PIRQ);
++
++	if (!VALID_EVTCHN(evtchn))
++		return;
++
++	mask_evtchn(evtchn);
++
++	close.port = evtchn;
++	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
++		BUG();
++
++	bind_evtchn_to_cpu(evtchn, 0);
++	evtchn_to_irq[evtchn] = -1;
++	info->evtchn = 0;
++}
++
++static void ack_pirq(unsigned int irq)
++{
++	move_masked_irq(irq);
++	
++	pirq_eoi(irq);
++}
++
++static void end_pirq(unsigned int irq)
++{
++	int evtchn = evtchn_from_irq(irq);
++	struct irq_desc *desc = irq_to_desc(irq);
++
++	if (WARN_ON(!desc))
++		return;
++
++	if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
++	    (IRQ_DISABLED|IRQ_PENDING)) {
++		shutdown_pirq(irq);
++	} else if (VALID_EVTCHN(evtchn)) {
++		pirq_eoi(irq);
++	}
++}
++
++static int find_irq_by_gsi(unsigned gsi)
++{
++	int irq;
++
++	for (irq = 0; irq < nr_irqs; irq++) {
++		struct irq_info *info = info_for_irq(irq);
++
++		if (info == NULL || info->type != IRQT_PIRQ)
++			continue;
++
++		if (gsi_from_irq(irq) == gsi)
++			return irq;
++	}
++
++	return -1;
++}
++
++/*
++ * Allocate a physical irq, along with a vector.  We don't assign an
++ * event channel until the irq actually started up.  Return an
++ * existing irq if we've already got one for the gsi.
++ */
++int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
++{
++	int irq;
++	struct physdev_irq irq_op;
++
++	spin_lock(&irq_mapping_update_lock);
++
++	irq = find_irq_by_gsi(gsi);
++	if (irq != -1) {
++		printk(KERN_INFO "xen_allocate_pirq: returning irq %d for gsi %u\n",
++		       irq, gsi);
++		goto out;	/* XXX need refcount? */
++	}
++
++	/* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
++	 * we are using the !xen_initial_domain() to drop in the function.*/
++	if (identity_mapped_irq(gsi) || !xen_initial_domain()) {
++		irq = gsi;
++		irq_to_desc_alloc_node(irq, 0);
++		dynamic_irq_init(irq);
++	} else
++		irq = find_unbound_irq();
++
++	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++				      handle_fasteoi_irq, name);
++
++	irq_op.irq = gsi;
++	irq_op.vector = 0;
++
++	/* Only the privileged domain can do this. For non-priv, the pcifront
++	 * driver provides a PCI bus that does the call to do exactly
++	 * this in the priv domain. */
++	if (xen_initial_domain() &&
++	    HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
++		dynamic_irq_cleanup(irq);
++		irq = -ENOSPC;
++		goto out;
++	}
++
++	irq_info[irq] = mk_pirq_info(0, gsi, irq_op.vector);
++ 	irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
++
++out:
++	spin_unlock(&irq_mapping_update_lock);
++
++	return irq;
++}
++
++#ifdef CONFIG_PCI_MSI
++int xen_destroy_irq(int irq)
++{
++	struct irq_desc *desc;
++	struct physdev_unmap_pirq unmap_irq;
++	struct irq_info *info = info_for_irq(irq);
++	int rc = -ENOENT;
++
++	spin_lock(&irq_mapping_update_lock);
++
++	desc = irq_to_desc(irq);
++	if (!desc)
++		goto out;
++
++	if (xen_initial_domain()) {
++		unmap_irq.pirq = info->u.pirq.gsi;
++		unmap_irq.domid = info->u.pirq.domid;
++		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
++		if (rc) {
++			printk(KERN_WARNING "unmap irq failed %d\n", rc);
++			goto out;
++		}
++	}
++	irq_info[irq] = mk_unbound_info();
++
++	dynamic_irq_cleanup(irq);
++
++out:
++	spin_unlock(&irq_mapping_update_lock);
++	return rc;
++}
++
++#ifdef CONFIG_PCI_XEN
++int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
++{
++	int irq = 0;
++	struct physdev_map_pirq map_irq;
++	int rc;
++	domid_t domid;
++	int pos;
++	u32 table_offset, bir;
++
++	domid = rc = xen_find_device_domain_owner(dev);
++	if (rc < 0)
++		domid = DOMID_SELF;
++	
++	memset(&map_irq, 0, sizeof(map_irq));
++	map_irq.domid = domid;
++	map_irq.type = MAP_PIRQ_TYPE_MSI;
++	map_irq.index = -1;
++	map_irq.pirq = -1;
++	map_irq.bus = dev->bus->number;
++	map_irq.devfn = dev->devfn;
++
++	if (type == PCI_CAP_ID_MSIX) {
++		pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
++
++		pci_read_config_dword(dev, msix_table_offset_reg(pos),
++					&table_offset);
++		bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
++
++		map_irq.table_base = pci_resource_start(dev, bir);
++		map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
++	}
++
++	spin_lock(&irq_mapping_update_lock);
++
++	irq = find_unbound_irq();
++
++	if (irq == -1)
++		goto out;
++
++	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
++	if (rc) {
++		printk(KERN_WARNING "xen map irq failed %d\n", rc);
++
++		dynamic_irq_cleanup(irq);
++
++		irq = -1;
++		goto out;
++	}
++	irq_info[irq] = mk_pirq_info(0, map_irq.pirq, map_irq.index);
++	if (domid)
++		irq_info[irq].u.pirq.domid = domid;
++
++	set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
++				      handle_fasteoi_irq,
++				      (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
++
++out:
++	spin_unlock(&irq_mapping_update_lock);
++	return irq;
++}
++#endif
++#endif
++
++int xen_vector_from_irq(unsigned irq)
++{
++	return vector_from_irq(irq);
++}
++
++int xen_gsi_from_irq(unsigned irq)
++{
++	return gsi_from_irq(irq);
++}
++EXPORT_SYMBOL_GPL(xen_gsi_from_irq);
++
+ int bind_evtchn_to_irq(unsigned int evtchn)
+ {
+ 	int irq;
+@@ -363,7 +750,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
+ 		irq = find_unbound_irq();
+ 
+ 		set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
+-					      handle_edge_irq, "event");
++					      handle_fasteoi_irq, "event");
+ 
+ 		evtchn_to_irq[evtchn] = irq;
+ 		irq_info[irq] = mk_evtchn_info(evtchn);
+@@ -410,8 +797,23 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+ 	return irq;
+ }
+ 
++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
++                                          unsigned int remote_port)
++{
++        struct evtchn_bind_interdomain bind_interdomain;
++        int err;
++
++        bind_interdomain.remote_dom  = remote_domain;
++        bind_interdomain.remote_port = remote_port;
++
++        err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
++                                          &bind_interdomain);
++
++        return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
++}
++
+ 
+-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
++int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+ {
+ 	struct evtchn_bind_virq bind_virq;
+ 	int evtchn, irq;
+@@ -421,6 +823,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+ 	irq = per_cpu(virq_to_irq, cpu)[virq];
+ 
+ 	if (irq == -1) {
++		irq = find_unbound_irq();
++
++		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
++					      handle_percpu_irq, "virq");
++
+ 		bind_virq.virq = virq;
+ 		bind_virq.vcpu = cpu;
+ 		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+@@ -428,11 +835,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+ 			BUG();
+ 		evtchn = bind_virq.port;
+ 
+-		irq = find_unbound_irq();
+-
+-		set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+-					      handle_percpu_irq, "virq");
+-
+ 		evtchn_to_irq[evtchn] = irq;
+ 		irq_info[irq] = mk_virq_info(evtchn, virq);
+ 
+@@ -505,6 +907,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ }
+ EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+ 
++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
++					  unsigned int remote_port,
++					  irq_handler_t handler,
++					  unsigned long irqflags,
++					  const char *devname,
++					  void *dev_id)
++{
++        int irq, retval;
++
++        irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
++        if (irq < 0)
++                return irq;
++
++        retval = request_irq(irq, handler, irqflags, devname, dev_id);
++        if (retval != 0) {
++                unbind_from_irq(irq);
++                return retval;
++        }
++
++        return irq;
++}
++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
++
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ 			    irq_handler_t handler,
+ 			    unsigned long irqflags, const char *devname, void *dev_id)
+@@ -618,17 +1043,13 @@ static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+  * a bitset of words which contain pending event bits.  The second
+  * level is a bitset of pending events themselves.
+  */
+-void xen_evtchn_do_upcall(struct pt_regs *regs)
++static void __xen_evtchn_do_upcall(struct pt_regs *regs)
+ {
+ 	int cpu = get_cpu();
+-	struct pt_regs *old_regs = set_irq_regs(regs);
+ 	struct shared_info *s = HYPERVISOR_shared_info;
+ 	struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
+  	unsigned count;
+ 
+-	exit_idle();
+-	irq_enter();
+-
+ 	do {
+ 		unsigned long pending_words;
+ 
+@@ -651,9 +1072,16 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
+ 				int bit_idx = __ffs(pending_bits);
+ 				int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ 				int irq = evtchn_to_irq[port];
++				struct irq_desc *desc;
+ 
+-				if (irq != -1)
+-					handle_irq(irq, regs);
++				mask_evtchn(port);
++				clear_evtchn(port);
++
++				if (irq != -1) {
++					desc = irq_to_desc(irq);
++					if (desc)
++						generic_handle_irq_desc(irq, desc);
++				}
+ 			}
+ 		}
+ 
+@@ -661,14 +1089,32 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
+ 
+ 		count = __get_cpu_var(xed_nesting_count);
+ 		__get_cpu_var(xed_nesting_count) = 0;
+-	} while(count != 1);
++	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
+ 
+ out:
++
++	put_cpu();
++}
++
++void xen_evtchn_do_upcall(struct pt_regs *regs)
++{
++	struct pt_regs *old_regs = set_irq_regs(regs);
++
++	exit_idle();
++	irq_enter();
++
++	__xen_evtchn_do_upcall(regs);
++
+ 	irq_exit();
+ 	set_irq_regs(old_regs);
++}
+ 
+-	put_cpu();
++void xen_hvm_evtchn_do_upcall(void)
++{
++	struct pt_regs *regs = get_irq_regs();
++	__xen_evtchn_do_upcall(regs);
+ }
++EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
+ 
+ /* Rebind a new event channel to an existing irq. */
+ void rebind_evtchn_irq(int evtchn, int irq)
+@@ -705,7 +1151,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+ 	struct evtchn_bind_vcpu bind_vcpu;
+ 	int evtchn = evtchn_from_irq(irq);
+ 
+-	if (!VALID_EVTCHN(evtchn))
++	/* events delivered via platform PCI interrupts are always
++	 * routed to vcpu 0 */
++	if (!VALID_EVTCHN(evtchn) ||
++		(xen_hvm_domain() && !xen_have_vector_callback))
+ 		return -1;
+ 
+ 	/* Send future instances of this interrupt to other vcpu. */
+@@ -746,33 +1195,17 @@ int resend_irq_on_evtchn(unsigned int irq)
+ 	return 1;
+ }
+ 
+-static void enable_dynirq(unsigned int irq)
+-{
+-	int evtchn = evtchn_from_irq(irq);
+-
+-	if (VALID_EVTCHN(evtchn))
+-		unmask_evtchn(evtchn);
+-}
+-
+-static void disable_dynirq(unsigned int irq)
+-{
+-	int evtchn = evtchn_from_irq(irq);
+-
+-	if (VALID_EVTCHN(evtchn))
+-		mask_evtchn(evtchn);
+-}
+-
+ static void ack_dynirq(unsigned int irq)
+ {
+ 	int evtchn = evtchn_from_irq(irq);
+ 
+-	move_native_irq(irq);
++	move_masked_irq(irq);
+ 
+ 	if (VALID_EVTCHN(evtchn))
+-		clear_evtchn(evtchn);
++		unmask_evtchn(evtchn);
+ }
+ 
+-static int retrigger_dynirq(unsigned int irq)
++static int retrigger_irq(unsigned int irq)
+ {
+ 	int evtchn = evtchn_from_irq(irq);
+ 	struct shared_info *sh = HYPERVISOR_shared_info;
+@@ -857,7 +1290,7 @@ void xen_clear_irq_pending(int irq)
+ 	if (VALID_EVTCHN(evtchn))
+ 		clear_evtchn(evtchn);
+ }
+-
++EXPORT_SYMBOL(xen_clear_irq_pending);
+ void xen_set_irq_pending(int irq)
+ {
+ 	int evtchn = evtchn_from_irq(irq);
+@@ -877,9 +1310,9 @@ bool xen_test_irq_pending(int irq)
+ 	return ret;
+ }
+ 
+-/* Poll waiting for an irq to become pending.  In the usual case, the
++/* Poll waiting for an irq to become pending with timeout.  In the usual case, the
+    irq will be disabled so it won't deliver an interrupt. */
+-void xen_poll_irq(int irq)
++void xen_poll_irq_timeout(int irq, u64 timeout)
+ {
+ 	evtchn_port_t evtchn = evtchn_from_irq(irq);
+ 
+@@ -887,13 +1320,33 @@ void xen_poll_irq(int irq)
+ 		struct sched_poll poll;
+ 
+ 		poll.nr_ports = 1;
+-		poll.timeout = 0;
++		poll.timeout = timeout;
+ 		set_xen_guest_handle(poll.ports, &evtchn);
+ 
+ 		if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
+ 			BUG();
+ 	}
+ }
++EXPORT_SYMBOL(xen_poll_irq_timeout);
++/* Poll waiting for an irq to become pending.  In the usual case, the
++   irq will be disabled so it won't deliver an interrupt. */
++void xen_poll_irq(int irq)
++{
++	xen_poll_irq_timeout(irq, 0 /* no timeout */);
++}
++
++/* Check whether the IRQ line is shared with other guests. */
++int xen_ignore_irq(int irq)
++{
++	struct irq_info *info = info_for_irq(irq);
++	struct physdev_irq_status_query irq_status = { .irq =
++							info->u.pirq.gsi };
++
++	if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
++		return 0;
++	return !(irq_status.flags & XENIRQSTAT_shared);
++}
++EXPORT_SYMBOL_GPL(xen_ignore_irq);
+ 
+ void xen_irq_resume(void)
+ {
+@@ -916,37 +1369,117 @@ void xen_irq_resume(void)
+ 		restore_cpu_virqs(cpu);
+ 		restore_cpu_ipis(cpu);
+ 	}
++
++	if (pirq_eoi_does_unmask) {
++		struct physdev_pirq_eoi_gmfn eoi_gmfn;
++		
++		eoi_gmfn.gmfn = virt_to_mfn(pirq_needs_eoi_bits);
++		if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) != 0) {
++			/* Could recover by reverting to old method...? */
++			BUG();
++		}
++	}
+ }
+ 
+ static struct irq_chip xen_dynamic_chip __read_mostly = {
+ 	.name		= "xen-dyn",
+ 
+-	.disable	= disable_dynirq,
+-	.mask		= disable_dynirq,
+-	.unmask		= enable_dynirq,
++	.disable	= mask_irq,
++	.mask		= mask_irq,
++	.unmask		= unmask_irq,
+ 
+-	.ack		= ack_dynirq,
++	.eoi		= ack_dynirq,
+ 	.set_affinity	= set_affinity_irq,
+-	.retrigger	= retrigger_dynirq,
++	.retrigger	= retrigger_irq,
+ };
+ 
+-static struct irq_chip en_percpu_chip __read_mostly = {
++static struct irq_chip xen_percpu_chip __read_mostly = {
+ 	.name		= "xen-percpu",
+ 
+-	.disable	= disable_dynirq,
+-	.mask		= disable_dynirq,
+-	.unmask		= enable_dynirq,
++	.disable	= mask_irq,
++	.mask		= mask_irq,
++	.unmask		= unmask_irq,
+ 
+ 	.ack		= ack_dynirq,
+ };
+ 
++static struct irq_chip xen_pirq_chip __read_mostly = {
++	.name		= "xen-pirq",
++
++	.startup	= startup_pirq,
++	.shutdown	= shutdown_pirq,
++
++	.enable		= pirq_eoi,
++	.unmask		= unmask_irq,
++
++	.disable	= mask_irq,
++	.mask		= mask_irq,
++
++	.eoi		= ack_pirq,
++	.end		= end_pirq,
++
++	.set_affinity	= set_affinity_irq,
++
++	.retrigger	= retrigger_irq,
++};
++
++int xen_set_callback_via(uint64_t via)
++{
++	struct xen_hvm_param a;
++	a.domid = DOMID_SELF;
++	a.index = HVM_PARAM_CALLBACK_IRQ;
++	a.value = via;
++	return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
++}
++EXPORT_SYMBOL_GPL(xen_set_callback_via);
++
++#ifdef CONFIG_XEN_PVHVM
++/* Vector callbacks are better than PCI interrupts to receive event
++ * channel notifications because we can receive vector callbacks on any
++ * vcpu and we don't need PCI support or APIC interactions. */
++void xen_callback_vector(void)
++{
++	int rc;
++	uint64_t callback_via;
++	if (xen_have_vector_callback) {
++		callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
++		rc = xen_set_callback_via(callback_via);
++		if (rc) {
++			printk(KERN_ERR "Request for Xen HVM callback vector"
++					" failed.\n");
++			xen_have_vector_callback = 0;
++			return;
++		}
++		printk(KERN_INFO "Xen HVM callback vector for event delivery is "
++				"enabled\n");
++		alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
++	}
++}
++#else
++void xen_callback_vector(void) {}
++#endif
++
+ void __init xen_init_IRQ(void)
+ {
+ 	int i;
++	struct physdev_pirq_eoi_gmfn eoi_gmfn;
++	int nr_pirqs = NR_IRQS;
+ 
+ 	cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
+ 				    GFP_KERNEL);
+-	BUG_ON(cpu_evtchn_mask_p == NULL);
++	irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
++
++	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
++				GFP_KERNEL);
++	for(i = 0; i < NR_EVENT_CHANNELS; i++)
++		evtchn_to_irq[i] = -1;
++
++	i = get_order(sizeof(unsigned long) * BITS_TO_LONGS(nr_pirqs));
++	pirq_needs_eoi_bits = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, i);
++
++ 	eoi_gmfn.gmfn = virt_to_mfn(pirq_needs_eoi_bits);
++	if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
++		pirq_eoi_does_unmask = true;
+ 
+ 	init_evtchn_cpu_bindings();
+ 
+@@ -954,5 +1487,11 @@ void __init xen_init_IRQ(void)
+ 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ 		mask_evtchn(i);
+ 
+-	irq_ctx_init(smp_processor_id());
++	if (xen_hvm_domain()) {
++		xen_callback_vector();
++		native_init_IRQ();
++	} else {
++		irq_ctx_init(smp_processor_id());
++		xen_setup_pirqs();
++	}
+ }
+diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
+index 79bedba..b82666a 100644
+--- a/drivers/xen/evtchn.c
++++ b/drivers/xen/evtchn.c
+@@ -48,6 +48,8 @@
+ #include <linux/gfp.h>
+ #include <linux/mutex.h>
+ #include <linux/cpu.h>
++
++#include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/evtchn.h>
+ #include <asm/xen/hypervisor.h>
+@@ -68,10 +70,36 @@ struct per_user_data {
+ 	const char *name;
+ };
+ 
+-/* Who's bound to each port? */
+-static struct per_user_data *port_user[NR_EVENT_CHANNELS];
++/*
++ * Who's bound to each port?  This is logically an array of struct
++ * per_user_data *, but we encode the current enabled-state in bit 0.
++ */
++static unsigned long *port_user;
+ static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+ 
++static inline struct per_user_data *get_port_user(unsigned port)
++{
++	return (struct per_user_data *)(port_user[port] & ~1);
++}
++
++static inline void set_port_user(unsigned port, struct per_user_data *u)
++{
++	port_user[port] = (unsigned long)u;
++}
++
++static inline bool get_port_enabled(unsigned port)
++{
++	return port_user[port] & 1;
++}
++
++static inline void set_port_enabled(unsigned port, bool enabled)
++{
++	if (enabled)
++		port_user[port] |= 1;
++	else
++		port_user[port] &= ~1;
++}
++
+ irqreturn_t evtchn_interrupt(int irq, void *data)
+ {
+ 	unsigned int port = (unsigned long)data;
+@@ -79,9 +107,14 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
+ 
+ 	spin_lock(&port_user_lock);
+ 
+-	u = port_user[port];
++	u = get_port_user(port);
++
++	WARN(!get_port_enabled(port),
++	     "Interrupt for port %d, but apparently not enabled; per-user %p\n",
++	     port, u);
+ 
+ 	disable_irq_nosync(irq);
++	set_port_enabled(port, false);
+ 
+ 	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+ 		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+@@ -91,9 +124,8 @@ irqreturn_t evtchn_interrupt(int irq, void *data)
+ 			kill_fasync(&u->evtchn_async_queue,
+ 				    SIGIO, POLL_IN);
+ 		}
+-	} else {
++	} else
+ 		u->ring_overflow = 1;
+-	}
+ 
+ 	spin_unlock(&port_user_lock);
+ 
+@@ -197,9 +229,18 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
+ 		goto out;
+ 
+ 	spin_lock_irq(&port_user_lock);
+-	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+-		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+-			enable_irq(irq_from_evtchn(kbuf[i]));
++
++	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
++		unsigned port = kbuf[i];
++
++		if (port < NR_EVENT_CHANNELS &&
++		    get_port_user(port) == u &&
++		    !get_port_enabled(port)) {
++			set_port_enabled(port, true);
++			enable_irq(irq_from_evtchn(port));
++		}
++	}
++
+ 	spin_unlock_irq(&port_user_lock);
+ 
+ 	rc = count;
+@@ -221,8 +262,9 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port)
+ 	 * interrupt handler yet, and our caller has already
+ 	 * serialized bind operations.)
+ 	 */
+-	BUG_ON(port_user[port] != NULL);
+-	port_user[port] = u;
++	BUG_ON(get_port_user(port) != NULL);
++	set_port_user(port, u);
++	set_port_enabled(port, true); /* start enabled */
+ 
+ 	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+ 				       u->name, (void *)(unsigned long)port);
+@@ -238,10 +280,7 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+ 
+ 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+ 
+-	/* make sure we unbind the irq handler before clearing the port */
+-	barrier();
+-
+-	port_user[port] = NULL;
++	set_port_user(port, NULL);
+ }
+ 
+ static long evtchn_ioctl(struct file *file,
+@@ -332,15 +371,17 @@ static long evtchn_ioctl(struct file *file,
+ 		spin_lock_irq(&port_user_lock);
+ 
+ 		rc = -ENOTCONN;
+-		if (port_user[unbind.port] != u) {
++		if (get_port_user(unbind.port) != u) {
+ 			spin_unlock_irq(&port_user_lock);
+ 			break;
+ 		}
+ 
+-		evtchn_unbind_from_user(u, unbind.port);
++		disable_irq(irq_from_evtchn(unbind.port));
+ 
+ 		spin_unlock_irq(&port_user_lock);
+ 
++		evtchn_unbind_from_user(u, unbind.port);
++
+ 		rc = 0;
+ 		break;
+ 	}
+@@ -354,7 +395,7 @@ static long evtchn_ioctl(struct file *file,
+ 
+ 		if (notify.port >= NR_EVENT_CHANNELS) {
+ 			rc = -EINVAL;
+-		} else if (port_user[notify.port] != u) {
++		} else if (get_port_user(notify.port) != u) {
+ 			rc = -ENOTCONN;
+ 		} else {
+ 			notify_remote_via_evtchn(notify.port);
+@@ -443,14 +484,21 @@ static int evtchn_release(struct inode *inode, struct file *filp)
+ 	free_page((unsigned long)u->ring);
+ 
+ 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+-		if (port_user[i] != u)
++		if (get_port_user(i) != u)
+ 			continue;
+ 
+-		evtchn_unbind_from_user(port_user[i], i);
++		disable_irq(irq_from_evtchn(i));
+ 	}
+ 
+ 	spin_unlock_irq(&port_user_lock);
+ 
++	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
++		if (get_port_user(i) != u)
++			continue;
++
++		evtchn_unbind_from_user(get_port_user(i), i);
++	}
++
+ 	kfree(u->name);
+ 	kfree(u);
+ 
+@@ -470,7 +518,7 @@ static const struct file_operations evtchn_fops = {
+ 
+ static struct miscdevice evtchn_miscdev = {
+ 	.minor        = MISC_DYNAMIC_MINOR,
+-	.name         = "evtchn",
++	.name         = "xen/evtchn",
+ 	.fops         = &evtchn_fops,
+ };
+ static int __init evtchn_init(void)
+@@ -480,8 +528,11 @@ static int __init evtchn_init(void)
+ 	if (!xen_domain())
+ 		return -ENODEV;
+ 
++	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
++	if (port_user == NULL)
++		return -ENOMEM;
++
+ 	spin_lock_init(&port_user_lock);
+-	memset(port_user, 0, sizeof(port_user));
+ 
+ 	/* Create '/dev/misc/evtchn'. */
+ 	err = misc_register(&evtchn_miscdev);
+@@ -497,6 +548,9 @@ static int __init evtchn_init(void)
+ 
+ static void __exit evtchn_cleanup(void)
+ {
++	kfree(port_user);
++	port_user = NULL;
++
+ 	misc_deregister(&evtchn_miscdev);
+ }
+ 
+diff --git a/drivers/xen/features.c b/drivers/xen/features.c
+index 99eda16..9e2b64f 100644
+--- a/drivers/xen/features.c
++++ b/drivers/xen/features.c
+@@ -18,7 +18,7 @@
+ u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+ EXPORT_SYMBOL_GPL(xen_features);
+ 
+-void xen_setup_features(void)
++void __init xen_setup_features(void)
+ {
+ 	struct xen_feature_info fi;
+ 	int i, j;
+diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
+new file mode 100644
+index 0000000..a33e443
+--- /dev/null
++++ b/drivers/xen/gntdev.c
+@@ -0,0 +1,645 @@
++/******************************************************************************
++ * gntdev.c
++ *
++ * Device for accessing (in user-space) pages that have been granted by other
++ * domains.
++ *
++ * Copyright (c) 2006-2007, D G Murray.
++ *           (c) 2009 Gerd Hoffmann <kraxel@redhat.com>
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ */
++
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/miscdevice.h>
++#include <linux/fs.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/mmu_notifier.h>
++#include <linux/types.h>
++#include <linux/uaccess.h>
++#include <linux/sched.h>
++#include <linux/spinlock.h>
++
++#include <xen/xen.h>
++#include <xen/grant_table.h>
++#include <xen/gntdev.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, "
++	      "Gerd Hoffmann <kraxel@redhat.com>");
++MODULE_DESCRIPTION("User-space granted page access driver");
++
++static int debug = 0;
++module_param(debug, int, 0644);
++static int limit = 1024;
++module_param(limit, int, 0644);
++
++struct gntdev_priv {
++	struct list_head maps;
++	uint32_t used;
++	uint32_t limit;
++	spinlock_t lock;
++	struct mm_struct *mm;
++	struct mmu_notifier mn;
++};
++
++struct grant_map {
++	struct list_head next;
++	struct gntdev_priv *priv;
++	struct vm_area_struct *vma;
++	int index;
++	int count;
++	int flags;
++	int is_mapped;
++	struct ioctl_gntdev_grant_ref *grants;
++	struct gnttab_map_grant_ref   *map_ops;
++	struct gnttab_unmap_grant_ref *unmap_ops;
++};
++
++/* ------------------------------------------------------------------ */
++
++static void gntdev_print_maps(struct gntdev_priv *priv,
++			      char *text, int text_index)
++{
++	struct grant_map *map;
++
++	printk("%s: maps list (priv %p, usage %d/%d)\n",
++	       __FUNCTION__, priv, priv->used, priv->limit);
++	list_for_each_entry(map, &priv->maps, next)
++		printk("  index %2d, count %2d %s\n",
++		       map->index, map->count,
++		       map->index == text_index && text ? text : "");
++}
++
++static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count)
++{
++	struct grant_map *add;
++
++	add = kzalloc(sizeof(struct grant_map), GFP_KERNEL);
++	if (NULL == add)
++		return NULL;
++
++	add->grants    = kzalloc(sizeof(add->grants[0])    * count, GFP_KERNEL);
++	add->map_ops   = kzalloc(sizeof(add->map_ops[0])   * count, GFP_KERNEL);
++	add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL);
++	if (NULL == add->grants  ||
++	    NULL == add->map_ops ||
++	    NULL == add->unmap_ops)
++		goto err;
++
++	add->index = 0;
++	add->count = count;
++	add->priv  = priv;
++
++	if (add->count + priv->used > priv->limit)
++		goto err;
++
++	return add;
++
++err:
++	kfree(add->grants);
++	kfree(add->map_ops);
++	kfree(add->unmap_ops);
++	kfree(add);
++	return NULL;
++}
++
++static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add)
++{
++	struct grant_map *map;
++
++	list_for_each_entry(map, &priv->maps, next) {
++		if (add->index + add->count < map->index) {
++			list_add_tail(&add->next, &map->next);
++			goto done;
++		}
++		add->index = map->index + map->count;
++	}
++	list_add_tail(&add->next, &priv->maps);
++
++done:
++	priv->used += add->count;
++	if (debug)
++		gntdev_print_maps(priv, "[new]", add->index);
++}
++
++static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, int index,
++					       int count)
++{
++	struct grant_map *map;
++
++	list_for_each_entry(map, &priv->maps, next) {
++		if (map->index != index)
++			continue;
++		if (map->count != count)
++			continue;
++		return map;
++	}
++	return NULL;
++}
++
++static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv,
++					       unsigned long vaddr)
++{
++	struct grant_map *map;
++
++	list_for_each_entry(map, &priv->maps, next) {
++		if (!map->vma)
++			continue;
++		if (vaddr < map->vma->vm_start)
++			continue;
++		if (vaddr >= map->vma->vm_end)
++			continue;
++		return map;
++	}
++	return NULL;
++}
++
++static int gntdev_del_map(struct grant_map *map)
++{
++	int i;
++
++	if (map->vma)
++		return -EBUSY;
++	for (i = 0; i < map->count; i++)
++		if (map->unmap_ops[i].handle)
++			return -EBUSY;
++
++	map->priv->used -= map->count;
++	list_del(&map->next);
++	return 0;
++}
++
++static void gntdev_free_map(struct grant_map *map)
++{
++	if (!map)
++		return;
++	kfree(map->grants);
++	kfree(map->map_ops);
++	kfree(map->unmap_ops);
++	kfree(map);
++}
++
++/* ------------------------------------------------------------------ */
++
++static int find_grant_ptes(pte_t *pte, pgtable_t token, unsigned long addr, void *data)
++{
++	struct grant_map *map = data;
++	unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT;
++	u64 pte_maddr;
++
++	BUG_ON(pgnr >= map->count);
++	pte_maddr  = (u64)pfn_to_mfn(page_to_pfn(token)) << PAGE_SHIFT;
++	pte_maddr += (unsigned long)pte & ~PAGE_MASK;
++	gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, map->flags,
++			  map->grants[pgnr].ref,
++			  map->grants[pgnr].domid);
++	gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, map->flags,
++			    0 /* handle */);
++	return 0;
++}
++
++static int map_grant_pages(struct grant_map *map)
++{
++	int i, err = 0;
++
++	if (debug)
++		printk("%s: map %d+%d\n", __FUNCTION__, map->index, map->count);
++	err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++					map->map_ops, map->count);
++	if (WARN_ON(err))
++		return err;
++
++	for (i = 0; i < map->count; i++) {
++		if (map->map_ops[i].status)
++			err = -EINVAL;
++		map->unmap_ops[i].handle = map->map_ops[i].handle;
++	}
++	return err;
++}
++
++static int unmap_grant_pages(struct grant_map *map, int offset, int pages)
++{
++	int i, err = 0;
++
++	if (debug)
++		printk("%s: map %d+%d [%d+%d]\n", __FUNCTION__,
++		       map->index, map->count, offset, pages);
++	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
++					map->unmap_ops + offset, pages);
++	if (WARN_ON(err))
++		return err;
++
++	for (i = 0; i < pages; i++) {
++		if (map->unmap_ops[offset+i].status)
++			err = -EINVAL;
++		map->unmap_ops[offset+i].handle = 0;
++	}
++	return err;
++}
++
++/* ------------------------------------------------------------------ */
++
++static void gntdev_vma_close(struct vm_area_struct *vma)
++{
++	struct grant_map *map = vma->vm_private_data;
++
++	if (debug)
++		printk("%s\n", __FUNCTION__);
++	map->is_mapped = 0;
++	map->vma = NULL;
++	vma->vm_private_data = NULL;
++}
++
++static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++	if (debug)
++		printk("%s: vaddr %p, pgoff %ld (shouldn't happen)\n",
++		       __FUNCTION__, vmf->virtual_address, vmf->pgoff);
++	vmf->flags = VM_FAULT_ERROR;
++	return 0;
++}
++
++static struct vm_operations_struct gntdev_vmops = {
++	.close = gntdev_vma_close,
++	.fault = gntdev_vma_fault,
++};
++
++/* ------------------------------------------------------------------ */
++
++static void mn_invl_range_start(struct mmu_notifier *mn,
++				struct mm_struct *mm,
++				unsigned long start, unsigned long end)
++{
++	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++	struct grant_map *map;
++	unsigned long mstart, mend;
++	int err;
++
++	spin_lock(&priv->lock);
++	list_for_each_entry(map, &priv->maps, next) {
++		if (!map->vma)
++			continue;
++		if (!map->is_mapped)
++			continue;
++		if (map->vma->vm_start >= end)
++			continue;
++		if (map->vma->vm_end <= start)
++			continue;
++		mstart = max(start, map->vma->vm_start);
++		mend   = min(end,   map->vma->vm_end);
++		if (debug)
++			printk("%s: map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
++			       __FUNCTION__, map->index, map->count,
++			       map->vma->vm_start, map->vma->vm_end,
++			       start, end, mstart, mend);
++		err = unmap_grant_pages(map,
++					(mstart - map->vma->vm_start) >> PAGE_SHIFT,
++					(mend - mstart) >> PAGE_SHIFT);
++		WARN_ON(err);
++	}
++	spin_unlock(&priv->lock);
++}
++
++static void mn_invl_page(struct mmu_notifier *mn,
++			 struct mm_struct *mm,
++			 unsigned long address)
++{
++	mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
++}
++
++static void mn_release(struct mmu_notifier *mn,
++		       struct mm_struct *mm)
++{
++	struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
++	struct grant_map *map;
++	int err;
++
++	spin_lock(&priv->lock);
++	list_for_each_entry(map, &priv->maps, next) {
++		if (!map->vma)
++			continue;
++		if (debug)
++			printk("%s: map %d+%d (%lx %lx)\n",
++			       __FUNCTION__, map->index, map->count,
++			       map->vma->vm_start, map->vma->vm_end);
++		err = unmap_grant_pages(map, 0, map->count);
++		WARN_ON(err);
++	}
++	spin_unlock(&priv->lock);
++}
++
++struct mmu_notifier_ops gntdev_mmu_ops = {
++	.release                = mn_release,
++	.invalidate_page        = mn_invl_page,
++	.invalidate_range_start = mn_invl_range_start,
++};
++
++/* ------------------------------------------------------------------ */
++
++static int gntdev_open(struct inode *inode, struct file *flip)
++{
++	struct gntdev_priv *priv;
++
++	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
++	if (!priv)
++		return -ENOMEM;
++
++	INIT_LIST_HEAD(&priv->maps);
++	spin_lock_init(&priv->lock);
++	priv->limit = limit;
++
++	priv->mm = get_task_mm(current);
++	if (!priv->mm) {
++		kfree(priv);
++		return -ENOMEM;
++	}
++	priv->mn.ops = &gntdev_mmu_ops;
++	mmu_notifier_register(&priv->mn, priv->mm);
++	mmput(priv->mm);
++
++	flip->private_data = priv;
++	if (debug)
++		printk("%s: priv %p\n", __FUNCTION__, priv);
++
++	return 0;
++}
++
++static int gntdev_release(struct inode *inode, struct file *flip)
++{
++	struct gntdev_priv *priv = flip->private_data;
++	struct grant_map *map;
++	int err;
++
++	if (debug)
++		printk("%s: priv %p\n", __FUNCTION__, priv);
++
++	spin_lock(&priv->lock);
++	while (!list_empty(&priv->maps)) {
++		map = list_entry(priv->maps.next, struct grant_map, next);
++		err = gntdev_del_map(map);
++		if (WARN_ON(err))
++			gntdev_free_map(map);
++
++	}
++	spin_unlock(&priv->lock);
++
++	mmu_notifier_unregister(&priv->mn, priv->mm);
++	kfree(priv);
++	return 0;
++}
++
++static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv,
++				       struct ioctl_gntdev_map_grant_ref __user *u)
++{
++	struct ioctl_gntdev_map_grant_ref op;
++	struct grant_map *map;
++	int err;
++
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, add %d\n", __FUNCTION__, priv,
++		       op.count);
++	if (unlikely(op.count <= 0))
++		return -EINVAL;
++	if (unlikely(op.count > priv->limit))
++		return -EINVAL;
++
++	err = -ENOMEM;
++	map = gntdev_alloc_map(priv, op.count);
++	if (!map)
++		return err;
++	if (copy_from_user(map->grants, &u->refs,
++			   sizeof(map->grants[0]) * op.count) != 0) {
++		gntdev_free_map(map);
++		return err;
++	}
++
++	spin_lock(&priv->lock);
++	gntdev_add_map(priv, map);
++	op.index = map->index << PAGE_SHIFT;
++	spin_unlock(&priv->lock);
++
++	if (copy_to_user(u, &op, sizeof(op)) != 0) {
++		spin_lock(&priv->lock);
++		gntdev_del_map(map);
++		spin_unlock(&priv->lock);
++		gntdev_free_map(map);
++		return err;
++	}
++	return 0;
++}
++
++static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
++					 struct ioctl_gntdev_unmap_grant_ref __user *u)
++{
++	struct ioctl_gntdev_unmap_grant_ref op;
++	struct grant_map *map;
++	int err = -EINVAL;
++
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, del %d+%d\n", __FUNCTION__, priv,
++		       (int)op.index, (int)op.count);
++
++	spin_lock(&priv->lock);
++	map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
++	if (map)
++		err = gntdev_del_map(map);
++	spin_unlock(&priv->lock);
++	if (!err)
++		gntdev_free_map(map);
++	return err;
++}
++
++static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv,
++					      struct ioctl_gntdev_get_offset_for_vaddr __user *u)
++{
++	struct ioctl_gntdev_get_offset_for_vaddr op;
++	struct grant_map *map;
++
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, offset for vaddr %lx\n", __FUNCTION__, priv,
++		       (unsigned long)op.vaddr);
++
++	spin_lock(&priv->lock);
++	map = gntdev_find_map_vaddr(priv, op.vaddr);
++	if (map == NULL ||
++	    map->vma->vm_start != op.vaddr) {
++		spin_unlock(&priv->lock);
++		return -EINVAL;
++	}
++	op.offset = map->index << PAGE_SHIFT;
++	op.count = map->count;
++	spin_unlock(&priv->lock);
++
++	if (copy_to_user(u, &op, sizeof(op)) != 0)
++		return -EFAULT;
++	return 0;
++}
++
++static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv,
++					struct ioctl_gntdev_set_max_grants __user *u)
++{
++	struct ioctl_gntdev_set_max_grants op;
++
++	if (copy_from_user(&op, u, sizeof(op)) != 0)
++		return -EFAULT;
++	if (debug)
++		printk("%s: priv %p, limit %d\n", __FUNCTION__, priv, op.count);
++	if (op.count > limit)
++		return -EINVAL;
++
++	spin_lock(&priv->lock);
++	priv->limit = op.count;
++	spin_unlock(&priv->lock);
++	return 0;
++}
++
++static long gntdev_ioctl(struct file *flip,
++			 unsigned int cmd, unsigned long arg)
++{
++	struct gntdev_priv *priv = flip->private_data;
++	void __user *ptr = (void __user *)arg;
++
++	switch (cmd) {
++	case IOCTL_GNTDEV_MAP_GRANT_REF:
++		return gntdev_ioctl_map_grant_ref(priv, ptr);
++
++	case IOCTL_GNTDEV_UNMAP_GRANT_REF:
++		return gntdev_ioctl_unmap_grant_ref(priv, ptr);
++
++	case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
++		return gntdev_ioctl_get_offset_for_vaddr(priv, ptr);
++
++	case IOCTL_GNTDEV_SET_MAX_GRANTS:
++		return gntdev_ioctl_set_max_grants(priv, ptr);
++
++	default:
++		if (debug)
++			printk("%s: priv %p, unknown cmd %x\n",
++			       __FUNCTION__, priv, cmd);
++		return -ENOIOCTLCMD;
++	}
++
++	return 0;
++}
++
++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
++{
++	struct gntdev_priv *priv = flip->private_data;
++	int index = vma->vm_pgoff;
++	int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
++	struct grant_map *map;
++	int err = -EINVAL;
++
++	if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
++		return -EINVAL;
++
++	if (debug)
++		printk("%s: map %d+%d at %lx (pgoff %lx)\n", __FUNCTION__,
++		       index, count, vma->vm_start, vma->vm_pgoff);
++
++	spin_lock(&priv->lock);
++	map = gntdev_find_map_index(priv, index, count);
++	if (!map)
++		goto unlock_out;
++	if (map->vma)
++		goto unlock_out;
++	if (priv->mm != vma->vm_mm) {
++		printk("%s: Huh? Other mm?\n", __FUNCTION__);
++		goto unlock_out;
++	}
++
++	vma->vm_ops = &gntdev_vmops;
++
++	vma->vm_flags |= VM_RESERVED;
++	vma->vm_flags |= VM_DONTCOPY;
++	vma->vm_flags |= VM_DONTEXPAND;
++
++	vma->vm_private_data = map;
++	map->vma = vma;
++
++	map->flags = GNTMAP_host_map | GNTMAP_application_map | GNTMAP_contains_pte;
++	if (!(vma->vm_flags & VM_WRITE))
++		map->flags |= GNTMAP_readonly;
++
++	err = apply_to_page_range(vma->vm_mm, vma->vm_start,
++				  vma->vm_end - vma->vm_start,
++				  find_grant_ptes, map);
++	if (err) {
++		goto unlock_out;
++		if (debug)
++			printk("%s: find_grant_ptes() failure.\n", __FUNCTION__);
++	}
++
++	err = map_grant_pages(map);
++	if (err) {
++		goto unlock_out;
++		if (debug)
++			printk("%s: map_grant_pages() failure.\n", __FUNCTION__);
++	}
++	map->is_mapped = 1;
++
++unlock_out:
++	spin_unlock(&priv->lock);
++	return err;
++}
++
++static const struct file_operations gntdev_fops = {
++	.owner = THIS_MODULE,
++	.open = gntdev_open,
++	.release = gntdev_release,
++	.mmap = gntdev_mmap,
++	.unlocked_ioctl = gntdev_ioctl
++};
++
++static struct miscdevice gntdev_miscdev = {
++	.minor        = MISC_DYNAMIC_MINOR,
++	.name         = "xen/gntdev",
++	.fops         = &gntdev_fops,
++};
++
++/* ------------------------------------------------------------------ */
++
++static int __init gntdev_init(void)
++{
++	int err;
++
++	if (!xen_domain())
++		return -ENODEV;
++
++	err = misc_register(&gntdev_miscdev);
++	if (err != 0) {
++		printk(KERN_ERR "Could not register gntdev device\n");
++		return err;
++	}
++	return 0;
++}
++
++static void __exit gntdev_exit(void)
++{
++	misc_deregister(&gntdev_miscdev);
++}
++
++module_init(gntdev_init);
++module_exit(gntdev_exit);
++
++/* ------------------------------------------------------------------ */
+diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
+index 7d8f531..5a8ad45 100644
+--- a/drivers/xen/grant-table.c
++++ b/drivers/xen/grant-table.c
+@@ -36,10 +36,13 @@
+ #include <linux/mm.h>
+ #include <linux/vmalloc.h>
+ #include <linux/uaccess.h>
++#include <linux/io.h>
+ 
++#include <xen/xen.h>
+ #include <xen/interface/xen.h>
+ #include <xen/page.h>
+ #include <xen/grant_table.h>
++#include <xen/interface/memory.h>
+ #include <asm/xen/hypercall.h>
+ 
+ #include <asm/pgtable.h>
+@@ -57,6 +60,8 @@ static unsigned int boot_max_nr_grant_frames;
+ static int gnttab_free_count;
+ static grant_ref_t gnttab_free_head;
+ static DEFINE_SPINLOCK(gnttab_list_lock);
++unsigned long xen_hvm_resume_frames;
++EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
+ 
+ static struct grant_entry *shared;
+ 
+@@ -431,7 +436,7 @@ static unsigned int __max_nr_grant_frames(void)
+ 	return query.max_nr_frames;
+ }
+ 
+-static inline unsigned int max_nr_grant_frames(void)
++unsigned int gnttab_max_grant_frames(void)
+ {
+ 	unsigned int xen_max = __max_nr_grant_frames();
+ 
+@@ -439,6 +444,7 @@ static inline unsigned int max_nr_grant_frames(void)
+ 		return boot_max_nr_grant_frames;
+ 	return xen_max;
+ }
++EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
+ 
+ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+ {
+@@ -447,6 +453,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+ 	unsigned int nr_gframes = end_idx + 1;
+ 	int rc;
+ 
++	if (xen_hvm_domain()) {
++		struct xen_add_to_physmap xatp;
++		unsigned int i = end_idx;
++		rc = 0;
++		/*
++		 * Loop backwards, so that the first hypercall has the largest
++		 * index, ensuring that the table will grow only once.
++		 */
++		do {
++			xatp.domid = DOMID_SELF;
++			xatp.idx = i;
++			xatp.space = XENMAPSPACE_grant_table;
++			xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
++			rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
++			if (rc != 0) {
++				printk(KERN_WARNING
++						"grant table add_to_physmap failed, err=%d\n", rc);
++				break;
++			}
++		} while (i-- > start_idx);
++
++		return rc;
++	}
++
+ 	frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
+ 	if (!frames)
+ 		return -ENOMEM;
+@@ -463,7 +493,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+ 
+ 	BUG_ON(rc || setup.status);
+ 
+-	rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
++	rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
+ 				    &shared);
+ 	BUG_ON(rc);
+ 
+@@ -472,11 +502,134 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+ 	return 0;
+ }
+ 
++static void gnttab_page_free(struct page *page, unsigned int order)
++{
++	BUG_ON(order);
++	ClearPageForeign(page);
++	gnttab_reset_grant_page(page);
++	put_page(page);
++}
++
++/*
++ * Must not be called with IRQs off.  This should only be used on the
++ * slow path.
++ *
++ * Copy a foreign granted page to local memory.
++ */
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
++{
++	struct gnttab_unmap_and_replace unmap;
++	struct mmu_update mmu;
++	struct page *page;
++	struct page *new_page;
++	void *new_addr;
++	void *addr;
++	unsigned long pfn;
++	unsigned long mfn;
++	unsigned long new_mfn;
++	int err;
++
++	page = *pagep;
++	if (!get_page_unless_zero(page))
++		return -ENOENT;
++
++	err = -ENOMEM;
++	new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
++	if (!new_page)
++		goto out;
++
++	new_addr = page_address(new_page);
++	addr = page_address(page);
++	memcpy(new_addr, addr, PAGE_SIZE);
++
++	pfn = page_to_pfn(page);
++	mfn = pfn_to_mfn(pfn);
++	new_mfn = virt_to_mfn(new_addr);
++
++//	write_seqlock(&gnttab_dma_lock); /* protects __gnttab_dma_map_page on 2.6.18 */
++
++	/* Make seq visible before checking page_mapped. */
++	smp_mb();
++
++	/* Has the page been DMA-mapped? */
++	if (unlikely(page_mapped(page))) {
++		//write_sequnlock(&gnttab_dma_lock);
++		put_page(new_page);
++		err = -EBUSY;
++		goto out;
++	}
++
++	if (!xen_feature(XENFEAT_auto_translated_physmap))
++		set_phys_to_machine(pfn, new_mfn);
++
++	//gnttab_set_replace_op(&unmap, (unsigned long)addr,
++	//		      (unsigned long)new_addr, ref);
++	unmap.host_addr = (unsigned long)addr;
++	unmap.new_addr = (unsigned long)new_addr;
++	unmap.handle = ref;
++
++	err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++					&unmap, 1);
++	BUG_ON(err);
++	BUG_ON(unmap.status);
++
++//	write_sequnlock(&gnttab_dma_lock);
++
++	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
++		set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
++
++		mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
++		mmu.val = pfn;
++		err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
++		BUG_ON(err);
++	}
++
++	new_page->mapping = page->mapping;
++	SetPageForeign(new_page, _PageForeignDestructor(page));
++	if (PageReserved(page))
++		SetPageReserved(new_page);
++	*pagep = new_page;
++
++	SetPageForeign(page, gnttab_page_free);
++	ClearPageReserved(page);
++	page->mapping = NULL;
++
++out:
++	put_page(page);
++	return err;
++}
++EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
++
++void gnttab_reset_grant_page(struct page *page)
++{
++	init_page_count(page);
++	reset_page_mapcount(page);
++}
++EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
++
+ int gnttab_resume(void)
+ {
+-	if (max_nr_grant_frames() < nr_grant_frames)
++	unsigned int max_nr_gframes;
++
++	max_nr_gframes = gnttab_max_grant_frames();
++	if (max_nr_gframes < nr_grant_frames)
+ 		return -ENOSYS;
+-	return gnttab_map(0, nr_grant_frames - 1);
++
++	if (xen_pv_domain())
++		return gnttab_map(0, nr_grant_frames - 1);
++
++	if (!shared) {
++		shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
++		if (shared == NULL) {
++			printk(KERN_WARNING
++					"Failed to ioremap gnttab share frames!");
++			return -ENOMEM;
++		}
++	}
++
++	gnttab_map(0, nr_grant_frames - 1);
++
++	return 0;
+ }
+ 
+ int gnttab_suspend(void)
+@@ -493,7 +646,7 @@ static int gnttab_expand(unsigned int req_entries)
+ 	cur = nr_grant_frames;
+ 	extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
+ 		 GREFS_PER_GRANT_FRAME);
+-	if (cur + extra > max_nr_grant_frames())
++	if (cur + extra > gnttab_max_grant_frames())
+ 		return -ENOSPC;
+ 
+ 	rc = gnttab_map(cur, cur + extra - 1);
+@@ -503,15 +656,12 @@ static int gnttab_expand(unsigned int req_entries)
+ 	return rc;
+ }
+ 
+-static int __devinit gnttab_init(void)
++int gnttab_init(void)
+ {
+ 	int i;
+ 	unsigned int max_nr_glist_frames, nr_glist_frames;
+ 	unsigned int nr_init_grefs;
+ 
+-	if (!xen_domain())
+-		return -ENODEV;
+-
+ 	nr_grant_frames = 1;
+ 	boot_max_nr_grant_frames = __max_nr_grant_frames();
+ 
+@@ -554,5 +704,18 @@ static int __devinit gnttab_init(void)
+ 	kfree(gnttab_list);
+ 	return -ENOMEM;
+ }
++EXPORT_SYMBOL_GPL(gnttab_init);
++
++static int __devinit __gnttab_init(void)
++{
++	/* Delay grant-table initialization in the PV on HVM case */
++	if (xen_hvm_domain())
++		return 0;
++
++	if (!xen_pv_domain())
++		return -ENODEV;
++
++	return gnttab_init();
++}
+ 
+-core_initcall(gnttab_init);
++core_initcall(__gnttab_init);
+diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
+index 5d42d55..0b50906 100644
+--- a/drivers/xen/manage.c
++++ b/drivers/xen/manage.c
+@@ -8,6 +8,7 @@
+ #include <linux/stop_machine.h>
+ #include <linux/freezer.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/grant_table.h>
+ #include <xen/events.h>
+@@ -32,10 +33,30 @@ enum shutdown_state {
+ static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
+ 
+ #ifdef CONFIG_PM_SLEEP
+-static int xen_suspend(void *data)
++static int xen_hvm_suspend(void *data)
+ {
++	struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
+ 	int *cancelled = data;
++
++	BUG_ON(!irqs_disabled());
++
++	*cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
++
++	xen_hvm_post_suspend(*cancelled);
++	gnttab_resume();
++
++	if (!*cancelled) {
++		xen_irq_resume();
++		xen_timer_resume();
++	}
++
++	return 0;
++}
++
++static int xen_suspend(void *data)
++{
+ 	int err;
++	int *cancelled = data;
+ 
+ 	BUG_ON(!irqs_disabled());
+ 
+@@ -111,7 +132,10 @@ static void do_suspend(void)
+ 		goto out_resume;
+ 	}
+ 
+-	err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
++	if (xen_hvm_domain())
++		err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
++	else
++		err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+ 
+ 	dpm_resume_noirq(PMSG_RESUME);
+ 
+@@ -260,7 +284,19 @@ static int shutdown_event(struct notifier_block *notifier,
+ 	return NOTIFY_DONE;
+ }
+ 
+-static int __init setup_shutdown_event(void)
++static int __init __setup_shutdown_event(void)
++{
++	/* Delay initialization in the PV on HVM case */
++	if (xen_hvm_domain())
++		return 0;
++
++	if (!xen_pv_domain())
++		return -ENODEV;
++
++	return xen_setup_shutdown_event();
++}
++
++int xen_setup_shutdown_event(void)
+ {
+ 	static struct notifier_block xenstore_notifier = {
+ 		.notifier_call = shutdown_event
+@@ -269,5 +305,6 @@ static int __init setup_shutdown_event(void)
+ 
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
+ 
+-subsys_initcall(setup_shutdown_event);
++subsys_initcall(__setup_shutdown_event);
+diff --git a/drivers/xen/mce.c b/drivers/xen/mce.c
+new file mode 100644
+index 0000000..da566a5
+--- /dev/null
++++ b/drivers/xen/mce.c
+@@ -0,0 +1,216 @@
++/******************************************************************************
++ * mce.c
++ * Add Machine Check event Logging support in DOM0
++ *
++ * Driver for receiving and logging machine check event
++ *
++ * Copyright (c) 2008, 2009 Intel Corporation
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <xen/interface/xen.h>
++#include <asm/xen/hypervisor.h>
++#include <xen/events.h>
++#include <xen/interface/vcpu.h>
++#include <asm/xen/hypercall.h>
++#include <asm/mce.h>
++#include <xen/xen.h>
++
++static mc_info_t *g_mi;
++static mcinfo_logical_cpu_t *g_physinfo;
++static uint32_t ncpus;
++
++static int convert_log(struct mc_info *mi)
++{
++	struct mcinfo_common *mic = NULL;
++	struct mcinfo_global *mc_global;
++	struct mcinfo_bank *mc_bank;
++	struct mce m;
++	int i, found = 0;
++
++	x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL);
++	WARN_ON(!mic);
++
++	mce_setup(&m);
++	mc_global = (struct mcinfo_global *)mic;
++	m.mcgstatus = mc_global->mc_gstatus;
++	m.apicid = mc_global->mc_apicid;
++	for (i = 0; i < ncpus; i++) {
++		if (g_physinfo[i].mc_apicid == m.apicid) {
++			found = 1;
++			break;
++		}
++	}
++	WARN_ON(!found);
++
++	m.socketid = g_physinfo[i].mc_chipid;
++	m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
++	m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
++	m.mcgcap = g_physinfo[i].mc_msrvalues[0].value;
++	x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK);
++	do {
++		if (mic == NULL || mic->size == 0)
++			break;
++		if (mic->type == MC_TYPE_BANK) {
++			mc_bank = (struct mcinfo_bank *)mic;
++			m.misc = mc_bank->mc_misc;
++			m.status = mc_bank->mc_status;
++			m.addr = mc_bank->mc_addr;
++			m.tsc = mc_bank->mc_tsc;
++			m.bank = mc_bank->mc_bank;
++			m.finished = 1;
++			/*log this record*/
++			mce_log(&m);
++		}
++		mic = x86_mcinfo_next(mic);
++	} while (1);
++
++	return 0;
++}
++
++/*pv_ops domain mce virq handler, logging physical mce error info*/
++static irqreturn_t mce_dom_interrupt(int irq, void *dev_id)
++{
++	xen_mc_t mc_op;
++	int result = 0;
++
++	mc_op.cmd = XEN_MC_fetch;
++	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
++	set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi);
++urgent:
++	mc_op.u.mc_fetch.flags = XEN_MC_URGENT;
++	result = HYPERVISOR_mca(&mc_op);
++	if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
++			mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
++		goto nonurgent;
++	else {
++		result = convert_log(g_mi);
++		if (result)
++			goto end;
++		/* After fetching the error event log entry from DOM0,
++		 * we need to dec the refcnt and release the entry.
++		 * The entry is reserved and inc refcnt when filling
++		 * the error log entry.
++		 */
++		mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK;
++		result = HYPERVISOR_mca(&mc_op);
++		goto urgent;
++	}
++nonurgent:
++	mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT;
++	result = HYPERVISOR_mca(&mc_op);
++	if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
++			mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
++		goto end;
++	else {
++		result = convert_log(g_mi);
++		if (result)
++			goto end;
++		/* After fetching the error event log entry from DOM0,
++		 * we need to dec the refcnt and release the entry. The
++		 * entry is reserved and inc refcnt when filling the
++		 * error log entry.
++		 */
++		mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK;
++		result = HYPERVISOR_mca(&mc_op);
++		goto nonurgent;
++	}
++end:
++	return IRQ_HANDLED;
++}
++
++static int bind_virq_for_mce(void)
++{
++	int ret;
++	xen_mc_t mc_op;
++
++	g_mi = kmalloc(sizeof(struct mc_info), GFP_KERNEL);
++
++	if (!g_mi)
++		return -ENOMEM;
++
++	/* Fetch physical CPU Numbers */
++	mc_op.cmd = XEN_MC_physcpuinfo;
++	mc_op.interface_version = XEN_MCA_INTERFACE_VERSION;
++	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
++	ret = HYPERVISOR_mca(&mc_op);
++	if (ret) {
++		printk(KERN_ERR "MCE_DOM0_LOG: Fail to get physical CPU numbers\n");
++		kfree(g_mi);
++		return ret;
++	}
++
++	/* Fetch each CPU Physical Info for later reference*/
++	ncpus = mc_op.u.mc_physcpuinfo.ncpus;
++	g_physinfo = kmalloc(sizeof(struct mcinfo_logical_cpu)*ncpus,
++					GFP_KERNEL);
++	if (!g_physinfo) {
++		kfree(g_mi);
++		return -ENOMEM;
++	}
++	set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
++	ret = HYPERVISOR_mca(&mc_op);
++	if (ret) {
++		printk(KERN_ERR "MCE_DOM0_LOG: Fail to get physical CPUs info\n");
++		kfree(g_mi);
++		kfree(g_physinfo);
++		return ret;
++	}
++
++	ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0,
++		mce_dom_interrupt, 0, "mce", NULL);
++
++	if (ret < 0) {
++		printk(KERN_ERR "MCE_DOM0_LOG: bind_virq for DOM0 failed\n");
++		return ret;
++	}
++
++	return 0;
++}
++
++static int __init mcelog_init(void)
++{
++	/* Only DOM0 is responsible for MCE logging */
++	if (xen_initial_domain())
++		return bind_virq_for_mce();
++
++	return 0;
++}
++
++
++static void __exit mcelog_cleanup(void)
++{
++	kfree(g_mi);
++	kfree(g_physinfo);
++}
++module_init(mcelog_init);
++module_exit(mcelog_cleanup);
++
++MODULE_LICENSE("GPL");
+diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile
+new file mode 100644
+index 0000000..e346e81
+--- /dev/null
++++ b/drivers/xen/netback/Makefile
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
++
++xen-netback-y := netback.o xenbus.o interface.o
+diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h
+new file mode 100644
+index 0000000..b40ad72
+--- /dev/null
++++ b/drivers/xen/netback/common.h
+@@ -0,0 +1,329 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/common.h
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __NETIF__BACKEND__COMMON_H__
++#define __NETIF__BACKEND__COMMON_H__
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/interrupt.h>
++#include <linux/slab.h>
++#include <linux/ip.h>
++#include <linux/in.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++
++#include <xen/interface/io/netif.h>
++#include <asm/io.h>
++#include <asm/pgalloc.h>
++#include <xen/interface/grant_table.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
++
++#define DPRINTK(_f, _a...)			\
++	pr_debug("(file=%s, line=%d) " _f,	\
++		 __FILE__ , __LINE__ , ## _a )
++#define IPRINTK(fmt, args...)				\
++	printk(KERN_INFO "xen_net: " fmt, ##args)
++#define WPRINTK(fmt, args...)				\
++	printk(KERN_WARNING "xen_net: " fmt, ##args)
++
++struct xen_netif {
++	/* Unique identifier for this interface. */
++	domid_t          domid;
++	int              group;
++	unsigned int     handle;
++
++	u8               fe_dev_addr[6];
++
++	/* Physical parameters of the comms window. */
++	grant_handle_t   tx_shmem_handle;
++	grant_ref_t      tx_shmem_ref;
++	grant_handle_t   rx_shmem_handle;
++	grant_ref_t      rx_shmem_ref;
++	unsigned int     irq;
++
++	/* The shared rings and indexes. */
++	struct xen_netif_tx_back_ring tx;
++	struct xen_netif_rx_back_ring rx;
++	struct vm_struct *tx_comms_area;
++	struct vm_struct *rx_comms_area;
++
++	/* Flags that must not be set in dev->features */
++	int features_disabled;
++
++	/* Frontend feature information. */
++	u8 can_sg:1;
++	u8 gso:1;
++	u8 gso_prefix:1;
++	u8 csum:1;
++	u8 smart_poll:1;
++
++	/* Internal feature information. */
++	u8 can_queue:1;	    /* can queue packets for receiver? */
++
++	/* Allow netif_be_start_xmit() to peek ahead in the rx request
++	 * ring.  This is a prediction of what rx_req_cons will be once
++	 * all queued skbs are put on the ring. */
++	RING_IDX rx_req_cons_peek;
++
++	/* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
++	unsigned long   credit_bytes;
++	unsigned long   credit_usec;
++	unsigned long   remaining_credit;
++	struct timer_list credit_timeout;
++
++	/* Enforce draining of the transmit queue. */
++	struct timer_list tx_queue_timeout;
++
++	/* Statistics */
++	int nr_copied_skbs;
++
++	/* Miscellaneous private stuff. */
++	struct list_head list;  /* scheduling list */
++	atomic_t         refcnt;
++	struct net_device *dev;
++	struct net_device_stats stats;
++
++	unsigned int carrier;
++
++	wait_queue_head_t waiting_to_free;
++};
++
++/*
++ * Implement our own carrier flag: the network stack's version causes delays
++ * when the carrier is re-enabled (in particular, dev_activate() may not
++ * immediately be called, which can cause packet loss; also the etherbridge
++ * can be rather lazy in activating its port).
++ */
++#define netback_carrier_on(netif)	((netif)->carrier = 1)
++#define netback_carrier_off(netif)	((netif)->carrier = 0)
++#define netback_carrier_ok(netif)	((netif)->carrier)
++
++enum {
++	NETBK_DONT_COPY_SKB,
++	NETBK_DELAYED_COPY_SKB,
++	NETBK_ALWAYS_COPY_SKB,
++};
++
++extern int netbk_copy_skb_mode;
++
++/* Function pointers into netback accelerator plugin modules */
++struct netback_accel_hooks {
++	struct module *owner;
++	int  (*probe)(struct xenbus_device *dev);
++	int (*remove)(struct xenbus_device *dev);
++};
++
++/* Structure to track the state of a netback accelerator plugin */
++struct netback_accelerator {
++	struct list_head link;
++	int id;
++	char *eth_name;
++	atomic_t use_count;
++	struct netback_accel_hooks *hooks;
++};
++
++struct backend_info {
++	struct xenbus_device *dev;
++	struct xen_netif *netif;
++	enum xenbus_state frontend_state;
++	struct xenbus_watch hotplug_status_watch;
++	int have_hotplug_status_watch:1;
++
++	/* State relating to the netback accelerator */
++	void *netback_accel_priv;
++	/* The accelerator that this backend is currently using */
++	struct netback_accelerator *accelerator;
++};
++
++#define NETBACK_ACCEL_VERSION 0x00010001
++
++/*
++ * Connect an accelerator plugin module to netback.  Returns zero on
++ * success, < 0 on error, > 0 (with highest version number supported)
++ * if version mismatch.
++ */
++extern int netback_connect_accelerator(unsigned version,
++				       int id, const char *eth_name,
++				       struct netback_accel_hooks *hooks);
++/* Disconnect a previously connected accelerator plugin module */
++extern void netback_disconnect_accelerator(int id, const char *eth_name);
++
++
++extern
++void netback_probe_accelerators(struct backend_info *be,
++				struct xenbus_device *dev);
++extern
++void netback_remove_accelerators(struct backend_info *be,
++				 struct xenbus_device *dev);
++extern
++void netif_accel_init(void);
++
++
++#define NET_TX_RING_SIZE __RING_SIZE((struct xen_netif_tx_sring *)0, PAGE_SIZE)
++#define NET_RX_RING_SIZE __RING_SIZE((struct xen_netif_rx_sring *)0, PAGE_SIZE)
++
++void netif_disconnect(struct xen_netif *netif);
++
++void netif_set_features(struct xen_netif *netif);
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++	      unsigned long rx_ring_ref, unsigned int evtchn);
++
++static inline void netif_get(struct xen_netif *netif)
++{
++	atomic_inc(&netif->refcnt);
++}
++
++static inline void  netif_put(struct xen_netif *netif)
++{
++	if (atomic_dec_and_test(&netif->refcnt))
++		wake_up(&netif->waiting_to_free);
++}
++
++int netif_xenbus_init(void);
++
++#define netif_schedulable(netif)				\
++	(netif_running((netif)->dev) && netback_carrier_ok(netif))
++
++void netif_schedule_work(struct xen_netif *netif);
++void netif_deschedule_work(struct xen_netif *netif);
++
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
++struct net_device_stats *netif_be_get_stats(struct net_device *dev);
++irqreturn_t netif_be_int(int irq, void *dev_id);
++
++static inline int netbk_can_queue(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	return netif->can_queue;
++}
++
++static inline int netbk_can_sg(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	return netif->can_sg;
++}
++
++struct pending_tx_info {
++	struct xen_netif_tx_request req;
++	struct xen_netif *netif;
++};
++typedef unsigned int pending_ring_idx_t;
++
++struct netbk_rx_meta {
++	int id;
++	int size;
++	int gso_size;
++};
++
++struct netbk_tx_pending_inuse {
++	struct list_head list;
++	unsigned long alloc_time;
++};
++
++#define MAX_PENDING_REQS 256
++
++#define MAX_BUFFER_OFFSET PAGE_SIZE
++
++/* extra field used in struct page */
++union page_ext {
++	struct {
++#if BITS_PER_LONG < 64
++#define IDX_WIDTH   8
++#define GROUP_WIDTH (BITS_PER_LONG - IDX_WIDTH)
++		unsigned int group:GROUP_WIDTH;
++		unsigned int idx:IDX_WIDTH;
++#else
++		unsigned int group, idx;
++#endif
++	} e;
++	void *mapping;
++};
++
++struct xen_netbk {
++	union {
++		struct {
++			struct tasklet_struct net_tx_tasklet;
++			struct tasklet_struct net_rx_tasklet;
++		} tasklet;
++
++		struct {
++			wait_queue_head_t netbk_action_wq;
++			struct task_struct *task;
++		} kthread;
++	};
++
++	struct sk_buff_head rx_queue;
++	struct sk_buff_head tx_queue;
++
++	struct timer_list net_timer;
++	struct timer_list netbk_tx_pending_timer;
++
++	struct page **mmap_pages;
++
++	pending_ring_idx_t pending_prod;
++	pending_ring_idx_t pending_cons;
++	pending_ring_idx_t dealloc_prod;
++	pending_ring_idx_t dealloc_cons;
++
++	struct list_head pending_inuse_head;
++	struct list_head net_schedule_list;
++
++	/* Protect the net_schedule_list in netif. */
++	spinlock_t net_schedule_list_lock;
++
++	atomic_t netfront_count;
++
++	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
++	struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
++	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
++	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
++
++	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
++	u16 pending_ring[MAX_PENDING_REQS];
++	u16 dealloc_ring[MAX_PENDING_REQS];
++
++	/*
++	 * Each head or fragment can be up to 4096 bytes. Given
++	 * MAX_BUFFER_OFFSET of 4096 the worst case is that each
++	 * head/fragment uses 2 copy operation.
++	 */
++	struct gnttab_copy grant_copy_op[2*NET_RX_RING_SIZE];
++	unsigned char rx_notify[NR_IRQS];
++	u16 notify_list[NET_RX_RING_SIZE];
++	struct netbk_rx_meta meta[NET_RX_RING_SIZE];
++};
++
++extern struct xen_netbk *xen_netbk;
++extern int xen_netbk_group_nr;
++
++#endif /* __NETIF__BACKEND__COMMON_H__ */
+diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c
+new file mode 100644
+index 0000000..2e8508a
+--- /dev/null
++++ b/drivers/xen/netback/interface.c
+@@ -0,0 +1,475 @@
++/******************************************************************************
++ * arch/xen/drivers/netif/backend/interface.c
++ *
++ * Network-device interface management.
++ *
++ * Copyright (c) 2004-2005, Keir Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++#include <linux/ethtool.h>
++#include <linux/rtnetlink.h>
++
++#include <xen/events.h>
++#include <asm/xen/hypercall.h>
++
++/*
++ * Module parameter 'queue_length':
++ *
++ * Enables queuing in the network stack when a client has run out of receive
++ * descriptors. Although this feature can improve receive bandwidth by avoiding
++ * packet loss, it can also result in packets sitting in the 'tx_queue' for
++ * unbounded time. This is bad if those packets hold onto foreign resources.
++ * For example, consider a packet that holds onto resources belonging to the
++ * guest for which it is queued (e.g., packet received on vif1.0, destined for
++ * vif1.1 which is not activated in the guest): in this situation the guest
++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
++ * run a timer (tx_queue_timeout) to drain the queue when the interface is
++ * blocked.
++ */
++static unsigned long netbk_queue_length = 32;
++module_param_named(queue_length, netbk_queue_length, ulong, 0644);
++
++static void netbk_add_netif(struct xen_netbk *netbk, int group_nr,
++			   struct xen_netif *netif)
++{
++	int i;
++	int min_netfront_count;
++	int min_group = 0;
++	min_netfront_count = atomic_read(&netbk[0].netfront_count);
++	for (i = 0; i < group_nr; i++) {
++		int netfront_count = atomic_read(&netbk[i].netfront_count);
++		if (netfront_count < min_netfront_count) {
++			min_group = i;
++			min_netfront_count = netfront_count;
++		}
++	}
++
++	netif->group = min_group;
++	atomic_inc(&netbk[netif->group].netfront_count);
++}
++
++static void netbk_remove_netif(struct xen_netbk *netbk, struct xen_netif *netif)
++{
++	atomic_dec(&netbk[netif->group].netfront_count);
++}
++
++static void __netif_up(struct xen_netif *netif)
++{
++	netbk_add_netif(xen_netbk, xen_netbk_group_nr, netif);
++	enable_irq(netif->irq);
++	netif_schedule_work(netif);
++}
++
++static void __netif_down(struct xen_netif *netif)
++{
++	disable_irq(netif->irq);
++	netif_deschedule_work(netif);
++	netbk_remove_netif(xen_netbk, netif);
++}
++
++static int net_open(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	if (netback_carrier_ok(netif)) {
++		__netif_up(netif);
++		netif_start_queue(dev);
++	}
++	return 0;
++}
++
++static int net_close(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	if (netback_carrier_ok(netif))
++		__netif_down(netif);
++	netif_stop_queue(dev);
++	return 0;
++}
++
++static int netbk_change_mtu(struct net_device *dev, int mtu)
++{
++	int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
++
++	if (mtu > max)
++		return -EINVAL;
++	dev->mtu = mtu;
++	return 0;
++}
++
++void netif_set_features(struct xen_netif *netif)
++{
++	struct net_device *dev = netif->dev;
++	int features = dev->features;
++
++	if (netif->can_sg)
++		features |= NETIF_F_SG;
++	if (netif->gso || netif->gso_prefix)
++		features |= NETIF_F_TSO;
++	if (netif->csum)
++		features |= NETIF_F_IP_CSUM;
++
++	features &= ~(netif->features_disabled);
++
++	if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN)
++		dev->mtu = ETH_DATA_LEN;
++
++	dev->features = features;
++}
++
++static int netbk_set_tx_csum(struct net_device *dev, u32 data)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	if (data) {
++		if (!netif->csum)
++			return -ENOSYS;
++		netif->features_disabled &= ~NETIF_F_IP_CSUM;
++	} else {
++		netif->features_disabled |= NETIF_F_IP_CSUM;
++	}
++
++	netif_set_features(netif);
++	return 0;
++}
++
++static int netbk_set_sg(struct net_device *dev, u32 data)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	if (data) {
++		if (!netif->can_sg)
++			return -ENOSYS;
++		netif->features_disabled &= ~NETIF_F_SG;
++	} else {
++		netif->features_disabled |= NETIF_F_SG;
++	}
++
++	netif_set_features(netif);
++	return 0;
++}
++
++static int netbk_set_tso(struct net_device *dev, u32 data)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	if (data) {
++		if (!netif->gso && !netif->gso_prefix)
++			return -ENOSYS;
++		netif->features_disabled &= ~NETIF_F_TSO;
++	} else {
++		netif->features_disabled |= NETIF_F_TSO;
++	}
++
++	netif_set_features(netif);
++	return 0;
++}
++
++static void netbk_get_drvinfo(struct net_device *dev,
++			      struct ethtool_drvinfo *info)
++{
++	strcpy(info->driver, "netbk");
++	strcpy(info->bus_info, dev_name(dev->dev.parent));
++}
++
++static const struct netif_stat {
++	char name[ETH_GSTRING_LEN];
++	u16 offset;
++} netbk_stats[] = {
++	{ "copied_skbs", offsetof(struct xen_netif, nr_copied_skbs) },
++};
++
++static int netbk_get_sset_count(struct net_device *dev, int string_set)
++{
++	switch (string_set) {
++	case ETH_SS_STATS:
++		return ARRAY_SIZE(netbk_stats);
++	default:
++		return -EINVAL;
++	}
++}
++
++static void netbk_get_ethtool_stats(struct net_device *dev,
++				   struct ethtool_stats *stats, u64 * data)
++{
++	void *netif = netdev_priv(dev);
++	int i;
++
++	for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++		data[i] = *(int *)(netif + netbk_stats[i].offset);
++}
++
++static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
++{
++	int i;
++
++	switch (stringset) {
++	case ETH_SS_STATS:
++		for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
++			memcpy(data + i * ETH_GSTRING_LEN,
++			       netbk_stats[i].name, ETH_GSTRING_LEN);
++		break;
++	}
++}
++
++static struct ethtool_ops network_ethtool_ops =
++{
++	.get_drvinfo = netbk_get_drvinfo,
++
++	.get_tx_csum = ethtool_op_get_tx_csum,
++	.set_tx_csum = netbk_set_tx_csum,
++	.get_sg = ethtool_op_get_sg,
++	.set_sg = netbk_set_sg,
++	.get_tso = ethtool_op_get_tso,
++	.set_tso = netbk_set_tso,
++	.get_link = ethtool_op_get_link,
++
++	.get_sset_count = netbk_get_sset_count,
++	.get_ethtool_stats = netbk_get_ethtool_stats,
++	.get_strings = netbk_get_strings,
++};
++
++static struct net_device_ops netback_ops =
++{
++	.ndo_start_xmit	= netif_be_start_xmit,
++	.ndo_get_stats	= netif_be_get_stats,
++	.ndo_open	= net_open,
++	.ndo_stop	= net_close,
++	.ndo_change_mtu	= netbk_change_mtu,
++};
++
++struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
++{
++	int err = 0;
++	struct net_device *dev;
++	struct xen_netif *netif;
++	char name[IFNAMSIZ] = {};
++
++	snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
++	dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
++	if (dev == NULL) {
++		DPRINTK("Could not create netif: out of memory\n");
++		return ERR_PTR(-ENOMEM);
++	}
++
++	SET_NETDEV_DEV(dev, parent);
++
++	netif = netdev_priv(dev);
++	memset(netif, 0, sizeof(*netif));
++	netif->domid  = domid;
++	netif->group  = -1;
++	netif->handle = handle;
++	netif->can_sg = 1;
++	netif->csum = 1;
++	atomic_set(&netif->refcnt, 1);
++	init_waitqueue_head(&netif->waiting_to_free);
++	netif->dev = dev;
++	INIT_LIST_HEAD(&netif->list);
++
++	netback_carrier_off(netif);
++
++	netif->credit_bytes = netif->remaining_credit = ~0UL;
++	netif->credit_usec  = 0UL;
++	init_timer(&netif->credit_timeout);
++	/* Initialize 'expires' now: it's used to track the credit window. */
++	netif->credit_timeout.expires = jiffies;
++
++	init_timer(&netif->tx_queue_timeout);
++
++	dev->netdev_ops	= &netback_ops;
++	netif_set_features(netif);
++	SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
++
++	dev->tx_queue_len = netbk_queue_length;
++
++	/*
++	 * Initialise a dummy MAC address. We choose the numerically
++	 * largest non-broadcast address to prevent the address getting
++	 * stolen by an Ethernet bridge for STP purposes.
++	 * (FE:FF:FF:FF:FF:FF)
++	 */
++	memset(dev->dev_addr, 0xFF, ETH_ALEN);
++	dev->dev_addr[0] &= ~0x01;
++
++	rtnl_lock();
++	err = register_netdevice(dev);
++	rtnl_unlock();
++	if (err) {
++		DPRINTK("Could not register new net device %s: err=%d\n",
++			dev->name, err);
++		free_netdev(dev);
++		return ERR_PTR(err);
++	}
++
++	DPRINTK("Successfully created netif\n");
++	return netif;
++}
++
++static int map_frontend_pages(
++	struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
++{
++	struct gnttab_map_grant_ref op;
++
++	gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr,
++			  GNTMAP_host_map, tx_ring_ref, netif->domid);
++
++	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++		BUG();
++
++	if (op.status) {
++		DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
++		return op.status;
++	}
++
++	netif->tx_shmem_ref    = tx_ring_ref;
++	netif->tx_shmem_handle = op.handle;
++
++	gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr,
++			  GNTMAP_host_map, rx_ring_ref, netif->domid);
++
++	if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
++		BUG();
++
++	if (op.status) {
++		struct gnttab_unmap_grant_ref unop;
++
++		gnttab_set_unmap_op(&unop,
++				    (unsigned long)netif->tx_comms_area->addr,
++				    GNTMAP_host_map, netif->tx_shmem_handle);
++		HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
++		DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
++		return op.status;
++	}
++
++	netif->rx_shmem_ref    = rx_ring_ref;
++	netif->rx_shmem_handle = op.handle;
++
++	return 0;
++}
++
++static void unmap_frontend_pages(struct xen_netif *netif)
++{
++	struct gnttab_unmap_grant_ref op;
++
++	gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
++			    GNTMAP_host_map, netif->tx_shmem_handle);
++
++	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++		BUG();
++
++	gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
++			    GNTMAP_host_map, netif->rx_shmem_handle);
++
++	if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
++		BUG();
++}
++
++int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
++	      unsigned long rx_ring_ref, unsigned int evtchn)
++{
++	int err = -ENOMEM;
++	struct xen_netif_tx_sring *txs;
++	struct xen_netif_rx_sring *rxs;
++
++	/* Already connected through? */
++	if (netif->irq)
++		return 0;
++
++	netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
++	if (netif->tx_comms_area == NULL)
++		return -ENOMEM;
++	netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
++	if (netif->rx_comms_area == NULL)
++		goto err_rx;
++
++	err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
++	if (err)
++		goto err_map;
++
++	err = bind_interdomain_evtchn_to_irqhandler(
++		netif->domid, evtchn, netif_be_int, 0,
++		netif->dev->name, netif);
++	if (err < 0)
++		goto err_hypervisor;
++	netif->irq = err;
++	disable_irq(netif->irq);
++
++	txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
++	BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
++
++	rxs = (struct xen_netif_rx_sring *)
++		((char *)netif->rx_comms_area->addr);
++	BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
++
++	netif->rx_req_cons_peek = 0;
++
++	netif_get(netif);
++
++	rtnl_lock();
++	netback_carrier_on(netif);
++	if (netif_running(netif->dev))
++		__netif_up(netif);
++	rtnl_unlock();
++
++	return 0;
++err_hypervisor:
++	unmap_frontend_pages(netif);
++err_map:
++	free_vm_area(netif->rx_comms_area);
++err_rx:
++	free_vm_area(netif->tx_comms_area);
++	return err;
++}
++
++void netif_disconnect(struct xen_netif *netif)
++{
++	if (netback_carrier_ok(netif)) {
++		rtnl_lock();
++		netback_carrier_off(netif);
++		netif_carrier_off(netif->dev); /* discard queued packets */
++		if (netif_running(netif->dev))
++			__netif_down(netif);
++		rtnl_unlock();
++		netif_put(netif);
++	}
++
++	atomic_dec(&netif->refcnt);
++	wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
++
++	del_timer_sync(&netif->credit_timeout);
++	del_timer_sync(&netif->tx_queue_timeout);
++
++	if (netif->irq)
++		unbind_from_irqhandler(netif->irq, netif);
++
++	unregister_netdev(netif->dev);
++
++	if (netif->tx.sring) {
++		unmap_frontend_pages(netif);
++		free_vm_area(netif->tx_comms_area);
++		free_vm_area(netif->rx_comms_area);
++	}
++
++	free_netdev(netif->dev);
++}
+diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c
+new file mode 100644
+index 0000000..9052895
+--- /dev/null
++++ b/drivers/xen/netback/netback.c
+@@ -0,0 +1,1881 @@
++/******************************************************************************
++ * drivers/xen/netback/netback.c
++ *
++ * Back-end of the driver for virtual network devices. This portion of the
++ * driver exports a 'unified' network-device interface that can be accessed
++ * by any operating system that implements a compatible front end. A
++ * reference front-end implementation can be found in:
++ *  drivers/xen/netfront/netfront.c
++ *
++ * Copyright (c) 2002-2005, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "common.h"
++
++#include <linux/tcp.h>
++#include <linux/udp.h>
++#include <linux/kthread.h>
++
++#include <xen/balloon.h>
++#include <xen/events.h>
++#include <xen/interface/memory.h>
++
++#include <asm/xen/hypercall.h>
++#include <asm/xen/page.h>
++
++/*define NETBE_DEBUG_INTERRUPT*/
++
++struct xen_netbk *xen_netbk;
++int xen_netbk_group_nr;
++
++static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx);
++static void make_tx_response(struct xen_netif *netif,
++			     struct xen_netif_tx_request *txp,
++			     s8       st);
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++					     u16      id,
++					     s8       st,
++					     u16      offset,
++					     u16      size,
++					     u16      flags);
++
++static void net_tx_action(unsigned long data);
++
++static void net_rx_action(unsigned long data);
++
++static inline unsigned long idx_to_pfn(struct xen_netbk *netbk,
++				       unsigned int idx)
++{
++	return page_to_pfn(netbk->mmap_pages[idx]);
++}
++
++static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk,
++					 unsigned int idx)
++{
++	return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx));
++}
++
++/* extra field used in struct page */
++static inline void netif_set_page_ext(struct page *pg, unsigned int group,
++		unsigned int idx)
++{
++	union page_ext ext = { .e = { .group = group + 1, .idx = idx } };
++
++	BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
++	pg->mapping = ext.mapping;
++}
++
++static inline int netif_get_page_ext(struct page *pg, unsigned int *_group, unsigned int *_idx)
++{
++	union page_ext ext = { .mapping = pg->mapping };
++	struct xen_netbk *netbk;
++	unsigned int group, idx;
++
++	if (!PageForeign(pg))
++		return 0;
++
++	group = ext.e.group - 1;
++
++	if (group < 0 || group >= xen_netbk_group_nr)
++		return 0;
++
++	netbk = &xen_netbk[group];
++
++	if (netbk->mmap_pages == NULL)
++		return 0;
++
++	idx = ext.e.idx;
++
++	if ((idx < 0) || (idx >= MAX_PENDING_REQS))
++		return 0;
++
++	if (netbk->mmap_pages[idx] != pg)
++		return 0;
++
++	*_group = group;
++	*_idx = idx;
++
++	return 1;
++}
++
++/*
++ * This is the amount of packet we copy rather than map, so that the
++ * guest can't fiddle with the contents of the headers while we do
++ * packet processing on them (netfilter, routing, etc). 72 is enough
++ * to cover TCP+IP headers including options.
++ */
++#define PKT_PROT_LEN 72
++
++static inline pending_ring_idx_t pending_index(unsigned i)
++{
++	return i & (MAX_PENDING_REQS-1);
++}
++
++static inline pending_ring_idx_t nr_pending_reqs(struct xen_netbk *netbk)
++{
++	return MAX_PENDING_REQS -
++		netbk->pending_prod + netbk->pending_cons;
++}
++
++/* Setting this allows the safe use of this driver without netloop. */
++static int MODPARM_copy_skb = 1;
++module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
++MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
++
++int netbk_copy_skb_mode;
++
++static int MODPARM_netback_kthread;
++module_param_named(netback_kthread, MODPARM_netback_kthread, bool, 0);
++MODULE_PARM_DESC(netback_kthread, "Use kernel thread to replace tasklet");
++
++/*
++ * Netback bottom half handler.
++ * dir indicates the data direction.
++ * rx: 1, tx: 0.
++ */
++static inline void xen_netbk_bh_handler(struct xen_netbk *netbk, int dir)
++{
++	if (MODPARM_netback_kthread)
++		wake_up(&netbk->kthread.netbk_action_wq);
++	else if (dir)
++		tasklet_schedule(&netbk->tasklet.net_rx_tasklet);
++	else
++		tasklet_schedule(&netbk->tasklet.net_tx_tasklet);
++}
++
++static inline void maybe_schedule_tx_action(struct xen_netbk *netbk)
++{
++	smp_mb();
++	if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
++	    !list_empty(&netbk->net_schedule_list))
++		xen_netbk_bh_handler(netbk, 0);
++}
++
++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
++{
++	struct skb_shared_info *ninfo;
++	struct sk_buff *nskb;
++	unsigned long offset;
++	int ret;
++	int len;
++	int headlen;
++
++	BUG_ON(skb_shinfo(skb)->frag_list != NULL);
++
++	nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
++	if (unlikely(!nskb))
++		goto err;
++
++	skb_reserve(nskb, NET_SKB_PAD + NET_IP_ALIGN);
++	headlen = skb_end_pointer(nskb) - nskb->data;
++	if (headlen > skb_headlen(skb))
++		headlen = skb_headlen(skb);
++	ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
++	BUG_ON(ret);
++
++	ninfo = skb_shinfo(nskb);
++	ninfo->gso_size = skb_shinfo(skb)->gso_size;
++	ninfo->gso_type = skb_shinfo(skb)->gso_type;
++
++	offset = headlen;
++	len = skb->len - headlen;
++
++	nskb->len = skb->len;
++	nskb->data_len = len;
++	nskb->truesize += len;
++
++	while (len) {
++		struct page *page;
++		int copy;
++		int zero;
++
++		if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
++			dump_stack();
++			goto err_free;
++		}
++
++		copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
++		zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
++
++		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
++		if (unlikely(!page))
++			goto err_free;
++
++		ret = skb_copy_bits(skb, offset, page_address(page), copy);
++		BUG_ON(ret);
++
++		ninfo->frags[ninfo->nr_frags].page = page;
++		ninfo->frags[ninfo->nr_frags].page_offset = 0;
++		ninfo->frags[ninfo->nr_frags].size = copy;
++		ninfo->nr_frags++;
++
++		offset += copy;
++		len -= copy;
++	}
++
++#ifdef NET_SKBUFF_DATA_USES_OFFSET
++	offset = 0;
++#else
++	offset = nskb->data - skb->data;
++#endif
++
++	nskb->transport_header = skb->transport_header + offset;
++	nskb->network_header = skb->network_header + offset;
++	nskb->mac_header = skb->mac_header + offset;
++
++	return nskb;
++
++ err_free:
++	kfree_skb(nskb);
++ err:
++	return NULL;
++}
++
++static inline int netbk_max_required_rx_slots(struct xen_netif *netif)
++{
++	if (netif->can_sg || netif->gso || netif->gso_prefix)
++		return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
++	return 1; /* all in one */
++}
++
++static inline int netbk_queue_full(struct xen_netif *netif)
++{
++	RING_IDX peek   = netif->rx_req_cons_peek;
++	RING_IDX needed = netbk_max_required_rx_slots(netif);
++
++	return ((netif->rx.sring->req_prod - peek) < needed) ||
++	       ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
++}
++
++static void tx_queue_callback(unsigned long data)
++{
++	struct xen_netif *netif = (struct xen_netif *)data;
++	if (netif_schedulable(netif))
++		netif_wake_queue(netif->dev);
++}
++
++/* Figure out how many ring slots we're going to need to send @skb to
++   the guest. */
++static unsigned count_skb_slots(struct sk_buff *skb, struct xen_netif *netif)
++{
++	unsigned count;
++	unsigned copy_off;
++	unsigned i;
++
++	copy_off = 0;
++	count = 1;
++
++	BUG_ON(offset_in_page(skb->data) + skb_headlen(skb) > MAX_BUFFER_OFFSET);
++
++	copy_off = skb_headlen(skb);
++
++	if (skb_shinfo(skb)->gso_size)
++		count++;
++
++	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
++		unsigned long size = skb_shinfo(skb)->frags[i].size;
++		unsigned long bytes;
++		while (size > 0) {
++			BUG_ON(copy_off > MAX_BUFFER_OFFSET);
++
++			/* These checks are the same as in netbk_gop_frag_copy */
++			if (copy_off == MAX_BUFFER_OFFSET
++			    || ((copy_off + size > MAX_BUFFER_OFFSET) && (size <= MAX_BUFFER_OFFSET) && copy_off)) {
++				count++;
++				copy_off = 0;
++			}
++
++			bytes = size;
++			if (copy_off + bytes > MAX_BUFFER_OFFSET)
++				bytes = MAX_BUFFER_OFFSET - copy_off;
++
++			copy_off += bytes;
++			size -= bytes;
++		}
++	}
++	return count;
++}
++
++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	struct xen_netbk *netbk;
++
++	BUG_ON(skb->dev != dev);
++
++	if (netif->group == -1)
++		goto drop;
++
++	netbk = &xen_netbk[netif->group];
++
++	/* Drop the packet if the target domain has no receive buffers. */
++	if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
++		goto drop;
++
++	/*
++	 * XXX For now we also copy skbuffs whose head crosses a page
++	 * boundary, because netbk_gop_skb can't handle them.
++	 */
++	if ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE) {
++		struct sk_buff *nskb = netbk_copy_skb(skb);
++		if ( unlikely(nskb == NULL) )
++			goto drop;
++		/* Copy only the header fields we use in this driver. */
++		nskb->dev = skb->dev;
++		nskb->ip_summed = skb->ip_summed;
++		dev_kfree_skb(skb);
++		skb = nskb;
++	}
++
++	/* Reserve ring slots for the worst-case number of
++	 * fragments. */
++	netif->rx_req_cons_peek += count_skb_slots(skb, netif);
++	netif_get(netif);
++
++	if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
++		netif->rx.sring->req_event = netif->rx_req_cons_peek +
++			netbk_max_required_rx_slots(netif);
++		mb(); /* request notification /then/ check & stop the queue */
++		if (netbk_queue_full(netif)) {
++			netif_stop_queue(dev);
++			/*
++			 * Schedule 500ms timeout to restart the queue, thus
++			 * ensuring that an inactive queue will be drained.
++			 * Packets will be immediately be dropped until more
++			 * receive buffers become available (see
++			 * netbk_queue_full() check above).
++			 */
++			netif->tx_queue_timeout.data = (unsigned long)netif;
++			netif->tx_queue_timeout.function = tx_queue_callback;
++			mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
++		}
++	}
++	skb_queue_tail(&netbk->rx_queue, skb);
++
++	xen_netbk_bh_handler(netbk, 1);
++
++	return 0;
++
++ drop:
++	netif->stats.tx_dropped++;
++	dev_kfree_skb(skb);
++	return 0;
++}
++
++struct netrx_pending_operations {
++	unsigned copy_prod, copy_cons;
++	unsigned meta_prod, meta_cons;
++	struct gnttab_copy *copy;
++	struct netbk_rx_meta *meta;
++	int copy_off;
++	grant_ref_t copy_gref;
++};
++
++/* Set up the grant operations for this fragment.  If it's a flipping
++   interface, we also set up the unmap request from here. */
++
++static void netbk_gop_frag_copy(struct xen_netif *netif,
++				struct netrx_pending_operations *npo,
++				struct page *page, unsigned long size,
++				unsigned long offset, int head)
++{
++	struct gnttab_copy *copy_gop;
++	struct netbk_rx_meta *meta;
++	/*
++	 * These variables a used iff netif_get_page_ext returns true,
++	 * in which case they are guaranteed to be initialized.
++         */
++	unsigned int uninitialized_var(group), uninitialized_var(idx);
++	int foreign = netif_get_page_ext(page, &group, &idx);
++	unsigned long bytes;
++
++	/* Data must not cross a page boundary. */
++	BUG_ON(size + offset > PAGE_SIZE);
++
++	meta = npo->meta + npo->meta_prod - 1;
++
++	while (size > 0) {
++		BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
++
++		/*
++		 * Move to a new receive buffer if:
++		 *
++		 * simple case: we have completely filled the current buffer.
++		 *
++		 * complex case: the current frag would overflow
++		 * the current buffer but only if:
++		 *     (i)   this frag would fit completely in the next buffer
++		 * and (ii)  there is already some data in the current buffer
++		 * and (iii) this is not the head buffer.
++		 *
++		 * Where:
++		 * - (i) stops us splitting a frag into two copies
++		 *   unless the frag is too large for a single buffer.
++		 * - (ii) stops us from leaving a buffer pointlessly empty.
++		 * - (iii) stops us leaving the first buffer
++		 *   empty. Strictly speaking this is already covered
++		 *   by (ii) but is explicitly checked because
++		 *   netfront relies on the first buffer being
++		 *   non-empty and can crash otherwise.
++		 *
++		 * This means we will effectively linearise small
++		 * frags but do not needlessly split large buffers
++		 * into multiple copies tend to give large frags their
++		 * own buffers as before.
++		 */
++		if (npo->copy_off == MAX_BUFFER_OFFSET
++		    || ((npo->copy_off + size > MAX_BUFFER_OFFSET) && (size <= MAX_BUFFER_OFFSET) && npo->copy_off && !head)) {
++			struct xen_netif_rx_request *req;
++
++			BUG_ON(head); /* Netfront requires there to be some data in the head buffer. */
++			/* Overflowed this request, go to the next one */
++			req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++);
++			meta = npo->meta + npo->meta_prod++;
++			meta->gso_size = 0;
++			meta->size = 0;
++			meta->id = req->id;
++			npo->copy_off = 0;
++			npo->copy_gref = req->gref;
++		}
++
++		bytes = size;
++		if (npo->copy_off + bytes > MAX_BUFFER_OFFSET)
++			bytes = MAX_BUFFER_OFFSET - npo->copy_off;
++
++		copy_gop = npo->copy + npo->copy_prod++;
++		copy_gop->flags = GNTCOPY_dest_gref;
++		if (foreign) {
++			struct xen_netbk *netbk = &xen_netbk[group];
++			struct pending_tx_info *src_pend;
++
++			src_pend = &netbk->pending_tx_info[idx];
++
++			copy_gop->source.domid = src_pend->netif->domid;
++			copy_gop->source.u.ref = src_pend->req.gref;
++			copy_gop->flags |= GNTCOPY_source_gref;
++		} else {
++			copy_gop->source.domid = DOMID_SELF;
++			copy_gop->source.u.gmfn = virt_to_mfn(page_address(page));
++		}
++		copy_gop->source.offset = offset;
++		copy_gop->dest.domid = netif->domid;
++
++		copy_gop->dest.offset = npo->copy_off;
++		copy_gop->dest.u.ref = npo->copy_gref;
++		copy_gop->len = bytes;
++
++		npo->copy_off += bytes;
++		meta->size += bytes;
++
++		offset += bytes;
++		size -= bytes;
++		head = 0; /* Must be something in this buffer now */
++	}
++}
++
++/* Prepare an SKB to be transmitted to the frontend.  This is
++   responsible for allocating grant operations, meta structures, etc.
++   It returns the number of meta structures consumed.  The number of
++   ring slots used is always equal to the number of meta slots used
++   plus the number of GSO descriptors used.  Currently, we use either
++   zero GSO descriptors (for non-GSO packets) or one descriptor (for
++   frontend-side LRO). */
++static int netbk_gop_skb(struct sk_buff *skb,
++			 struct netrx_pending_operations *npo)
++{
++	struct xen_netif *netif = netdev_priv(skb->dev);
++	int nr_frags = skb_shinfo(skb)->nr_frags;
++	int i;
++	struct xen_netif_rx_request *req;
++	struct netbk_rx_meta *meta;
++	int old_meta_prod;
++
++	old_meta_prod = npo->meta_prod;
++
++	/* Set up a GSO prefix descriptor, if necessary */
++	if (skb_shinfo(skb)->gso_size && netif->gso_prefix) {
++		req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++);
++		meta = npo->meta + npo->meta_prod++;
++		meta->gso_size = skb_shinfo(skb)->gso_size;
++		meta->size = 0;
++		meta->id = req->id;
++	}
++
++	req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons++);
++	meta = npo->meta + npo->meta_prod++;
++
++	if (!netif->gso_prefix)
++		meta->gso_size = skb_shinfo(skb)->gso_size;
++	else
++		meta->gso_size = 0;
++
++	meta->size = 0;
++	meta->id = req->id;
++	npo->copy_off = 0;
++	npo->copy_gref = req->gref;
++
++	netbk_gop_frag_copy(netif,
++			    npo, virt_to_page(skb->data),
++			    skb_headlen(skb),
++			    offset_in_page(skb->data), 1);
++
++	/* Leave a gap for the GSO descriptor. */
++	if (skb_shinfo(skb)->gso_size && !netif->gso_prefix)
++		netif->rx.req_cons++;
++
++	for (i = 0; i < nr_frags; i++) {
++		netbk_gop_frag_copy(netif, npo,
++				    skb_shinfo(skb)->frags[i].page,
++				    skb_shinfo(skb)->frags[i].size,
++				    skb_shinfo(skb)->frags[i].page_offset,
++				    0);
++	}
++
++	return npo->meta_prod - old_meta_prod;
++}
++
++/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
++   used to set up the operations on the top of
++   netrx_pending_operations, which have since been done.  Check that
++   they didn't give any errors and advance over them. */
++static int netbk_check_gop(int nr_meta_slots, domid_t domid,
++			   struct netrx_pending_operations *npo)
++{
++	struct gnttab_copy     *copy_op;
++	int status = NETIF_RSP_OKAY;
++	int i;
++
++	for (i = 0; i < nr_meta_slots; i++) {
++		copy_op = npo->copy + npo->copy_cons++;
++		if (copy_op->status != GNTST_okay) {
++				DPRINTK("Bad status %d from copy to DOM%d.\n",
++					copy_op->status, domid);
++				status = NETIF_RSP_ERROR;
++			}
++	}
++
++	return status;
++}
++
++static void netbk_add_frag_responses(struct xen_netif *netif, int status,
++				     struct netbk_rx_meta *meta,
++				     int nr_meta_slots)
++{
++	int i;
++	unsigned long offset;
++
++	for (i = 0; i < nr_meta_slots; i++) {
++		int flags;
++		if (i == nr_meta_slots - 1)
++			flags = 0;
++		else
++			flags = NETRXF_more_data;
++
++		offset = 0;
++		make_rx_response(netif, meta[i].id, status, offset,
++				 meta[i].size, flags);
++	}
++}
++
++struct skb_cb_overlay {
++	int meta_slots_used;
++};
++
++static void net_rx_action(unsigned long data)
++{
++	struct xen_netif *netif = NULL;
++	struct xen_netbk *netbk = (struct xen_netbk *)data;
++	s8 status;
++	u16 irq, flags;
++	struct xen_netif_rx_response *resp;
++	struct sk_buff_head rxq;
++	struct sk_buff *skb;
++	int notify_nr = 0;
++	int ret;
++	int nr_frags;
++	int count;
++	unsigned long offset;
++	struct skb_cb_overlay *sco;
++
++	struct netrx_pending_operations npo = {
++		.copy  = netbk->grant_copy_op,
++		.meta  = netbk->meta,
++	};
++
++	skb_queue_head_init(&rxq);
++
++	count = 0;
++
++	while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
++		netif = netdev_priv(skb->dev);
++		nr_frags = skb_shinfo(skb)->nr_frags;
++
++		sco = (struct skb_cb_overlay *)skb->cb;
++		sco->meta_slots_used = netbk_gop_skb(skb, &npo);
++
++		count += nr_frags + 1;
++
++		__skb_queue_tail(&rxq, skb);
++
++		/* Filled the batch queue? */
++		if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
++			break;
++	}
++
++	BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
++
++	if (!npo.copy_prod)
++		return;
++
++	BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
++	ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, &netbk->grant_copy_op,
++					npo.copy_prod);
++	BUG_ON(ret != 0);
++
++	while ((skb = __skb_dequeue(&rxq)) != NULL) {
++		sco = (struct skb_cb_overlay *)skb->cb;
++
++		netif = netdev_priv(skb->dev);
++
++		if (netbk->meta[npo.meta_cons].gso_size && netif->gso_prefix) {
++			resp = RING_GET_RESPONSE(&netif->rx,
++						netif->rx.rsp_prod_pvt++);
++
++			resp->flags = NETRXF_gso_prefix | NETRXF_more_data;
++
++			resp->offset = netbk->meta[npo.meta_cons].gso_size;
++			resp->id = netbk->meta[npo.meta_cons].id;
++			resp->status = sco->meta_slots_used;
++
++			npo.meta_cons++;
++			sco->meta_slots_used--;
++		}
++
++
++		netif->stats.tx_bytes += skb->len;
++		netif->stats.tx_packets++;
++
++		status = netbk_check_gop(sco->meta_slots_used,
++					 netif->domid, &npo);
++
++		if (sco->meta_slots_used == 1)
++			flags = 0;
++		else
++			flags = NETRXF_more_data;
++
++		if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
++			flags |= NETRXF_csum_blank | NETRXF_data_validated;
++		else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
++			/* remote but checksummed. */
++			flags |= NETRXF_data_validated;
++
++		offset = 0;
++		resp = make_rx_response(netif, netbk->meta[npo.meta_cons].id,
++					status, offset,
++					netbk->meta[npo.meta_cons].size,
++					flags);
++
++		if (netbk->meta[npo.meta_cons].gso_size && !netif->gso_prefix) {
++			struct xen_netif_extra_info *gso =
++				(struct xen_netif_extra_info *)
++				RING_GET_RESPONSE(&netif->rx,
++						  netif->rx.rsp_prod_pvt++);
++
++			resp->flags |= NETRXF_extra_info;
++
++			gso->u.gso.size = netbk->meta[npo.meta_cons].gso_size;
++			gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
++			gso->u.gso.pad = 0;
++			gso->u.gso.features = 0;
++
++			gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
++			gso->flags = 0;
++		}
++
++		if (sco->meta_slots_used > 1) {
++			netbk_add_frag_responses(netif, status,
++						 netbk->meta + npo.meta_cons + 1,
++						 sco->meta_slots_used - 1);
++		}
++
++		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
++		irq = netif->irq;
++		if (ret && !netbk->rx_notify[irq] &&
++				(netif->smart_poll != 1)) {
++			netbk->rx_notify[irq] = 1;
++			netbk->notify_list[notify_nr++] = irq;
++		}
++
++		if (netif_queue_stopped(netif->dev) &&
++		    netif_schedulable(netif) &&
++		    !netbk_queue_full(netif))
++			netif_wake_queue(netif->dev);
++
++		/*
++		 * netfront_smartpoll_active indicates whether
++		 * netfront timer is active.
++		 */
++		if ((netif->smart_poll == 1) &&
++		    !(netif->rx.sring->private.netif.smartpoll_active)) {
++			notify_remote_via_irq(irq);
++			netif->rx.sring->private.netif.smartpoll_active = 1;
++		}
++
++		netif_put(netif);
++		npo.meta_cons += sco->meta_slots_used;
++		dev_kfree_skb(skb);
++	}
++
++	while (notify_nr != 0) {
++		irq = netbk->notify_list[--notify_nr];
++		netbk->rx_notify[irq] = 0;
++		notify_remote_via_irq(irq);
++	}
++
++	/* More work to do? */
++	if (!skb_queue_empty(&netbk->rx_queue) &&
++			!timer_pending(&netbk->net_timer))
++		xen_netbk_bh_handler(netbk, 1);
++}
++
++static void net_alarm(unsigned long data)
++{
++	struct xen_netbk *netbk = (struct xen_netbk *)data;
++	xen_netbk_bh_handler(netbk, 1);
++}
++
++static void netbk_tx_pending_timeout(unsigned long data)
++{
++	struct xen_netbk *netbk = (struct xen_netbk *)data;
++	xen_netbk_bh_handler(netbk, 0);
++}
++
++struct net_device_stats *netif_be_get_stats(struct net_device *dev)
++{
++	struct xen_netif *netif = netdev_priv(dev);
++	return &netif->stats;
++}
++
++static int __on_net_schedule_list(struct xen_netif *netif)
++{
++	return !list_empty(&netif->list);
++}
++
++static void remove_from_net_schedule_list(struct xen_netif *netif)
++{
++	struct xen_netbk *netbk = &xen_netbk[netif->group];
++	spin_lock_irq(&netbk->net_schedule_list_lock);
++	if (likely(__on_net_schedule_list(netif))) {
++		list_del_init(&netif->list);
++		netif_put(netif);
++	}
++	spin_unlock_irq(&netbk->net_schedule_list_lock);
++}
++
++static void add_to_net_schedule_list_tail(struct xen_netif *netif)
++{
++	unsigned long flags;
++
++	struct xen_netbk *netbk = &xen_netbk[netif->group];
++	if (__on_net_schedule_list(netif))
++		return;
++
++	spin_lock_irqsave(&netbk->net_schedule_list_lock, flags);
++	if (!__on_net_schedule_list(netif) &&
++	    likely(netif_schedulable(netif))) {
++		list_add_tail(&netif->list, &netbk->net_schedule_list);
++		netif_get(netif);
++	}
++	spin_unlock_irqrestore(&netbk->net_schedule_list_lock, flags);
++}
++
++void netif_schedule_work(struct xen_netif *netif)
++{
++	struct xen_netbk *netbk = &xen_netbk[netif->group];
++	int more_to_do;
++
++	RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
++
++	if (more_to_do) {
++		add_to_net_schedule_list_tail(netif);
++		maybe_schedule_tx_action(netbk);
++	}
++}
++
++void netif_deschedule_work(struct xen_netif *netif)
++{
++	remove_from_net_schedule_list(netif);
++}
++
++
++static void tx_add_credit(struct xen_netif *netif)
++{
++	unsigned long max_burst, max_credit;
++
++	/*
++	 * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
++	 * Otherwise the interface can seize up due to insufficient credit.
++	 */
++	max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
++	max_burst = min(max_burst, 131072UL);
++	max_burst = max(max_burst, netif->credit_bytes);
++
++	/* Take care that adding a new chunk of credit doesn't wrap to zero. */
++	max_credit = netif->remaining_credit + netif->credit_bytes;
++	if (max_credit < netif->remaining_credit)
++		max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
++
++	netif->remaining_credit = min(max_credit, max_burst);
++}
++
++static void tx_credit_callback(unsigned long data)
++{
++	struct xen_netif *netif = (struct xen_netif *)data;
++	tx_add_credit(netif);
++	netif_schedule_work(netif);
++}
++
++static inline int copy_pending_req(struct xen_netbk *netbk,
++				   pending_ring_idx_t pending_idx)
++{
++	return gnttab_copy_grant_page(
++			netbk->grant_tx_handle[pending_idx],
++			&netbk->mmap_pages[pending_idx]);
++}
++
++static inline void net_tx_action_dealloc(struct xen_netbk *netbk)
++{
++	struct netbk_tx_pending_inuse *inuse, *n;
++	struct gnttab_unmap_grant_ref *gop;
++	u16 pending_idx;
++	pending_ring_idx_t dc, dp;
++	struct xen_netif *netif;
++	int ret;
++	LIST_HEAD(list);
++
++	dc = netbk->dealloc_cons;
++	gop = netbk->tx_unmap_ops;
++
++	/*
++	 * Free up any grants we have finished using
++	 */
++	do {
++		dp = netbk->dealloc_prod;
++
++		/* Ensure we see all indices enqueued by netif_idx_release(). */
++		smp_rmb();
++
++		while (dc != dp) {
++			unsigned long pfn;
++			struct netbk_tx_pending_inuse *pending_inuse =
++					netbk->pending_inuse;
++
++			pending_idx = netbk->dealloc_ring[pending_index(dc++)];
++			list_move_tail(&pending_inuse[pending_idx].list, &list);
++
++			pfn = idx_to_pfn(netbk, pending_idx);
++			/* Already unmapped? */
++			if (!phys_to_machine_mapping_valid(pfn))
++				continue;
++
++			gnttab_set_unmap_op(gop,
++					idx_to_kaddr(netbk, pending_idx),
++					GNTMAP_host_map,
++					netbk->grant_tx_handle[pending_idx]);
++			gop++;
++		}
++
++		if (netbk_copy_skb_mode != NETBK_DELAYED_COPY_SKB ||
++		    list_empty(&netbk->pending_inuse_head))
++			break;
++
++		/* Copy any entries that have been pending for too long. */
++		list_for_each_entry_safe(inuse, n,
++				&netbk->pending_inuse_head, list) {
++			struct pending_tx_info *pending_tx_info;
++			pending_tx_info = netbk->pending_tx_info;
++
++			if (time_after(inuse->alloc_time + HZ / 2, jiffies))
++				break;
++
++			pending_idx = inuse - netbk->pending_inuse;
++
++			pending_tx_info[pending_idx].netif->nr_copied_skbs++;
++
++			switch (copy_pending_req(netbk, pending_idx)) {
++			case 0:
++				list_move_tail(&inuse->list, &list);
++				continue;
++			case -EBUSY:
++				list_del_init(&inuse->list);
++				continue;
++			case -ENOENT:
++				continue;
++			}
++
++			break;
++		}
++	} while (dp != netbk->dealloc_prod);
++
++	netbk->dealloc_cons = dc;
++
++	ret = HYPERVISOR_grant_table_op(
++		GNTTABOP_unmap_grant_ref, netbk->tx_unmap_ops,
++		gop - netbk->tx_unmap_ops);
++	BUG_ON(ret);
++
++	list_for_each_entry_safe(inuse, n, &list, list) {
++		struct pending_tx_info *pending_tx_info;
++		pending_ring_idx_t index;
++
++		pending_tx_info = netbk->pending_tx_info;
++		pending_idx = inuse - netbk->pending_inuse;
++
++		netif = pending_tx_info[pending_idx].netif;
++
++		make_tx_response(netif, &pending_tx_info[pending_idx].req,
++				 NETIF_RSP_OKAY);
++
++		/* Ready for next use. */
++		gnttab_reset_grant_page(netbk->mmap_pages[pending_idx]);
++
++		index = pending_index(netbk->pending_prod++);
++		netbk->pending_ring[index] = pending_idx;
++
++		netif_put(netif);
++
++		list_del_init(&inuse->list);
++	}
++}
++
++static void netbk_tx_err(struct xen_netif *netif,
++		struct xen_netif_tx_request *txp, RING_IDX end)
++{
++	RING_IDX cons = netif->tx.req_cons;
++
++	do {
++		make_tx_response(netif, txp, NETIF_RSP_ERROR);
++		if (cons >= end)
++			break;
++		txp = RING_GET_REQUEST(&netif->tx, cons++);
++	} while (1);
++	netif->tx.req_cons = cons;
++	netif_schedule_work(netif);
++	netif_put(netif);
++}
++
++static int netbk_count_requests(struct xen_netif *netif,
++				struct xen_netif_tx_request *first,
++				struct xen_netif_tx_request *txp, int work_to_do)
++{
++	RING_IDX cons = netif->tx.req_cons;
++	int frags = 0;
++
++	if (!(first->flags & NETTXF_more_data))
++		return 0;
++
++	do {
++		if (frags >= work_to_do) {
++			DPRINTK("Need more frags\n");
++			return -frags;
++		}
++
++		if (unlikely(frags >= MAX_SKB_FRAGS)) {
++			DPRINTK("Too many frags\n");
++			return -frags;
++		}
++
++		memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
++		       sizeof(*txp));
++		if (txp->size > first->size) {
++			DPRINTK("Frags galore\n");
++			return -frags;
++		}
++
++		first->size -= txp->size;
++		frags++;
++
++		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
++			DPRINTK("txp->offset: %x, size: %u\n",
++				txp->offset, txp->size);
++			return -frags;
++		}
++	} while ((txp++)->flags & NETTXF_more_data);
++
++	return frags;
++}
++
++static struct gnttab_map_grant_ref *netbk_get_requests(struct xen_netbk *netbk,
++						  struct xen_netif *netif,
++						  struct sk_buff *skb,
++						  struct xen_netif_tx_request *txp,
++						  struct gnttab_map_grant_ref *mop)
++{
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	skb_frag_t *frags = shinfo->frags;
++	unsigned long pending_idx = *((u16 *)skb->data);
++	int i, start;
++
++	/* Skip first skb fragment if it is on same page as header fragment. */
++	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++
++	for (i = start; i < shinfo->nr_frags; i++, txp++) {
++		pending_ring_idx_t index;
++		struct pending_tx_info *pending_tx_info =
++			netbk->pending_tx_info;
++
++		index = pending_index(netbk->pending_cons++);
++		pending_idx = netbk->pending_ring[index];
++
++		gnttab_set_map_op(mop++, idx_to_kaddr(netbk, pending_idx),
++				  GNTMAP_host_map | GNTMAP_readonly,
++				  txp->gref, netif->domid);
++
++		memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
++		netif_get(netif);
++		pending_tx_info[pending_idx].netif = netif;
++		frags[i].page = (void *)pending_idx;
++	}
++
++	return mop;
++}
++
++static int netbk_tx_check_mop(struct xen_netbk *netbk,
++			      struct sk_buff *skb,
++			      struct gnttab_map_grant_ref **mopp)
++{
++	struct gnttab_map_grant_ref *mop = *mopp;
++	int pending_idx = *((u16 *)skb->data);
++	struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
++	struct xen_netif *netif = pending_tx_info[pending_idx].netif;
++	struct xen_netif_tx_request *txp;
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	int nr_frags = shinfo->nr_frags;
++	int i, err, start;
++
++	/* Check status of header. */
++	err = mop->status;
++	if (unlikely(err)) {
++		pending_ring_idx_t index;
++		index = pending_index(netbk->pending_prod++);
++		txp = &pending_tx_info[pending_idx].req;
++		make_tx_response(netif, txp, NETIF_RSP_ERROR);
++		netbk->pending_ring[index] = pending_idx;
++		netif_put(netif);
++	} else {
++		set_phys_to_machine(
++			__pa(idx_to_kaddr(netbk, pending_idx)) >> PAGE_SHIFT,
++			FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
++		netbk->grant_tx_handle[pending_idx] = mop->handle;
++	}
++
++	/* Skip first skb fragment if it is on same page as header fragment. */
++	start = ((unsigned long)shinfo->frags[0].page == pending_idx);
++
++	for (i = start; i < nr_frags; i++) {
++		int j, newerr;
++		pending_ring_idx_t index;
++
++		pending_idx = (unsigned long)shinfo->frags[i].page;
++
++		/* Check error status: if okay then remember grant handle. */
++		newerr = (++mop)->status;
++		if (likely(!newerr)) {
++			unsigned long addr;
++			addr = idx_to_kaddr(netbk, pending_idx);
++			set_phys_to_machine(
++				__pa(addr)>>PAGE_SHIFT,
++				FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
++			netbk->grant_tx_handle[pending_idx] = mop->handle;
++			/* Had a previous error? Invalidate this fragment. */
++			if (unlikely(err))
++				netif_idx_release(netbk, pending_idx);
++			continue;
++		}
++
++		/* Error on this fragment: respond to client with an error. */
++		txp = &netbk->pending_tx_info[pending_idx].req;
++		make_tx_response(netif, txp, NETIF_RSP_ERROR);
++		index = pending_index(netbk->pending_prod++);
++		netbk->pending_ring[index] = pending_idx;
++		netif_put(netif);
++
++		/* Not the first error? Preceding frags already invalidated. */
++		if (err)
++			continue;
++
++		/* First error: invalidate header and preceding fragments. */
++		pending_idx = *((u16 *)skb->data);
++		netif_idx_release(netbk, pending_idx);
++		for (j = start; j < i; j++) {
++			pending_idx = (unsigned long)shinfo->frags[i].page;
++			netif_idx_release(netbk, pending_idx);
++		}
++
++		/* Remember the error: invalidate all subsequent fragments. */
++		err = newerr;
++	}
++
++	*mopp = mop + 1;
++	return err;
++}
++
++static void netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb)
++{
++	struct skb_shared_info *shinfo = skb_shinfo(skb);
++	int nr_frags = shinfo->nr_frags;
++	int i;
++
++	for (i = 0; i < nr_frags; i++) {
++		skb_frag_t *frag = shinfo->frags + i;
++		struct xen_netif_tx_request *txp;
++		unsigned long pending_idx;
++
++		pending_idx = (unsigned long)frag->page;
++
++		netbk->pending_inuse[pending_idx].alloc_time = jiffies;
++		list_add_tail(&netbk->pending_inuse[pending_idx].list,
++			      &netbk->pending_inuse_head);
++
++		txp = &netbk->pending_tx_info[pending_idx].req;
++		frag->page = virt_to_page(idx_to_kaddr(netbk, pending_idx));
++		frag->size = txp->size;
++		frag->page_offset = txp->offset;
++
++		skb->len += txp->size;
++		skb->data_len += txp->size;
++		skb->truesize += txp->size;
++	}
++}
++
++int netbk_get_extras(struct xen_netif *netif, struct xen_netif_extra_info *extras,
++		     int work_to_do)
++{
++	struct xen_netif_extra_info extra;
++	RING_IDX cons = netif->tx.req_cons;
++
++	do {
++		if (unlikely(work_to_do-- <= 0)) {
++			DPRINTK("Missing extra info\n");
++			return -EBADR;
++		}
++
++		memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
++		       sizeof(extra));
++		if (unlikely(!extra.type ||
++			     extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
++			netif->tx.req_cons = ++cons;
++			DPRINTK("Invalid extra type: %d\n", extra.type);
++			return -EINVAL;
++		}
++
++		memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
++		netif->tx.req_cons = ++cons;
++	} while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
++
++	return work_to_do;
++}
++
++static int netbk_set_skb_gso(struct sk_buff *skb, struct xen_netif_extra_info *gso)
++{
++	if (!gso->u.gso.size) {
++		DPRINTK("GSO size must not be zero.\n");
++		return -EINVAL;
++	}
++
++	/* Currently only TCPv4 S.O. is supported. */
++	if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
++		DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
++		return -EINVAL;
++	}
++
++	skb_shinfo(skb)->gso_size = gso->u.gso.size;
++	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
++
++	/* Header must be checked, and gso_segs computed. */
++	skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
++	skb_shinfo(skb)->gso_segs = 0;
++
++	return 0;
++}
++
++static int skb_checksum_setup(struct sk_buff *skb)
++{
++	struct iphdr *iph;
++	unsigned char *th;
++	int err = -EPROTO;
++
++	if (skb->protocol != htons(ETH_P_IP))
++		goto out;
++
++	iph = (void *)skb->data;
++	th = skb->data + 4 * iph->ihl;
++	if (th >= skb_tail_pointer(skb))
++		goto out;
++
++	skb->csum_start = th - skb->head;
++	switch (iph->protocol) {
++	case IPPROTO_TCP:
++		skb->csum_offset = offsetof(struct tcphdr, check);
++		break;
++	case IPPROTO_UDP:
++		skb->csum_offset = offsetof(struct udphdr, check);
++		break;
++	default:
++		if (net_ratelimit())
++			printk(KERN_ERR "Attempting to checksum a non-"
++			       "TCP/UDP packet, dropping a protocol"
++			       " %d packet", iph->protocol);
++		goto out;
++	}
++
++	if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
++		goto out;
++
++	err = 0;
++
++out:
++	return err;
++}
++
++static bool tx_credit_exceeded(struct xen_netif *netif, unsigned size)
++{
++	unsigned long now = jiffies;
++	unsigned long next_credit =
++		netif->credit_timeout.expires +
++		msecs_to_jiffies(netif->credit_usec / 1000);
++
++	/* Timer could already be pending in rare cases. */
++	if (timer_pending(&netif->credit_timeout))
++		return true;
++
++	/* Passed the point where we can replenish credit? */
++	if (time_after_eq(now, next_credit)) {
++		netif->credit_timeout.expires = now;
++		tx_add_credit(netif);
++	}
++
++	/* Still too big to send right now? Set a callback. */
++	if (size > netif->remaining_credit) {
++		netif->credit_timeout.data     =
++			(unsigned long)netif;
++		netif->credit_timeout.function =
++			tx_credit_callback;
++		mod_timer(&netif->credit_timeout,
++			  next_credit);
++
++		return true;
++	}
++
++	return false;
++}
++
++static unsigned net_tx_build_mops(struct xen_netbk *netbk)
++{
++	struct gnttab_map_grant_ref *mop;
++	struct sk_buff *skb;
++	int ret;
++
++	mop = netbk->tx_map_ops;
++	while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
++		!list_empty(&netbk->net_schedule_list)) {
++		struct xen_netif *netif;
++		struct xen_netif_tx_request txreq;
++		struct xen_netif_tx_request txfrags[MAX_SKB_FRAGS];
++		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
++		u16 pending_idx;
++		RING_IDX idx;
++		int work_to_do;
++		unsigned int data_len;
++		pending_ring_idx_t index;
++	
++		/* Get a netif from the list with work to do. */
++		netif = list_first_entry(&netbk->net_schedule_list,
++				struct xen_netif, list);
++		netif_get(netif);
++		remove_from_net_schedule_list(netif);
++
++		RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
++		if (!work_to_do) {
++			netif_put(netif);
++			continue;
++		}
++
++		idx = netif->tx.req_cons;
++		rmb(); /* Ensure that we see the request before we copy it. */
++		memcpy(&txreq, RING_GET_REQUEST(&netif->tx, idx), sizeof(txreq));
++
++		/* Credit-based scheduling. */
++		if (txreq.size > netif->remaining_credit &&
++		    tx_credit_exceeded(netif, txreq.size)) {
++			netif_put(netif);
++			continue;
++		}
++
++		netif->remaining_credit -= txreq.size;
++
++		work_to_do--;
++		netif->tx.req_cons = ++idx;
++
++		memset(extras, 0, sizeof(extras));
++		if (txreq.flags & NETTXF_extra_info) {
++			work_to_do = netbk_get_extras(netif, extras,
++						      work_to_do);
++			idx = netif->tx.req_cons;
++			if (unlikely(work_to_do < 0)) {
++				netbk_tx_err(netif, &txreq, idx);
++				continue;
++			}
++		}
++
++		ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
++		if (unlikely(ret < 0)) {
++			netbk_tx_err(netif, &txreq, idx - ret);
++			continue;
++		}
++		idx += ret;
++
++		if (unlikely(txreq.size < ETH_HLEN)) {
++			DPRINTK("Bad packet size: %d\n", txreq.size);
++			netbk_tx_err(netif, &txreq, idx);
++			continue;
++		}
++
++		/* No crossing a page as the payload mustn't fragment. */
++		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
++			DPRINTK("txreq.offset: %x, size: %u, end: %lu\n",
++				txreq.offset, txreq.size,
++				(txreq.offset &~PAGE_MASK) + txreq.size);
++			netbk_tx_err(netif, &txreq, idx);
++			continue;
++		}
++
++		index = pending_index(netbk->pending_cons);
++		pending_idx = netbk->pending_ring[index];
++
++		data_len = (txreq.size > PKT_PROT_LEN &&
++			    ret < MAX_SKB_FRAGS) ?
++			PKT_PROT_LEN : txreq.size;
++
++		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
++				GFP_ATOMIC | __GFP_NOWARN);
++		if (unlikely(skb == NULL)) {
++			DPRINTK("Can't allocate a skb in start_xmit.\n");
++			netbk_tx_err(netif, &txreq, idx);
++			break;
++		}
++
++		/* Packets passed to netif_rx() must have some headroom. */
++		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
++
++		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
++			struct xen_netif_extra_info *gso;
++			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
++
++			if (netbk_set_skb_gso(skb, gso)) {
++				kfree_skb(skb);
++				netbk_tx_err(netif, &txreq, idx);
++				continue;
++			}
++		}
++
++		gnttab_set_map_op(mop, idx_to_kaddr(netbk, pending_idx),
++				  GNTMAP_host_map | GNTMAP_readonly,
++				  txreq.gref, netif->domid);
++		mop++;
++
++		memcpy(&netbk->pending_tx_info[pending_idx].req,
++		       &txreq, sizeof(txreq));
++		netbk->pending_tx_info[pending_idx].netif = netif;
++		*((u16 *)skb->data) = pending_idx;
++
++		__skb_put(skb, data_len);
++
++		skb_shinfo(skb)->nr_frags = ret;
++		if (data_len < txreq.size) {
++			skb_shinfo(skb)->nr_frags++;
++			skb_shinfo(skb)->frags[0].page =
++				(void *)(unsigned long)pending_idx;
++		} else {
++			/* Discriminate from any valid pending_idx value. */
++			skb_shinfo(skb)->frags[0].page = (void *)~0UL;
++		}
++
++		__skb_queue_tail(&netbk->tx_queue, skb);
++
++		netbk->pending_cons++;
++
++		mop = netbk_get_requests(netbk, netif, skb, txfrags, mop);
++
++		netif->tx.req_cons = idx;
++		netif_schedule_work(netif);
++
++		if ((mop - netbk->tx_map_ops) >= ARRAY_SIZE(netbk->tx_map_ops))
++			break;
++	}
++
++	return mop - netbk->tx_map_ops;
++}
++
++static void net_tx_submit(struct xen_netbk *netbk)
++{
++	struct gnttab_map_grant_ref *mop;
++	struct sk_buff *skb;
++
++	mop = netbk->tx_map_ops;
++	while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
++		struct xen_netif_tx_request *txp;
++		struct xen_netif *netif;
++		u16 pending_idx;
++		unsigned data_len;
++
++		pending_idx = *((u16 *)skb->data);
++		netif = netbk->pending_tx_info[pending_idx].netif;
++		txp = &netbk->pending_tx_info[pending_idx].req;
++
++		/* Check the remap error code. */
++		if (unlikely(netbk_tx_check_mop(netbk, skb, &mop))) {
++			DPRINTK("netback grant failed.\n");
++			skb_shinfo(skb)->nr_frags = 0;
++			kfree_skb(skb);
++			continue;
++		}
++
++		data_len = skb->len;
++		memcpy(skb->data,
++		       (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset),
++		       data_len);
++		if (data_len < txp->size) {
++			/* Append the packet payload as a fragment. */
++			txp->offset += data_len;
++			txp->size -= data_len;
++		} else {
++			/* Schedule a response immediately. */
++			netif_idx_release(netbk, pending_idx);
++		}
++
++		if (txp->flags & NETTXF_csum_blank)
++			skb->ip_summed = CHECKSUM_PARTIAL;
++		else if (txp->flags & NETTXF_data_validated)
++			skb->ip_summed = CHECKSUM_UNNECESSARY;
++
++		netbk_fill_frags(netbk, skb);
++
++		/*
++		 * If the initial fragment was < PKT_PROT_LEN then
++		 * pull through some bytes from the other fragments to
++		 * increase the linear region to PKT_PROT_LEN bytes.
++		 */
++		if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
++			int target = min_t(int, skb->len, PKT_PROT_LEN);
++			__pskb_pull_tail(skb, target - skb_headlen(skb));
++		}
++
++		skb->dev      = netif->dev;
++		skb->protocol = eth_type_trans(skb, skb->dev);
++
++		netif->stats.rx_bytes += skb->len;
++		netif->stats.rx_packets++;
++
++		if (skb->ip_summed == CHECKSUM_PARTIAL) {
++			if (skb_checksum_setup(skb)) {
++				DPRINTK("Can't setup checksum in net_tx_action\n");
++				kfree_skb(skb);
++				continue;
++			}
++		}
++
++		if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
++		    unlikely(skb_linearize(skb))) {
++			DPRINTK("Can't linearize skb in net_tx_action.\n");
++			kfree_skb(skb);
++			continue;
++		}
++
++		netif_rx_ni(skb);
++		netif->dev->last_rx = jiffies;
++	}
++}
++
++/* Called after netfront has transmitted */
++static void net_tx_action(unsigned long data)
++{
++	struct xen_netbk *netbk = (struct xen_netbk *)data;
++	unsigned nr_mops;
++	int ret;
++
++	net_tx_action_dealloc(netbk);
++
++	nr_mops = net_tx_build_mops(netbk);
++
++	if (nr_mops == 0)
++		goto out;
++
++	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
++					netbk->tx_map_ops, nr_mops);
++	BUG_ON(ret);
++
++	net_tx_submit(netbk);
++out:
++	if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
++	    !list_empty(&netbk->pending_inuse_head)) {
++		struct netbk_tx_pending_inuse *oldest;
++
++		oldest = list_entry(netbk->pending_inuse_head.next,
++				    struct netbk_tx_pending_inuse, list);
++		mod_timer(&netbk->netbk_tx_pending_timer,
++				oldest->alloc_time + HZ);
++	}
++}
++
++static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx)
++{
++	static DEFINE_SPINLOCK(_lock);
++	unsigned long flags;
++	pending_ring_idx_t index;
++
++	spin_lock_irqsave(&_lock, flags);
++	index = pending_index(netbk->dealloc_prod);
++	netbk->dealloc_ring[index] = pending_idx;
++	/* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
++	smp_wmb();
++	netbk->dealloc_prod++;
++	spin_unlock_irqrestore(&_lock, flags);
++
++	xen_netbk_bh_handler(netbk, 0);
++}
++
++static void netif_page_release(struct page *page, unsigned int order)
++{
++	unsigned int group, idx;
++	int foreign = netif_get_page_ext(page, &group, &idx);
++
++	BUG_ON(!foreign);
++	BUG_ON(order);
++
++	netif_idx_release(&xen_netbk[group], idx);
++}
++
++irqreturn_t netif_be_int(int irq, void *dev_id)
++{
++	struct xen_netif *netif = dev_id;
++	struct xen_netbk *netbk;
++
++	if (netif->group == -1)
++		return IRQ_NONE;
++
++	netbk = &xen_netbk[netif->group];
++
++	add_to_net_schedule_list_tail(netif);
++	maybe_schedule_tx_action(netbk);
++
++	if (netif_schedulable(netif) && !netbk_queue_full(netif))
++		netif_wake_queue(netif->dev);
++
++	return IRQ_HANDLED;
++}
++
++static void make_tx_response(struct xen_netif *netif,
++			     struct xen_netif_tx_request *txp,
++			     s8       st)
++{
++	RING_IDX i = netif->tx.rsp_prod_pvt;
++	struct xen_netif_tx_response *resp;
++	int notify;
++
++	resp = RING_GET_RESPONSE(&netif->tx, i);
++	resp->id     = txp->id;
++	resp->status = st;
++
++	if (txp->flags & NETTXF_extra_info)
++		RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL;
++
++	netif->tx.rsp_prod_pvt = ++i;
++	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
++
++	/*
++	 * netfront_smartpoll_active indicates whether netfront timer
++	 * is active.
++	 */
++	if ((netif->smart_poll == 1)) {
++		if (!(netif->rx.sring->private.netif.smartpoll_active)) {
++			notify_remote_via_irq(netif->irq);
++			netif->rx.sring->private.netif.smartpoll_active = 1;
++		}
++	} else if (notify)
++		notify_remote_via_irq(netif->irq);
++}
++
++static struct xen_netif_rx_response *make_rx_response(struct xen_netif *netif,
++					     u16      id,
++					     s8       st,
++					     u16      offset,
++					     u16      size,
++					     u16      flags)
++{
++	RING_IDX i = netif->rx.rsp_prod_pvt;
++	struct xen_netif_rx_response *resp;
++
++	resp = RING_GET_RESPONSE(&netif->rx, i);
++	resp->offset     = offset;
++	resp->flags      = flags;
++	resp->id         = id;
++	resp->status     = (s16)size;
++	if (st < 0)
++		resp->status = (s16)st;
++
++	netif->rx.rsp_prod_pvt = ++i;
++
++	return resp;
++}
++
++#ifdef NETBE_DEBUG_INTERRUPT
++static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
++{
++	struct list_head *ent;
++	struct xen_netif *netif;
++	int i = 0;
++	int group = 0;
++
++	printk(KERN_ALERT "netif_schedule_list:\n");
++
++	for (group = 0; group < xen_netbk_group_nr; group++) {
++		struct xen_netbk *netbk = &xen_netbk[group];
++		spin_lock_irq(&netbk->net_schedule_list_lock);
++		printk(KERN_ALERT "xen_netback group number: %d\n", group);
++		list_for_each(ent, &netbk->net_schedule_list) {
++			netif = list_entry(ent, struct xen_netif, list);
++			printk(KERN_ALERT " %d: private(rx_req_cons=%08x "
++				"rx_resp_prod=%08x\n",
++				i, netif->rx.req_cons, netif->rx.rsp_prod_pvt);
++			printk(KERN_ALERT
++				"   tx_req_cons=%08x, tx_resp_prod=%08x)\n",
++				netif->tx.req_cons, netif->tx.rsp_prod_pvt);
++			printk(KERN_ALERT
++				"   shared(rx_req_prod=%08x "
++				"rx_resp_prod=%08x\n",
++				netif->rx.sring->req_prod,
++				netif->rx.sring->rsp_prod);
++			printk(KERN_ALERT
++				"   rx_event=%08x, tx_req_prod=%08x\n",
++				netif->rx.sring->rsp_event,
++				netif->tx.sring->req_prod);
++			printk(KERN_ALERT
++				"   tx_resp_prod=%08x, tx_event=%08x)\n",
++				netif->tx.sring->rsp_prod,
++				netif->tx.sring->rsp_event);
++			i++;
++		}
++		spin_unlock_irq(&netbk->net_schedule_list_lock);
++	}
++
++	printk(KERN_ALERT " ** End of netif_schedule_list **\n");
++
++	return IRQ_HANDLED;
++}
++#endif
++
++static inline int rx_work_todo(struct xen_netbk *netbk)
++{
++	return !skb_queue_empty(&netbk->rx_queue);
++}
++
++static inline int tx_work_todo(struct xen_netbk *netbk)
++{
++	if (netbk->dealloc_cons != netbk->dealloc_prod)
++		return 1;
++
++	if (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
++			!list_empty(&netbk->net_schedule_list))
++		return 1;
++
++	return 0;
++}
++
++static int netbk_action_thread(void *data)
++{
++	struct xen_netbk *netbk = (struct xen_netbk *)data;
++	while (!kthread_should_stop()) {
++		wait_event_interruptible(netbk->kthread.netbk_action_wq,
++				rx_work_todo(netbk)
++				|| tx_work_todo(netbk)
++				|| kthread_should_stop());
++		cond_resched();
++
++		if (kthread_should_stop())
++			break;
++
++		if (rx_work_todo(netbk))
++			net_rx_action((unsigned long)netbk);
++
++		if (tx_work_todo(netbk))
++			net_tx_action((unsigned long)netbk);
++	}
++
++	return 0;
++}
++
++static int __init netback_init(void)
++{
++	int i;
++	struct page *page;
++	int rc = 0;
++	int group;
++
++	if (!xen_pv_domain())
++		return -ENODEV;
++
++	xen_netbk_group_nr = num_online_cpus();
++	xen_netbk = vmalloc(sizeof(struct xen_netbk) * xen_netbk_group_nr);
++	if (!xen_netbk) {
++		printk(KERN_ALERT "%s: out of memory\n", __func__);
++		return -ENOMEM;
++	}
++	memset(xen_netbk, 0, sizeof(struct xen_netbk) * xen_netbk_group_nr);
++
++	/* We can increase reservation by this much in net_rx_action(). */
++//	balloon_update_driver_allowance(NET_RX_RING_SIZE);
++
++	for (group = 0; group < xen_netbk_group_nr; group++) {
++		struct xen_netbk *netbk = &xen_netbk[group];
++		skb_queue_head_init(&netbk->rx_queue);
++		skb_queue_head_init(&netbk->tx_queue);
++
++		init_timer(&netbk->net_timer);
++		netbk->net_timer.data = (unsigned long)netbk;
++		netbk->net_timer.function = net_alarm;
++
++		init_timer(&netbk->netbk_tx_pending_timer);
++		netbk->netbk_tx_pending_timer.data = (unsigned long)netbk;
++		netbk->netbk_tx_pending_timer.function =
++			netbk_tx_pending_timeout;
++
++		netbk->mmap_pages =
++			alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
++		if (!netbk->mmap_pages) {
++			printk(KERN_ALERT "%s: out of memory\n", __func__);
++			del_timer(&netbk->netbk_tx_pending_timer);
++			del_timer(&netbk->net_timer);
++			rc = -ENOMEM;
++			goto failed_init;
++		}
++
++		for (i = 0; i < MAX_PENDING_REQS; i++) {
++			page = netbk->mmap_pages[i];
++			SetPageForeign(page, netif_page_release);
++			netif_set_page_ext(page, group, i);
++			INIT_LIST_HEAD(&netbk->pending_inuse[i].list);
++		}
++
++		netbk->pending_cons = 0;
++		netbk->pending_prod = MAX_PENDING_REQS;
++		for (i = 0; i < MAX_PENDING_REQS; i++)
++			netbk->pending_ring[i] = i;
++
++		if (MODPARM_netback_kthread) {
++			init_waitqueue_head(&netbk->kthread.netbk_action_wq);
++			netbk->kthread.task =
++				kthread_create(netbk_action_thread,
++					       (void *)netbk,
++					       "netback/%u", group);
++
++			if (!IS_ERR(netbk->kthread.task)) {
++				kthread_bind(netbk->kthread.task, group);
++			} else {
++				printk(KERN_ALERT
++					"kthread_run() fails at netback\n");
++				free_empty_pages_and_pagevec(netbk->mmap_pages,
++						MAX_PENDING_REQS);
++				del_timer(&netbk->netbk_tx_pending_timer);
++				del_timer(&netbk->net_timer);
++				rc = PTR_ERR(netbk->kthread.task);
++				goto failed_init;
++			}
++		} else {
++			tasklet_init(&netbk->tasklet.net_tx_tasklet,
++				     net_tx_action,
++				     (unsigned long)netbk);
++			tasklet_init(&netbk->tasklet.net_rx_tasklet,
++				     net_rx_action,
++				     (unsigned long)netbk);
++		}
++
++		INIT_LIST_HEAD(&netbk->pending_inuse_head);
++		INIT_LIST_HEAD(&netbk->net_schedule_list);
++
++		spin_lock_init(&netbk->net_schedule_list_lock);
++
++		atomic_set(&netbk->netfront_count, 0);
++
++		if (MODPARM_netback_kthread)
++			wake_up_process(netbk->kthread.task);
++	}
++
++	netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
++	if (MODPARM_copy_skb) {
++		if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
++					      NULL, 0))
++			netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
++		else
++			netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
++	}
++
++	//netif_accel_init();
++
++	rc = netif_xenbus_init();
++	if (rc)
++		goto failed_init;
++
++#ifdef NETBE_DEBUG_INTERRUPT
++	(void)bind_virq_to_irqhandler(VIRQ_DEBUG,
++				      0,
++				      netif_be_dbg,
++				      IRQF_SHARED,
++				      "net-be-dbg",
++				      &netif_be_dbg);
++#endif
++
++	return 0;
++
++failed_init:
++	for (i = 0; i < group; i++) {
++		struct xen_netbk *netbk = &xen_netbk[i];
++		free_empty_pages_and_pagevec(netbk->mmap_pages,
++				MAX_PENDING_REQS);
++		del_timer(&netbk->netbk_tx_pending_timer);
++		del_timer(&netbk->net_timer);
++		if (MODPARM_netback_kthread)
++			kthread_stop(netbk->kthread.task);
++	}
++	vfree(xen_netbk);
++	return rc;
++
++}
++
++module_init(netback_init);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c
+new file mode 100644
+index 0000000..1930f64
+--- /dev/null
++++ b/drivers/xen/netback/xenbus.c
+@@ -0,0 +1,518 @@
++/*  Xenbus code for netif backend
++    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
++    Copyright (C) 2005 XenSource Ltd
++
++    This program is free software; you can redistribute it and/or modify
++    it under the terms of the GNU General Public License as published by
++    the Free Software Foundation; either version 2 of the License, or
++    (at your option) any later version.
++
++    This program is distributed in the hope that it will be useful,
++    but WITHOUT ANY WARRANTY; without even the implied warranty of
++    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++    GNU General Public License for more details.
++
++    You should have received a copy of the GNU General Public License
++    along with this program; if not, write to the Free Software
++    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++*/
++
++#include <stdarg.h>
++#include <linux/module.h>
++#include <xen/xenbus.h>
++#include "common.h"
++
++#if 0
++#undef DPRINTK
++#define DPRINTK(fmt, args...) \
++    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
++#endif
++
++
++static int connect_rings(struct backend_info *);
++static void connect(struct backend_info *);
++static void backend_create_netif(struct backend_info *be);
++static void unregister_hotplug_status_watch(struct backend_info *be);
++
++static int netback_remove(struct xenbus_device *dev)
++{
++  struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++	//netback_remove_accelerators(be, dev);
++
++	unregister_hotplug_status_watch(be);
++	if (be->netif) {
++		kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++		netif_disconnect(be->netif);
++		be->netif = NULL;
++	}
++	kfree(be);
++	dev_set_drvdata(&dev->dev, NULL);
++	return 0;
++}
++
++
++/**
++ * Entry point to this code when a new device is created.  Allocate the basic
++ * structures and switch to InitWait.
++ */
++static int netback_probe(struct xenbus_device *dev,
++			 const struct xenbus_device_id *id)
++{
++	const char *message;
++	struct xenbus_transaction xbt;
++	int err;
++	int sg;
++	struct backend_info *be = kzalloc(sizeof(struct backend_info),
++					  GFP_KERNEL);
++	if (!be) {
++		xenbus_dev_fatal(dev, -ENOMEM,
++				 "allocating backend structure");
++		return -ENOMEM;
++	}
++
++	be->dev = dev;
++	dev_set_drvdata(&dev->dev, be);
++
++	sg = 1;
++	if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
++		sg = 0;
++
++	do {
++		err = xenbus_transaction_start(&xbt);
++		if (err) {
++			xenbus_dev_fatal(dev, err, "starting transaction");
++			goto fail;
++		}
++
++		err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
++		if (err) {
++			message = "writing feature-sg";
++			goto abort_transaction;
++		}
++
++		err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
++				    "%d", sg);
++		if (err) {
++			message = "writing feature-gso-tcpv4";
++			goto abort_transaction;
++		}
++
++		/* We support rx-copy path. */
++		err = xenbus_printf(xbt, dev->nodename,
++				    "feature-rx-copy", "%d", 1);
++		if (err) {
++			message = "writing feature-rx-copy";
++			goto abort_transaction;
++		}
++
++		/*
++		 * We don't support rx-flip path (except old guests who don't
++		 * grok this feature flag).
++		 */
++		err = xenbus_printf(xbt, dev->nodename,
++				    "feature-rx-flip", "%d", 0);
++		if (err) {
++			message = "writing feature-rx-flip";
++			goto abort_transaction;
++		}
++
++		/* We support data smart poll mechanism */
++		err = xenbus_printf(xbt, dev->nodename,
++				    "feature-smart-poll", "%d", 1);
++		if (err) {
++			message = "writing feature-smart-poll";
++			goto abort_transaction;
++		}
++
++		err = xenbus_transaction_end(xbt, 0);
++	} while (err == -EAGAIN);
++
++	if (err) {
++		xenbus_dev_fatal(dev, err, "completing transaction");
++		goto fail;
++	}
++
++	//netback_probe_accelerators(be, dev);
++
++	err = xenbus_switch_state(dev, XenbusStateInitWait);
++	if (err)
++		goto fail;
++
++	/* This kicks hotplug scripts, so do it immediately. */
++	backend_create_netif(be);
++
++	return 0;
++
++abort_transaction:
++	xenbus_transaction_end(xbt, 1);
++	xenbus_dev_fatal(dev, err, "%s", message);
++fail:
++	DPRINTK("failed");
++	netback_remove(dev);
++	return err;
++}
++
++
++/**
++ * Handle the creation of the hotplug script environment.  We add the script
++ * and vif variables to the environment, for the benefit of the vif-* hotplug
++ * scripts.
++ */
++static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
++{
++	struct backend_info *be = dev_get_drvdata(&xdev->dev);
++	char *val;
++
++	DPRINTK("netback_uevent");
++
++	val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
++	if (IS_ERR(val)) {
++		int err = PTR_ERR(val);
++		xenbus_dev_fatal(xdev, err, "reading script");
++		return err;
++	}
++	else {
++		if (add_uevent_var(env, "script=%s", val)) {
++			kfree(val);
++			return -ENOMEM;
++		}
++		kfree(val);
++	}
++
++	if (be && be->netif && add_uevent_var(env, "vif=%s", be->netif->dev->name))
++		return -ENOMEM;
++
++	return 0;
++}
++
++
++static void backend_create_netif(struct backend_info *be)
++{
++	int err;
++	long handle;
++	struct xenbus_device *dev = be->dev;
++
++	if (be->netif != NULL)
++		return;
++
++	err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
++	if (err != 1) {
++		xenbus_dev_fatal(dev, err, "reading handle");
++		return;
++	}
++
++	be->netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
++	if (IS_ERR(be->netif)) {
++		err = PTR_ERR(be->netif);
++		be->netif = NULL;
++		xenbus_dev_fatal(dev, err, "creating interface");
++		return;
++	}
++
++	kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
++}
++
++
++static void disconnect_backend(struct xenbus_device *dev)
++{
++	struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++	if (be->netif) {
++		xenbus_rm(XBT_NIL, dev->nodename, "hotplug-status");
++		netif_disconnect(be->netif);
++		be->netif = NULL;
++	}
++}
++
++/**
++ * Callback received when the frontend's state changes.
++ */
++static void frontend_changed(struct xenbus_device *dev,
++			     enum xenbus_state frontend_state)
++{
++	struct backend_info *be = dev_get_drvdata(&dev->dev);
++
++	DPRINTK("%s", xenbus_strstate(frontend_state));
++
++	be->frontend_state = frontend_state;
++
++	switch (frontend_state) {
++	case XenbusStateInitialising:
++		if (dev->state == XenbusStateClosed) {
++			printk(KERN_INFO "%s: %s: prepare for reconnect\n",
++			       __FUNCTION__, dev->nodename);
++			xenbus_switch_state(dev, XenbusStateInitWait);
++		}
++		break;
++
++	case XenbusStateInitialised:
++		break;
++
++	case XenbusStateConnected:
++		if (dev->state == XenbusStateConnected)
++			break;
++		backend_create_netif(be);
++		if (be->netif)
++			connect(be);
++		break;
++
++	case XenbusStateClosing:
++		if (be->netif)
++			kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
++		disconnect_backend(dev);
++		xenbus_switch_state(dev, XenbusStateClosing);
++		break;
++
++	case XenbusStateClosed:
++		xenbus_switch_state(dev, XenbusStateClosed);
++		if (xenbus_dev_is_online(dev))
++			break;
++		/* fall through if not online */
++	case XenbusStateUnknown:
++		device_unregister(&dev->dev);
++		break;
++
++	default:
++		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
++				 frontend_state);
++		break;
++	}
++}
++
++
++static void xen_net_read_rate(struct xenbus_device *dev,
++			      unsigned long *bytes, unsigned long *usec)
++{
++	char *s, *e;
++	unsigned long b, u;
++	char *ratestr;
++
++	/* Default to unlimited bandwidth. */
++	*bytes = ~0UL;
++	*usec = 0;
++
++	ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
++	if (IS_ERR(ratestr))
++		return;
++
++	s = ratestr;
++	b = simple_strtoul(s, &e, 10);
++	if ((s == e) || (*e != ','))
++		goto fail;
++
++	s = e + 1;
++	u = simple_strtoul(s, &e, 10);
++	if ((s == e) || (*e != '\0'))
++		goto fail;
++
++	*bytes = b;
++	*usec = u;
++
++	kfree(ratestr);
++	return;
++
++ fail:
++	WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
++	kfree(ratestr);
++}
++
++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
++{
++	char *s, *e, *macstr;
++	int i;
++
++	macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
++	if (IS_ERR(macstr))
++		return PTR_ERR(macstr);
++
++	for (i = 0; i < ETH_ALEN; i++) {
++		mac[i] = simple_strtoul(s, &e, 16);
++		if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
++			kfree(macstr);
++			return -ENOENT;
++		}
++		s = e+1;
++	}
++
++	kfree(macstr);
++	return 0;
++}
++
++static void unregister_hotplug_status_watch(struct backend_info *be)
++{
++	if (be->have_hotplug_status_watch) {
++		unregister_xenbus_watch(&be->hotplug_status_watch);
++		kfree(be->hotplug_status_watch.node);
++	}
++	be->have_hotplug_status_watch = 0;
++}
++
++static void hotplug_status_changed(struct xenbus_watch *watch,
++				   const char **vec,
++				   unsigned int vec_size)
++{
++	struct backend_info *be = container_of(watch,
++					       struct backend_info,
++					       hotplug_status_watch);
++	char *str;
++	unsigned int len;
++
++	str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
++	if (IS_ERR(str))
++		return;
++	if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
++		xenbus_switch_state(be->dev, XenbusStateConnected);
++		/* Not interested in this watch anymore. */
++		unregister_hotplug_status_watch(be);
++	}
++	kfree(str);
++}
++
++static void connect(struct backend_info *be)
++{
++	int err;
++	struct xenbus_device *dev = be->dev;
++
++	err = connect_rings(be);
++	if (err)
++		return;
++
++	err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
++	if (err) {
++		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
++		return;
++	}
++
++	xen_net_read_rate(dev, &be->netif->credit_bytes,
++			  &be->netif->credit_usec);
++	be->netif->remaining_credit = be->netif->credit_bytes;
++
++	unregister_hotplug_status_watch(be);
++	err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
++				   hotplug_status_changed,
++				   "%s/%s", dev->nodename, "hotplug-status");
++	if (err) {
++		/* Switch now, since we can't do a watch. */
++		xenbus_switch_state(dev, XenbusStateConnected);
++	} else {
++		be->have_hotplug_status_watch = 1;
++	}
++
++	netif_wake_queue(be->netif->dev);
++}
++
++
++static int connect_rings(struct backend_info *be)
++{
++	struct xen_netif *netif = be->netif;
++	struct xenbus_device *dev = be->dev;
++	unsigned long tx_ring_ref, rx_ring_ref;
++	unsigned int evtchn, rx_copy;
++	int err;
++	int val;
++
++	DPRINTK("");
++
++	err = xenbus_gather(XBT_NIL, dev->otherend,
++			    "tx-ring-ref", "%lu", &tx_ring_ref,
++			    "rx-ring-ref", "%lu", &rx_ring_ref,
++			    "event-channel", "%u", &evtchn, NULL);
++	if (err) {
++		xenbus_dev_fatal(dev, err,
++				 "reading %s/ring-ref and event-channel",
++				 dev->otherend);
++		return err;
++	}
++
++	err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
++			   &rx_copy);
++	if (err == -ENOENT) {
++		err = 0;
++		rx_copy = 0;
++	}
++	if (err < 0) {
++		xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
++				 dev->otherend);
++		return err;
++	}
++	if (!rx_copy)
++		return -EOPNOTSUPP;
++
++	if (netif->dev->tx_queue_len != 0) {
++		if (xenbus_scanf(XBT_NIL, dev->otherend,
++				 "feature-rx-notify", "%d", &val) < 0)
++			val = 0;
++		if (val)
++			netif->can_queue = 1;
++		else
++			/* Must be non-zero for pfifo_fast to work. */
++			netif->dev->tx_queue_len = 1;
++	}
++
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg",
++			 "%d", &val) < 0)
++		val = 0;
++	netif->can_sg = !!val;
++
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4",
++			 "%d", &val) < 0)
++		val = 0;
++	netif->gso = !!val;
++
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4-prefix",
++			 "%d", &val) < 0)
++		val = 0;
++	netif->gso_prefix = !!val;
++
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
++			 "%d", &val) < 0)
++		val = 0;
++	netif->csum = !val;
++
++	if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
++			 "%d", &val) < 0)
++		val = 0;
++	netif->smart_poll = !!val;
++
++	/* Set dev->features */
++	netif_set_features(netif);
++
++	/* Map the shared frame, irq etc. */
++	err = netif_map(netif, tx_ring_ref, rx_ring_ref, evtchn);
++	if (err) {
++		xenbus_dev_fatal(dev, err,
++				 "mapping shared-frames %lu/%lu port %u",
++				 tx_ring_ref, rx_ring_ref, evtchn);
++		return err;
++	}
++	return 0;
++}
++
++
++/* ** Driver Registration ** */
++
++
++static const struct xenbus_device_id netback_ids[] = {
++	{ "vif" },
++	{ "" }
++};
++
++
++static struct xenbus_driver netback = {
++	.name = "vif",
++	.owner = THIS_MODULE,
++	.ids = netback_ids,
++	.probe = netback_probe,
++	.remove = netback_remove,
++	.uevent = netback_uevent,
++	.otherend_changed = frontend_changed,
++};
++
++
++int netif_xenbus_init(void)
++{
++	printk(KERN_CRIT "registering netback\n");
++	return xenbus_register_backend(&netback);
++}
+diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
+new file mode 100644
+index 0000000..ae693e7
+--- /dev/null
++++ b/drivers/xen/pci.c
+@@ -0,0 +1,124 @@
++/*
++ * Copyright (c) 2009, Intel Corporation.
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ *
++ * Author: Weidong Han <weidong.han@intel.com>
++ */
++
++#include <linux/pci.h>
++
++#include <xen/interface/xen.h>
++#include <xen/interface/physdev.h>
++
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++#include "../pci/pci.h"
++
++
++#ifdef CONFIG_PCI_IOV
++#define HANDLE_PCI_IOV	1
++#else
++#define HANDLE_PCI_IOV	0
++#endif
++
++static int xen_add_device(struct device *dev)
++{
++	int r;
++	struct pci_dev *pci_dev = to_pci_dev(dev);
++
++	if (HANDLE_PCI_IOV && pci_dev->is_virtfn) {
++		struct physdev_manage_pci_ext manage_pci_ext = {
++			.bus		= pci_dev->bus->number,
++			.devfn		= pci_dev->devfn,
++			.is_virtfn 	= 1,
++#ifdef CONFIG_PCI_IOV
++			.physfn.bus	= pci_dev->physfn->bus->number,
++			.physfn.devfn	= pci_dev->physfn->devfn,
++#endif
++		};
++
++		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++			&manage_pci_ext);
++	} else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
++		struct physdev_manage_pci_ext manage_pci_ext = {
++			.bus		= pci_dev->bus->number,
++			.devfn		= pci_dev->devfn,
++			.is_extfn	= 1,
++		};
++
++		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
++			&manage_pci_ext);
++	} else {
++		struct physdev_manage_pci manage_pci = {
++			.bus 	= pci_dev->bus->number,
++			.devfn	= pci_dev->devfn,
++		};
++
++		r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
++			&manage_pci);
++	}
++
++	return r;
++}
++
++static int xen_remove_device(struct device *dev)
++{
++	int r;
++	struct pci_dev *pci_dev = to_pci_dev(dev);
++	struct physdev_manage_pci manage_pci;
++
++	manage_pci.bus = pci_dev->bus->number;
++	manage_pci.devfn = pci_dev->devfn;
++
++	r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
++		&manage_pci);
++
++	return r;
++}
++
++static int xen_pci_notifier(struct notifier_block *nb,
++			    unsigned long action, void *data)
++{
++	struct device *dev = data;
++	int r = 0;
++
++	switch (action) {
++	case BUS_NOTIFY_ADD_DEVICE:
++		r = xen_add_device(dev);
++		break;
++	case BUS_NOTIFY_DEL_DEVICE:
++		r = xen_remove_device(dev);
++		break;
++	default:
++		break;
++	}
++
++	return r;
++}
++
++struct notifier_block device_nb = {
++	.notifier_call = xen_pci_notifier,
++};
++
++static int __init register_xen_pci_notifier(void)
++{
++	if (!xen_pv_domain())
++		return 0;
++
++	return bus_register_notifier(&pci_bus_type, &device_nb);
++}
++
++arch_initcall(register_xen_pci_notifier);
+diff --git a/drivers/xen/pciback/Makefile b/drivers/xen/pciback/Makefile
+new file mode 100644
+index 0000000..38bc123
+--- /dev/null
++++ b/drivers/xen/pciback/Makefile
+@@ -0,0 +1,17 @@
++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
++
++xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
++xen-pciback-y += conf_space.o conf_space_header.o \
++		 conf_space_capability.o \
++		 conf_space_capability_vpd.o \
++		 conf_space_capability_pm.o \
++		 conf_space_quirks.o
++xen-pciback-$(CONFIG_PCI_MSI) += conf_space_capability_msi.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o
++xen-pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
++
++ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y)
++EXTRA_CFLAGS += -DDEBUG
++endif
+diff --git a/drivers/xen/pciback/conf_space.c b/drivers/xen/pciback/conf_space.c
+new file mode 100644
+index 0000000..370c18e
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space.c
+@@ -0,0 +1,435 @@
++/*
++ * PCI Backend - Functions for creating a virtual configuration space for
++ *               exported PCI Devices.
++ *               It's dangerous to allow PCI Driver Domains to change their
++ *               device's resources (memory, i/o ports, interrupts). We need to
++ *               restrict changes to certain PCI Configuration registers:
++ *               BARs, INTERRUPT_PIN, most registers in the header...
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++static int permissive;
++module_param(permissive, bool, 0644);
++
++#define DEFINE_PCI_CONFIG(op, size, type) 			\
++int pciback_##op##_config_##size 				\
++(struct pci_dev *dev, int offset, type value, void *data)	\
++{								\
++	return pci_##op##_config_##size(dev, offset, value);	\
++}
++
++DEFINE_PCI_CONFIG(read, byte, u8 *)
++DEFINE_PCI_CONFIG(read, word, u16 *)
++DEFINE_PCI_CONFIG(read, dword, u32 *)
++
++DEFINE_PCI_CONFIG(write, byte, u8)
++DEFINE_PCI_CONFIG(write, word, u16)
++DEFINE_PCI_CONFIG(write, dword, u32)
++
++static int conf_space_read(struct pci_dev *dev,
++			   const struct config_field_entry *entry,
++			   int offset, u32 *value)
++{
++	int ret = 0;
++	const struct config_field *field = entry->field;
++
++	*value = 0;
++
++	switch (field->size) {
++	case 1:
++		if (field->u.b.read)
++			ret = field->u.b.read(dev, offset, (u8 *) value,
++					      entry->data);
++		break;
++	case 2:
++		if (field->u.w.read)
++			ret = field->u.w.read(dev, offset, (u16 *) value,
++					      entry->data);
++		break;
++	case 4:
++		if (field->u.dw.read)
++			ret = field->u.dw.read(dev, offset, value, entry->data);
++		break;
++	}
++	return ret;
++}
++
++static int conf_space_write(struct pci_dev *dev,
++			    const struct config_field_entry *entry,
++			    int offset, u32 value)
++{
++	int ret = 0;
++	const struct config_field *field = entry->field;
++
++	switch (field->size) {
++	case 1:
++		if (field->u.b.write)
++			ret = field->u.b.write(dev, offset, (u8) value,
++					       entry->data);
++		break;
++	case 2:
++		if (field->u.w.write)
++			ret = field->u.w.write(dev, offset, (u16) value,
++					       entry->data);
++		break;
++	case 4:
++		if (field->u.dw.write)
++			ret = field->u.dw.write(dev, offset, value,
++						entry->data);
++		break;
++	}
++	return ret;
++}
++
++static inline u32 get_mask(int size)
++{
++	if (size == 1)
++		return 0xff;
++	else if (size == 2)
++		return 0xffff;
++	else
++		return 0xffffffff;
++}
++
++static inline int valid_request(int offset, int size)
++{
++	/* Validate request (no un-aligned requests) */
++	if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0)
++		return 1;
++	return 0;
++}
++
++static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask,
++			      int offset)
++{
++	if (offset >= 0) {
++		new_val_mask <<= (offset * 8);
++		new_val <<= (offset * 8);
++	} else {
++		new_val_mask >>= (offset * -8);
++		new_val >>= (offset * -8);
++	}
++	val = (val & ~new_val_mask) | (new_val & new_val_mask);
++
++	return val;
++}
++
++static int pcibios_err_to_errno(int err)
++{
++	switch (err) {
++	case PCIBIOS_SUCCESSFUL:
++		return XEN_PCI_ERR_success;
++	case PCIBIOS_DEVICE_NOT_FOUND:
++		return XEN_PCI_ERR_dev_not_found;
++	case PCIBIOS_BAD_REGISTER_NUMBER:
++		return XEN_PCI_ERR_invalid_offset;
++	case PCIBIOS_FUNC_NOT_SUPPORTED:
++		return XEN_PCI_ERR_not_implemented;
++	case PCIBIOS_SET_FAILED:
++		return XEN_PCI_ERR_access_denied;
++	}
++	return err;
++}
++
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++			u32 *ret_val)
++{
++	int err = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	const struct config_field_entry *cfg_entry;
++	const struct config_field *field;
++	int req_start, req_end, field_start, field_end;
++	/* if read fails for any reason, return 0
++	 * (as if device didn't respond) */
++	u32 value = 0, tmp_val;
++
++	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n",
++		       pci_name(dev), size, offset);
++
++	if (!valid_request(offset, size)) {
++		err = XEN_PCI_ERR_invalid_offset;
++		goto out;
++	}
++
++	/* Get the real value first, then modify as appropriate */
++	switch (size) {
++	case 1:
++		err = pci_read_config_byte(dev, offset, (u8 *) &value);
++		break;
++	case 2:
++		err = pci_read_config_word(dev, offset, (u16 *) &value);
++		break;
++	case 4:
++		err = pci_read_config_dword(dev, offset, &value);
++		break;
++	}
++
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
++
++		req_start = offset;
++		req_end = offset + size;
++		field_start = OFFSET(cfg_entry);
++		field_end = OFFSET(cfg_entry) + field->size;
++
++		if ((req_start >= field_start && req_start < field_end)
++		    || (req_end > field_start && req_end <= field_end)) {
++			err = conf_space_read(dev, cfg_entry, field_start,
++					      &tmp_val);
++			if (err)
++				goto out;
++
++			value = merge_value(value, tmp_val,
++					    get_mask(field->size),
++					    field_start - req_start);
++		}
++	}
++
++out:
++	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n",
++		       pci_name(dev), size, offset, value);
++
++	*ret_val = value;
++	return pcibios_err_to_errno(err);
++}
++
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value)
++{
++	int err = 0, handled = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	const struct config_field_entry *cfg_entry;
++	const struct config_field *field;
++	u32 tmp_val;
++	int req_start, req_end, field_start, field_end;
++
++	if (unlikely(verbose_request))
++		printk(KERN_DEBUG
++		       "pciback: %s: write request %d bytes at 0x%x = %x\n",
++		       pci_name(dev), size, offset, value);
++
++	if (!valid_request(offset, size))
++		return XEN_PCI_ERR_invalid_offset;
++
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
++
++		req_start = offset;
++		req_end = offset + size;
++		field_start = OFFSET(cfg_entry);
++		field_end = OFFSET(cfg_entry) + field->size;
++
++		if ((req_start >= field_start && req_start < field_end)
++		    || (req_end > field_start && req_end <= field_end)) {
++			tmp_val = 0;
++
++			err = pciback_config_read(dev, field_start,
++						  field->size, &tmp_val);
++			if (err)
++				break;
++
++			tmp_val = merge_value(tmp_val, value, get_mask(size),
++					      req_start - field_start);
++
++			err = conf_space_write(dev, cfg_entry, field_start,
++					       tmp_val);
++
++			/* handled is set true here, but not every byte
++			 * may have been written! Properly detecting if
++			 * every byte is handled is unnecessary as the
++			 * flag is used to detect devices that need
++			 * special helpers to work correctly.
++			 */
++			handled = 1;
++		}
++	}
++
++	if (!handled && !err) {
++		/* By default, anything not specificially handled above is
++		 * read-only. The permissive flag changes this behavior so
++		 * that anything not specifically handled above is writable.
++		 * This means that some fields may still be read-only because
++		 * they have entries in the config_field list that intercept
++		 * the write and do nothing. */
++		if (dev_data->permissive || permissive) {
++			switch (size) {
++			case 1:
++				err = pci_write_config_byte(dev, offset,
++							    (u8) value);
++				break;
++			case 2:
++				err = pci_write_config_word(dev, offset,
++							    (u16) value);
++				break;
++			case 4:
++				err = pci_write_config_dword(dev, offset,
++							     (u32) value);
++				break;
++			}
++		} else if (!dev_data->warned_on_write) {
++			dev_data->warned_on_write = 1;
++			dev_warn(&dev->dev, "Driver tried to write to a "
++				 "read-only configuration space field at offset"
++				 " 0x%x, size %d. This may be harmless, but if "
++				 "you have problems with your device:\n"
++				 "1) see permissive attribute in sysfs\n"
++				 "2) report problems to the xen-devel "
++				 "mailing list along with details of your "
++				 "device obtained from lspci.\n", offset, size);
++		}
++	}
++
++	return pcibios_err_to_errno(err);
++}
++
++void pciback_config_free_dyn_fields(struct pci_dev *dev)
++{
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry, *t;
++	const struct config_field *field;
++
++	dev_dbg(&dev->dev, "free-ing dynamically allocated virtual "
++			   "configuration space fields\n");
++	if (!dev_data)
++		return;
++
++	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
++
++		if (field->clean) {
++			field->clean((struct config_field *)field);
++
++			kfree(cfg_entry->data);
++
++			list_del(&cfg_entry->list);
++			kfree(cfg_entry);
++		}
++
++	}
++}
++
++void pciback_config_reset_dev(struct pci_dev *dev)
++{
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	const struct config_field_entry *cfg_entry;
++	const struct config_field *field;
++
++	dev_dbg(&dev->dev, "resetting virtual configuration space\n");
++	if (!dev_data)
++		return;
++
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		field = cfg_entry->field;
++
++		if (field->reset)
++			field->reset(dev, OFFSET(cfg_entry), cfg_entry->data);
++	}
++}
++
++void pciback_config_free_dev(struct pci_dev *dev)
++{
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry, *t;
++	const struct config_field *field;
++
++	dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n");
++	if (!dev_data)
++		return;
++
++	list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) {
++		list_del(&cfg_entry->list);
++
++		field = cfg_entry->field;
++
++		if (field->release)
++			field->release(dev, OFFSET(cfg_entry), cfg_entry->data);
++
++		kfree(cfg_entry);
++	}
++}
++
++int pciback_config_add_field_offset(struct pci_dev *dev,
++				    const struct config_field *field,
++				    unsigned int base_offset)
++{
++	int err = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry;
++	void *tmp;
++
++	cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL);
++	if (!cfg_entry) {
++		err = -ENOMEM;
++		goto out;
++	}
++
++	cfg_entry->data = NULL;
++	cfg_entry->field = field;
++	cfg_entry->base_offset = base_offset;
++
++	/* silently ignore duplicate fields */
++	err = pciback_field_is_dup(dev, OFFSET(cfg_entry));
++	if (err)
++		goto out;
++
++	if (field->init) {
++		tmp = field->init(dev, OFFSET(cfg_entry));
++
++		if (IS_ERR(tmp)) {
++			err = PTR_ERR(tmp);
++			goto out;
++		}
++
++		cfg_entry->data = tmp;
++	}
++
++	dev_dbg(&dev->dev, "added config field at offset 0x%02x\n",
++		OFFSET(cfg_entry));
++	list_add_tail(&cfg_entry->list, &dev_data->config_fields);
++
++out:
++	if (err)
++		kfree(cfg_entry);
++
++	return err;
++}
++
++/* This sets up the device's virtual configuration space to keep track of
++ * certain registers (like the base address registers (BARs) so that we can
++ * keep the client from manipulating them directly.
++ */
++int pciback_config_init_dev(struct pci_dev *dev)
++{
++	int err = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++
++	dev_dbg(&dev->dev, "initializing virtual configuration space\n");
++
++	INIT_LIST_HEAD(&dev_data->config_fields);
++
++	err = pciback_config_header_add_fields(dev);
++	if (err)
++		goto out;
++
++	err = pciback_config_capability_add_fields(dev);
++	if (err)
++		goto out;
++
++	err = pciback_config_quirks_init(dev);
++
++out:
++	return err;
++}
++
++int pciback_config_init(void)
++{
++	return pciback_config_capability_init();
++}
+diff --git a/drivers/xen/pciback/conf_space.h b/drivers/xen/pciback/conf_space.h
+new file mode 100644
+index 0000000..50ebef2
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space.h
+@@ -0,0 +1,126 @@
++/*
++ * PCI Backend - Common data structures for overriding the configuration space
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#ifndef __XEN_PCIBACK_CONF_SPACE_H__
++#define __XEN_PCIBACK_CONF_SPACE_H__
++
++#include <linux/list.h>
++#include <linux/err.h>
++
++/* conf_field_init can return an errno in a ptr with ERR_PTR() */
++typedef void *(*conf_field_init) (struct pci_dev *dev, int offset);
++typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data);
++typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data);
++
++typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value,
++				 void *data);
++typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value,
++				void *data);
++typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value,
++				void *data);
++typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value,
++				void *data);
++typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value,
++			       void *data);
++typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value,
++			       void *data);
++
++/* These are the fields within the configuration space which we
++ * are interested in intercepting reads/writes to and changing their
++ * values.
++ */
++struct config_field {
++	unsigned int offset;
++	unsigned int size;
++	unsigned int mask;
++	conf_field_init init;
++	conf_field_reset reset;
++	conf_field_free release;
++	void (*clean) (struct config_field *field);
++	union {
++		struct {
++			conf_dword_write write;
++			conf_dword_read read;
++		} dw;
++		struct {
++			conf_word_write write;
++			conf_word_read read;
++		} w;
++		struct {
++			conf_byte_write write;
++			conf_byte_read read;
++		} b;
++	} u;
++	struct list_head list;
++};
++
++struct config_field_entry {
++	struct list_head list;
++	const struct config_field *field;
++	unsigned int base_offset;
++	void *data;
++};
++
++#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset)
++
++/* Add fields to a device - the add_fields macro expects to get a pointer to
++ * the first entry in an array (of which the ending is marked by size==0)
++ */
++int pciback_config_add_field_offset(struct pci_dev *dev,
++				    const struct config_field *field,
++				    unsigned int offset);
++
++static inline int pciback_config_add_field(struct pci_dev *dev,
++					   const struct config_field *field)
++{
++	return pciback_config_add_field_offset(dev, field, 0);
++}
++
++static inline int pciback_config_add_fields(struct pci_dev *dev,
++					    const struct config_field *field)
++{
++	int i, err = 0;
++	for (i = 0; field[i].size != 0; i++) {
++		err = pciback_config_add_field(dev, &field[i]);
++		if (err)
++			break;
++	}
++	return err;
++}
++
++static inline int pciback_config_add_fields_offset(struct pci_dev *dev,
++					const struct config_field *field,
++					unsigned int offset)
++{
++	int i, err = 0;
++	for (i = 0; field[i].size != 0; i++) {
++		err = pciback_config_add_field_offset(dev, &field[i], offset);
++		if (err)
++			break;
++	}
++	return err;
++}
++
++/* Read/Write the real configuration space */
++int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 *value,
++			     void *data);
++int pciback_read_config_word(struct pci_dev *dev, int offset, u16 *value,
++			     void *data);
++int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 *value,
++			      void *data);
++int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value,
++			      void *data);
++int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value,
++			      void *data);
++int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value,
++			       void *data);
++
++int pciback_config_capability_init(void);
++
++int pciback_config_header_add_fields(struct pci_dev *dev);
++int pciback_config_capability_add_fields(struct pci_dev *dev);
++
++#endif				/* __XEN_PCIBACK_CONF_SPACE_H__ */
+diff --git a/drivers/xen/pciback/conf_space_capability.c b/drivers/xen/pciback/conf_space_capability.c
+new file mode 100644
+index 0000000..0ea84d6
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability.c
+@@ -0,0 +1,66 @@
++/*
++ * PCI Backend - Handles the virtual fields found on the capability lists
++ *               in the configuration space.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_capability.h"
++
++static LIST_HEAD(capabilities);
++
++static const struct config_field caplist_header[] = {
++	{
++	 .offset    = PCI_CAP_LIST_ID,
++	 .size      = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */
++	 .u.w.read  = pciback_read_config_word,
++	 .u.w.write = NULL,
++	},
++	{}
++};
++
++static inline void register_capability(struct pciback_config_capability *cap)
++{
++	list_add_tail(&cap->cap_list, &capabilities);
++}
++
++int pciback_config_capability_add_fields(struct pci_dev *dev)
++{
++	int err = 0;
++	struct pciback_config_capability *cap;
++	int cap_offset;
++
++	list_for_each_entry(cap, &capabilities, cap_list) {
++		cap_offset = pci_find_capability(dev, cap->capability);
++		if (cap_offset) {
++			dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n",
++				cap->capability, cap_offset);
++
++			err = pciback_config_add_fields_offset(dev,
++							       caplist_header,
++							       cap_offset);
++			if (err)
++				goto out;
++			err = pciback_config_add_fields_offset(dev,
++							       cap->fields,
++							       cap_offset);
++			if (err)
++				goto out;
++		}
++	}
++
++out:
++	return err;
++}
++
++int pciback_config_capability_init(void)
++{
++	register_capability(&pciback_config_capability_vpd);
++	register_capability(&pciback_config_capability_pm);
++
++	return 0;
++}
+diff --git a/drivers/xen/pciback/conf_space_capability.h b/drivers/xen/pciback/conf_space_capability.h
+new file mode 100644
+index 0000000..8da3ac4
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability.h
+@@ -0,0 +1,26 @@
++/*
++ * PCI Backend - Data structures for special overlays for structures on
++ *               the capability list.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#ifndef __PCIBACK_CONFIG_CAPABILITY_H__
++#define __PCIBACK_CONFIG_CAPABILITY_H__
++
++#include <linux/pci.h>
++#include <linux/list.h>
++
++struct pciback_config_capability {
++	struct list_head cap_list;
++
++	int capability;
++
++	/* If the device has the capability found above, add these fields */
++	const struct config_field *fields;
++};
++
++extern struct pciback_config_capability pciback_config_capability_vpd;
++extern struct pciback_config_capability pciback_config_capability_pm;
++
++#endif
+diff --git a/drivers/xen/pciback/conf_space_capability_msi.c b/drivers/xen/pciback/conf_space_capability_msi.c
+new file mode 100644
+index 0000000..b15131e
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability_msi.c
+@@ -0,0 +1,110 @@
++/*
++ * PCI Backend -- Configuration overlay for MSI capability
++ */
++#include <linux/pci.h>
++#include <linux/slab.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++#include <xen/interface/io/pciif.h>
++#include <xen/events.h>
++#include "pciback.h"
++
++int pciback_enable_msi(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
++{
++	struct pciback_dev_data *dev_data;
++	int otherend = pdev->xdev->otherend_id;
++	int status;
++
++ 	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: enable MSI\n", pci_name(dev));
++
++	status = pci_enable_msi(dev);
++
++	if (status) {
++		printk(KERN_ERR "error enable msi for guest %x status %x\n",
++			otherend, status);
++		op->value = 0;
++		return XEN_PCI_ERR_op_failed;
++	}
++
++	/* The value the guest needs is actually the IDT vector, not the
++	 * the local domain's IRQ number. */
++	op->value = xen_gsi_from_irq(dev->irq);
++	dev_data = pci_get_drvdata(dev);
++	if (dev_data)
++		dev_data->ack_intr = 0;
++
++	return 0;
++}
++
++int pciback_disable_msi(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
++{
++	struct pciback_dev_data *dev_data;
++
++ 	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: disable MSI\n", pci_name(dev));
++	pci_disable_msi(dev);
++
++	op->value = xen_gsi_from_irq(dev->irq);
++	dev_data = pci_get_drvdata(dev);
++	if (dev_data)
++		dev_data->ack_intr = 1;
++	return 0;
++}
++
++int pciback_enable_msix(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
++{
++	struct pciback_dev_data *dev_data;
++	int i, result;
++	struct msix_entry *entries;
++
++ 	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: enable MSI-X\n", pci_name(dev));
++	if (op->value > SH_INFO_MAX_VEC)
++		return -EINVAL;
++
++	entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL);
++	if (entries == NULL)
++		return -ENOMEM;
++
++	for (i = 0; i < op->value; i++) {
++		entries[i].entry = op->msix_entries[i].entry;
++		entries[i].vector = op->msix_entries[i].vector;
++	}
++
++	result = pci_enable_msix(dev, entries, op->value);
++
++	for (i = 0; i < op->value; i++) {
++		op->msix_entries[i].entry = entries[i].entry;
++		op->msix_entries[i].vector =
++					xen_gsi_from_irq(entries[i].vector);
++	}
++
++	kfree(entries);
++
++	op->value = result;
++	dev_data = pci_get_drvdata(dev);
++	if (dev_data)
++		dev_data->ack_intr = 0;
++
++	return result;
++}
++
++int pciback_disable_msix(struct pciback_device *pdev,
++		struct pci_dev *dev, struct xen_pci_op *op)
++{
++	struct pciback_dev_data *dev_data;
++ 	if (unlikely(verbose_request))
++		printk(KERN_DEBUG "pciback: %s: disable MSI-X\n", pci_name(dev));
++	pci_disable_msix(dev);
++
++	op->value = xen_gsi_from_irq(dev->irq);
++	dev_data = pci_get_drvdata(dev);
++	if (dev_data)
++		dev_data->ack_intr = 1;
++	return 0;
++}
++
+diff --git a/drivers/xen/pciback/conf_space_capability_pm.c b/drivers/xen/pciback/conf_space_capability_pm.c
+new file mode 100644
+index 0000000..0442616
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability_pm.c
+@@ -0,0 +1,113 @@
++/*
++ * PCI Backend - Configuration space overlay for power management
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++
++static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
++			void *data)
++{
++	int err;
++	u16 real_value;
++
++	err = pci_read_config_word(dev, offset, &real_value);
++	if (err)
++		goto out;
++
++	*value = real_value & ~PCI_PM_CAP_PME_MASK;
++
++out:
++	return err;
++}
++
++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
++ * Can't allow driver domain to enable PMEs - they're shared */
++#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
++
++static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
++			 void *data)
++{
++	int err;
++	u16 old_value;
++	pci_power_t new_state, old_state;
++
++	err = pci_read_config_word(dev, offset, &old_value);
++	if (err)
++		goto out;
++
++	old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK);
++	new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
++
++	new_value &= PM_OK_BITS;
++	if ((old_value & PM_OK_BITS) != new_value) {
++		new_value = (old_value & ~PM_OK_BITS) | new_value;
++		err = pci_write_config_word(dev, offset, new_value);
++		if (err)
++			goto out;
++	}
++
++	/* Let pci core handle the power management change */
++	dev_dbg(&dev->dev, "set power state to %x\n", new_state);
++	err = pci_set_power_state(dev, new_state);
++	if (err) {
++		err = PCIBIOS_SET_FAILED;
++		goto out;
++	}
++
++ out:
++	return err;
++}
++
++/* Ensure PMEs are disabled */
++static void *pm_ctrl_init(struct pci_dev *dev, int offset)
++{
++	int err;
++	u16 value;
++
++	err = pci_read_config_word(dev, offset, &value);
++	if (err)
++		goto out;
++
++	if (value & PCI_PM_CTRL_PME_ENABLE) {
++		value &= ~PCI_PM_CTRL_PME_ENABLE;
++		err = pci_write_config_word(dev, offset, value);
++	}
++
++out:
++	return ERR_PTR(err);
++}
++
++static const struct config_field caplist_pm[] = {
++	{
++		.offset     = PCI_PM_PMC,
++		.size       = 2,
++		.u.w.read   = pm_caps_read,
++	},
++	{
++		.offset     = PCI_PM_CTRL,
++		.size       = 2,
++		.init       = pm_ctrl_init,
++		.u.w.read   = pciback_read_config_word,
++		.u.w.write  = pm_ctrl_write,
++	},
++	{
++		.offset     = PCI_PM_PPB_EXTENSIONS,
++		.size       = 1,
++		.u.b.read   = pciback_read_config_byte,
++	},
++	{
++		.offset     = PCI_PM_DATA_REGISTER,
++		.size       = 1,
++		.u.b.read   = pciback_read_config_byte,
++	},
++	{}
++};
++
++struct pciback_config_capability pciback_config_capability_pm = {
++	.capability = PCI_CAP_ID_PM,
++	.fields = caplist_pm,
++};
+diff --git a/drivers/xen/pciback/conf_space_capability_vpd.c b/drivers/xen/pciback/conf_space_capability_vpd.c
+new file mode 100644
+index 0000000..e7b4d66
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_capability_vpd.c
+@@ -0,0 +1,40 @@
++/*
++ * PCI Backend - Configuration space overlay for Vital Product Data
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/pci.h>
++#include "conf_space.h"
++#include "conf_space_capability.h"
++
++static int vpd_address_write(struct pci_dev *dev, int offset, u16 value,
++			     void *data)
++{
++	/* Disallow writes to the vital product data */
++	if (value & PCI_VPD_ADDR_F)
++		return PCIBIOS_SET_FAILED;
++	else
++		return pci_write_config_word(dev, offset, value);
++}
++
++static const struct config_field caplist_vpd[] = {
++	{
++	 .offset    = PCI_VPD_ADDR,
++	 .size      = 2,
++	 .u.w.read  = pciback_read_config_word,
++	 .u.w.write = vpd_address_write,
++	 },
++	{
++	 .offset     = PCI_VPD_DATA,
++	 .size       = 4,
++	 .u.dw.read  = pciback_read_config_dword,
++	 .u.dw.write = NULL,
++	 },
++	{}
++};
++
++struct pciback_config_capability pciback_config_capability_vpd = {
++	.capability = PCI_CAP_ID_VPD,
++	.fields = caplist_vpd,
++};
+diff --git a/drivers/xen/pciback/conf_space_header.c b/drivers/xen/pciback/conf_space_header.c
+new file mode 100644
+index 0000000..cb450f4
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_header.c
+@@ -0,0 +1,385 @@
++/*
++ * PCI Backend - Handles the virtual fields in the configuration space headers.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++
++struct pci_bar_info {
++	u32 val;
++	u32 len_val;
++	int which;
++};
++
++#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO))
++#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER)
++
++static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
++{
++	int i;
++	int ret;
++
++	ret = pciback_read_config_word(dev, offset, value, data);
++	if (!atomic_read(&dev->enable_cnt))
++		return ret;
++
++	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
++		if (dev->resource[i].flags & IORESOURCE_IO)
++			*value |= PCI_COMMAND_IO;
++		if (dev->resource[i].flags & IORESOURCE_MEM)
++			*value |= PCI_COMMAND_MEMORY;
++	}
++
++	return ret;
++}
++
++static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
++{
++	struct pciback_dev_data *dev_data;
++	int err;
++
++	dev_data = pci_get_drvdata(dev);
++	if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG "pciback: %s: enable\n",
++			       pci_name(dev));
++		err = pci_enable_device(dev);
++		if (err)
++			return err;
++		if (dev_data)
++			dev_data->enable_intx = 1;
++	} else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG "pciback: %s: disable\n",
++			       pci_name(dev));
++		pci_disable_device(dev);
++		if (dev_data)
++			dev_data->enable_intx = 0;
++	}
++
++	if (!dev->is_busmaster && is_master_cmd(value)) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG "pciback: %s: set bus master\n",
++			       pci_name(dev));
++		pci_set_master(dev);
++	}
++
++	if (value & PCI_COMMAND_INVALIDATE) {
++		if (unlikely(verbose_request))
++			printk(KERN_DEBUG
++			       "pciback: %s: enable memory-write-invalidate\n",
++			       pci_name(dev));
++		err = pci_set_mwi(dev);
++		if (err) {
++			printk(KERN_WARNING
++			       "pciback: %s: cannot enable "
++			       "memory-write-invalidate (%d)\n",
++			       pci_name(dev), err);
++			value &= ~PCI_COMMAND_INVALIDATE;
++		}
++	}
++
++	return pci_write_config_word(dev, offset, value);
++}
++
++static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data)
++{
++	struct pci_bar_info *bar = data;
++
++	if (unlikely(!bar)) {
++		printk(KERN_WARNING "pciback: driver data not found for %s\n",
++		       pci_name(dev));
++		return XEN_PCI_ERR_op_failed;
++	}
++
++	/* A write to obtain the length must happen as a 32-bit write.
++	 * This does not (yet) support writing individual bytes
++	 */
++	if (value == ~PCI_ROM_ADDRESS_ENABLE)
++		bar->which = 1;
++	else {
++		u32 tmpval;
++		pci_read_config_dword(dev, offset, &tmpval);
++		if (tmpval != bar->val && value == bar->val) {
++			/* Allow restoration of bar value. */
++			pci_write_config_dword(dev, offset, bar->val);
++		}
++		bar->which = 0;
++	}
++
++	/* Do we need to support enabling/disabling the rom address here? */
++
++	return 0;
++}
++
++/* For the BARs, only allow writes which write ~0 or
++ * the correct resource information
++ * (Needed for when the driver probes the resource usage)
++ */
++static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data)
++{
++	struct pci_bar_info *bar = data;
++
++	if (unlikely(!bar)) {
++		printk(KERN_WARNING "pciback: driver data not found for %s\n",
++		       pci_name(dev));
++		return XEN_PCI_ERR_op_failed;
++	}
++
++	/* A write to obtain the length must happen as a 32-bit write.
++	 * This does not (yet) support writing individual bytes
++	 */
++	if (value == ~0)
++		bar->which = 1;
++	else {
++		u32 tmpval;
++		pci_read_config_dword(dev, offset, &tmpval);
++		if (tmpval != bar->val && value == bar->val) {
++			/* Allow restoration of bar value. */
++			pci_write_config_dword(dev, offset, bar->val);
++		}
++		bar->which = 0;
++	}
++
++	return 0;
++}
++
++static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data)
++{
++	struct pci_bar_info *bar = data;
++
++	if (unlikely(!bar)) {
++		printk(KERN_WARNING "pciback: driver data not found for %s\n",
++		       pci_name(dev));
++		return XEN_PCI_ERR_op_failed;
++	}
++
++	*value = bar->which ? bar->len_val : bar->val;
++
++	return 0;
++}
++
++static inline void read_dev_bar(struct pci_dev *dev,
++				struct pci_bar_info *bar_info, int offset,
++				u32 len_mask)
++{
++	int	pos;
++	struct resource	*res = dev->resource;
++
++	if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1)
++		pos = PCI_ROM_RESOURCE;
++	else {
++		pos = (offset - PCI_BASE_ADDRESS_0) / 4;
++		if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE |
++				PCI_BASE_ADDRESS_MEM_TYPE_MASK)) ==
++			   (PCI_BASE_ADDRESS_SPACE_MEMORY |
++				PCI_BASE_ADDRESS_MEM_TYPE_64))) {
++			bar_info->val = res[pos - 1].start >> 32;
++			bar_info->len_val = res[pos - 1].end >> 32;
++			return;
++		}
++	}
++
++	bar_info->val = res[pos].start |
++			(res[pos].flags & PCI_REGION_FLAG_MASK);
++	bar_info->len_val = res[pos].end - res[pos].start + 1;
++}
++
++static void *bar_init(struct pci_dev *dev, int offset)
++{
++	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
++
++	if (!bar)
++		return ERR_PTR(-ENOMEM);
++
++	read_dev_bar(dev, bar, offset, ~0);
++	bar->which = 0;
++
++	return bar;
++}
++
++static void *rom_init(struct pci_dev *dev, int offset)
++{
++	struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL);
++
++	if (!bar)
++		return ERR_PTR(-ENOMEM);
++
++	read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE);
++	bar->which = 0;
++
++	return bar;
++}
++
++static void bar_reset(struct pci_dev *dev, int offset, void *data)
++{
++	struct pci_bar_info *bar = data;
++
++	bar->which = 0;
++}
++
++static void bar_release(struct pci_dev *dev, int offset, void *data)
++{
++	kfree(data);
++}
++
++static int pciback_read_vendor(struct pci_dev *dev, int offset,
++			       u16 *value, void *data)
++{
++	*value = dev->vendor;
++
++	return 0;
++}
++
++static int pciback_read_device(struct pci_dev *dev, int offset,
++			       u16 *value, void *data)
++{
++	*value = dev->device;
++
++	return 0;
++}
++
++static int interrupt_read(struct pci_dev *dev, int offset, u8 * value,
++			  void *data)
++{
++	*value = (u8) dev->irq;
++
++	return 0;
++}
++
++static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data)
++{
++	u8 cur_value;
++	int err;
++
++	err = pci_read_config_byte(dev, offset, &cur_value);
++	if (err)
++		goto out;
++
++	if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START)
++	    || value == PCI_BIST_START)
++		err = pci_write_config_byte(dev, offset, value);
++
++out:
++	return err;
++}
++
++static const struct config_field header_common[] = {
++	{
++	 .offset    = PCI_VENDOR_ID,
++	 .size      = 2,
++	 .u.w.read  = pciback_read_vendor,
++	},
++	{
++	 .offset    = PCI_DEVICE_ID,
++	 .size      = 2,
++	 .u.w.read  = pciback_read_device,
++	},
++	{
++	 .offset    = PCI_COMMAND,
++	 .size      = 2,
++	 .u.w.read  = command_read,
++	 .u.w.write = command_write,
++	},
++	{
++	 .offset    = PCI_INTERRUPT_LINE,
++	 .size      = 1,
++	 .u.b.read  = interrupt_read,
++	},
++	{
++	 .offset    = PCI_INTERRUPT_PIN,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	},
++	{
++	 /* Any side effects of letting driver domain control cache line? */
++	 .offset    = PCI_CACHE_LINE_SIZE,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	 .u.b.write = pciback_write_config_byte,
++	},
++	{
++	 .offset    = PCI_LATENCY_TIMER,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	},
++	{
++	 .offset    = PCI_BIST,
++	 .size      = 1,
++	 .u.b.read  = pciback_read_config_byte,
++	 .u.b.write = bist_write,
++	},
++	{}
++};
++
++#define CFG_FIELD_BAR(reg_offset) 			\
++	{ 						\
++	 .offset     = reg_offset, 			\
++	 .size       = 4, 				\
++	 .init       = bar_init, 			\
++	 .reset      = bar_reset, 			\
++	 .release    = bar_release, 			\
++	 .u.dw.read  = bar_read, 			\
++	 .u.dw.write = bar_write, 			\
++	 }
++
++#define CFG_FIELD_ROM(reg_offset) 			\
++	{ 						\
++	 .offset     = reg_offset, 			\
++	 .size       = 4, 				\
++	 .init       = rom_init, 			\
++	 .reset      = bar_reset, 			\
++	 .release    = bar_release, 			\
++	 .u.dw.read  = bar_read, 			\
++	 .u.dw.write = rom_write, 			\
++	 }
++
++static const struct config_field header_0[] = {
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_2),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_3),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_4),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_5),
++	CFG_FIELD_ROM(PCI_ROM_ADDRESS),
++	{}
++};
++
++static const struct config_field header_1[] = {
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_0),
++	CFG_FIELD_BAR(PCI_BASE_ADDRESS_1),
++	CFG_FIELD_ROM(PCI_ROM_ADDRESS1),
++	{}
++};
++
++int pciback_config_header_add_fields(struct pci_dev *dev)
++{
++	int err;
++
++	err = pciback_config_add_fields(dev, header_common);
++	if (err)
++		goto out;
++
++	switch (dev->hdr_type) {
++	case PCI_HEADER_TYPE_NORMAL:
++		err = pciback_config_add_fields(dev, header_0);
++		break;
++
++	case PCI_HEADER_TYPE_BRIDGE:
++		err = pciback_config_add_fields(dev, header_1);
++		break;
++
++	default:
++		err = -EINVAL;
++		printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n",
++		       pci_name(dev), dev->hdr_type);
++		break;
++	}
++
++out:
++	return err;
++}
+diff --git a/drivers/xen/pciback/conf_space_quirks.c b/drivers/xen/pciback/conf_space_quirks.c
+new file mode 100644
+index 0000000..45c31fb
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_quirks.c
+@@ -0,0 +1,140 @@
++/*
++ * PCI Backend - Handle special overlays for broken devices.
++ *
++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ * Author: Chris Bookholt <hap10@epoch.ncsc.mil>
++ */
++
++#include <linux/kernel.h>
++#include <linux/pci.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++LIST_HEAD(pciback_quirks);
++
++static inline const struct pci_device_id *
++match_one_device(const struct pci_device_id *id, const struct pci_dev *dev)
++{
++	if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) &&
++	    (id->device == PCI_ANY_ID || id->device == dev->device) &&
++	    (id->subvendor == PCI_ANY_ID ||
++				id->subvendor == dev->subsystem_vendor) &&
++	    (id->subdevice == PCI_ANY_ID ||
++				id->subdevice == dev->subsystem_device) &&
++	    !((id->class ^ dev->class) & id->class_mask))
++		return id;
++	return NULL;
++}
++
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev)
++{
++	struct pciback_config_quirk *tmp_quirk;
++
++	list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list)
++		if (match_one_device(&tmp_quirk->devid, dev) != NULL)
++			goto out;
++	tmp_quirk = NULL;
++	printk(KERN_DEBUG
++	       "quirk didn't match any device pciback knows about\n");
++out:
++	return tmp_quirk;
++}
++
++static inline void register_quirk(struct pciback_config_quirk *quirk)
++{
++	list_add_tail(&quirk->quirks_list, &pciback_quirks);
++}
++
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg)
++{
++	int ret = 0;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++	struct config_field_entry *cfg_entry;
++
++	list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++		if (OFFSET(cfg_entry) == reg) {
++			ret = 1;
++			break;
++		}
++	}
++	return ret;
++}
++
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++				    *field)
++{
++	int err = 0;
++
++	switch (field->size) {
++	case 1:
++		field->u.b.read = pciback_read_config_byte;
++		field->u.b.write = pciback_write_config_byte;
++		break;
++	case 2:
++		field->u.w.read = pciback_read_config_word;
++		field->u.w.write = pciback_write_config_word;
++		break;
++	case 4:
++		field->u.dw.read = pciback_read_config_dword;
++		field->u.dw.write = pciback_write_config_dword;
++		break;
++	default:
++		err = -EINVAL;
++		goto out;
++	}
++
++	pciback_config_add_field(dev, field);
++
++out:
++	return err;
++}
++
++int pciback_config_quirks_init(struct pci_dev *dev)
++{
++	struct pciback_config_quirk *quirk;
++	int ret = 0;
++
++	quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC);
++	if (!quirk) {
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	quirk->devid.vendor = dev->vendor;
++	quirk->devid.device = dev->device;
++	quirk->devid.subvendor = dev->subsystem_vendor;
++	quirk->devid.subdevice = dev->subsystem_device;
++	quirk->devid.class = 0;
++	quirk->devid.class_mask = 0;
++	quirk->devid.driver_data = 0UL;
++
++	quirk->pdev = dev;
++
++	register_quirk(quirk);
++out:
++	return ret;
++}
++
++void pciback_config_field_free(struct config_field *field)
++{
++	kfree(field);
++}
++
++int pciback_config_quirk_release(struct pci_dev *dev)
++{
++	struct pciback_config_quirk *quirk;
++	int ret = 0;
++
++	quirk = pciback_find_quirk(dev);
++	if (!quirk) {
++		ret = -ENXIO;
++		goto out;
++	}
++
++	list_del(&quirk->quirks_list);
++	kfree(quirk);
++
++out:
++	return ret;
++}
+diff --git a/drivers/xen/pciback/conf_space_quirks.h b/drivers/xen/pciback/conf_space_quirks.h
+new file mode 100644
+index 0000000..acd0e1a
+--- /dev/null
++++ b/drivers/xen/pciback/conf_space_quirks.h
+@@ -0,0 +1,35 @@
++/*
++ * PCI Backend - Data structures for special overlays for broken devices.
++ *
++ * Ryan Wilson <hap9@epoch.ncsc.mil>
++ * Chris Bookholt <hap10@epoch.ncsc.mil>
++ */
++
++#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
++#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__
++
++#include <linux/pci.h>
++#include <linux/list.h>
++
++struct pciback_config_quirk {
++	struct list_head quirks_list;
++	struct pci_device_id devid;
++	struct pci_dev *pdev;
++};
++
++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev);
++
++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field
++				    *field);
++
++int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg);
++
++int pciback_config_quirks_init(struct pci_dev *dev);
++
++void pciback_config_field_free(struct config_field *field);
++
++int pciback_config_quirk_release(struct pci_dev *dev);
++
++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg);
++
++#endif
+diff --git a/drivers/xen/pciback/controller.c b/drivers/xen/pciback/controller.c
+new file mode 100644
+index 0000000..7f04f11
+--- /dev/null
++++ b/drivers/xen/pciback/controller.c
+@@ -0,0 +1,442 @@
++/*
++ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
++ *      Alex Williamson <alex.williamson@hp.com>
++ *
++ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
++ * controllers.  Devices under the same PCI controller are exposed on the
++ * same virtual domain:bus.  Within a bus, device slots are virtualized
++ * to compact the bus.
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software
++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ */
++
++#include <linux/acpi.h>
++#include <linux/list.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++#define PCI_MAX_BUSSES	255
++#define PCI_MAX_SLOTS	32
++
++struct controller_dev_entry {
++	struct list_head list;
++	struct pci_dev *dev;
++	unsigned int devfn;
++};
++
++struct controller_list_entry {
++	struct list_head list;
++	struct pci_controller *controller;
++	unsigned int domain;
++	unsigned int bus;
++	unsigned int next_devfn;
++	struct list_head dev_list;
++};
++
++struct controller_dev_data {
++	struct list_head list;
++	unsigned int next_domain;
++	unsigned int next_bus;
++	spinlock_t lock;
++};
++
++struct walk_info {
++	struct pciback_device *pdev;
++	int resource_count;
++	int root_num;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_dev_entry *dev_entry;
++	struct controller_list_entry *cntrl_entry;
++	struct pci_dev *dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&dev_data->lock, flags);
++
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		if (cntrl_entry->domain != domain ||
++		    cntrl_entry->bus != bus)
++			continue;
++
++		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++			if (devfn == dev_entry->devfn) {
++				dev = dev_entry->dev;
++				goto found;
++			}
++		}
++	}
++found:
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++
++	return dev;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_dev_entry *dev_entry;
++	struct controller_list_entry *cntrl_entry;
++	struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
++	unsigned long flags;
++	int ret = 0, found = 0;
++
++	spin_lock_irqsave(&dev_data->lock, flags);
++
++	/* Look to see if we already have a domain:bus for this controller */
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		if (cntrl_entry->controller == dev_controller) {
++			found = 1;
++			break;
++		}
++	}
++
++	if (!found) {
++		cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
++		if (!cntrl_entry) {
++			ret =  -ENOMEM;
++			goto out;
++		}
++
++		cntrl_entry->controller = dev_controller;
++		cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
++
++		cntrl_entry->domain = dev_data->next_domain;
++		cntrl_entry->bus = dev_data->next_bus++;
++		if (dev_data->next_bus > PCI_MAX_BUSSES) {
++			dev_data->next_domain++;
++			dev_data->next_bus = 0;
++		}
++
++		INIT_LIST_HEAD(&cntrl_entry->dev_list);
++
++		list_add_tail(&cntrl_entry->list, &dev_data->list);
++	}
++
++	if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
++		/*
++		 * While it seems unlikely, this can actually happen if
++		 * a controller has P2P bridges under it.
++		 */
++		xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
++				 "is full, no room to export %04x:%02x:%02x.%x",
++				 cntrl_entry->domain, cntrl_entry->bus,
++				 pci_domain_nr(dev->bus), dev->bus->number,
++				 PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
++		ret = -ENOSPC;
++		goto out;
++	}
++
++	dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
++	if (!dev_entry) {
++		if (list_empty(&cntrl_entry->dev_list)) {
++			list_del(&cntrl_entry->list);
++			kfree(cntrl_entry);
++		}
++		ret = -ENOMEM;
++		goto out;
++	}
++
++	dev_entry->dev = dev;
++	dev_entry->devfn = cntrl_entry->next_devfn;
++
++	list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
++
++	cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
++
++out:
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++
++	/* TODO: Publish virtual domain:bus:slot.func here. */
++
++	return ret;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_list_entry *cntrl_entry;
++	struct controller_dev_entry *dev_entry = NULL;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&dev_data->lock, flags);
++
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		if (cntrl_entry->controller != PCI_CONTROLLER(dev))
++			continue;
++
++		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++			if (dev_entry->dev == dev) {
++				found_dev = dev_entry->dev;
++				break;
++			}
++		}
++	}
++
++	if (!found_dev) {
++		spin_unlock_irqrestore(&dev_data->lock, flags);
++		return;
++	}
++
++	list_del(&dev_entry->list);
++	kfree(dev_entry);
++
++	if (list_empty(&cntrl_entry->dev_list)) {
++		list_del(&cntrl_entry->list);
++		kfree(cntrl_entry);
++	}
++
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++	pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++	struct controller_dev_data *dev_data;
++
++	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
++	if (!dev_data)
++		return -ENOMEM;
++
++	spin_lock_init(&dev_data->lock);
++
++	INIT_LIST_HEAD(&dev_data->list);
++
++	/* Starting domain:bus numbers */
++	dev_data->next_domain = 0;
++	dev_data->next_bus = 0;
++
++	pdev->pci_dev_data = dev_data;
++
++	return 0;
++}
++
++static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
++{
++	struct walk_info *info = data;
++	struct acpi_resource_address64 addr;
++	acpi_status status;
++	int i, len, err;
++	char str[32], tmp[3];
++	unsigned char *ptr, *buf;
++
++	status = acpi_resource_to_address64(res, &addr);
++
++	/* Do we care about this range?  Let's check. */
++	if (!ACPI_SUCCESS(status) ||
++	    !(addr.resource_type == ACPI_MEMORY_RANGE ||
++	      addr.resource_type == ACPI_IO_RANGE) ||
++	    !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
++		return AE_OK;
++
++	/*
++	 * Furthermore, we really only care to tell the guest about
++	 * address ranges that require address translation of some sort.
++	 */
++	if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
++	      addr.info.mem.translation) &&
++	    !(addr.resource_type == ACPI_IO_RANGE &&
++	      addr.info.io.translation))
++		return AE_OK;
++
++	/* Store the resource in xenbus for the guest */
++	len = snprintf(str, sizeof(str), "root-%d-resource-%d",
++		       info->root_num, info->resource_count);
++	if (unlikely(len >= (sizeof(str) - 1)))
++		return AE_OK;
++
++	buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
++	if (!buf)
++		return AE_OK;
++
++	/* Clean out resource_source */
++	res->data.address64.resource_source.index = 0xFF;
++	res->data.address64.resource_source.string_length = 0;
++	res->data.address64.resource_source.string_ptr = NULL;
++
++	ptr = (unsigned char *)res;
++
++	/* Turn the acpi_resource into an ASCII byte stream */
++	for (i = 0; i < sizeof(*res); i++) {
++		snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
++		strncat(buf, tmp, 2);
++	}
++
++	err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
++			    str, "%s", buf);
++
++	if (!err)
++		info->resource_count++;
++
++	kfree(buf);
++
++	return AE_OK;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_root_cb)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_list_entry *cntrl_entry;
++	int i, root_num, len, err = 0;
++	unsigned int domain, bus;
++	char str[64];
++	struct walk_info info;
++
++	spin_lock(&dev_data->lock);
++
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		/* First publish all the domain:bus info */
++		err = publish_root_cb(pdev, cntrl_entry->domain,
++				      cntrl_entry->bus);
++		if (err)
++			goto out;
++
++		/*
++		 * Now figure out which root-%d this belongs to
++		 * so we can associate resources with it.
++		 */
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++				   "root_num", "%d", &root_num);
++
++		if (err != 1)
++			goto out;
++
++		for (i = 0; i < root_num; i++) {
++			len = snprintf(str, sizeof(str), "root-%d", i);
++			if (unlikely(len >= (sizeof(str) - 1))) {
++				err = -ENOMEM;
++				goto out;
++			}
++
++			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++					   str, "%x:%x", &domain, &bus);
++			if (err != 2)
++				goto out;
++
++			/* Is this the one we just published? */
++			if (domain == cntrl_entry->domain &&
++			    bus == cntrl_entry->bus)
++				break;
++		}
++
++		if (i == root_num)
++			goto out;
++
++		info.pdev = pdev;
++		info.resource_count = 0;
++		info.root_num = i;
++
++		/* Let ACPI do the heavy lifting on decoding resources */
++		acpi_walk_resources(cntrl_entry->controller->acpi_handle,
++				    METHOD_NAME__CRS, write_xenbus_resource,
++				    &info);
++
++		/* No resouces.  OK.  On to the next one */
++		if (!info.resource_count)
++			continue;
++
++		/* Store the number of resources we wrote for this root-%d */
++		len = snprintf(str, sizeof(str), "root-%d-resources", i);
++		if (unlikely(len >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
++		}
++
++		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++				    "%d", info.resource_count);
++		if (err)
++			goto out;
++	}
++
++	/* Finally, write some magic to synchronize with the guest. */
++	len = snprintf(str, sizeof(str), "root-resource-magic");
++	if (unlikely(len >= (sizeof(str) - 1))) {
++		err = -ENOMEM;
++		goto out;
++	}
++
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++			    "%lx", (sizeof(struct acpi_resource) * 2) + 1);
++
++out:
++	spin_unlock(&dev_data->lock);
++
++	return err;
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_list_entry *cntrl_entry, *c;
++	struct controller_dev_entry *dev_entry, *d;
++
++	list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
++		list_for_each_entry_safe(dev_entry, d,
++					 &cntrl_entry->dev_list, list) {
++			list_del(&dev_entry->list);
++			pcistub_put_pci_dev(dev_entry->dev);
++			kfree(dev_entry);
++		}
++		list_del(&cntrl_entry->list);
++		kfree(cntrl_entry);
++	}
++
++	kfree(dev_data);
++	pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++		struct pciback_device *pdev,
++		unsigned int *domain, unsigned int *bus, unsigned int *devfn)
++{
++	struct controller_dev_data *dev_data = pdev->pci_dev_data;
++	struct controller_dev_entry *dev_entry;
++	struct controller_list_entry *cntrl_entry;
++	unsigned long flags;
++	int found = 0;
++	spin_lock_irqsave(&dev_data->lock, flags);
++
++	list_for_each_entry(cntrl_entry, &dev_data->list, list) {
++		list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
++			if ((dev_entry->dev->bus->number ==
++					pcidev->bus->number) &&
++				(dev_entry->dev->devfn ==
++					pcidev->devfn) &&
++				(pci_domain_nr(dev_entry->dev->bus) ==
++					pci_domain_nr(pcidev->bus))) {
++				found = 1;
++				*domain = cntrl_entry->domain;
++				*bus = cntrl_entry->bus;
++				*devfn = dev_entry->devfn;
++				goto out;
++			}
++		}
++	}
++out:
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++	return found;
++
++}
++
+diff --git a/drivers/xen/pciback/passthrough.c b/drivers/xen/pciback/passthrough.c
+new file mode 100644
+index 0000000..5386bebf
+--- /dev/null
++++ b/drivers/xen/pciback/passthrough.c
+@@ -0,0 +1,178 @@
++/*
++ * PCI Backend - Provides restricted access to the real PCI bus topology
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/list.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++struct passthrough_dev_data {
++	/* Access to dev_list must be protected by lock */
++	struct list_head dev_list;
++	spinlock_t lock;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn)
++{
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry;
++	struct pci_dev *dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&dev_data->lock, flags);
++
++	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++		if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus)
++		    && bus == (unsigned int)dev_entry->dev->bus->number
++		    && devfn == dev_entry->dev->devfn) {
++			dev = dev_entry->dev;
++			break;
++		}
++	}
++
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++
++	return dev;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
++{
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry;
++	unsigned long flags;
++	unsigned int domain, bus, devfn;
++	int err;
++
++	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++	if (!dev_entry)
++		return -ENOMEM;
++	dev_entry->dev = dev;
++
++	spin_lock_irqsave(&dev_data->lock, flags);
++	list_add_tail(&dev_entry->list, &dev_data->dev_list);
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++
++	/* Publish this device. */
++	domain = (unsigned int)pci_domain_nr(dev->bus);
++	bus = (unsigned int)dev->bus->number;
++	devfn = dev->devfn;
++	err = publish_cb(pdev, domain, bus, devfn, devid);
++
++	return err;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry, *t;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&dev_data->lock, flags);
++
++	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++		if (dev_entry->dev == dev) {
++			list_del(&dev_entry->list);
++			found_dev = dev_entry->dev;
++			kfree(dev_entry);
++		}
++	}
++
++	spin_unlock_irqrestore(&dev_data->lock, flags);
++
++	if (found_dev)
++		pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++	struct passthrough_dev_data *dev_data;
++
++	dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
++	if (!dev_data)
++		return -ENOMEM;
++
++	spin_lock_init(&dev_data->lock);
++
++	INIT_LIST_HEAD(&dev_data->dev_list);
++
++	pdev->pci_dev_data = dev_data;
++
++	return 0;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_root_cb)
++{
++	int err = 0;
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry, *e;
++	struct pci_dev *dev;
++	int found;
++	unsigned int domain, bus;
++
++	spin_lock(&dev_data->lock);
++
++	list_for_each_entry(dev_entry, &dev_data->dev_list, list) {
++		/* Only publish this device as a root if none of its
++		 * parent bridges are exported
++		 */
++		found = 0;
++		dev = dev_entry->dev->bus->self;
++		for (; !found && dev != NULL; dev = dev->bus->self) {
++			list_for_each_entry(e, &dev_data->dev_list, list) {
++				if (dev == e->dev) {
++					found = 1;
++					break;
++				}
++			}
++		}
++
++		domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus);
++		bus = (unsigned int)dev_entry->dev->bus->number;
++
++		if (!found) {
++			err = publish_root_cb(pdev, domain, bus);
++			if (err)
++				break;
++		}
++	}
++
++	spin_unlock(&dev_data->lock);
++
++	return err;
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	struct passthrough_dev_data *dev_data = pdev->pci_dev_data;
++	struct pci_dev_entry *dev_entry, *t;
++
++	list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) {
++		list_del(&dev_entry->list);
++		pcistub_put_pci_dev(dev_entry->dev);
++		kfree(dev_entry);
++	}
++
++	kfree(dev_data);
++	pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn)
++
++{
++	*domain = pci_domain_nr(pcidev->bus);
++	*bus = pcidev->bus->number;
++	*devfn = pcidev->devfn;
++	return 1;
++}
+diff --git a/drivers/xen/pciback/pci_stub.c b/drivers/xen/pciback/pci_stub.c
+new file mode 100644
+index 0000000..88c7ca1
+--- /dev/null
++++ b/drivers/xen/pciback/pci_stub.c
+@@ -0,0 +1,1370 @@
++/*
++ * PCI Stub Driver - Grabs devices in backend to be exported later
++ *
++ * Ryan Wilson <hap9@epoch.ncsc.mil>
++ * Chris Bookholt <hap10@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/rwsem.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/kref.h>
++#include <linux/pci.h>
++#include <linux/wait.h>
++#include <linux/sched.h>
++#include <asm/atomic.h>
++#include <xen/events.h>
++#include <asm/xen/pci.h>
++#include <asm/xen/hypervisor.h>
++#include "pciback.h"
++#include "conf_space.h"
++#include "conf_space_quirks.h"
++
++#define DRV_NAME	"pciback"
++
++static char *pci_devs_to_hide;
++wait_queue_head_t aer_wait_queue;
++/*Add sem for sync AER handling and pciback remove/reconfigue ops,
++* We want to avoid in middle of AER ops, pciback devices is being removed
++*/
++static DECLARE_RWSEM(pcistub_sem);
++module_param_named(hide, pci_devs_to_hide, charp, 0444);
++
++struct pcistub_device_id {
++	struct list_head slot_list;
++	int domain;
++	unsigned char bus;
++	unsigned int devfn;
++};
++static LIST_HEAD(pcistub_device_ids);
++static DEFINE_SPINLOCK(device_ids_lock);
++
++struct pcistub_device {
++	struct kref kref;
++	struct list_head dev_list;
++	spinlock_t lock;
++
++	struct pci_dev *dev;
++	struct pciback_device *pdev;/* non-NULL if struct pci_dev is in use */
++};
++
++/* Access to pcistub_devices & seized_devices lists and the initialize_devices
++ * flag must be locked with pcistub_devices_lock
++ */
++static DEFINE_SPINLOCK(pcistub_devices_lock);
++static LIST_HEAD(pcistub_devices);
++
++/* wait for device_initcall before initializing our devices
++ * (see pcistub_init_devices_late)
++ */
++static int initialize_devices;
++static LIST_HEAD(seized_devices);
++
++static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
++
++	dev_dbg(&dev->dev, "pcistub_device_alloc\n");
++
++	psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC);
++	if (!psdev)
++		return NULL;
++
++	psdev->dev = pci_dev_get(dev);
++	if (!psdev->dev) {
++		kfree(psdev);
++		return NULL;
++	}
++
++	kref_init(&psdev->kref);
++	spin_lock_init(&psdev->lock);
++
++	return psdev;
++}
++
++/* Don't call this directly as it's called by pcistub_device_put */
++static void pcistub_device_release(struct kref *kref)
++{
++	struct pcistub_device *psdev;
++
++	psdev = container_of(kref, struct pcistub_device, kref);
++
++	dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
++
++	xen_unregister_device_domain_owner(psdev->dev);
++
++	/* Clean-up the device */
++	pciback_reset_device(psdev->dev);
++	pciback_config_free_dyn_fields(psdev->dev);
++	pciback_config_free_dev(psdev->dev);
++	kfree(pci_get_drvdata(psdev->dev));
++	pci_set_drvdata(psdev->dev, NULL);
++
++	pci_dev_put(psdev->dev);
++
++	kfree(psdev);
++}
++
++static inline void pcistub_device_get(struct pcistub_device *psdev)
++{
++	kref_get(&psdev->kref);
++}
++
++static inline void pcistub_device_put(struct pcistub_device *psdev)
++{
++	kref_put(&psdev->kref, pcistub_device_release);
++}
++
++static struct pcistub_device *pcistub_device_find(int domain, int bus,
++						  int slot, int func)
++{
++	struct pcistub_device *psdev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev != NULL
++		    && domain == pci_domain_nr(psdev->dev->bus)
++		    && bus == psdev->dev->bus->number
++		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++			pcistub_device_get(psdev);
++			goto out;
++		}
++	}
++
++	/* didn't find it */
++	psdev = NULL;
++
++out:
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return psdev;
++}
++
++static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev,
++						  struct pcistub_device *psdev)
++{
++	struct pci_dev *pci_dev = NULL;
++	unsigned long flags;
++
++	pcistub_device_get(psdev);
++
++	spin_lock_irqsave(&psdev->lock, flags);
++	if (!psdev->pdev) {
++		psdev->pdev = pdev;
++		pci_dev = psdev->dev;
++	}
++	spin_unlock_irqrestore(&psdev->lock, flags);
++
++	if (!pci_dev)
++		pcistub_device_put(psdev);
++
++	return pci_dev;
++}
++
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++					    int domain, int bus,
++					    int slot, int func)
++{
++	struct pcistub_device *psdev;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev != NULL
++		    && domain == pci_domain_nr(psdev->dev->bus)
++		    && bus == psdev->dev->bus->number
++		    && PCI_DEVFN(slot, func) == psdev->dev->devfn) {
++			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
++			break;
++		}
++	}
++
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return found_dev;
++}
++
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++				    struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev == dev) {
++			found_dev = pcistub_device_get_pci_dev(pdev, psdev);
++			break;
++		}
++	}
++
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return found_dev;
++}
++
++void pcistub_put_pci_dev(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev, *found_psdev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev == dev) {
++			found_psdev = psdev;
++			break;
++		}
++	}
++
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++	/*hold this lock for avoiding breaking link between
++	* pcistub and pciback when AER is in processing
++	*/
++	down_write(&pcistub_sem);
++	/* Cleanup our device
++	 * (so it's ready for the next domain)
++	 */
++	pciback_reset_device(found_psdev->dev);
++	pciback_config_free_dyn_fields(found_psdev->dev);
++	pciback_config_reset_dev(found_psdev->dev);
++
++	spin_lock_irqsave(&found_psdev->lock, flags);
++	found_psdev->pdev = NULL;
++	spin_unlock_irqrestore(&found_psdev->lock, flags);
++
++	pcistub_device_put(found_psdev);
++	up_write(&pcistub_sem);
++}
++
++static int __devinit pcistub_match_one(struct pci_dev *dev,
++				       struct pcistub_device_id *pdev_id)
++{
++	/* Match the specified device by domain, bus, slot, func and also if
++	 * any of the device's parent bridges match.
++	 */
++	for (; dev != NULL; dev = dev->bus->self) {
++		if (pci_domain_nr(dev->bus) == pdev_id->domain
++		    && dev->bus->number == pdev_id->bus
++		    && dev->devfn == pdev_id->devfn)
++			return 1;
++
++		/* Sometimes topmost bridge links to itself. */
++		if (dev == dev->bus->self)
++			break;
++	}
++
++	return 0;
++}
++
++static int __devinit pcistub_match(struct pci_dev *dev)
++{
++	struct pcistub_device_id *pdev_id;
++	unsigned long flags;
++	int found = 0;
++
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) {
++		if (pcistub_match_one(dev, pdev_id)) {
++			found = 1;
++			break;
++		}
++	}
++	spin_unlock_irqrestore(&device_ids_lock, flags);
++
++	return found;
++}
++
++static int __devinit pcistub_init_device(struct pci_dev *dev)
++{
++	struct pciback_dev_data *dev_data;
++	int err = 0;
++
++	dev_dbg(&dev->dev, "initializing...\n");
++
++	/* The PCI backend is not intended to be a module (or to work with
++	 * removable PCI devices (yet). If it were, pciback_config_free()
++	 * would need to be called somewhere to free the memory allocated
++	 * here and then to call kfree(pci_get_drvdata(psdev->dev)).
++	 */
++	dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]")
++				+ strlen(pci_name(dev)) + 1, GFP_ATOMIC);
++	if (!dev_data) {
++		err = -ENOMEM;
++		goto out;
++	}
++	pci_set_drvdata(dev, dev_data);
++
++	/*
++	 * Setup name for fake IRQ handler. It will only be enabled
++	 * once the device is turned on by the guest.
++	 */
++	sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
++
++	dev_dbg(&dev->dev, "initializing config\n");
++
++	init_waitqueue_head(&aer_wait_queue);
++	err = pciback_config_init_dev(dev);
++	if (err)
++		goto out;
++
++	/* HACK: Force device (& ACPI) to determine what IRQ it's on - we
++	 * must do this here because pcibios_enable_device may specify
++	 * the pci device's true irq (and possibly its other resources)
++	 * if they differ from what's in the configuration space.
++	 * This makes the assumption that the device's resources won't
++	 * change after this point (otherwise this code may break!)
++	 */
++	dev_dbg(&dev->dev, "enabling device\n");
++	err = pci_enable_device(dev);
++	if (err)
++		goto config_release;
++
++	/* Now disable the device (this also ensures some private device
++	 * data is setup before we export)
++	 */
++	dev_dbg(&dev->dev, "reset device\n");
++	pciback_reset_device(dev);
++
++	return 0;
++
++config_release:
++	pciback_config_free_dev(dev);
++
++out:
++	pci_set_drvdata(dev, NULL);
++	kfree(dev_data);
++	return err;
++}
++
++/*
++ * Because some initialization still happens on
++ * devices during fs_initcall, we need to defer
++ * full initialization of our devices until
++ * device_initcall.
++ */
++static int __init pcistub_init_devices_late(void)
++{
++	struct pcistub_device *psdev;
++	unsigned long flags;
++	int err = 0;
++
++	pr_debug("pciback: pcistub_init_devices_late\n");
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	while (!list_empty(&seized_devices)) {
++		psdev = container_of(seized_devices.next,
++				     struct pcistub_device, dev_list);
++		list_del(&psdev->dev_list);
++
++		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++		err = pcistub_init_device(psdev->dev);
++		if (err) {
++			dev_err(&psdev->dev->dev,
++				"error %d initializing device\n", err);
++			kfree(psdev);
++			psdev = NULL;
++		}
++
++		spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++		if (psdev)
++			list_add_tail(&psdev->dev_list, &pcistub_devices);
++	}
++
++	initialize_devices = 1;
++
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++	return 0;
++}
++
++static int __devinit pcistub_seize(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
++	unsigned long flags;
++	int err = 0;
++
++	psdev = pcistub_device_alloc(dev);
++	if (!psdev)
++		return -ENOMEM;
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	if (initialize_devices) {
++		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++		/* don't want irqs disabled when calling pcistub_init_device */
++		err = pcistub_init_device(psdev->dev);
++
++		spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++		if (!err)
++			list_add(&psdev->dev_list, &pcistub_devices);
++	} else {
++		dev_dbg(&dev->dev, "deferring initialization\n");
++		list_add(&psdev->dev_list, &seized_devices);
++	}
++
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++	if (err)
++		pcistub_device_put(psdev);
++
++	return err;
++}
++
++static int __devinit pcistub_probe(struct pci_dev *dev,
++				   const struct pci_device_id *id)
++{
++	int err = 0;
++
++	dev_dbg(&dev->dev, "probing...\n");
++
++	if (pcistub_match(dev)) {
++
++		if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL
++		    && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
++			dev_err(&dev->dev, "can't export pci devices that "
++				"don't have a normal (0) or bridge (1) "
++				"header type!\n");
++			err = -ENODEV;
++			goto out;
++		}
++
++		dev_info(&dev->dev, "seizing device\n");
++		err = pcistub_seize(dev);
++	} else
++		/* Didn't find the device */
++		err = -ENODEV;
++
++out:
++	return err;
++}
++
++static void pcistub_remove(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev, *found_psdev = NULL;
++	unsigned long flags;
++
++	dev_dbg(&dev->dev, "removing\n");
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++
++	pciback_config_quirk_release(dev);
++
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (psdev->dev == dev) {
++			found_psdev = psdev;
++			break;
++		}
++	}
++
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++	if (found_psdev) {
++		dev_dbg(&dev->dev, "found device to remove - in use? %p\n",
++			found_psdev->pdev);
++
++		if (found_psdev->pdev) {
++			printk(KERN_WARNING "pciback: ****** removing device "
++			       "%s while still in-use! ******\n",
++			       pci_name(found_psdev->dev));
++			printk(KERN_WARNING "pciback: ****** driver domain may "
++			       "still access this device's i/o resources!\n");
++			printk(KERN_WARNING "pciback: ****** shutdown driver "
++			       "domain before binding device\n");
++			printk(KERN_WARNING "pciback: ****** to other drivers "
++			       "or domains\n");
++
++			pciback_release_pci_dev(found_psdev->pdev,
++						found_psdev->dev);
++		}
++
++		spin_lock_irqsave(&pcistub_devices_lock, flags);
++		list_del(&found_psdev->dev_list);
++		spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++
++		/* the final put for releasing from the list */
++		pcistub_device_put(found_psdev);
++	}
++}
++
++static const struct pci_device_id pcistub_ids[] = {
++	{
++	 .vendor = PCI_ANY_ID,
++	 .device = PCI_ANY_ID,
++	 .subvendor = PCI_ANY_ID,
++	 .subdevice = PCI_ANY_ID,
++	 },
++	{0,},
++};
++
++#define PCI_NODENAME_MAX 40
++static void kill_domain_by_device(struct pcistub_device *psdev)
++{
++	struct xenbus_transaction xbt;
++	int err;
++	char nodename[PCI_NODENAME_MAX];
++
++	if (!psdev)
++		dev_err(&psdev->dev->dev,
++			"device is NULL when do AER recovery/kill_domain\n");
++	snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0",
++		psdev->pdev->xdev->otherend_id);
++	nodename[strlen(nodename)] = '\0';
++
++again:
++	err = xenbus_transaction_start(&xbt);
++	if (err) {
++		dev_err(&psdev->dev->dev,
++			"error %d when start xenbus transaction\n", err);
++		return;
++	}
++	/*PV AER handlers will set this flag*/
++	xenbus_printf(xbt, nodename, "aerState" , "aerfail");
++	err = xenbus_transaction_end(xbt, 0);
++	if (err) {
++		if (err == -EAGAIN)
++			goto again;
++		dev_err(&psdev->dev->dev,
++			"error %d when end xenbus transaction\n", err);
++		return;
++	}
++}
++
++/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and
++ * backend need to have cooperation. In pciback, those steps will do similar
++ * jobs: send service request and waiting for front_end response.
++*/
++static pci_ers_result_t common_process(struct pcistub_device *psdev,
++		pci_channel_state_t state, int aer_cmd, pci_ers_result_t result)
++{
++	pci_ers_result_t res = result;
++	struct xen_pcie_aer_op *aer_op;
++	int ret;
++
++	/*with PV AER drivers*/
++	aer_op = &(psdev->pdev->sh_info->aer_op);
++	aer_op->cmd = aer_cmd ;
++	/*useful for error_detected callback*/
++	aer_op->err = state;
++	/*pcifront_end BDF*/
++	ret = pciback_get_pcifront_dev(psdev->dev, psdev->pdev,
++		&aer_op->domain, &aer_op->bus, &aer_op->devfn);
++	if (!ret) {
++		dev_err(&psdev->dev->dev,
++			"pciback: failed to get pcifront device\n");
++		return PCI_ERS_RESULT_NONE;
++	}
++	wmb();
++
++	dev_dbg(&psdev->dev->dev,
++			"pciback: aer_op %x dom %x bus %x devfn %x\n",
++			aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn);
++	/*local flag to mark there's aer request, pciback callback will use this
++	* flag to judge whether we need to check pci-front give aer service
++	* ack signal
++	*/
++	set_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++	/*It is possible that a pcifront conf_read_write ops request invokes
++	* the callback which cause the spurious execution of wake_up.
++	* Yet it is harmless and better than a spinlock here
++	*/
++	set_bit(_XEN_PCIB_active,
++		(unsigned long *)&psdev->pdev->sh_info->flags);
++	wmb();
++	notify_remote_via_irq(psdev->pdev->evtchn_irq);
++
++	ret = wait_event_timeout(aer_wait_queue, !(test_bit(_XEN_PCIB_active,
++		(unsigned long *)&psdev->pdev->sh_info->flags)), 300*HZ);
++
++	if (!ret) {
++		if (test_bit(_XEN_PCIB_active,
++			(unsigned long *)&psdev->pdev->sh_info->flags)) {
++			dev_err(&psdev->dev->dev,
++				"pcifront aer process not responding!\n");
++			clear_bit(_XEN_PCIB_active,
++			  (unsigned long *)&psdev->pdev->sh_info->flags);
++			aer_op->err = PCI_ERS_RESULT_NONE;
++			return res;
++		}
++	}
++	clear_bit(_PCIB_op_pending, (unsigned long *)&psdev->pdev->flags);
++
++	if (test_bit(_XEN_PCIF_active,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_dbg(&psdev->dev->dev,
++			"schedule pci_conf service in pciback \n");
++		test_and_schedule_op(psdev->pdev);
++	}
++
++	res = (pci_ers_result_t)aer_op->err;
++	return res;
++}
++
++/*
++* pciback_slot_reset: it will send the slot_reset request to  pcifront in case
++* of the device driver could provide this service, and then wait for pcifront
++* ack.
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
++static pci_ers_result_t pciback_slot_reset(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
++	pci_ers_result_t result;
++
++	result = PCI_ERS_RESULT_RECOVERED;
++	dev_dbg(&dev->dev, "pciback_slot_reset(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
++
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
++
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
++	}
++
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_err(&dev->dev,
++			"guest with no AER driver should have been killed\n");
++		goto release;
++	}
++	result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result);
++
++	if (result == PCI_ERS_RESULT_NONE ||
++		result == PCI_ERS_RESULT_DISCONNECT) {
++		dev_dbg(&dev->dev,
++			"No AER slot_reset service or disconnected!\n");
++		kill_domain_by_device(psdev);
++	}
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return result;
++
++}
++
++
++/*pciback_mmio_enabled: it will send the mmio_enabled request to  pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack
++* @dev: pointer to PCI devices
++* return value is used by aer_core do_recovery policy
++*/
++
++static pci_ers_result_t pciback_mmio_enabled(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
++	pci_ers_result_t result;
++
++	result = PCI_ERS_RESULT_RECOVERED;
++	dev_dbg(&dev->dev, "pciback_mmio_enabled(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
++
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
++
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
++	}
++
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_err(&dev->dev,
++			"guest with no AER driver should have been killed\n");
++		goto release;
++	}
++	result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result);
++
++	if (result == PCI_ERS_RESULT_NONE ||
++		result == PCI_ERS_RESULT_DISCONNECT) {
++		dev_dbg(&dev->dev,
++			"No AER mmio_enabled service or disconnected!\n");
++		kill_domain_by_device(psdev);
++	}
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return result;
++}
++
++/*pciback_error_detected: it will send the error_detected request to  pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack.
++* @dev: pointer to PCI devices
++* @error: the current PCI connection state
++* return value is used by aer_core do_recovery policy
++*/
++
++static pci_ers_result_t pciback_error_detected(struct pci_dev *dev,
++	pci_channel_state_t error)
++{
++	struct pcistub_device *psdev;
++	pci_ers_result_t result;
++
++	result = PCI_ERS_RESULT_CAN_RECOVER;
++	dev_dbg(&dev->dev, "pciback_error_detected(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
++
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
++
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
++	}
++
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++
++	/*Guest owns the device yet no aer handler regiested, kill guest*/
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++	result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result);
++
++	if (result == PCI_ERS_RESULT_NONE ||
++		result == PCI_ERS_RESULT_DISCONNECT) {
++		dev_dbg(&dev->dev,
++			"No AER error_detected service or disconnected!\n");
++		kill_domain_by_device(psdev);
++	}
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return result;
++}
++
++/*pciback_error_resume: it will send the error_resume request to  pcifront
++* in case of the device driver could provide this service, and then wait
++* for pcifront ack.
++* @dev: pointer to PCI devices
++*/
++
++static void pciback_error_resume(struct pci_dev *dev)
++{
++	struct pcistub_device *psdev;
++
++	dev_dbg(&dev->dev, "pciback_error_resume(bus:%x,devfn:%x)\n",
++		dev->bus->number, dev->devfn);
++
++	down_write(&pcistub_sem);
++	psdev = pcistub_device_find(pci_domain_nr(dev->bus),
++				dev->bus->number,
++				PCI_SLOT(dev->devfn),
++				PCI_FUNC(dev->devfn));
++
++	if (!psdev || !psdev->pdev) {
++		dev_err(&dev->dev,
++			"pciback device is not found/assigned\n");
++		goto end;
++	}
++
++	if (!psdev->pdev->sh_info) {
++		dev_err(&dev->dev, "pciback device is not connected or owned"
++			" by HVM, kill it\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++
++	if (!test_bit(_XEN_PCIB_AERHANDLER,
++		(unsigned long *)&psdev->pdev->sh_info->flags)) {
++		dev_err(&dev->dev,
++			"guest with no AER driver should have been killed\n");
++		kill_domain_by_device(psdev);
++		goto release;
++	}
++	common_process(psdev, 1, XEN_PCI_OP_aer_resume,
++		       PCI_ERS_RESULT_RECOVERED);
++release:
++	pcistub_device_put(psdev);
++end:
++	up_write(&pcistub_sem);
++	return;
++}
++
++/*add pciback AER handling*/
++static struct pci_error_handlers pciback_error_handler = {
++	.error_detected = pciback_error_detected,
++	.mmio_enabled = pciback_mmio_enabled,
++	.slot_reset = pciback_slot_reset,
++	.resume = pciback_error_resume,
++};
++
++/*
++ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't
++ * for a normal device. I don't want it to be loaded automatically.
++ */
++
++static struct pci_driver pciback_pci_driver = {
++	.name = DRV_NAME,
++	.id_table = pcistub_ids,
++	.probe = pcistub_probe,
++	.remove = pcistub_remove,
++	.err_handler = &pciback_error_handler,
++};
++
++static inline int str_to_slot(const char *buf, int *domain, int *bus,
++			      int *slot, int *func)
++{
++	int err;
++
++	err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func);
++	if (err == 4)
++		return 0;
++	else if (err < 0)
++		return -EINVAL;
++
++	/* try again without domain */
++	*domain = 0;
++	err = sscanf(buf, " %x:%x.%x", bus, slot, func);
++	if (err == 3)
++		return 0;
++
++	return -EINVAL;
++}
++
++static inline int str_to_quirk(const char *buf, int *domain, int *bus, int
++			       *slot, int *func, int *reg, int *size, int *mask)
++{
++	int err;
++
++	err =
++	    sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot,
++		   func, reg, size, mask);
++	if (err == 7)
++		return 0;
++	return -EINVAL;
++}
++
++static int pcistub_device_id_add(int domain, int bus, int slot, int func)
++{
++	struct pcistub_device_id *pci_dev_id;
++	unsigned long flags;
++
++	pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL);
++	if (!pci_dev_id)
++		return -ENOMEM;
++
++	pci_dev_id->domain = domain;
++	pci_dev_id->bus = bus;
++	pci_dev_id->devfn = PCI_DEVFN(slot, func);
++
++	pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n",
++		 domain, bus, slot, func);
++
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids);
++	spin_unlock_irqrestore(&device_ids_lock, flags);
++
++	return 0;
++}
++
++static int pcistub_device_id_remove(int domain, int bus, int slot, int func)
++{
++	struct pcistub_device_id *pci_dev_id, *t;
++	int devfn = PCI_DEVFN(slot, func);
++	int err = -ENOENT;
++	unsigned long flags;
++
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids,
++				 slot_list) {
++		if (pci_dev_id->domain == domain
++		    && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) {
++			/* Don't break; here because it's possible the same
++			 * slot could be in the list more than once
++			 */
++			list_del(&pci_dev_id->slot_list);
++			kfree(pci_dev_id);
++
++			err = 0;
++
++			pr_debug("pciback: removed %04x:%02x:%02x.%01x from "
++				 "seize list\n", domain, bus, slot, func);
++		}
++	}
++	spin_unlock_irqrestore(&device_ids_lock, flags);
++
++	return err;
++}
++
++static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg,
++			   int size, int mask)
++{
++	int err = 0;
++	struct pcistub_device *psdev;
++	struct pci_dev *dev;
++	struct config_field *field;
++
++	psdev = pcistub_device_find(domain, bus, slot, func);
++	if (!psdev || !psdev->dev) {
++		err = -ENODEV;
++		goto out;
++	}
++	dev = psdev->dev;
++
++	field = kzalloc(sizeof(*field), GFP_ATOMIC);
++	if (!field) {
++		err = -ENOMEM;
++		goto out;
++	}
++
++	field->offset = reg;
++	field->size = size;
++	field->mask = mask;
++	field->init = NULL;
++	field->reset = NULL;
++	field->release = NULL;
++	field->clean = pciback_config_field_free;
++
++	err = pciback_config_quirks_add_field(dev, field);
++	if (err)
++		kfree(field);
++out:
++	return err;
++}
++
++static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf,
++				size_t count)
++{
++	int domain, bus, slot, func;
++	int err;
++
++	err = str_to_slot(buf, &domain, &bus, &slot, &func);
++	if (err)
++		goto out;
++
++	err = pcistub_device_id_add(domain, bus, slot, func);
++
++out:
++	if (!err)
++		err = count;
++	return err;
++}
++
++DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add);
++
++static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf,
++				   size_t count)
++{
++	int domain, bus, slot, func;
++	int err;
++
++	err = str_to_slot(buf, &domain, &bus, &slot, &func);
++	if (err)
++		goto out;
++
++	err = pcistub_device_id_remove(domain, bus, slot, func);
++
++out:
++	if (!err)
++		err = count;
++	return err;
++}
++
++DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove);
++
++static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
++{
++	struct pcistub_device_id *pci_dev_id;
++	size_t count = 0;
++	unsigned long flags;
++
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) {
++		if (count >= PAGE_SIZE)
++			break;
++
++		count += scnprintf(buf + count, PAGE_SIZE - count,
++				   "%04x:%02x:%02x.%01x\n",
++				   pci_dev_id->domain, pci_dev_id->bus,
++				   PCI_SLOT(pci_dev_id->devfn),
++				   PCI_FUNC(pci_dev_id->devfn));
++	}
++	spin_unlock_irqrestore(&device_ids_lock, flags);
++
++	return count;
++}
++
++DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
++
++static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
++{
++	struct pcistub_device *psdev;
++	struct pciback_dev_data *dev_data;
++	size_t count = 0;
++	unsigned long flags;
++
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (count >= PAGE_SIZE)
++			break;
++		if (!psdev->dev)
++			continue;
++		dev_data = pci_get_drvdata(psdev->dev);
++		if (!dev_data)
++			continue;
++		count +=
++		    scnprintf(buf + count, PAGE_SIZE - count, "%s:%s:%sing:%ld\n",
++			      pci_name(psdev->dev),
++			      dev_data->isr_on ? "on" : "off",
++			      dev_data->ack_intr ? "ack" : "not ack",
++			      dev_data->handled);
++	}
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return count;
++}
++
++DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL);
++
++static ssize_t pcistub_irq_handler_switch(struct device_driver *drv,
++					  const char *buf,
++					  size_t count)
++{
++	struct pcistub_device *psdev;
++	struct pciback_dev_data *dev_data;
++	int domain, bus, slot, func;
++	int err = -ENOENT;
++
++	err = str_to_slot(buf, &domain, &bus, &slot, &func);
++	if (err)
++		goto out;
++
++	psdev = pcistub_device_find(domain, bus, slot, func);
++
++	if (!psdev)
++		goto out;
++
++	dev_data = pci_get_drvdata(psdev->dev);
++	if (!dev_data)
++		goto out;
++
++	dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n",
++		dev_data->irq_name, dev_data->isr_on,
++		!dev_data->isr_on);
++
++	dev_data->isr_on = !(dev_data->isr_on);
++	if (dev_data->isr_on)
++		dev_data->ack_intr = 1;
++out:
++	if (!err)
++		err = count;
++	return err;
++}
++DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch);
++
++static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
++				 size_t count)
++{
++	int domain, bus, slot, func, reg, size, mask;
++	int err;
++
++	err = str_to_quirk(buf, &domain, &bus, &slot, &func, &reg, &size,
++			   &mask);
++	if (err)
++		goto out;
++
++	err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask);
++
++out:
++	if (!err)
++		err = count;
++	return err;
++}
++
++static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf)
++{
++	int count = 0;
++	unsigned long flags;
++	struct pciback_config_quirk *quirk;
++	struct pciback_dev_data *dev_data;
++	const struct config_field *field;
++	const struct config_field_entry *cfg_entry;
++
++	spin_lock_irqsave(&device_ids_lock, flags);
++	list_for_each_entry(quirk, &pciback_quirks, quirks_list) {
++		if (count >= PAGE_SIZE)
++			goto out;
++
++		count += scnprintf(buf + count, PAGE_SIZE - count,
++				   "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n",
++				   quirk->pdev->bus->number,
++				   PCI_SLOT(quirk->pdev->devfn),
++				   PCI_FUNC(quirk->pdev->devfn),
++				   quirk->devid.vendor, quirk->devid.device,
++				   quirk->devid.subvendor,
++				   quirk->devid.subdevice);
++
++		dev_data = pci_get_drvdata(quirk->pdev);
++
++		list_for_each_entry(cfg_entry, &dev_data->config_fields, list) {
++			field = cfg_entry->field;
++			if (count >= PAGE_SIZE)
++				goto out;
++
++			count += scnprintf(buf + count, PAGE_SIZE - count,
++					   "\t\t%08x:%01x:%08x\n",
++					   cfg_entry->base_offset +
++					   field->offset, field->size,
++					   field->mask);
++		}
++	}
++
++out:
++	spin_unlock_irqrestore(&device_ids_lock, flags);
++
++	return count;
++}
++
++DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add);
++
++static ssize_t permissive_add(struct device_driver *drv, const char *buf,
++			      size_t count)
++{
++	int domain, bus, slot, func;
++	int err;
++	struct pcistub_device *psdev;
++	struct pciback_dev_data *dev_data;
++	err = str_to_slot(buf, &domain, &bus, &slot, &func);
++	if (err)
++		goto out;
++	psdev = pcistub_device_find(domain, bus, slot, func);
++	if (!psdev) {
++		err = -ENODEV;
++		goto out;
++	}
++	if (!psdev->dev) {
++		err = -ENODEV;
++		goto release;
++	}
++	dev_data = pci_get_drvdata(psdev->dev);
++	/* the driver data for a device should never be null at this point */
++	if (!dev_data) {
++		err = -ENXIO;
++		goto release;
++	}
++	if (!dev_data->permissive) {
++		dev_data->permissive = 1;
++		/* Let user know that what they're doing could be unsafe */
++		dev_warn(&psdev->dev->dev, "enabling permissive mode "
++			 "configuration space accesses!\n");
++		dev_warn(&psdev->dev->dev,
++			 "permissive mode is potentially unsafe!\n");
++	}
++release:
++	pcistub_device_put(psdev);
++out:
++	if (!err)
++		err = count;
++	return err;
++}
++
++static ssize_t permissive_show(struct device_driver *drv, char *buf)
++{
++	struct pcistub_device *psdev;
++	struct pciback_dev_data *dev_data;
++	size_t count = 0;
++	unsigned long flags;
++	spin_lock_irqsave(&pcistub_devices_lock, flags);
++	list_for_each_entry(psdev, &pcistub_devices, dev_list) {
++		if (count >= PAGE_SIZE)
++			break;
++		if (!psdev->dev)
++			continue;
++		dev_data = pci_get_drvdata(psdev->dev);
++		if (!dev_data || !dev_data->permissive)
++			continue;
++		count +=
++		    scnprintf(buf + count, PAGE_SIZE - count, "%s\n",
++			      pci_name(psdev->dev));
++	}
++	spin_unlock_irqrestore(&pcistub_devices_lock, flags);
++	return count;
++}
++
++DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add);
++
++static void pcistub_exit(void)
++{
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot);
++	driver_remove_file(&pciback_pci_driver.driver,
++			   &driver_attr_remove_slot);
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots);
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks);
++	driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive);
++	driver_remove_file(&pciback_pci_driver.driver,
++			   &driver_attr_irq_handlers);
++	driver_remove_file(&pciback_pci_driver.driver,
++			   &driver_attr_irq_handler_state);
++	pci_unregister_driver(&pciback_pci_driver);
++}
++
++static int __init pcistub_init(void)
++{
++	int pos = 0;
++	int err = 0;
++	int domain, bus, slot, func;
++	int parsed;
++
++	if (pci_devs_to_hide && *pci_devs_to_hide) {
++		do {
++			parsed = 0;
++
++			err = sscanf(pci_devs_to_hide + pos,
++				     " (%x:%x:%x.%x) %n",
++				     &domain, &bus, &slot, &func, &parsed);
++			if (err != 4) {
++				domain = 0;
++				err = sscanf(pci_devs_to_hide + pos,
++					     " (%x:%x.%x) %n",
++					     &bus, &slot, &func, &parsed);
++				if (err != 3)
++					goto parse_error;
++			}
++
++			err = pcistub_device_id_add(domain, bus, slot, func);
++			if (err)
++				goto out;
++
++			/* if parsed<=0, we've reached the end of the string */
++			pos += parsed;
++		} while (parsed > 0 && pci_devs_to_hide[pos]);
++	}
++
++	/* If we're the first PCI Device Driver to register, we're the
++	 * first one to get offered PCI devices as they become
++	 * available (and thus we can be the first to grab them)
++	 */
++	err = pci_register_driver(&pciback_pci_driver);
++	if (err < 0)
++		goto out;
++
++	err = driver_create_file(&pciback_pci_driver.driver,
++				 &driver_attr_new_slot);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_remove_slot);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_slots);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_quirks);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_permissive);
++
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					 &driver_attr_irq_handlers);
++	if (!err)
++		err = driver_create_file(&pciback_pci_driver.driver,
++					&driver_attr_irq_handler_state);
++	if (err)
++		pcistub_exit();
++
++out:
++	return err;
++
++parse_error:
++	printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n",
++	       pci_devs_to_hide + pos);
++	return -EINVAL;
++}
++
++#ifndef MODULE
++/*
++ * fs_initcall happens before device_initcall
++ * so pciback *should* get called first (b/c we
++ * want to suck up any device before other drivers
++ * get a chance by being the first pci device
++ * driver to register)
++ */
++fs_initcall(pcistub_init);
++#endif
++
++static int __init pciback_init(void)
++{
++	int err;
++
++	if (!xen_initial_domain())
++		return -ENODEV;
++
++	err = pciback_config_init();
++	if (err)
++		return err;
++
++#ifdef MODULE
++	err = pcistub_init();
++	if (err < 0)
++		return err;
++#endif
++
++	pcistub_init_devices_late();
++	err = pciback_xenbus_register();
++	if (err)
++		pcistub_exit();
++
++	return err;
++}
++
++static void __exit pciback_cleanup(void)
++{
++	pciback_xenbus_unregister();
++	pcistub_exit();
++}
++
++module_init(pciback_init);
++module_exit(pciback_cleanup);
++
++MODULE_LICENSE("Dual BSD/GPL");
+diff --git a/drivers/xen/pciback/pciback.h b/drivers/xen/pciback/pciback.h
+new file mode 100644
+index 0000000..fc31052
+--- /dev/null
++++ b/drivers/xen/pciback/pciback.h
+@@ -0,0 +1,142 @@
++/*
++ * PCI Backend Common Data Structures & Function Declarations
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#ifndef __XEN_PCIBACK_H__
++#define __XEN_PCIBACK_H__
++
++#include <linux/pci.h>
++#include <linux/interrupt.h>
++#include <xen/xenbus.h>
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/workqueue.h>
++#include <asm/atomic.h>
++#include <xen/interface/io/pciif.h>
++
++struct pci_dev_entry {
++	struct list_head list;
++	struct pci_dev *dev;
++};
++
++#define _PDEVF_op_active 	(0)
++#define PDEVF_op_active 	(1<<(_PDEVF_op_active))
++#define _PCIB_op_pending	(1)
++#define PCIB_op_pending		(1<<(_PCIB_op_pending))
++
++struct pciback_device {
++	void *pci_dev_data;
++	spinlock_t dev_lock;
++
++	struct xenbus_device *xdev;
++
++	struct xenbus_watch be_watch;
++	u8 be_watching;
++
++	int evtchn_irq;
++
++	struct xen_pci_sharedinfo *sh_info;
++
++	unsigned long flags;
++
++	struct work_struct op_work;
++};
++
++struct pciback_dev_data {
++	struct list_head config_fields;
++	unsigned int permissive : 1;
++	unsigned int warned_on_write : 1;
++	unsigned int enable_intx : 1;
++	unsigned int isr_on : 1; /* Whether the IRQ handler is installed. */ 
++	unsigned int ack_intr : 1; /* .. and ACK-ing */
++	unsigned long handled;
++	unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
++	char irq_name[0]; /* pciback[000:04:00.0] */
++};
++
++/* Used by XenBus and pciback_ops.c */
++extern wait_queue_head_t aer_wait_queue;
++extern struct workqueue_struct *pciback_wq;
++/* Used by pcistub.c and conf_space_quirks.c */
++extern struct list_head pciback_quirks;
++
++/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */
++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev,
++					    int domain, int bus,
++					    int slot, int func);
++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev,
++				    struct pci_dev *dev);
++void pcistub_put_pci_dev(struct pci_dev *dev);
++
++/* Ensure a device is turned off or reset */
++void pciback_reset_device(struct pci_dev *pdev);
++
++/* Access a virtual configuration space for a PCI device */
++int pciback_config_init(void);
++int pciback_config_init_dev(struct pci_dev *dev);
++void pciback_config_free_dyn_fields(struct pci_dev *dev);
++void pciback_config_reset_dev(struct pci_dev *dev);
++void pciback_config_free_dev(struct pci_dev *dev);
++int pciback_config_read(struct pci_dev *dev, int offset, int size,
++			u32 *ret_val);
++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value);
++
++/* Handle requests for specific devices from the frontend */
++typedef int (*publish_pci_dev_cb) (struct pciback_device *pdev,
++				   unsigned int domain, unsigned int bus,
++				   unsigned int devfn, unsigned int devid);
++typedef int (*publish_pci_root_cb) (struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus);
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb);
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev);
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn);
++
++/**
++* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in pciback
++* before sending aer request to pcifront, so that guest could identify
++* device, coopearte with pciback to finish aer recovery job if device driver
++* has the capability
++*/
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn);
++int pciback_init_devices(struct pciback_device *pdev);
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb cb);
++void pciback_release_devices(struct pciback_device *pdev);
++
++/* Handles events from front-end */
++irqreturn_t pciback_handle_event(int irq, void *dev_id);
++void pciback_do_op(struct work_struct *data);
++
++int pciback_xenbus_register(void);
++void pciback_xenbus_unregister(void);
++
++#ifdef CONFIG_PCI_MSI
++int pciback_enable_msi(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
++
++int pciback_disable_msi(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
++
++
++int pciback_enable_msix(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
++
++int pciback_disable_msix(struct pciback_device *pdev,
++			struct pci_dev *dev, struct xen_pci_op *op);
++#endif
++extern int verbose_request;
++
++void test_and_schedule_op(struct pciback_device *pdev);
++#endif
++
++/* Handles shared IRQs that can to device domain and control domain. */
++void pciback_irq_handler(struct pci_dev *dev, int reset);
++irqreturn_t pciback_guest_interrupt(int irq, void *dev_id);
+diff --git a/drivers/xen/pciback/pciback_ops.c b/drivers/xen/pciback/pciback_ops.c
+new file mode 100644
+index 0000000..5543881
+--- /dev/null
++++ b/drivers/xen/pciback/pciback_ops.c
+@@ -0,0 +1,242 @@
++/*
++ * PCI Backend Operations - respond to PCI requests from Frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/wait.h>
++#include <linux/bitops.h>
++#include <xen/events.h>
++#include <linux/sched.h>
++#include "pciback.h"
++
++int verbose_request;
++module_param(verbose_request, int, 0644);
++
++/* Ensure a device is has the fake IRQ handler "turned on/off" and is
++ * ready to be exported. This MUST be run after pciback_reset_device
++ * which does the actual PCI device enable/disable.
++ */
++void pciback_control_isr(struct pci_dev *dev, int reset)
++{
++	struct pciback_dev_data *dev_data;
++	int rc;
++	int enable = 0;
++
++	dev_data = pci_get_drvdata(dev);
++	if (!dev_data)
++		return;
++
++	/* We don't deal with bridges */
++	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
++		return;
++
++	if (reset) {
++		dev_data->enable_intx = 0;
++		dev_data->ack_intr = 0;
++	}
++	enable =  dev_data->enable_intx;
++
++	/* Asked to disable, but ISR isn't runnig */
++	if (!enable && !dev_data->isr_on)
++		return;
++
++	/* Squirrel away the IRQs in the dev_data. We need this
++	 * b/c when device transitions to MSI, the dev->irq is
++	 * overwritten with the MSI vector.
++	 */
++	if (enable)
++		dev_data->irq = dev->irq;
++
++	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n",
++		dev_data->irq_name,
++		dev_data->irq,
++		pci_is_enabled(dev) ? "on" : "off",
++		dev->msi_enabled ? "MSI" : "",
++		dev->msix_enabled ? "MSI/X" : "",
++		dev_data->isr_on ? "enable" : "disable",
++		enable ? "enable" : "disable");
++
++	if (enable) {
++		rc = request_irq(dev_data->irq,
++				pciback_guest_interrupt, IRQF_SHARED,
++				dev_data->irq_name, dev);
++		if (rc) {
++			dev_err(&dev->dev, "%s: failed to install fake IRQ " \
++				"handler for IRQ %d! (rc:%d)\n", dev_data->irq_name,
++				dev_data->irq, rc);
++			goto out;
++		}
++	}
++	else {
++		free_irq(dev_data->irq, dev);
++		dev_data->irq = 0;
++	}
++	dev_data->isr_on = enable;
++	dev_data->ack_intr = enable;
++out:
++	dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n",
++		dev_data->irq_name,
++		dev_data->irq,
++		pci_is_enabled(dev) ? "on" : "off",
++		dev->msi_enabled ? "MSI" : "",
++		dev->msix_enabled ? "MSI/X" : "",
++		enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
++			(dev_data->isr_on ? "failed to disable" : "disabled"));
++}
++
++/* Ensure a device is "turned off" and ready to be exported.
++ * (Also see pciback_config_reset to ensure virtual configuration space is
++ * ready to be re-exported)
++ */
++void pciback_reset_device(struct pci_dev *dev)
++{
++	u16 cmd;
++
++	pciback_control_isr(dev, 1 /* reset device */);
++
++	/* Disable devices (but not bridges) */
++	if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
++#ifdef CONFIG_PCI_MSI
++		/* The guest could have been abruptly killed without
++		 * disabling MSI/MSI-X interrupts.*/
++		if (dev->msix_enabled)
++			pci_disable_msix(dev);
++		if (dev->msi_enabled)
++			pci_disable_msi(dev);
++#endif
++		pci_disable_device(dev);
++
++		pci_write_config_word(dev, PCI_COMMAND, 0);
++
++		dev->is_busmaster = 0;
++	} else {
++		pci_read_config_word(dev, PCI_COMMAND, &cmd);
++		if (cmd & (PCI_COMMAND_INVALIDATE)) {
++			cmd &= ~(PCI_COMMAND_INVALIDATE);
++			pci_write_config_word(dev, PCI_COMMAND, cmd);
++
++			dev->is_busmaster = 0;
++		}
++	}
++}
++/*
++* Now the same evtchn is used for both pcifront conf_read_write request
++* as well as pcie aer front end ack. We use a new work_queue to schedule
++* pciback conf_read_write service for avoiding confict with aer_core
++* do_recovery job which also use the system default work_queue
++*/
++void test_and_schedule_op(struct pciback_device *pdev)
++{
++	/* Check that frontend is requesting an operation and that we are not
++	 * already processing a request */
++	if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
++	    && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
++		queue_work(pciback_wq, &pdev->op_work);
++	}
++	/*_XEN_PCIB_active should have been cleared by pcifront. And also make
++	sure pciback is waiting for ack by checking _PCIB_op_pending*/
++	if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
++	    && test_bit(_PCIB_op_pending, &pdev->flags)) {
++		wake_up(&aer_wait_queue);
++	}
++}
++
++/* Performing the configuration space reads/writes must not be done in atomic
++ * context because some of the pci_* functions can sleep (mostly due to ACPI
++ * use of semaphores). This function is intended to be called from a work
++ * queue in process context taking a struct pciback_device as a parameter */
++
++void pciback_do_op(struct work_struct *data)
++{
++	struct pciback_device *pdev =
++		container_of(data, struct pciback_device, op_work);
++	struct pci_dev *dev;
++	struct pciback_dev_data *dev_data = NULL;
++	struct xen_pci_op *op = &pdev->sh_info->op;
++	int test_intx = 0;
++
++	dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn);
++
++	if (dev == NULL)
++		op->err = XEN_PCI_ERR_dev_not_found;
++	else {
++		dev_data = pci_get_drvdata(dev);
++		if (dev_data)
++			test_intx = dev_data->enable_intx;
++		switch (op->cmd) {
++		case XEN_PCI_OP_conf_read:
++			op->err = pciback_config_read(dev,
++				  op->offset, op->size, &op->value);
++			break;
++		case XEN_PCI_OP_conf_write:
++			op->err = pciback_config_write(dev,
++				  op->offset, op->size,	op->value);
++			break;
++#ifdef CONFIG_PCI_MSI
++		case XEN_PCI_OP_enable_msi:
++			op->err = pciback_enable_msi(pdev, dev, op);
++			break;
++		case XEN_PCI_OP_disable_msi:
++			op->err = pciback_disable_msi(pdev, dev, op);
++			break;
++		case XEN_PCI_OP_enable_msix:
++			op->err = pciback_enable_msix(pdev, dev, op);
++			break;
++		case XEN_PCI_OP_disable_msix:
++			op->err = pciback_disable_msix(pdev, dev, op);
++			break;
++#endif
++		default:
++			op->err = XEN_PCI_ERR_not_implemented;	
++			break;
++		}
++	}
++	if (!op->err && dev && dev_data) {
++		/* Transition detected */
++		if ((dev_data->enable_intx != test_intx))
++			pciback_control_isr(dev, 0 /* no reset */);
++	}
++	/* Tell the driver domain that we're done. */
++	wmb();
++	clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
++	notify_remote_via_irq(pdev->evtchn_irq);
++
++	/* Mark that we're done. */
++	smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */
++	clear_bit(_PDEVF_op_active, &pdev->flags);
++	smp_mb__after_clear_bit(); /* /before/ final check for work */
++
++	/* Check to see if the driver domain tried to start another request in
++	 * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
++	*/
++	test_and_schedule_op(pdev);
++}
++
++irqreturn_t pciback_handle_event(int irq, void *dev_id)
++{
++	struct pciback_device *pdev = dev_id;
++
++	test_and_schedule_op(pdev);
++
++	return IRQ_HANDLED;
++}
++irqreturn_t pciback_guest_interrupt(int irq, void *dev_id)
++{
++	struct pci_dev *dev = (struct pci_dev *)dev_id;
++	struct pciback_dev_data *dev_data = pci_get_drvdata(dev);
++
++	if (dev_data->isr_on && dev_data->ack_intr) {
++		dev_data->handled++;
++		if ((dev_data->handled % 1000) == 0) {
++			if (xen_ignore_irq(irq)) {
++				printk(KERN_INFO "%s IRQ line is not shared "
++					"with other domains. Turning ISR off\n",
++					 dev_data->irq_name);
++				dev_data->ack_intr = 0;
++			}
++		}
++		return IRQ_HANDLED;
++	}
++	return IRQ_NONE;
++}
+diff --git a/drivers/xen/pciback/slot.c b/drivers/xen/pciback/slot.c
+new file mode 100644
+index 0000000..efb922d
+--- /dev/null
++++ b/drivers/xen/pciback/slot.c
+@@ -0,0 +1,191 @@
++/*
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
++ *   Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
++ */
++
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++/* There are at most 32 slots in a pci bus.  */
++#define PCI_SLOT_MAX 32
++
++#define PCI_BUS_NBR 2
++
++struct slot_dev_data {
++	/* Access to dev_list must be protected by lock */
++	struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
++	spinlock_t lock;
++};
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn)
++{
++	struct pci_dev *dev = NULL;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	unsigned long flags;
++
++	if (domain != 0 || PCI_FUNC(devfn) != 0)
++		return NULL;
++
++	if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
++		return NULL;
++
++	spin_lock_irqsave(&slot_dev->lock, flags);
++	dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
++
++	return dev;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
++{
++	int err = 0, slot, bus;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	unsigned long flags;
++
++	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++		err = -EFAULT;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Can't export bridges on the virtual PCI bus");
++		goto out;
++	}
++
++	spin_lock_irqsave(&slot_dev->lock, flags);
++
++	/* Assign to a new slot on the virtual PCI bus */
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			if (slot_dev->slots[bus][slot] == NULL) {
++				printk(KERN_INFO
++				       "pciback: slot: %s: assign to virtual "
++				       "slot %d, bus %d\n",
++				       pci_name(dev), slot, bus);
++				slot_dev->slots[bus][slot] = dev;
++				goto unlock;
++			}
++		}
++
++	err = -ENOMEM;
++	xenbus_dev_fatal(pdev->xdev, err,
++			 "No more space on root virtual PCI bus");
++
++unlock:
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
++
++	/* Publish this device. */
++	if (!err)
++		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
++
++out:
++	return err;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++	int slot, bus;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&slot_dev->lock, flags);
++
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			if (slot_dev->slots[bus][slot] == dev) {
++				slot_dev->slots[bus][slot] = NULL;
++				found_dev = dev;
++				goto out;
++			}
++		}
++
++out:
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
++
++	if (found_dev)
++		pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++	int slot, bus;
++	struct slot_dev_data *slot_dev;
++
++	slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
++	if (!slot_dev)
++		return -ENOMEM;
++
++	spin_lock_init(&slot_dev->lock);
++
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++)
++			slot_dev->slots[bus][slot] = NULL;
++
++	pdev->pci_dev_data = slot_dev;
++
++	return 0;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_cb)
++{
++	/* The Virtual PCI bus has only one root */
++	return publish_cb(pdev, 0, 0);
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	int slot, bus;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	struct pci_dev *dev;
++
++	for (bus = 0; bus < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			dev = slot_dev->slots[bus][slot];
++			if (dev != NULL)
++				pcistub_put_pci_dev(dev);
++		}
++
++	kfree(slot_dev);
++	pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn)
++{
++	int slot, busnr;
++	struct slot_dev_data *slot_dev = pdev->pci_dev_data;
++	struct pci_dev *dev;
++	int found = 0;
++	unsigned long flags;
++
++	spin_lock_irqsave(&slot_dev->lock, flags);
++
++	for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
++		for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++			dev = slot_dev->slots[busnr][slot];
++			if (dev && dev->bus->number == pcidev->bus->number
++				&& dev->devfn == pcidev->devfn
++				&& pci_domain_nr(dev->bus) ==
++					pci_domain_nr(pcidev->bus)) {
++				found = 1;
++				*domain = 0;
++				*bus = busnr;
++				*devfn = PCI_DEVFN(slot, 0);
++				goto out;
++			}
++		}
++out:
++	spin_unlock_irqrestore(&slot_dev->lock, flags);
++	return found;
++
++}
+diff --git a/drivers/xen/pciback/vpci.c b/drivers/xen/pciback/vpci.c
+new file mode 100644
+index 0000000..2857ab8
+--- /dev/null
++++ b/drivers/xen/pciback/vpci.c
+@@ -0,0 +1,244 @@
++/*
++ * PCI Backend - Provides a Virtual PCI bus (with real devices)
++ *               to the frontend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++
++#include <linux/list.h>
++#include <linux/slab.h>
++#include <linux/pci.h>
++#include <linux/spinlock.h>
++#include "pciback.h"
++
++#define PCI_SLOT_MAX 32
++
++struct vpci_dev_data {
++	/* Access to dev_list must be protected by lock */
++	struct list_head dev_list[PCI_SLOT_MAX];
++	spinlock_t lock;
++};
++
++static inline struct list_head *list_first(struct list_head *head)
++{
++	return head->next;
++}
++
++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus,
++				    unsigned int devfn)
++{
++	struct pci_dev_entry *entry;
++	struct pci_dev *dev = NULL;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	unsigned long flags;
++
++	if (domain != 0 || bus != 0)
++		return NULL;
++
++	if (PCI_SLOT(devfn) < PCI_SLOT_MAX) {
++		spin_lock_irqsave(&vpci_dev->lock, flags);
++
++		list_for_each_entry(entry,
++				    &vpci_dev->dev_list[PCI_SLOT(devfn)],
++				    list) {
++			if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) {
++				dev = entry->dev;
++				break;
++			}
++		}
++
++		spin_unlock_irqrestore(&vpci_dev->lock, flags);
++	}
++	return dev;
++}
++
++static inline int match_slot(struct pci_dev *l, struct pci_dev *r)
++{
++	if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus)
++	    && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn))
++		return 1;
++
++	return 0;
++}
++
++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev,
++			int devid, publish_pci_dev_cb publish_cb)
++{
++	int err = 0, slot, func = -1;
++	struct pci_dev_entry *t, *dev_entry;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	unsigned long flags;
++
++	if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
++		err = -EFAULT;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Can't export bridges on the virtual PCI bus");
++		goto out;
++	}
++
++	dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL);
++	if (!dev_entry) {
++		err = -ENOMEM;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error adding entry to virtual PCI bus");
++		goto out;
++	}
++
++	dev_entry->dev = dev;
++
++	spin_lock_irqsave(&vpci_dev->lock, flags);
++
++	/* Keep multi-function devices together on the virtual PCI bus */
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		if (!list_empty(&vpci_dev->dev_list[slot])) {
++			t = list_entry(list_first(&vpci_dev->dev_list[slot]),
++				       struct pci_dev_entry, list);
++
++			if (match_slot(dev, t->dev)) {
++				pr_info("pciback: vpci: %s: "
++					"assign to virtual slot %d func %d\n",
++					pci_name(dev), slot,
++					PCI_FUNC(dev->devfn));
++				list_add_tail(&dev_entry->list,
++					      &vpci_dev->dev_list[slot]);
++				func = PCI_FUNC(dev->devfn);
++				goto unlock;
++			}
++		}
++	}
++
++	/* Assign to a new slot on the virtual PCI bus */
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		if (list_empty(&vpci_dev->dev_list[slot])) {
++			printk(KERN_INFO
++			       "pciback: vpci: %s: assign to virtual slot %d\n",
++			       pci_name(dev), slot);
++			list_add_tail(&dev_entry->list,
++				      &vpci_dev->dev_list[slot]);
++			func = PCI_FUNC(dev->devfn);
++			goto unlock;
++		}
++	}
++
++	err = -ENOMEM;
++	xenbus_dev_fatal(pdev->xdev, err,
++			 "No more space on root virtual PCI bus");
++
++unlock:
++	spin_unlock_irqrestore(&vpci_dev->lock, flags);
++
++	/* Publish this device. */
++	if (!err)
++		err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid);
++
++out:
++	return err;
++}
++
++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev)
++{
++	int slot;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	struct pci_dev *found_dev = NULL;
++	unsigned long flags;
++
++	spin_lock_irqsave(&vpci_dev->lock, flags);
++
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		struct pci_dev_entry *e, *tmp;
++		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++					 list) {
++			if (e->dev == dev) {
++				list_del(&e->list);
++				found_dev = e->dev;
++				kfree(e);
++				goto out;
++			}
++		}
++	}
++
++out:
++	spin_unlock_irqrestore(&vpci_dev->lock, flags);
++
++	if (found_dev)
++		pcistub_put_pci_dev(found_dev);
++}
++
++int pciback_init_devices(struct pciback_device *pdev)
++{
++	int slot;
++	struct vpci_dev_data *vpci_dev;
++
++	vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL);
++	if (!vpci_dev)
++		return -ENOMEM;
++
++	spin_lock_init(&vpci_dev->lock);
++
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++)
++		INIT_LIST_HEAD(&vpci_dev->dev_list[slot]);
++
++	pdev->pci_dev_data = vpci_dev;
++
++	return 0;
++}
++
++int pciback_publish_pci_roots(struct pciback_device *pdev,
++			      publish_pci_root_cb publish_cb)
++{
++	/* The Virtual PCI bus has only one root */
++	return publish_cb(pdev, 0, 0);
++}
++
++void pciback_release_devices(struct pciback_device *pdev)
++{
++	int slot;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		struct pci_dev_entry *e, *tmp;
++		list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot],
++					 list) {
++			list_del(&e->list);
++			pcistub_put_pci_dev(e->dev);
++			kfree(e);
++		}
++	}
++
++	kfree(vpci_dev);
++	pdev->pci_dev_data = NULL;
++}
++
++int pciback_get_pcifront_dev(struct pci_dev *pcidev,
++			     struct pciback_device *pdev,
++			     unsigned int *domain, unsigned int *bus,
++			     unsigned int *devfn)
++{
++	struct pci_dev_entry *entry;
++	struct pci_dev *dev = NULL;
++	struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
++	unsigned long flags;
++	int found = 0, slot;
++
++	spin_lock_irqsave(&vpci_dev->lock, flags);
++	for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
++		list_for_each_entry(entry,
++			    &vpci_dev->dev_list[slot],
++			    list) {
++			dev = entry->dev;
++			if (dev && dev->bus->number == pcidev->bus->number
++				&& pci_domain_nr(dev->bus) ==
++					pci_domain_nr(pcidev->bus)
++				&& dev->devfn == pcidev->devfn) {
++				found = 1;
++				*domain = 0;
++				*bus = 0;
++				*devfn = PCI_DEVFN(slot,
++					 PCI_FUNC(pcidev->devfn));
++			}
++		}
++	}
++	spin_unlock_irqrestore(&vpci_dev->lock, flags);
++	return found;
++}
+diff --git a/drivers/xen/pciback/xenbus.c b/drivers/xen/pciback/xenbus.c
+new file mode 100644
+index 0000000..f0d5426
+--- /dev/null
++++ b/drivers/xen/pciback/xenbus.c
+@@ -0,0 +1,730 @@
++/*
++ * PCI Backend Xenbus Setup - handles setup with frontend and xend
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/vmalloc.h>
++#include <linux/workqueue.h>
++#include <xen/xenbus.h>
++#include <xen/events.h>
++#include <asm/xen/pci.h>
++#include <linux/workqueue.h>
++#include "pciback.h"
++
++#define INVALID_EVTCHN_IRQ  (-1)
++struct workqueue_struct *pciback_wq;
++
++static struct pciback_device *alloc_pdev(struct xenbus_device *xdev)
++{
++	struct pciback_device *pdev;
++
++	pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL);
++	if (pdev == NULL)
++		goto out;
++	dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev);
++
++	pdev->xdev = xdev;
++	dev_set_drvdata(&xdev->dev, pdev);
++
++	spin_lock_init(&pdev->dev_lock);
++
++	pdev->sh_info = NULL;
++	pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++	pdev->be_watching = 0;
++
++	INIT_WORK(&pdev->op_work, pciback_do_op);
++
++	if (pciback_init_devices(pdev)) {
++		kfree(pdev);
++		pdev = NULL;
++	}
++out:
++	return pdev;
++}
++
++static void pciback_disconnect(struct pciback_device *pdev)
++{
++	spin_lock(&pdev->dev_lock);
++
++	/* Ensure the guest can't trigger our handler before removing devices */
++	if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) {
++		unbind_from_irqhandler(pdev->evtchn_irq, pdev);
++		pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
++	}
++	spin_unlock(&pdev->dev_lock);
++
++	/* If the driver domain started an op, make sure we complete it
++	 * before releasing the shared memory */
++
++	/* Note, the workqueue does not use spinlocks at all.*/
++	flush_workqueue(pciback_wq);
++
++	spin_lock(&pdev->dev_lock);
++	if (pdev->sh_info != NULL) {
++		xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
++		pdev->sh_info = NULL;
++	}
++	spin_unlock(&pdev->dev_lock);
++
++}
++
++static void free_pdev(struct pciback_device *pdev)
++{
++	spin_lock(&pdev->dev_lock);
++	if (pdev->be_watching) {
++		unregister_xenbus_watch(&pdev->be_watch);
++		pdev->be_watching = 0;
++	}
++	spin_unlock(&pdev->dev_lock);
++
++	pciback_disconnect(pdev);
++
++	pciback_release_devices(pdev);
++
++	dev_set_drvdata(&pdev->xdev->dev, NULL);
++	pdev->xdev = NULL;
++
++	kfree(pdev);
++}
++
++static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref,
++			     int remote_evtchn)
++{
++	int err = 0;
++	void *vaddr;
++
++	dev_dbg(&pdev->xdev->dev,
++		"Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
++		gnt_ref, remote_evtchn);
++
++	err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
++	if (err < 0) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				"Error mapping other domain page in ours.");
++		goto out;
++	}
++
++	spin_lock(&pdev->dev_lock);
++	pdev->sh_info = vaddr;
++	spin_unlock(&pdev->dev_lock);
++
++	err = bind_interdomain_evtchn_to_irqhandler(
++		pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
++		0, "pciback", pdev);
++	if (err < 0) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error binding event channel to IRQ");
++		goto out;
++	}
++
++	spin_lock(&pdev->dev_lock);
++	pdev->evtchn_irq = err;
++	spin_unlock(&pdev->dev_lock);
++	err = 0;
++
++	dev_dbg(&pdev->xdev->dev, "Attached!\n");
++out:
++	return err;
++}
++
++static int pciback_attach(struct pciback_device *pdev)
++{
++	int err = 0;
++	int gnt_ref, remote_evtchn;
++	char *magic = NULL;
++
++
++	/* Make sure we only do this setup once */
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateInitialised)
++		goto out;
++
++	/* Wait for frontend to state that it has published the configuration */
++	if (xenbus_read_driver_state(pdev->xdev->otherend) !=
++	    XenbusStateInitialised)
++		goto out;
++
++	dev_dbg(&pdev->xdev->dev, "Reading frontend config\n");
++
++	err = xenbus_gather(XBT_NIL, pdev->xdev->otherend,
++			    "pci-op-ref", "%u", &gnt_ref,
++			    "event-channel", "%u", &remote_evtchn,
++			    "magic", NULL, &magic, NULL);
++	if (err) {
++		/* If configuration didn't get read correctly, wait longer */
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading configuration from frontend");
++		goto out;
++	}
++
++	if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) {
++		xenbus_dev_fatal(pdev->xdev, -EFAULT,
++				 "version mismatch (%s/%s) with pcifront - "
++				 "halting pciback",
++				 magic, XEN_PCI_MAGIC);
++		goto out;
++	}
++
++	err = pciback_do_attach(pdev, gnt_ref, remote_evtchn);
++	if (err)
++		goto out;
++
++	dev_dbg(&pdev->xdev->dev, "Connecting...\n");
++
++	err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
++	if (err)
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error switching to connected state!");
++
++	dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err);
++out:
++
++	kfree(magic);
++
++	return err;
++}
++
++static int pciback_publish_pci_dev(struct pciback_device *pdev,
++				   unsigned int domain, unsigned int bus,
++				   unsigned int devfn, unsigned int devid)
++{
++	int err;
++	int len;
++	char str[64];
++
++	len = snprintf(str, sizeof(str), "vdev-%d", devid);
++	if (unlikely(len >= (sizeof(str) - 1))) {
++		err = -ENOMEM;
++		goto out;
++	}
++
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++			    "%04x:%02x:%02x.%02x", domain, bus,
++			    PCI_SLOT(devfn), PCI_FUNC(devfn));
++
++out:
++	return err;
++}
++
++static int pciback_export_device(struct pciback_device *pdev,
++				 int domain, int bus, int slot, int func,
++				 int devid)
++{
++	struct pci_dev *dev;
++	int err = 0;
++
++	dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n",
++		domain, bus, slot, func);
++
++	dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func);
++	if (!dev) {
++		err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Couldn't locate PCI device "
++				 "(%04x:%02x:%02x.%01x)! "
++				 "perhaps already in-use?",
++				 domain, bus, slot, func);
++		goto out;
++	}
++
++	err = pciback_add_pci_dev(pdev, dev, devid, pciback_publish_pci_dev);
++	if (err)
++		goto out;
++
++	dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
++	if (xen_register_device_domain_owner(dev,
++					     pdev->xdev->otherend_id) != 0) {
++		dev_err(&dev->dev, "device has been assigned to another " \
++			"domain! Over-writting the ownership, but beware.\n");
++		xen_unregister_device_domain_owner(dev);
++		xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
++	}
++
++	/* TODO: It'd be nice to export a bridge and have all of its children
++	 * get exported with it. This may be best done in xend (which will
++	 * have to calculate resource usage anyway) but we probably want to
++	 * put something in here to ensure that if a bridge gets given to a
++	 * driver domain, that all devices under that bridge are not given
++	 * to other driver domains (as he who controls the bridge can disable
++	 * it and stop the other devices from working).
++	 */
++out:
++	return err;
++}
++
++static int pciback_remove_device(struct pciback_device *pdev,
++				 int domain, int bus, int slot, int func)
++{
++	int err = 0;
++	struct pci_dev *dev;
++
++	dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n",
++		domain, bus, slot, func);
++
++	dev = pciback_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func));
++	if (!dev) {
++		err = -EINVAL;
++		dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device "
++			"(%04x:%02x:%02x.%01x)! not owned by this domain\n",
++			domain, bus, slot, func);
++		goto out;
++	}
++
++	dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
++	xen_unregister_device_domain_owner(dev);
++
++	pciback_release_pci_dev(pdev, dev);
++
++out:
++	return err;
++}
++
++static int pciback_publish_pci_root(struct pciback_device *pdev,
++				    unsigned int domain, unsigned int bus)
++{
++	unsigned int d, b;
++	int i, root_num, len, err;
++	char str[64];
++
++	dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n");
++
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++			   "root_num", "%d", &root_num);
++	if (err == 0 || err == -ENOENT)
++		root_num = 0;
++	else if (err < 0)
++		goto out;
++
++	/* Verify that we haven't already published this pci root */
++	for (i = 0; i < root_num; i++) {
++		len = snprintf(str, sizeof(str), "root-%d", i);
++		if (unlikely(len >= (sizeof(str) - 1))) {
++			err = -ENOMEM;
++			goto out;
++		}
++
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++				   str, "%x:%x", &d, &b);
++		if (err < 0)
++			goto out;
++		if (err != 2) {
++			err = -EINVAL;
++			goto out;
++		}
++
++		if (d == domain && b == bus) {
++			err = 0;
++			goto out;
++		}
++	}
++
++	len = snprintf(str, sizeof(str), "root-%d", root_num);
++	if (unlikely(len >= (sizeof(str) - 1))) {
++		err = -ENOMEM;
++		goto out;
++	}
++
++	dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n",
++		root_num, domain, bus);
++
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
++			    "%04x:%02x", domain, bus);
++	if (err)
++		goto out;
++
++	err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++			    "root_num", "%d", (root_num + 1));
++
++out:
++	return err;
++}
++
++static int pciback_reconfigure(struct pciback_device *pdev)
++{
++	int err = 0;
++	int num_devs;
++	int domain, bus, slot, func;
++	int substate;
++	int i, len;
++	char state_str[64];
++	char dev_str[64];
++
++
++	dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
++
++	/* Make sure we only reconfigure once */
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateReconfiguring)
++		goto out;
++
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++			   &num_devs);
++	if (err != 1) {
++		if (err >= 0)
++			err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading number of devices");
++		goto out;
++	}
++
++	for (i = 0; i < num_devs; i++) {
++		len = snprintf(state_str, sizeof(state_str), "state-%d", i);
++		if (unlikely(len >= (sizeof(state_str) - 1))) {
++			err = -ENOMEM;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "String overflow while reading "
++					 "configuration");
++			goto out;
++		}
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str,
++				   "%d", &substate);
++		if (err != 1)
++			substate = XenbusStateUnknown;
++
++		switch (substate) {
++		case XenbusStateInitialising:
++			dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i);
++
++			len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++			if (unlikely(len >= (sizeof(dev_str) - 1))) {
++				err = -ENOMEM;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "String overflow while "
++						 "reading configuration");
++				goto out;
++			}
++			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++					   dev_str, "%x:%x:%x.%x",
++					   &domain, &bus, &slot, &func);
++			if (err < 0) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error reading device "
++						 "configuration");
++				goto out;
++			}
++			if (err != 4) {
++				err = -EINVAL;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error parsing pci device "
++						 "configuration");
++				goto out;
++			}
++
++			err = pciback_export_device(pdev, domain, bus, slot,
++						    func, i);
++			if (err)
++				goto out;
++
++			/* Publish pci roots. */
++			err = pciback_publish_pci_roots(pdev,
++						pciback_publish_pci_root);
++			if (err) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error while publish PCI root"
++						 "buses for frontend");
++				goto out;
++			}
++
++			err = xenbus_printf(XBT_NIL, pdev->xdev->nodename,
++					    state_str, "%d",
++					    XenbusStateInitialised);
++			if (err) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error switching substate of "
++						 "dev-%d\n", i);
++				goto out;
++			}
++			break;
++
++		case XenbusStateClosing:
++			dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i);
++
++			len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i);
++			if (unlikely(len >= (sizeof(dev_str) - 1))) {
++				err = -ENOMEM;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "String overflow while "
++						 "reading configuration");
++				goto out;
++			}
++			err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
++					   dev_str, "%x:%x:%x.%x",
++					   &domain, &bus, &slot, &func);
++			if (err < 0) {
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error reading device "
++						 "configuration");
++				goto out;
++			}
++			if (err != 4) {
++				err = -EINVAL;
++				xenbus_dev_fatal(pdev->xdev, err,
++						 "Error parsing pci device "
++						 "configuration");
++				goto out;
++			}
++
++			err = pciback_remove_device(pdev, domain, bus, slot,
++						    func);
++			if (err)
++				goto out;
++
++			/* TODO: If at some point we implement support for pci
++			 * root hot-remove on pcifront side, we'll need to
++			 * remove unnecessary xenstore nodes of pci roots here.
++			 */
++
++			break;
++
++		default:
++			break;
++		}
++	}
++
++	err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
++	if (err) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error switching to reconfigured state!");
++		goto out;
++	}
++
++out:
++	return 0;
++}
++
++static void pciback_frontend_changed(struct xenbus_device *xdev,
++				     enum xenbus_state fe_state)
++{
++	struct pciback_device *pdev = dev_get_drvdata(&xdev->dev);
++
++	dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state);
++
++	switch (fe_state) {
++	case XenbusStateInitialised:
++		pciback_attach(pdev);
++		break;
++
++	case XenbusStateReconfiguring:
++		pciback_reconfigure(pdev);
++		break;
++
++	case XenbusStateConnected:
++		/* pcifront switched its state from reconfiguring to connected.
++		 * Then switch to connected state.
++		 */
++		xenbus_switch_state(xdev, XenbusStateConnected);
++		break;
++
++	case XenbusStateClosing:
++		pciback_disconnect(pdev);
++		xenbus_switch_state(xdev, XenbusStateClosing);
++		break;
++
++	case XenbusStateClosed:
++		pciback_disconnect(pdev);
++		xenbus_switch_state(xdev, XenbusStateClosed);
++		if (xenbus_dev_is_online(xdev))
++			break;
++		/* fall through if not online */
++	case XenbusStateUnknown:
++		dev_dbg(&xdev->dev, "frontend is gone! unregister device\n");
++		device_unregister(&xdev->dev);
++		break;
++
++	default:
++		break;
++	}
++}
++
++static int pciback_setup_backend(struct pciback_device *pdev)
++{
++	/* Get configuration from xend (if available now) */
++	int domain, bus, slot, func;
++	int err = 0;
++	int i, num_devs;
++	char dev_str[64];
++	char state_str[64];
++
++	/* It's possible we could get the call to setup twice, so make sure
++	 * we're not already connected.
++	 */
++	if (xenbus_read_driver_state(pdev->xdev->nodename) !=
++	    XenbusStateInitWait)
++		goto out;
++
++	dev_dbg(&pdev->xdev->dev, "getting be setup\n");
++
++	err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
++			   &num_devs);
++	if (err != 1) {
++		if (err >= 0)
++			err = -EINVAL;
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error reading number of devices");
++		goto out;
++	}
++
++	for (i = 0; i < num_devs; i++) {
++		int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i);
++		if (unlikely(l >= (sizeof(dev_str) - 1))) {
++			err = -ENOMEM;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "String overflow while reading "
++					 "configuration");
++			goto out;
++		}
++
++		err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str,
++				   "%x:%x:%x.%x", &domain, &bus, &slot, &func);
++		if (err < 0) {
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error reading device configuration");
++			goto out;
++		}
++		if (err != 4) {
++			err = -EINVAL;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "Error parsing pci device "
++					 "configuration");
++			goto out;
++		}
++
++		err = pciback_export_device(pdev, domain, bus, slot, func, i);
++		if (err)
++			goto out;
++
++		/* Switch substate of this device. */
++		l = snprintf(state_str, sizeof(state_str), "state-%d", i);
++		if (unlikely(l >= (sizeof(state_str) - 1))) {
++			err = -ENOMEM;
++			xenbus_dev_fatal(pdev->xdev, err,
++					 "String overflow while reading "
++					 "configuration");
++			goto out;
++		}
++		err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str,
++				    "%d", XenbusStateInitialised);
++		if (err) {
++			xenbus_dev_fatal(pdev->xdev, err, "Error switching "
++					 "substate of dev-%d\n", i);
++			goto out;
++		}
++	}
++
++	err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root);
++	if (err) {
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error while publish PCI root buses "
++				 "for frontend");
++		goto out;
++	}
++
++	err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
++	if (err)
++		xenbus_dev_fatal(pdev->xdev, err,
++				 "Error switching to initialised state!");
++
++out:
++	if (!err)
++		/* see if pcifront is already configured (if not, we'll wait) */
++		pciback_attach(pdev);
++
++	return err;
++}
++
++static void pciback_be_watch(struct xenbus_watch *watch,
++			     const char **vec, unsigned int len)
++{
++	struct pciback_device *pdev =
++	    container_of(watch, struct pciback_device, be_watch);
++
++	switch (xenbus_read_driver_state(pdev->xdev->nodename)) {
++	case XenbusStateInitWait:
++		pciback_setup_backend(pdev);
++		break;
++
++	default:
++		break;
++	}
++}
++
++static int pciback_xenbus_probe(struct xenbus_device *dev,
++				const struct xenbus_device_id *id)
++{
++	int err = 0;
++	struct pciback_device *pdev = alloc_pdev(dev);
++
++	if (pdev == NULL) {
++		err = -ENOMEM;
++		xenbus_dev_fatal(dev, err,
++				 "Error allocating pciback_device struct");
++		goto out;
++	}
++
++	/* wait for xend to configure us */
++	err = xenbus_switch_state(dev, XenbusStateInitWait);
++	if (err)
++		goto out;
++
++	/* watch the backend node for backend configuration information */
++	err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
++				pciback_be_watch);
++	if (err)
++		goto out;
++
++	spin_lock(&pdev->dev_lock);
++	pdev->be_watching = 1;
++	spin_unlock(&pdev->dev_lock);
++
++	/* We need to force a call to our callback here in case
++	 * xend already configured us!
++	 */
++	pciback_be_watch(&pdev->be_watch, NULL, 0);
++
++out:
++	return err;
++}
++
++static int pciback_xenbus_remove(struct xenbus_device *dev)
++{
++	struct pciback_device *pdev = dev_get_drvdata(&dev->dev);
++
++	if (pdev != NULL)
++		free_pdev(pdev);
++
++	return 0;
++}
++
++static const struct xenbus_device_id xenpci_ids[] = {
++	{"pci"},
++	{""},
++};
++
++static struct xenbus_driver xenbus_pciback_driver = {
++	.name 			= "pciback",
++	.owner 			= THIS_MODULE,
++	.ids 			= xenpci_ids,
++	.probe 			= pciback_xenbus_probe,
++	.remove 		= pciback_xenbus_remove,
++	.otherend_changed 	= pciback_frontend_changed,
++};
++
++int __init pciback_xenbus_register(void)
++{
++	pciback_wq = create_workqueue("pciback_workqueue");
++	if (!pciback_wq) {
++		printk(KERN_ERR "%s: create"
++			"pciback_workqueue failed\n",__FUNCTION__);
++		return -EFAULT;
++	}
++	return xenbus_register_backend(&xenbus_pciback_driver);
++}
++
++void __exit pciback_xenbus_unregister(void)
++{
++	destroy_workqueue(pciback_wq);
++	xenbus_unregister_driver(&xenbus_pciback_driver);
++}
+diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c
+new file mode 100644
+index 0000000..6d1a770
+--- /dev/null
++++ b/drivers/xen/pcpu.c
+@@ -0,0 +1,452 @@
++/*
++ * pcpu.c - management physical cpu in dom0 environment
++ */
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <linux/cpu.h>
++#include <xen/xenbus.h>
++#include <xen/pcpu.h>
++#include <xen/events.h>
++#include <xen/acpi.h>
++
++static struct sysdev_class xen_pcpu_sysdev_class = {
++	.name = "xen_pcpu",
++};
++
++static DEFINE_MUTEX(xen_pcpu_lock);
++static RAW_NOTIFIER_HEAD(xen_pcpu_chain);
++
++/* No need for irq disable since hotplug notify is in workqueue context */
++#define get_pcpu_lock() mutex_lock(&xen_pcpu_lock);
++#define put_pcpu_lock() mutex_unlock(&xen_pcpu_lock);
++
++struct xen_pcpus {
++	struct list_head list;
++	int present;
++};
++static struct xen_pcpus xen_pcpus;
++
++int register_xen_pcpu_notifier(struct notifier_block *nb)
++{
++	int ret;
++
++	/* All refer to the chain notifier is protected by the pcpu_lock */
++	get_pcpu_lock();
++	ret = raw_notifier_chain_register(&xen_pcpu_chain, nb);
++	put_pcpu_lock();
++	return ret;
++}
++EXPORT_SYMBOL_GPL(register_xen_pcpu_notifier);
++
++void unregister_xen_pcpu_notifier(struct notifier_block *nb)
++{
++	get_pcpu_lock();
++	raw_notifier_chain_unregister(&xen_pcpu_chain, nb);
++	put_pcpu_lock();
++}
++EXPORT_SYMBOL_GPL(unregister_xen_pcpu_notifier);
++
++static int xen_pcpu_down(uint32_t xen_id)
++{
++	int ret;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_cpu_offline,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.cpu_ol.cpuid	= xen_id,
++	};
++
++	ret = HYPERVISOR_dom0_op(&op);
++	return ret;
++}
++
++static int xen_pcpu_up(uint32_t xen_id)
++{
++	int ret;
++	xen_platform_op_t op = {
++		.cmd			= XENPF_cpu_online,
++		.interface_version	= XENPF_INTERFACE_VERSION,
++		.u.cpu_ol.cpuid	= xen_id,
++	};
++
++	ret = HYPERVISOR_dom0_op(&op);
++	return ret;
++}
++
++static ssize_t show_online(struct sys_device *dev,
++			struct sysdev_attribute *attr,
++			char *buf)
++{
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++	return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE));
++}
++
++static ssize_t __ref store_online(struct sys_device *dev,
++				  struct sysdev_attribute *attr,
++				  const char *buf, size_t count)
++{
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++	ssize_t ret;
++
++	switch (buf[0]) {
++	case '0':
++		ret = xen_pcpu_down(cpu->xen_id);
++		break;
++	case '1':
++		ret = xen_pcpu_up(cpu->xen_id);
++		break;
++	default:
++		ret = -EINVAL;
++	}
++
++	if (ret >= 0)
++		ret = count;
++	return ret;
++}
++
++static SYSDEV_ATTR(online, 0644, show_online, store_online);
++
++static ssize_t show_apicid(struct sys_device *dev,
++			struct sysdev_attribute *attr,
++			char *buf)
++{
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++	return sprintf(buf, "%u\n", cpu->apic_id);
++}
++
++static ssize_t show_acpiid(struct sys_device *dev,
++			struct sysdev_attribute *attr,
++			char *buf)
++{
++	struct pcpu *cpu = container_of(dev, struct pcpu, sysdev);
++
++	return sprintf(buf, "%u\n", cpu->acpi_id);
++}
++static SYSDEV_ATTR(apic_id, 0444, show_apicid, NULL);
++static SYSDEV_ATTR(acpi_id, 0444, show_acpiid, NULL);
++
++static int xen_pcpu_free(struct pcpu *pcpu)
++{
++	if (!pcpu)
++		return 0;
++
++	sysdev_remove_file(&pcpu->sysdev, &attr_online);
++	sysdev_unregister(&pcpu->sysdev);
++	list_del(&pcpu->pcpu_list);
++	kfree(pcpu);
++
++	return 0;
++}
++
++static inline int same_pcpu(struct xenpf_pcpuinfo *info,
++			    struct pcpu *pcpu)
++{
++	return (pcpu->apic_id == info->apic_id) &&
++		(pcpu->xen_id == info->xen_cpuid);
++}
++
++/*
++ * Return 1 if online status changed
++ */
++static int xen_pcpu_online_check(struct xenpf_pcpuinfo *info,
++				 struct pcpu *pcpu)
++{
++	int result = 0;
++
++	if (info->xen_cpuid != pcpu->xen_id)
++		return 0;
++
++	if (xen_pcpu_online(info->flags) && !xen_pcpu_online(pcpu->flags)) {
++		/* the pcpu is onlined */
++		pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
++		kobject_uevent(&pcpu->sysdev.kobj, KOBJ_ONLINE);
++		raw_notifier_call_chain(&xen_pcpu_chain,
++			XEN_PCPU_ONLINE, (void *)(long)pcpu->xen_id);
++		result = 1;
++	} else if (!xen_pcpu_online(info->flags) &&
++		 xen_pcpu_online(pcpu->flags))  {
++		/* The pcpu is offlined now */
++		pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
++		kobject_uevent(&pcpu->sysdev.kobj, KOBJ_OFFLINE);
++		raw_notifier_call_chain(&xen_pcpu_chain,
++			XEN_PCPU_OFFLINE, (void *)(long)pcpu->xen_id);
++		result = 1;
++	}
++
++	return result;
++}
++
++static int pcpu_sysdev_init(struct pcpu *cpu)
++{
++	int error;
++
++	error = sysdev_register(&cpu->sysdev);
++	if (error) {
++		printk(KERN_WARNING "xen_pcpu_add: Failed to register pcpu\n");
++		kfree(cpu);
++		return -1;
++	}
++	sysdev_create_file(&cpu->sysdev, &attr_online);
++	sysdev_create_file(&cpu->sysdev, &attr_apic_id);
++	sysdev_create_file(&cpu->sysdev, &attr_acpi_id);
++	return 0;
++}
++
++static struct pcpu *get_pcpu(int xen_id)
++{
++	struct pcpu *pcpu = NULL;
++
++	list_for_each_entry(pcpu, &xen_pcpus.list, pcpu_list) {
++		if (pcpu->xen_id == xen_id)
++			return pcpu;
++	}
++	return NULL;
++}
++
++static struct pcpu *init_pcpu(struct xenpf_pcpuinfo *info)
++{
++	struct pcpu *pcpu;
++
++	if (info->flags & XEN_PCPU_FLAGS_INVALID)
++		return NULL;
++
++	/* The PCPU is just added */
++	pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
++	if (!pcpu)
++		return NULL;
++
++	INIT_LIST_HEAD(&pcpu->pcpu_list);
++	pcpu->xen_id = info->xen_cpuid;
++	pcpu->apic_id = info->apic_id;
++	pcpu->acpi_id = info->acpi_id;
++	pcpu->flags = info->flags;
++
++	pcpu->sysdev.cls = &xen_pcpu_sysdev_class;
++	pcpu->sysdev.id = info->xen_cpuid;
++
++	if (pcpu_sysdev_init(pcpu)) {
++		kfree(pcpu);
++		return NULL;
++	}
++
++	list_add_tail(&pcpu->pcpu_list, &xen_pcpus.list);
++	raw_notifier_call_chain(&xen_pcpu_chain,
++				XEN_PCPU_ADD,
++				(void *)(long)pcpu->xen_id);
++	return pcpu;
++}
++
++#define PCPU_NO_CHANGE			0
++#define PCPU_ADDED			1
++#define PCPU_ONLINE_OFFLINE		2
++#define PCPU_REMOVED			3
++/*
++ * Caller should hold the pcpu lock
++ * < 0: Something wrong
++ * 0: No changes
++ * > 0: State changed
++ */
++static struct pcpu *_sync_pcpu(int cpu_num, int *max_id, int *result)
++{
++	struct pcpu *pcpu = NULL;
++	struct xenpf_pcpuinfo *info;
++	xen_platform_op_t op = {
++		.cmd            = XENPF_get_cpuinfo,
++		.interface_version  = XENPF_INTERFACE_VERSION,
++	};
++	int ret;
++
++	*result = -1;
++
++	info = &op.u.pcpu_info;
++	info->xen_cpuid = cpu_num;
++
++	ret = HYPERVISOR_dom0_op(&op);
++	if (ret)
++		return NULL;
++
++	if (max_id)
++		*max_id = op.u.pcpu_info.max_present;
++
++	pcpu = get_pcpu(cpu_num);
++
++	if (info->flags & XEN_PCPU_FLAGS_INVALID) {
++		/* The pcpu has been removed */
++		*result = PCPU_NO_CHANGE;
++		if (pcpu) {
++			raw_notifier_call_chain(&xen_pcpu_chain,
++			  XEN_PCPU_REMOVE,
++			  (void *)(long)pcpu->xen_id);
++			xen_pcpu_free(pcpu);
++			*result = PCPU_REMOVED;
++		}
++		return NULL;
++	}
++
++
++	if (!pcpu) {
++		*result = PCPU_ADDED;
++		pcpu = init_pcpu(info);
++		if (pcpu == NULL) {
++			printk(KERN_WARNING "Failed to init pcpu %x\n",
++			  info->xen_cpuid);
++			  *result = -1;
++		}
++	} else {
++		*result = PCPU_NO_CHANGE;
++		/*
++		 * Old PCPU is replaced with a new pcpu, this means
++		 * several virq is missed, will it happen?
++		 */
++		if (!same_pcpu(info, pcpu)) {
++			printk(KERN_WARNING "Pcpu %x changed!\n",
++			  pcpu->xen_id);
++			pcpu->apic_id = info->apic_id;
++			pcpu->acpi_id = info->acpi_id;
++		}
++		if (xen_pcpu_online_check(info, pcpu))
++			*result = PCPU_ONLINE_OFFLINE;
++	}
++	return pcpu;
++}
++
++int xen_pcpu_index(uint32_t id, int is_acpiid)
++{
++	int cpu_num = 0, max_id = 0, ret;
++	xen_platform_op_t op = {
++		.cmd            = XENPF_get_cpuinfo,
++		.interface_version  = XENPF_INTERFACE_VERSION,
++	};
++	struct xenpf_pcpuinfo *info = &op.u.pcpu_info;
++
++	info->xen_cpuid = 0;
++	ret = HYPERVISOR_dom0_op(&op);
++	if (ret)
++		return -1;
++	max_id = op.u.pcpu_info.max_present;
++
++	while ((cpu_num <= max_id)) {
++		info->xen_cpuid = cpu_num;
++		ret = HYPERVISOR_dom0_op(&op);
++		if (ret)
++			continue;
++
++		if (op.u.pcpu_info.max_present > max_id)
++			max_id = op.u.pcpu_info.max_present;
++		if (id == (is_acpiid ? info->acpi_id : info->apic_id))
++			return cpu_num;
++		cpu_num++;
++	}
++
++    return -1;
++}
++EXPORT_SYMBOL(xen_pcpu_index);
++
++/*
++ * Sync dom0's pcpu information with xen hypervisor's
++ */
++static int xen_sync_pcpus(void)
++{
++	/*
++	 * Boot cpu always have cpu_id 0 in xen
++	 */
++	int cpu_num = 0, max_id = 0, result = 0, present = 0;
++	struct list_head *elem, *tmp;
++	struct pcpu *pcpu;
++
++	get_pcpu_lock();
++
++	while ((result >= 0) && (cpu_num <= max_id)) {
++		pcpu = _sync_pcpu(cpu_num, &max_id, &result);
++
++		printk(KERN_DEBUG "sync cpu %x get result %x max_id %x\n",
++			cpu_num, result, max_id);
++
++		switch (result)	{
++		case PCPU_NO_CHANGE:
++			if (pcpu)
++				present++;
++			break;
++		case PCPU_ADDED:
++		case PCPU_ONLINE_OFFLINE:
++			present++;
++		case PCPU_REMOVED:
++			break;
++		default:
++			printk(KERN_WARNING "Failed to sync pcpu %x\n",
++			  cpu_num);
++			break;
++
++		}
++		cpu_num++;
++	}
++
++	if (result < 0) {
++		list_for_each_safe(elem, tmp, &xen_pcpus.list) {
++			pcpu = list_entry(elem, struct pcpu, pcpu_list);
++			xen_pcpu_free(pcpu);
++		}
++		present = 0;
++	}
++
++	xen_pcpus.present = present;
++
++	put_pcpu_lock();
++
++	return 0;
++}
++
++static void xen_pcpu_dpc(struct work_struct *work)
++{
++	if (xen_sync_pcpus() < 0)
++		printk(KERN_WARNING
++			"xen_pcpu_dpc: Failed to sync pcpu information\n");
++}
++static DECLARE_WORK(xen_pcpu_work, xen_pcpu_dpc);
++
++int xen_pcpu_hotplug(int type, uint32_t apic_id)
++{
++	schedule_work(&xen_pcpu_work);
++
++	return 0;
++}
++EXPORT_SYMBOL(xen_pcpu_hotplug);
++
++static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
++{
++	schedule_work(&xen_pcpu_work);
++	return IRQ_HANDLED;
++}
++
++static int __init xen_pcpu_init(void)
++{
++	int err;
++
++	if (!xen_initial_domain())
++		return 0;
++
++	err = sysdev_class_register(&xen_pcpu_sysdev_class);
++	if (err) {
++		printk(KERN_WARNING
++			"xen_pcpu_init: register xen_pcpu sysdev Failed!\n");
++		return err;
++	}
++
++	INIT_LIST_HEAD(&xen_pcpus.list);
++	xen_pcpus.present = 0;
++
++	xen_sync_pcpus();
++	if (xen_pcpus.present > 0)
++		err = bind_virq_to_irqhandler(VIRQ_PCPU_STATE,
++			0, xen_pcpu_interrupt, 0, "pcpu", NULL);
++	if (err < 0)
++		printk(KERN_WARNING "xen_pcpu_init: "
++			"Failed to bind pcpu_state virq\n"
++			"You will lost latest information! \n");
++	return err;
++}
++
++arch_initcall(xen_pcpu_init);
+diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
+new file mode 100644
+index 0000000..c01b5dd
+--- /dev/null
++++ b/drivers/xen/platform-pci.c
+@@ -0,0 +1,207 @@
++/******************************************************************************
++ * platform-pci.c
++ *
++ * Xen platform PCI device driver
++ * Copyright (c) 2005, Intel Corporation.
++ * Copyright (c) 2007, XenSource Inc.
++ * Copyright (c) 2010, Citrix
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms and conditions of the GNU General Public License,
++ * version 2, as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++ * more details.
++ *
++ * You should have received a copy of the GNU General Public License along with
++ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++ * Place - Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++
++#include <linux/interrupt.h>
++#include <linux/io.h>
++#include <linux/module.h>
++#include <linux/pci.h>
++
++#include <xen/platform_pci.h>
++#include <xen/grant_table.h>
++#include <xen/xenbus.h>
++#include <xen/events.h>
++#include <xen/hvm.h>
++#include <xen/xen-ops.h>
++
++#define DRV_NAME    "xen-platform-pci"
++
++MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com");
++MODULE_DESCRIPTION("Xen platform PCI device");
++MODULE_LICENSE("GPL");
++
++static unsigned long platform_mmio;
++static unsigned long platform_mmio_alloc;
++static unsigned long platform_mmiolen;
++static uint64_t callback_via;
++
++unsigned long alloc_xen_mmio(unsigned long len)
++{
++	unsigned long addr;
++
++	addr = platform_mmio + platform_mmio_alloc;
++	platform_mmio_alloc += len;
++	BUG_ON(platform_mmio_alloc > platform_mmiolen);
++
++	return addr;
++}
++
++static uint64_t get_callback_via(struct pci_dev *pdev)
++{
++	u8 pin;
++	int irq;
++
++	irq = pdev->irq;
++	if (irq < 16)
++		return irq; /* ISA IRQ */
++
++	pin = pdev->pin;
++
++	/* We don't know the GSI. Specify the PCI INTx line instead. */
++	return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
++		((uint64_t)pci_domain_nr(pdev->bus) << 32) |
++		((uint64_t)pdev->bus->number << 16) |
++		((uint64_t)(pdev->devfn & 0xff) << 8) |
++		((uint64_t)(pin - 1) & 3);
++}
++
++static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
++{
++	xen_hvm_evtchn_do_upcall();
++	return IRQ_HANDLED;
++}
++
++static int xen_allocate_irq(struct pci_dev *pdev)
++{
++	return request_irq(pdev->irq, do_hvm_evtchn_intr,
++			IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
++			"xen-platform-pci", pdev);
++}
++
++static int platform_pci_resume(struct pci_dev *pdev)
++{
++	int err;
++	if (xen_have_vector_callback)
++		return 0;
++	err = xen_set_callback_via(callback_via);
++	if (err) {
++		dev_err(&pdev->dev, "platform_pci_resume failure!\n");
++		return err;
++	}
++	return 0;
++}
++
++static int __devinit platform_pci_init(struct pci_dev *pdev,
++				       const struct pci_device_id *ent)
++{
++	int i, ret;
++	long ioaddr, iolen;
++	long mmio_addr, mmio_len;
++	unsigned int max_nr_gframes;
++
++	i = pci_enable_device(pdev);
++	if (i)
++		return i;
++
++	ioaddr = pci_resource_start(pdev, 0);
++	iolen = pci_resource_len(pdev, 0);
++
++	mmio_addr = pci_resource_start(pdev, 1);
++	mmio_len = pci_resource_len(pdev, 1);
++
++	if (mmio_addr == 0 || ioaddr == 0) {
++		dev_err(&pdev->dev, "no resources found\n");
++		ret = -ENOENT;
++		goto pci_out;
++	}
++
++	if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
++		dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
++		       mmio_addr, mmio_len);
++		ret = -EBUSY;
++		goto pci_out;
++	}
++
++	if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
++		dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
++		       iolen, ioaddr);
++		ret = -EBUSY;
++		goto mem_out;
++	}
++
++	platform_mmio = mmio_addr;
++	platform_mmiolen = mmio_len;
++
++	if (!xen_have_vector_callback) {
++		ret = xen_allocate_irq(pdev);
++		if (ret) {
++			dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
++			goto out;
++		}
++		callback_via = get_callback_via(pdev);
++		ret = xen_set_callback_via(callback_via);
++		if (ret) {
++			dev_warn(&pdev->dev, "Unable to set the evtchn callback "
++					 "err=%d\n", ret);
++			goto out;
++		}
++	}
++
++	max_nr_gframes = gnttab_max_grant_frames();
++	xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
++	ret = gnttab_init();
++	if (ret)
++		goto out;
++	xenbus_probe(NULL);
++	ret = xen_setup_shutdown_event();
++	if (ret)
++		goto out;
++	return 0;
++
++out:
++	release_region(ioaddr, iolen);
++mem_out:
++	release_mem_region(mmio_addr, mmio_len);
++pci_out:
++	pci_disable_device(pdev);
++	return ret;
++}
++
++static struct pci_device_id platform_pci_tbl[] __devinitdata = {
++	{PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
++		PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
++	{0,}
++};
++
++MODULE_DEVICE_TABLE(pci, platform_pci_tbl);
++
++static struct pci_driver platform_driver = {
++	.name =           DRV_NAME,
++	.probe =          platform_pci_init,
++	.id_table =       platform_pci_tbl,
++#ifdef CONFIG_PM
++	.resume_early =   platform_pci_resume,
++#endif
++};
++
++static int __init platform_pci_module_init(void)
++{
++	/* no unplug has been done, IGNORE hasn't been specified: just
++	 * return now */
++	if (!xen_platform_pci_unplug)
++		return -ENODEV;
++
++	return pci_register_driver(&platform_driver);
++}
++
++module_init(platform_pci_module_init);
+diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
+index 88a60e0..ae5cb05 100644
+--- a/drivers/xen/sys-hypervisor.c
++++ b/drivers/xen/sys-hypervisor.c
+@@ -14,6 +14,7 @@
+ #include <asm/xen/hypervisor.h>
+ #include <asm/xen/hypercall.h>
+ 
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/interface/xen.h>
+ #include <xen/interface/version.h>
+diff --git a/drivers/xen/xen_acpi_memhotplug.c b/drivers/xen/xen_acpi_memhotplug.c
+new file mode 100644
+index 0000000..0c4af99
+--- /dev/null
++++ b/drivers/xen/xen_acpi_memhotplug.c
+@@ -0,0 +1,209 @@
++/*
++ *  xen_acpi_memhotplug.c - interface to notify Xen on memory device hotadd
++ *
++ *  Copyright (C) 2008, Intel corporation
++ *
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or (at
++ *  your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful, but
++ *  WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ *  General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License along
++ *  with this program; if not, write to the Free Software Foundation, Inc.,
++ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
++ *
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/types.h>
++#include <linux/memory_hotplug.h>
++#include <acpi/acpi_drivers.h>
++#include <xen/interface/platform.h>
++#include <linux/interrupt.h>
++#include <linux/spinlock.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++#include <xen/acpi.h>
++
++struct xen_hotmem_entry {
++	struct list_head hotmem_list;
++	uint64_t start;
++	uint64_t end;
++	uint32_t flags;
++	uint32_t pxm;
++};
++
++struct xen_hotmem_list {
++	struct list_head list;
++	int entry_nr;
++} xen_hotmem;
++
++DEFINE_SPINLOCK(xen_hotmem_lock);
++
++static int xen_hyper_addmem(struct xen_hotmem_entry *entry)
++{
++	int ret;
++
++	xen_platform_op_t op = {
++		.cmd            = XENPF_mem_hotadd,
++		.interface_version  = XENPF_INTERFACE_VERSION,
++	};
++	op.u.mem_add.spfn = entry->start >> PAGE_SHIFT;
++	op.u.mem_add.epfn = entry->end >> PAGE_SHIFT;
++	op.u.mem_add.flags = entry->flags;
++	op.u.mem_add.pxm = entry->pxm;
++
++	ret = HYPERVISOR_dom0_op(&op);
++	return ret;
++}
++
++static int add_hotmem_entry(int pxm, uint64_t start,
++			uint64_t length, uint32_t flags)
++{
++	struct xen_hotmem_entry *entry;
++
++	if (pxm < 0 || !length)
++		return -EINVAL;
++
++	entry = kzalloc(sizeof(struct xen_hotmem_entry), GFP_ATOMIC);
++	if (!entry)
++		return -ENOMEM;
++
++	INIT_LIST_HEAD(&entry->hotmem_list);
++	entry->start = start;
++	entry->end = start + length;
++	entry->flags = flags;
++	entry->pxm = pxm;
++
++	spin_lock(&xen_hotmem_lock);
++
++	list_add_tail(&entry->hotmem_list, &xen_hotmem.list);
++	xen_hotmem.entry_nr++;
++
++	spin_unlock(&xen_hotmem_lock);
++
++	return 0;
++}
++
++static int free_hotmem_entry(struct xen_hotmem_entry *entry)
++{
++	list_del(&entry->hotmem_list);
++	kfree(entry);
++
++	return 0;
++}
++
++static void xen_hotadd_mem_dpc(struct work_struct *work)
++{
++	struct list_head *elem, *tmp;
++	struct xen_hotmem_entry *entry;
++	unsigned long flags;
++	int ret;
++
++	spin_lock_irqsave(&xen_hotmem_lock, flags);
++	list_for_each_safe(elem, tmp, &xen_hotmem.list) {
++		entry = list_entry(elem, struct xen_hotmem_entry, hotmem_list);
++		ret = xen_hyper_addmem(entry);
++		if (ret)
++			printk(KERN_WARNING "xen addmem failed with %x\n", ret);
++		free_hotmem_entry(entry);
++		xen_hotmem.entry_nr--;
++	}
++	spin_unlock_irqrestore(&xen_hotmem_lock, flags);
++}
++
++static DECLARE_WORK(xen_hotadd_mem_work, xen_hotadd_mem_dpc);
++
++static int xen_acpi_get_pxm(acpi_handle h)
++{
++	unsigned long long pxm;
++	acpi_status status;
++	acpi_handle handle;
++	acpi_handle phandle = h;
++
++	do {
++		handle = phandle;
++		status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
++		if (ACPI_SUCCESS(status))
++			return pxm;
++		status = acpi_get_parent(handle, &phandle);
++	} while (ACPI_SUCCESS(status));
++
++	return -1;
++}
++
++int xen_hotadd_memory(struct acpi_memory_device *mem_device)
++{
++	int pxm, result;
++	int num_enabled = 0;
++	struct acpi_memory_info *info;
++
++	if (!mem_device)
++		return -EINVAL;
++
++	pxm = xen_acpi_get_pxm(mem_device->device->handle);
++
++	if (pxm < 0)
++		return -EINVAL;
++
++	/*
++	 * Always return success to ACPI driver, and notify hypervisor later
++	 * because hypervisor will utilize the memory in memory hotadd hypercall
++	 */
++	list_for_each_entry(info, &mem_device->res_list, list) {
++		if (info->enabled) { /* just sanity check...*/
++			num_enabled++;
++			continue;
++		}
++		/*
++		 * If the memory block size is zero, please ignore it.
++		 * Don't try to do the following memory hotplug flowchart.
++		 */
++		if (!info->length)
++			continue;
++
++		result = add_hotmem_entry(pxm, info->start_addr,
++					info->length, 0);
++		if (result)
++			continue;
++		info->enabled = 1;
++		num_enabled++;
++	}
++
++	if (!num_enabled)
++		return -EINVAL;
++
++	schedule_work(&xen_hotadd_mem_work);
++
++	return 0;
++}
++EXPORT_SYMBOL(xen_hotadd_memory);
++
++static int xen_hotadd_mem_init(void)
++{
++	if (!xen_initial_domain())
++		return -ENODEV;
++
++	INIT_LIST_HEAD(&xen_hotmem.list);
++	xen_hotmem.entry_nr = 0;
++
++	return 0;
++}
++
++static void xen_hotadd_mem_exit(void)
++{
++	flush_scheduled_work();
++}
++
++module_init(xen_hotadd_mem_init);
++module_exit(xen_hotadd_mem_exit);
++MODULE_LICENSE("GPL");
+diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile
+index 5571f5b..8dca685 100644
+--- a/drivers/xen/xenbus/Makefile
++++ b/drivers/xen/xenbus/Makefile
+@@ -5,3 +5,8 @@ xenbus-objs += xenbus_client.o
+ xenbus-objs += xenbus_comms.o
+ xenbus-objs += xenbus_xs.o
+ xenbus-objs += xenbus_probe.o
++
++xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
++xenbus-objs += $(xenbus-be-objs-y)
++
++obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
+diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
+index 92a1ef8..89f2e42 100644
+--- a/drivers/xen/xenbus/xenbus_client.c
++++ b/drivers/xen/xenbus/xenbus_client.c
+@@ -49,6 +49,8 @@ const char *xenbus_strstate(enum xenbus_state state)
+ 		[ XenbusStateConnected    ] = "Connected",
+ 		[ XenbusStateClosing      ] = "Closing",
+ 		[ XenbusStateClosed	  ] = "Closed",
++		[ XenbusStateReconfiguring ] = "Reconfiguring",
++		[ XenbusStateReconfigured ] = "Reconfigured",
+ 	};
+ 	return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
+ }
+@@ -132,17 +134,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev,
+ }
+ EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+ 
++static void xenbus_switch_fatal(struct xenbus_device *, int, int,
++				const char *, ...);
+ 
+-/**
+- * xenbus_switch_state
+- * @dev: xenbus device
+- * @state: new state
+- *
+- * Advertise in the store a change of the given driver to the given new_state.
+- * Return 0 on success, or -errno on error.  On error, the device will switch
+- * to XenbusStateClosing, and the error will be saved in the store.
+- */
+-int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
++static int
++__xenbus_switch_state(struct xenbus_device *dev,
++		      enum xenbus_state state, int depth)
+ {
+ 	/* We check whether the state is currently set to the given value, and
+ 	   if not, then the state is set.  We don't want to unconditionally
+@@ -151,35 +148,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+ 	   to it, as the device will be tearing down, and we don't want to
+ 	   resurrect that directory.
+ 
+-	   Note that, because of this cached value of our state, this function
+-	   will not work inside a Xenstore transaction (something it was
+-	   trying to in the past) because dev->state would not get reset if
+-	   the transaction was aborted.
+-
++	   Note that, because of this cached value of our state, this
++	   function will not take a caller's Xenstore transaction
++	   (something it was trying to in the past) because dev->state
++	   would not get reset if the transaction was aborted.
+ 	 */
+ 
++	struct xenbus_transaction xbt;
+ 	int current_state;
+-	int err;
++	int err, abort;
+ 
+ 	if (state == dev->state)
+ 		return 0;
+ 
+-	err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
+-			   &current_state);
+-	if (err != 1)
++again:
++	abort = 1;
++
++	err = xenbus_transaction_start(&xbt);
++	if (err) {
++		xenbus_switch_fatal(dev, depth, err, "starting transaction");
+ 		return 0;
++	}
++
++	err = xenbus_scanf(xbt, dev->nodename, "state", "%d", &current_state);
++	if (err != 1)
++		goto abort;
+ 
+-	err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
++	err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
+ 	if (err) {
+-		if (state != XenbusStateClosing) /* Avoid looping */
+-			xenbus_dev_fatal(dev, err, "writing new state");
+-		return err;
++		xenbus_switch_fatal(dev, depth, err, "writing new state");
++		goto abort;
+ 	}
+ 
+-	dev->state = state;
++	abort = 0;
++abort:
++	err = xenbus_transaction_end(xbt, abort);
++	if (err) {
++		if (err == -EAGAIN && !abort)
++			goto again;
++		xenbus_switch_fatal(dev, depth, err, "ending transaction");
++	} else
++		dev->state = state;
+ 
+ 	return 0;
+ }
++
++/**
++ * xenbus_switch_state
++ * @dev: xenbus device
++ * @state: new state
++ *
++ * Advertise in the store a change of the given driver to the given new_state.
++ * Return 0 on success, or -errno on error.  On error, the device will switch
++ * to XenbusStateClosing, and the error will be saved in the store.
++ */
++int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
++{
++	return __xenbus_switch_state(dev, state, 0);
++}
++
+ EXPORT_SYMBOL_GPL(xenbus_switch_state);
+ 
+ int xenbus_frontend_closed(struct xenbus_device *dev)
+@@ -283,6 +310,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
+ EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
+ 
+ /**
++ * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps
++ * avoiding recursion within xenbus_switch_state.
++ */
++static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
++				const char *fmt, ...)
++{
++	va_list ap;
++
++	va_start(ap, fmt);
++	xenbus_va_dev_error(dev, err, fmt, ap);
++	va_end(ap);
++
++	if (!depth)
++		__xenbus_switch_state(dev, XenbusStateClosing, 1);
++}
++
++/**
+  * xenbus_grant_ring
+  * @dev: xenbus device
+  * @ring_mfn: mfn of ring to grant
+diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
+index 649fcdf..3a83ba2 100644
+--- a/drivers/xen/xenbus/xenbus_probe.c
++++ b/drivers/xen/xenbus/xenbus_probe.c
+@@ -49,31 +49,29 @@
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+ #include <asm/xen/hypervisor.h>
++
++#include <xen/xen.h>
+ #include <xen/xenbus.h>
+ #include <xen/events.h>
+ #include <xen/page.h>
+ 
++#include <xen/platform_pci.h>
++#include <xen/hvm.h>
++
+ #include "xenbus_comms.h"
+ #include "xenbus_probe.h"
+ 
+ 
+ int xen_store_evtchn;
+-EXPORT_SYMBOL(xen_store_evtchn);
++EXPORT_SYMBOL_GPL(xen_store_evtchn);
+ 
+ struct xenstore_domain_interface *xen_store_interface;
++EXPORT_SYMBOL_GPL(xen_store_interface);
++
+ static unsigned long xen_store_mfn;
+ 
+ static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+ 
+-static void wait_for_devices(struct xenbus_driver *xendrv);
+-
+-static int xenbus_probe_frontend(const char *type, const char *name);
+-
+-static void xenbus_dev_shutdown(struct device *_dev);
+-
+-static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+-static int xenbus_dev_resume(struct device *dev);
+-
+ /* If something in array of ids matches this device, return it. */
+ static const struct xenbus_device_id *
+ match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
+@@ -94,34 +92,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
+ 
+ 	return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
+ }
+-
+-static int xenbus_uevent(struct device *_dev, struct kobj_uevent_env *env)
+-{
+-	struct xenbus_device *dev = to_xenbus_device(_dev);
+-
+-	if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
+-		return -ENOMEM;
+-
+-	return 0;
+-}
+-
+-/* device/<type>/<id> => <type>-<id> */
+-static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+-{
+-	nodename = strchr(nodename, '/');
+-	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+-		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
+-		return -EINVAL;
+-	}
+-
+-	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+-	if (!strchr(bus_id, '/')) {
+-		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
+-		return -EINVAL;
+-	}
+-	*strchr(bus_id, '/') = '-';
+-	return 0;
+-}
++EXPORT_SYMBOL_GPL(xenbus_match);
+ 
+ 
+ static void free_otherend_details(struct xenbus_device *dev)
+@@ -141,7 +112,28 @@ static void free_otherend_watch(struct xenbus_device *dev)
+ }
+ 
+ 
+-int read_otherend_details(struct xenbus_device *xendev,
++static int talk_to_otherend(struct xenbus_device *dev)
++{
++	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
++
++	free_otherend_watch(dev);
++	free_otherend_details(dev);
++
++	return drv->read_otherend_details(dev);
++}
++
++
++
++static int watch_otherend(struct xenbus_device *dev)
++{
++	struct xen_bus_type *bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
++
++	return xenbus_watch_pathfmt(dev, &dev->otherend_watch, bus->otherend_changed,
++				    "%s/%s", dev->otherend, "state");
++}
++
++
++int xenbus_read_otherend_details(struct xenbus_device *xendev,
+ 				 char *id_node, char *path_node)
+ {
+ 	int err = xenbus_gather(XBT_NIL, xendev->nodename,
+@@ -166,39 +158,11 @@ int read_otherend_details(struct xenbus_device *xendev,
+ 
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
+ 
+-
+-static int read_backend_details(struct xenbus_device *xendev)
+-{
+-	return read_otherend_details(xendev, "backend-id", "backend");
+-}
+-
+-static struct device_attribute xenbus_dev_attrs[] = {
+-	__ATTR_NULL
+-};
+-
+-/* Bus type for frontend drivers. */
+-static struct xen_bus_type xenbus_frontend = {
+-	.root = "device",
+-	.levels = 2, 		/* device/type/<id> */
+-	.get_bus_id = frontend_bus_id,
+-	.probe = xenbus_probe_frontend,
+-	.bus = {
+-		.name      = "xen",
+-		.match     = xenbus_match,
+-		.uevent    = xenbus_uevent,
+-		.probe     = xenbus_dev_probe,
+-		.remove    = xenbus_dev_remove,
+-		.shutdown  = xenbus_dev_shutdown,
+-		.dev_attrs = xenbus_dev_attrs,
+-
+-		.suspend   = xenbus_dev_suspend,
+-		.resume    = xenbus_dev_resume,
+-	},
+-};
+-
+-static void otherend_changed(struct xenbus_watch *watch,
+-			     const char **vec, unsigned int len)
++void xenbus_otherend_changed(struct xenbus_watch *watch,
++			     const char **vec, unsigned int len,
++			     int ignore_on_shutdown)
+ {
+ 	struct xenbus_device *dev =
+ 		container_of(watch, struct xenbus_device, otherend_watch);
+@@ -226,11 +190,7 @@ static void otherend_changed(struct xenbus_watch *watch,
+ 	 * work that can fail e.g., when the rootfs is gone.
+ 	 */
+ 	if (system_state > SYSTEM_RUNNING) {
+-		struct xen_bus_type *bus = bus;
+-		bus = container_of(dev->dev.bus, struct xen_bus_type, bus);
+-		/* If we're frontend, drive the state machine to Closed. */
+-		/* This should cause the backend to release our resources. */
+-		if ((bus == &xenbus_frontend) && (state == XenbusStateClosing))
++		if (ignore_on_shutdown && (state == XenbusStateClosing))
+ 			xenbus_frontend_closed(dev);
+ 		return;
+ 	}
+@@ -238,25 +198,7 @@ static void otherend_changed(struct xenbus_watch *watch,
+ 	if (drv->otherend_changed)
+ 		drv->otherend_changed(dev, state);
+ }
+-
+-
+-static int talk_to_otherend(struct xenbus_device *dev)
+-{
+-	struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+-
+-	free_otherend_watch(dev);
+-	free_otherend_details(dev);
+-
+-	return drv->read_otherend_details(dev);
+-}
+-
+-
+-static int watch_otherend(struct xenbus_device *dev)
+-{
+-	return xenbus_watch_pathfmt(dev, &dev->otherend_watch, otherend_changed,
+-				    "%s/%s", dev->otherend, "state");
+-}
+-
++EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
+ 
+ int xenbus_dev_probe(struct device *_dev)
+ {
+@@ -300,8 +242,9 @@ int xenbus_dev_probe(struct device *_dev)
+ fail:
+ 	xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
+ 	xenbus_switch_state(dev, XenbusStateClosed);
+-	return -ENODEV;
++	return err;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_probe);
+ 
+ int xenbus_dev_remove(struct device *_dev)
+ {
+@@ -319,8 +262,9 @@ int xenbus_dev_remove(struct device *_dev)
+ 	xenbus_switch_state(dev, XenbusStateClosed);
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_remove);
+ 
+-static void xenbus_dev_shutdown(struct device *_dev)
++void xenbus_dev_shutdown(struct device *_dev)
+ {
+ 	struct xenbus_device *dev = to_xenbus_device(_dev);
+ 	unsigned long timeout = 5*HZ;
+@@ -341,6 +285,7 @@ static void xenbus_dev_shutdown(struct device *_dev)
+  out:
+ 	put_device(&dev->dev);
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
+ 
+ int xenbus_register_driver_common(struct xenbus_driver *drv,
+ 				  struct xen_bus_type *bus,
+@@ -354,25 +299,7 @@ int xenbus_register_driver_common(struct xenbus_driver *drv,
+ 
+ 	return driver_register(&drv->driver);
+ }
+-
+-int __xenbus_register_frontend(struct xenbus_driver *drv,
+-			       struct module *owner, const char *mod_name)
+-{
+-	int ret;
+-
+-	drv->read_otherend_details = read_backend_details;
+-
+-	ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+-					    owner, mod_name);
+-	if (ret)
+-		return ret;
+-
+-	/* If this driver is loaded as a module wait for devices to attach. */
+-	wait_for_devices(drv);
+-
+-	return 0;
+-}
+-EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
++EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
+ 
+ void xenbus_unregister_driver(struct xenbus_driver *drv)
+ {
+@@ -543,24 +470,7 @@ fail:
+ 	kfree(xendev);
+ 	return err;
+ }
+-
+-/* device/<typename>/<name> */
+-static int xenbus_probe_frontend(const char *type, const char *name)
+-{
+-	char *nodename;
+-	int err;
+-
+-	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s",
+-			     xenbus_frontend.root, type, name);
+-	if (!nodename)
+-		return -ENOMEM;
+-
+-	DPRINTK("%s", nodename);
+-
+-	err = xenbus_probe_node(&xenbus_frontend, type, nodename);
+-	kfree(nodename);
+-	return err;
+-}
++EXPORT_SYMBOL_GPL(xenbus_probe_node);
+ 
+ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+ {
+@@ -574,10 +484,11 @@ static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
+ 		return PTR_ERR(dir);
+ 
+ 	for (i = 0; i < dir_n; i++) {
+-		err = bus->probe(type, dir[i]);
++		err = bus->probe(bus, type, dir[i]);
+ 		if (err)
+ 			break;
+ 	}
++
+ 	kfree(dir);
+ 	return err;
+ }
+@@ -597,9 +508,11 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
+ 		if (err)
+ 			break;
+ 	}
++
+ 	kfree(dir);
+ 	return err;
+ }
++EXPORT_SYMBOL_GPL(xenbus_probe_devices);
+ 
+ static unsigned int char_count(const char *str, char c)
+ {
+@@ -662,32 +575,17 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
+ }
+ EXPORT_SYMBOL_GPL(xenbus_dev_changed);
+ 
+-static void frontend_changed(struct xenbus_watch *watch,
+-			     const char **vec, unsigned int len)
+-{
+-	DPRINTK("");
+-
+-	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+-}
+-
+-/* We watch for devices appearing and vanishing. */
+-static struct xenbus_watch fe_watch = {
+-	.node = "device",
+-	.callback = frontend_changed,
+-};
+-
+-static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
++int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+ {
+ 	int err = 0;
+ 	struct xenbus_driver *drv;
+-	struct xenbus_device *xdev;
++	struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev);
+ 
+-	DPRINTK("");
++	DPRINTK("%s", xdev->nodename);
+ 
+ 	if (dev->driver == NULL)
+ 		return 0;
+ 	drv = to_xenbus_driver(dev->driver);
+-	xdev = container_of(dev, struct xenbus_device, dev);
+ 	if (drv->suspend)
+ 		err = drv->suspend(xdev, state);
+ 	if (err)
+@@ -695,21 +593,19 @@ static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
+ 		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
+ 
+-static int xenbus_dev_resume(struct device *dev)
++int xenbus_dev_resume(struct device *dev)
+ {
+ 	int err;
+ 	struct xenbus_driver *drv;
+-	struct xenbus_device *xdev;
++	struct xenbus_device *xdev = container_of(dev, struct xenbus_device, dev);
+ 
+-	DPRINTK("");
++	DPRINTK("%s", xdev->nodename);
+ 
+ 	if (dev->driver == NULL)
+ 		return 0;
+-
+ 	drv = to_xenbus_driver(dev->driver);
+-	xdev = container_of(dev, struct xenbus_device, dev);
+-
+ 	err = talk_to_otherend(xdev);
+ 	if (err) {
+ 		printk(KERN_WARNING
+@@ -740,6 +636,7 @@ static int xenbus_dev_resume(struct device *dev)
+ 
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(xenbus_dev_resume);
+ 
+ /* A flag to determine if xenstored is 'ready' (i.e. has started) */
+ int xenstored_ready = 0;
+@@ -749,10 +646,7 @@ int register_xenstore_notifier(struct notifier_block *nb)
+ {
+ 	int ret = 0;
+ 
+-	if (xenstored_ready > 0)
+-		ret = nb->notifier_call(nb, 0, NULL);
+-	else
+-		blocking_notifier_chain_register(&xenstore_chain, nb);
++	blocking_notifier_chain_register(&xenstore_chain, nb);
+ 
+ 	return ret;
+ }
+@@ -768,57 +662,93 @@ void xenbus_probe(struct work_struct *unused)
+ {
+ 	BUG_ON((xenstored_ready <= 0));
+ 
+-	/* Enumerate devices in xenstore and watch for changes. */
+-	xenbus_probe_devices(&xenbus_frontend);
+-	register_xenbus_watch(&fe_watch);
+-	xenbus_backend_probe_and_watch();
+-
+ 	/* Notify others that xenstore is up */
+ 	blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
+ }
++EXPORT_SYMBOL_GPL(xenbus_probe);
++
++static int __init xenbus_probe_initcall(void)
++{
++	if (!xen_domain())
++		return -ENODEV;
++
++	if (xen_initial_domain() || xen_hvm_domain())
++		return 0;
++
++	xenbus_probe(NULL);
++	return 0;
++}
++
++device_initcall(xenbus_probe_initcall);
+ 
+-static int __init xenbus_probe_init(void)
++static int __init xenbus_init(void)
+ {
+ 	int err = 0;
++	unsigned long page = 0;
+ 
+ 	DPRINTK("");
+ 
+ 	err = -ENODEV;
+ 	if (!xen_domain())
+-		goto out_error;
+-
+-	/* Register ourselves with the kernel bus subsystem */
+-	err = bus_register(&xenbus_frontend.bus);
+-	if (err)
+-		goto out_error;
+-
+-	err = xenbus_backend_bus_register();
+-	if (err)
+-		goto out_unreg_front;
++		return err;
+ 
+ 	/*
+ 	 * Domain0 doesn't have a store_evtchn or store_mfn yet.
+ 	 */
+ 	if (xen_initial_domain()) {
+-		/* dom0 not yet supported */
++		struct evtchn_alloc_unbound alloc_unbound;
++
++		/* Allocate Xenstore page */
++		page = get_zeroed_page(GFP_KERNEL);
++		if (!page)
++			goto out_error;
++
++		xen_store_mfn = xen_start_info->store_mfn =
++			pfn_to_mfn(virt_to_phys((void *)page) >>
++				   PAGE_SHIFT);
++
++		/* Next allocate a local port which xenstored can bind to */
++		alloc_unbound.dom        = DOMID_SELF;
++		alloc_unbound.remote_dom = 0;
++
++		err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
++						  &alloc_unbound);
++		if (err == -ENOSYS)
++			goto out_error;
++
++		BUG_ON(err);
++		xen_store_evtchn = xen_start_info->store_evtchn =
++			alloc_unbound.port;
++
++		xen_store_interface = mfn_to_virt(xen_store_mfn);
+ 	} else {
+ 		xenstored_ready = 1;
+-		xen_store_evtchn = xen_start_info->store_evtchn;
+-		xen_store_mfn = xen_start_info->store_mfn;
++		if (xen_hvm_domain()) {
++			uint64_t v = 0;
++			err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
++			if (err)
++				goto out_error;
++			xen_store_evtchn = (int)v;
++			err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
++			if (err)
++				goto out_error;
++			xen_store_mfn = (unsigned long)v;
++			xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
++		} else {
++			xen_store_evtchn = xen_start_info->store_evtchn;
++			xen_store_mfn = xen_start_info->store_mfn;
++			xen_store_interface = mfn_to_virt(xen_store_mfn);
++		}
+ 	}
+-	xen_store_interface = mfn_to_virt(xen_store_mfn);
+ 
+ 	/* Initialize the interface to xenstore. */
+ 	err = xs_init();
+ 	if (err) {
+ 		printk(KERN_WARNING
+ 		       "XENBUS: Error initializing xenstore comms: %i\n", err);
+-		goto out_unreg_back;
++		goto out_error;
+ 	}
+ 
+-	if (!xen_initial_domain())
+-		xenbus_probe(NULL);
+-
+ #ifdef CONFIG_XEN_COMPAT_XENFS
+ 	/*
+ 	 * Create xenfs mountpoint in /proc for compatibility with
+@@ -829,128 +759,13 @@ static int __init xenbus_probe_init(void)
+ 
+ 	return 0;
+ 
+-  out_unreg_back:
+-	xenbus_backend_bus_unregister();
+-
+-  out_unreg_front:
+-	bus_unregister(&xenbus_frontend.bus);
+-
+   out_error:
++	if (page != 0)
++		free_page(page);
++
+ 	return err;
+ }
+ 
+-postcore_initcall(xenbus_probe_init);
++postcore_initcall(xenbus_init);
+ 
+ MODULE_LICENSE("GPL");
+-
+-static int is_device_connecting(struct device *dev, void *data)
+-{
+-	struct xenbus_device *xendev = to_xenbus_device(dev);
+-	struct device_driver *drv = data;
+-	struct xenbus_driver *xendrv;
+-
+-	/*
+-	 * A device with no driver will never connect. We care only about
+-	 * devices which should currently be in the process of connecting.
+-	 */
+-	if (!dev->driver)
+-		return 0;
+-
+-	/* Is this search limited to a particular driver? */
+-	if (drv && (dev->driver != drv))
+-		return 0;
+-
+-	xendrv = to_xenbus_driver(dev->driver);
+-	return (xendev->state < XenbusStateConnected ||
+-		(xendev->state == XenbusStateConnected &&
+-		 xendrv->is_ready && !xendrv->is_ready(xendev)));
+-}
+-
+-static int exists_connecting_device(struct device_driver *drv)
+-{
+-	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+-				is_device_connecting);
+-}
+-
+-static int print_device_status(struct device *dev, void *data)
+-{
+-	struct xenbus_device *xendev = to_xenbus_device(dev);
+-	struct device_driver *drv = data;
+-
+-	/* Is this operation limited to a particular driver? */
+-	if (drv && (dev->driver != drv))
+-		return 0;
+-
+-	if (!dev->driver) {
+-		/* Information only: is this too noisy? */
+-		printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
+-		       xendev->nodename);
+-	} else if (xendev->state < XenbusStateConnected) {
+-		enum xenbus_state rstate = XenbusStateUnknown;
+-		if (xendev->otherend)
+-			rstate = xenbus_read_driver_state(xendev->otherend);
+-		printk(KERN_WARNING "XENBUS: Timeout connecting "
+-		       "to device: %s (local state %d, remote state %d)\n",
+-		       xendev->nodename, xendev->state, rstate);
+-	}
+-
+-	return 0;
+-}
+-
+-/* We only wait for device setup after most initcalls have run. */
+-static int ready_to_wait_for_devices;
+-
+-/*
+- * On a 5-minute timeout, wait for all devices currently configured.  We need
+- * to do this to guarantee that the filesystems and / or network devices
+- * needed for boot are available, before we can allow the boot to proceed.
+- *
+- * This needs to be on a late_initcall, to happen after the frontend device
+- * drivers have been initialised, but before the root fs is mounted.
+- *
+- * A possible improvement here would be to have the tools add a per-device
+- * flag to the store entry, indicating whether it is needed at boot time.
+- * This would allow people who knew what they were doing to accelerate their
+- * boot slightly, but of course needs tools or manual intervention to set up
+- * those flags correctly.
+- */
+-static void wait_for_devices(struct xenbus_driver *xendrv)
+-{
+-	unsigned long start = jiffies;
+-	struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+-	unsigned int seconds_waited = 0;
+-
+-	if (!ready_to_wait_for_devices || !xen_domain())
+-		return;
+-
+-	while (exists_connecting_device(drv)) {
+-		if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+-			if (!seconds_waited)
+-				printk(KERN_WARNING "XENBUS: Waiting for "
+-				       "devices to initialise: ");
+-			seconds_waited += 5;
+-			printk("%us...", 300 - seconds_waited);
+-			if (seconds_waited == 300)
+-				break;
+-		}
+-
+-		schedule_timeout_interruptible(HZ/10);
+-	}
+-
+-	if (seconds_waited)
+-		printk("\n");
+-
+-	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+-			 print_device_status);
+-}
+-
+-#ifndef MODULE
+-static int __init boot_wait_for_devices(void)
+-{
+-	ready_to_wait_for_devices = 1;
+-	wait_for_devices(NULL);
+-	return 0;
+-}
+-
+-late_initcall(boot_wait_for_devices);
+-#endif
+diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h
+index 6c5e318..0e5fc4c 100644
+--- a/drivers/xen/xenbus/xenbus_probe.h
++++ b/drivers/xen/xenbus/xenbus_probe.h
+@@ -36,26 +36,13 @@
+ 
+ #define XEN_BUS_ID_SIZE			20
+ 
+-#ifdef CONFIG_XEN_BACKEND
+-extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
+-extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
+-extern void xenbus_backend_probe_and_watch(void);
+-extern int xenbus_backend_bus_register(void);
+-extern void xenbus_backend_bus_unregister(void);
+-#else
+-static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
+-static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
+-static inline void xenbus_backend_probe_and_watch(void) {}
+-static inline int xenbus_backend_bus_register(void) { return 0; }
+-static inline void xenbus_backend_bus_unregister(void) {}
+-#endif
+-
+ struct xen_bus_type
+ {
+ 	char *root;
+ 	unsigned int levels;
+ 	int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
+-	int (*probe)(const char *type, const char *dir);
++	int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir);
++	void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, unsigned int len);
+ 	struct bus_type bus;
+ };
+ 
+@@ -73,4 +60,16 @@ extern int xenbus_probe_devices(struct xen_bus_type *bus);
+ 
+ extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus);
+ 
++extern void xenbus_dev_shutdown(struct device *_dev);
++
++extern int xenbus_dev_suspend(struct device *dev, pm_message_t state);
++extern int xenbus_dev_resume(struct device *dev);
++
++extern void xenbus_otherend_changed(struct xenbus_watch *watch,
++				    const char **vec, unsigned int len,
++				    int ignore_on_shutdown);
++
++extern int xenbus_read_otherend_details(struct xenbus_device *xendev,
++					char *id_node, char *path_node);
++
+ #endif
+diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
+new file mode 100644
+index 0000000..9b9dd36
+--- /dev/null
++++ b/drivers/xen/xenbus/xenbus_probe_backend.c
+@@ -0,0 +1,293 @@
++/******************************************************************************
++ * Talks to Xen Store to figure out what devices we have (backend half).
++ *
++ * Copyright (C) 2005 Rusty Russell, IBM Corporation
++ * Copyright (C) 2005 Mike Wray, Hewlett-Packard
++ * Copyright (C) 2005, 2006 XenSource Ltd
++ * Copyright (C) 2007 Solarflare Communications, Inc.
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#define DPRINTK(fmt, args...)				\
++	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
++		 __func__, __LINE__, ##args)
++
++#include <linux/kernel.h>
++#include <linux/err.h>
++#include <linux/string.h>
++#include <linux/ctype.h>
++#include <linux/fcntl.h>
++#include <linux/mm.h>
++#include <linux/notifier.h>
++
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/features.h>
++
++#include "xenbus_comms.h"
++#include "xenbus_probe.h"
++
++/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
++static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
++{
++	int domid, err;
++	const char *devid, *type, *frontend;
++	unsigned int typelen;
++
++	type = strchr(nodename, '/');
++	if (!type)
++		return -EINVAL;
++	type++;
++	typelen = strcspn(type, "/");
++	if (!typelen || type[typelen] != '/')
++		return -EINVAL;
++
++	devid = strrchr(nodename, '/') + 1;
++
++	err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid,
++			    "frontend", NULL, &frontend,
++			    NULL);
++	if (err)
++		return err;
++	if (strlen(frontend) == 0)
++		err = -ERANGE;
++	if (!err && !xenbus_exists(XBT_NIL, frontend, ""))
++		err = -ENOENT;
++	kfree(frontend);
++
++	if (err)
++		return err;
++
++	if (snprintf(bus_id, XEN_BUS_ID_SIZE,
++		     "%.*s-%i-%s", typelen, type, domid, devid) >= XEN_BUS_ID_SIZE)
++		return -ENOSPC;
++	return 0;
++}
++
++static int xenbus_uevent_backend(struct device *dev,
++				 struct kobj_uevent_env *env)
++{
++	struct xenbus_device *xdev;
++	struct xenbus_driver *drv;
++	struct xen_bus_type *bus;
++
++	DPRINTK("");
++
++	if (dev == NULL)
++		return -ENODEV;
++
++	xdev = to_xenbus_device(dev);
++	bus = container_of(xdev->dev.bus, struct xen_bus_type, bus);
++	if (xdev == NULL)
++		return -ENODEV;
++
++	/* stuff we want to pass to /sbin/hotplug */
++	if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype))
++		return -ENOMEM;
++
++	if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename))
++		return -ENOMEM;
++
++	if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root))
++		return -ENOMEM;
++
++	if (dev->driver) {
++		drv = to_xenbus_driver(dev->driver);
++		if (drv && drv->uevent)
++			return drv->uevent(xdev, env);
++	}
++
++	return 0;
++}
++
++/* backend/<typename>/<frontend-uuid>/<name> */
++static int xenbus_probe_backend_unit(struct xen_bus_type *bus,
++				     const char *dir,
++				     const char *type,
++				     const char *name)
++{
++	char *nodename;
++	int err;
++
++	nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name);
++	if (!nodename)
++		return -ENOMEM;
++
++	DPRINTK("%s\n", nodename);
++
++	err = xenbus_probe_node(bus, type, nodename);
++	kfree(nodename);
++	return err;
++}
++
++/* backend/<typename>/<frontend-domid> */
++static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, const char *domid)
++{
++	char *nodename;
++	int err = 0;
++	char **dir;
++	unsigned int i, dir_n = 0;
++
++	DPRINTK("");
++
++	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid);
++	if (!nodename)
++		return -ENOMEM;
++
++	dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n);
++	if (IS_ERR(dir)) {
++		kfree(nodename);
++		return PTR_ERR(dir);
++	}
++
++	for (i = 0; i < dir_n; i++) {
++		err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]);
++		if (err)
++			break;
++	}
++	kfree(dir);
++	kfree(nodename);
++	return err;
++}
++
++static void frontend_changed(struct xenbus_watch *watch,
++			    const char **vec, unsigned int len)
++{
++	xenbus_otherend_changed(watch, vec, len, 0);
++}
++
++static struct device_attribute xenbus_backend_dev_attrs[] = {
++	__ATTR_NULL
++};
++
++static struct xen_bus_type xenbus_backend = {
++	.root = "backend",
++	.levels = 3, 		/* backend/type/<frontend>/<id> */
++	.get_bus_id = backend_bus_id,
++	.probe = xenbus_probe_backend,
++	.otherend_changed = frontend_changed,
++	.bus = {
++		.name      = "xen-backend",
++		.match     = xenbus_match,
++		.uevent    = xenbus_uevent_backend,
++		.probe     = xenbus_dev_probe,
++		.remove    = xenbus_dev_remove,
++		.shutdown  = xenbus_dev_shutdown,
++		.dev_attrs = xenbus_backend_dev_attrs,
++	},
++};
++
++static void backend_changed(struct xenbus_watch *watch,
++			    const char **vec, unsigned int len)
++{
++	DPRINTK("");
++
++	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend);
++}
++
++static struct xenbus_watch be_watch = {
++	.node = "backend",
++	.callback = backend_changed,
++};
++
++static int read_frontend_details(struct xenbus_device *xendev)
++{
++	return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
++}
++
++//void xenbus_backend_suspend(int (*fn)(struct device *, void *))
++//{
++//	DPRINTK("");
++//	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
++//}
++
++//void xenbus_backend_resume(int (*fn)(struct device *, void *))
++//{
++//	DPRINTK("");
++//	bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
++//}
++
++//int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *))
++//{
++//	return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn);
++//}
++//EXPORT_SYMBOL_GPL(xenbus_for_each_backend);
++
++int xenbus_dev_is_online(struct xenbus_device *dev)
++{
++	int rc, val;
++
++	rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
++	if (rc != 1)
++		val = 0; /* no online node present */
++
++	return val;
++}
++EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
++
++int __xenbus_register_backend(struct xenbus_driver *drv,
++			      struct module *owner, const char *mod_name)
++{
++	drv->read_otherend_details = read_frontend_details;
++
++	return xenbus_register_driver_common(drv, &xenbus_backend,
++					     owner, mod_name);
++}
++EXPORT_SYMBOL_GPL(__xenbus_register_backend);
++
++static int backend_probe_and_watch(struct notifier_block *notifier,
++				   unsigned long event,
++				   void *data)
++{
++	/* Enumerate devices in xenstore and watch for changes. */
++	xenbus_probe_devices(&xenbus_backend);
++	register_xenbus_watch(&be_watch);
++
++	return NOTIFY_DONE;
++}
++
++static int __init xenbus_probe_backend_init(void)
++{
++	static struct notifier_block xenstore_notifier = {
++		.notifier_call = backend_probe_and_watch
++	};
++	int err;
++
++	DPRINTK("");
++
++	/* Register ourselves with the kernel bus subsystem */
++	err = bus_register(&xenbus_backend.bus);
++	if (err)
++		return err;
++
++	register_xenstore_notifier(&xenstore_notifier);
++
++	return 0;
++}
++subsys_initcall(xenbus_probe_backend_init);
+diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c
+new file mode 100644
+index 0000000..5413248
+--- /dev/null
++++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
+@@ -0,0 +1,292 @@
++#define DPRINTK(fmt, args...)				\
++	pr_debug("xenbus_probe (%s:%d) " fmt ".\n",	\
++		 __func__, __LINE__, ##args)
++
++#include <linux/kernel.h>
++#include <linux/err.h>
++#include <linux/string.h>
++#include <linux/ctype.h>
++#include <linux/fcntl.h>
++#include <linux/mm.h>
++#include <linux/proc_fs.h>
++#include <linux/notifier.h>
++#include <linux/kthread.h>
++#include <linux/mutex.h>
++#include <linux/io.h>
++
++#include <asm/page.h>
++#include <asm/pgtable.h>
++#include <asm/xen/hypervisor.h>
++#include <xen/xenbus.h>
++#include <xen/events.h>
++#include <xen/page.h>
++#include <xen/xen.h>
++#include <xen/platform_pci.h>
++
++#include "xenbus_comms.h"
++#include "xenbus_probe.h"
++
++/* device/<type>/<id> => <type>-<id> */
++static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
++{
++	nodename = strchr(nodename, '/');
++	if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
++		printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename);
++		return -EINVAL;
++	}
++
++	strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
++	if (!strchr(bus_id, '/')) {
++		printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id);
++		return -EINVAL;
++	}
++	*strchr(bus_id, '/') = '-';
++	return 0;
++}
++
++/* device/<typename>/<name> */
++static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, const char *name)
++{
++	char *nodename;
++	int err;
++
++	nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
++	if (!nodename)
++		return -ENOMEM;
++
++	DPRINTK("%s", nodename);
++
++	err = xenbus_probe_node(bus, type, nodename);
++	kfree(nodename);
++	return err;
++}
++
++static int xenbus_uevent_frontend(struct device *_dev, struct kobj_uevent_env *env)
++{
++	struct xenbus_device *dev = to_xenbus_device(_dev);
++
++	if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype))
++		return -ENOMEM;
++
++	return 0;
++}
++
++
++static void backend_changed(struct xenbus_watch *watch,
++			    const char **vec, unsigned int len)
++{
++	xenbus_otherend_changed(watch, vec, len, 1);
++}
++
++static struct device_attribute xenbus_frontend_dev_attrs[] = {
++	__ATTR_NULL
++};
++
++
++static struct xen_bus_type xenbus_frontend = {
++	.root = "device",
++	.levels = 2, 		/* device/type/<id> */
++	.get_bus_id = frontend_bus_id,
++	.probe = xenbus_probe_frontend,
++	.otherend_changed = backend_changed,
++	.bus = {
++		.name     = "xen",
++		.match    = xenbus_match,
++		.uevent   = xenbus_uevent_frontend,
++		.probe    = xenbus_dev_probe,
++		.remove   = xenbus_dev_remove,
++		.shutdown = xenbus_dev_shutdown,
++		.dev_attrs= xenbus_frontend_dev_attrs,
++
++		.suspend  = xenbus_dev_suspend,
++		.resume   = xenbus_dev_resume,
++	},
++};
++
++static void frontend_changed(struct xenbus_watch *watch,
++			     const char **vec, unsigned int len)
++{
++	DPRINTK("");
++
++	xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
++}
++
++
++/* We watch for devices appearing and vanishing. */
++static struct xenbus_watch fe_watch = {
++	.node = "device",
++	.callback = frontend_changed,
++};
++
++static int read_backend_details(struct xenbus_device *xendev)
++{
++	return xenbus_read_otherend_details(xendev, "backend-id", "backend");
++}
++
++static int is_device_connecting(struct device *dev, void *data)
++{
++	struct xenbus_device *xendev = to_xenbus_device(dev);
++	struct device_driver *drv = data;
++	struct xenbus_driver *xendrv;
++
++	/*
++	 * A device with no driver will never connect. We care only about
++	 * devices which should currently be in the process of connecting.
++	 */
++	if (!dev->driver)
++		return 0;
++
++	/* Is this search limited to a particular driver? */
++	if (drv && (dev->driver != drv))
++		return 0;
++
++	xendrv = to_xenbus_driver(dev->driver);
++	return (xendev->state < XenbusStateConnected ||
++		(xendev->state == XenbusStateConnected &&
++		 xendrv->is_ready && !xendrv->is_ready(xendev)));
++}
++
++static int exists_connecting_device(struct device_driver *drv)
++{
++	return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
++				is_device_connecting);
++}
++
++static int print_device_status(struct device *dev, void *data)
++{
++	struct xenbus_device *xendev = to_xenbus_device(dev);
++	struct device_driver *drv = data;
++
++	/* Is this operation limited to a particular driver? */
++	if (drv && (dev->driver != drv))
++		return 0;
++
++	if (!dev->driver) {
++		/* Information only: is this too noisy? */
++		printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
++		       xendev->nodename);
++	} else if (xendev->state < XenbusStateConnected) {
++		enum xenbus_state rstate = XenbusStateUnknown;
++		if (xendev->otherend)
++			rstate = xenbus_read_driver_state(xendev->otherend);
++		printk(KERN_WARNING "XENBUS: Timeout connecting "
++		       "to device: %s (local state %d, remote state %d)\n",
++		       xendev->nodename, xendev->state, rstate);
++	}
++
++	return 0;
++}
++
++/* We only wait for device setup after most initcalls have run. */
++static int ready_to_wait_for_devices;
++
++/*
++ * On a 5-minute timeout, wait for all devices currently configured.  We need
++ * to do this to guarantee that the filesystems and / or network devices
++ * needed for boot are available, before we can allow the boot to proceed.
++ *
++ * This needs to be on a late_initcall, to happen after the frontend device
++ * drivers have been initialised, but before the root fs is mounted.
++ *
++ * A possible improvement here would be to have the tools add a per-device
++ * flag to the store entry, indicating whether it is needed at boot time.
++ * This would allow people who knew what they were doing to accelerate their
++ * boot slightly, but of course needs tools or manual intervention to set up
++ * those flags correctly.
++ */
++static void wait_for_devices(struct xenbus_driver *xendrv)
++{
++	unsigned long start = jiffies;
++	struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
++	unsigned int seconds_waited = 0;
++
++	if (!ready_to_wait_for_devices || !xen_domain())
++		return;
++
++	while (exists_connecting_device(drv)) {
++		if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
++			if (!seconds_waited)
++				printk(KERN_WARNING "XENBUS: Waiting for "
++				       "devices to initialise: ");
++			seconds_waited += 5;
++			printk("%us...", 300 - seconds_waited);
++			if (seconds_waited == 300)
++				break;
++		}
++
++		schedule_timeout_interruptible(HZ/10);
++	}
++
++	if (seconds_waited)
++		printk("\n");
++
++	bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
++			 print_device_status);
++}
++
++int __xenbus_register_frontend(struct xenbus_driver *drv,
++			       struct module *owner, const char *mod_name)
++{
++	int ret;
++
++	drv->read_otherend_details = read_backend_details;
++
++	ret = xenbus_register_driver_common(drv, &xenbus_frontend,
++					    owner, mod_name);
++	if (ret)
++		return ret;
++
++	/* If this driver is loaded as a module wait for devices to attach. */
++	wait_for_devices(drv);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
++
++static int frontend_probe_and_watch(struct notifier_block *notifier,
++				   unsigned long event,
++				   void *data)
++{
++	/* Enumerate devices in xenstore and watch for changes. */
++	xenbus_probe_devices(&xenbus_frontend);
++	register_xenbus_watch(&fe_watch);
++
++	return NOTIFY_DONE;
++}
++
++
++static int __init xenbus_probe_frontend_init(void)
++{
++	static struct notifier_block xenstore_notifier = {
++		.notifier_call = frontend_probe_and_watch
++	};
++	int err;
++
++	DPRINTK("");
++
++	/* Register ourselves with the kernel bus subsystem */
++	err = bus_register(&xenbus_frontend.bus);
++	if (err)
++		return err;
++
++	register_xenstore_notifier(&xenstore_notifier);
++
++	return 0;
++}
++subsys_initcall(xenbus_probe_frontend_init);
++
++#ifndef MODULE
++static int __init boot_wait_for_devices(void)
++{
++	if (xen_hvm_domain() && !xen_platform_pci_unplug)
++		return -ENODEV;
++
++	ready_to_wait_for_devices = 1;
++	wait_for_devices(NULL);
++	return 0;
++}
++
++late_initcall(boot_wait_for_devices);
++#endif
++
++MODULE_LICENSE("GPL");
+diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
+index 7b547f5..5534690 100644
+--- a/drivers/xen/xenbus/xenbus_xs.c
++++ b/drivers/xen/xenbus/xenbus_xs.c
+@@ -76,6 +76,14 @@ struct xs_handle {
+ 	/*
+ 	 * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
+ 	 * response_mutex is never taken simultaneously with the other three.
++	 *
++	 * transaction_mutex must be held before incrementing
++	 * transaction_count. The mutex is held when a suspend is in
++	 * progress to prevent new transactions starting.
++	 *
++	 * When decrementing transaction_count to zero the wait queue
++	 * should be woken up, the suspend code waits for count to
++	 * reach zero.
+ 	 */
+ 
+ 	/* One request at a time. */
+@@ -85,7 +93,9 @@ struct xs_handle {
+ 	struct mutex response_mutex;
+ 
+ 	/* Protect transactions against save/restore. */
+-	struct rw_semaphore transaction_mutex;
++	struct mutex transaction_mutex;
++	atomic_t transaction_count;
++	wait_queue_head_t transaction_wq;
+ 
+ 	/* Protect watch (de)register against save/restore. */
+ 	struct rw_semaphore watch_mutex;
+@@ -157,6 +167,31 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
+ 	return body;
+ }
+ 
++static void transaction_start(void)
++{
++	mutex_lock(&xs_state.transaction_mutex);
++	atomic_inc(&xs_state.transaction_count);
++	mutex_unlock(&xs_state.transaction_mutex);
++}
++
++static void transaction_end(void)
++{
++	if (atomic_dec_and_test(&xs_state.transaction_count))
++		wake_up(&xs_state.transaction_wq);
++}
++
++static void transaction_suspend(void)
++{
++	mutex_lock(&xs_state.transaction_mutex);
++	wait_event(xs_state.transaction_wq,
++		   atomic_read(&xs_state.transaction_count) == 0);
++}
++
++static void transaction_resume(void)
++{
++	mutex_unlock(&xs_state.transaction_mutex);
++}
++
+ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+ {
+ 	void *ret;
+@@ -164,7 +199,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+ 	int err;
+ 
+ 	if (req_msg.type == XS_TRANSACTION_START)
+-		down_read(&xs_state.transaction_mutex);
++		transaction_start();
+ 
+ 	mutex_lock(&xs_state.request_mutex);
+ 
+@@ -180,7 +215,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
+ 	if ((msg->type == XS_TRANSACTION_END) ||
+ 	    ((req_msg.type == XS_TRANSACTION_START) &&
+ 	     (msg->type == XS_ERROR)))
+-		up_read(&xs_state.transaction_mutex);
++		transaction_end();
+ 
+ 	return ret;
+ }
+@@ -432,11 +467,11 @@ int xenbus_transaction_start(struct xenbus_transaction *t)
+ {
+ 	char *id_str;
+ 
+-	down_read(&xs_state.transaction_mutex);
++	transaction_start();
+ 
+ 	id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
+ 	if (IS_ERR(id_str)) {
+-		up_read(&xs_state.transaction_mutex);
++		transaction_end();
+ 		return PTR_ERR(id_str);
+ 	}
+ 
+@@ -461,7 +496,7 @@ int xenbus_transaction_end(struct xenbus_transaction t, int abort)
+ 
+ 	err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
+ 
+-	up_read(&xs_state.transaction_mutex);
++	transaction_end();
+ 
+ 	return err;
+ }
+@@ -662,7 +697,7 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
+ 
+ void xs_suspend(void)
+ {
+-	down_write(&xs_state.transaction_mutex);
++	transaction_suspend();
+ 	down_write(&xs_state.watch_mutex);
+ 	mutex_lock(&xs_state.request_mutex);
+ 	mutex_lock(&xs_state.response_mutex);
+@@ -677,7 +712,7 @@ void xs_resume(void)
+ 
+ 	mutex_unlock(&xs_state.response_mutex);
+ 	mutex_unlock(&xs_state.request_mutex);
+-	up_write(&xs_state.transaction_mutex);
++	transaction_resume();
+ 
+ 	/* No need for watches_lock: the watch_mutex is sufficient. */
+ 	list_for_each_entry(watch, &watches, list) {
+@@ -693,7 +728,7 @@ void xs_suspend_cancel(void)
+ 	mutex_unlock(&xs_state.response_mutex);
+ 	mutex_unlock(&xs_state.request_mutex);
+ 	up_write(&xs_state.watch_mutex);
+-	up_write(&xs_state.transaction_mutex);
++	mutex_unlock(&xs_state.transaction_mutex);
+ }
+ 
+ static int xenwatch_thread(void *unused)
+@@ -843,8 +878,10 @@ int xs_init(void)
+ 
+ 	mutex_init(&xs_state.request_mutex);
+ 	mutex_init(&xs_state.response_mutex);
+-	init_rwsem(&xs_state.transaction_mutex);
++	mutex_init(&xs_state.transaction_mutex);
+ 	init_rwsem(&xs_state.watch_mutex);
++	atomic_set(&xs_state.transaction_count, 0);
++	init_waitqueue_head(&xs_state.transaction_wq);
+ 
+ 	/* Initialize the shared memory rings to talk to xenstored */
+ 	err = xb_init_comms();
+diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
+index 25275c3..4a0be9a 100644
+--- a/drivers/xen/xenfs/Makefile
++++ b/drivers/xen/xenfs/Makefile
+@@ -1,3 +1,4 @@
+ obj-$(CONFIG_XENFS) += xenfs.o
+ 
+-xenfs-objs = super.o xenbus.o
+\ No newline at end of file
++xenfs-y			  = super.o xenbus.o
++xenfs-$(CONFIG_XEN_DOM0) += xenstored.o privcmd.o
+diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
+new file mode 100644
+index 0000000..f80be7f
+--- /dev/null
++++ b/drivers/xen/xenfs/privcmd.c
+@@ -0,0 +1,404 @@
++/******************************************************************************
++ * privcmd.c
++ *
++ * Interface to privileged domain-0 commands.
++ *
++ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/string.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/uaccess.h>
++#include <linux/swap.h>
++#include <linux/smp_lock.h>
++#include <linux/highmem.h>
++#include <linux/pagemap.h>
++#include <linux/seq_file.h>
++
++#include <asm/pgalloc.h>
++#include <asm/pgtable.h>
++#include <asm/tlb.h>
++#include <asm/xen/hypervisor.h>
++#include <asm/xen/hypercall.h>
++
++#include <xen/xen.h>
++#include <xen/privcmd.h>
++#include <xen/interface/xen.h>
++#include <xen/features.h>
++#include <xen/page.h>
++#include <xen/xen-ops.h>
++
++#ifndef HAVE_ARCH_PRIVCMD_MMAP
++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
++#endif
++
++static long privcmd_ioctl_hypercall(void __user *udata)
++{
++	struct privcmd_hypercall hypercall;
++	long ret;
++
++	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
++		return -EFAULT;
++
++	ret = privcmd_call(hypercall.op,
++			   hypercall.arg[0], hypercall.arg[1],
++			   hypercall.arg[2], hypercall.arg[3],
++			   hypercall.arg[4]);
++
++	return ret;
++}
++
++static void free_page_list(struct list_head *pages)
++{
++	struct page *p, *n;
++
++	list_for_each_entry_safe(p, n, pages, lru)
++		__free_page(p);
++
++	INIT_LIST_HEAD(pages);
++}
++
++/*
++ * Given an array of items in userspace, return a list of pages
++ * containing the data.  If copying fails, either because of memory
++ * allocation failure or a problem reading user memory, return an
++ * error code; its up to the caller to dispose of any partial list.
++ */
++static int gather_array(struct list_head *pagelist,
++			unsigned nelem, size_t size,
++			void __user *data)
++{
++	unsigned pageidx;
++	void *pagedata;
++	int ret;
++
++	if (size > PAGE_SIZE)
++		return 0;
++
++	pageidx = PAGE_SIZE;
++	pagedata = NULL;	/* quiet, gcc */
++	while (nelem--) {
++		if (pageidx > PAGE_SIZE-size) {
++			struct page *page = alloc_page(GFP_KERNEL);
++
++			ret = -ENOMEM;
++			if (page == NULL)
++				goto fail;
++
++			pagedata = page_address(page);
++
++			list_add_tail(&page->lru, pagelist);
++			pageidx = 0;
++		}
++
++		ret = -EFAULT;
++		if (copy_from_user(pagedata + pageidx, data, size))
++			goto fail;
++
++		data += size;
++		pageidx += size;
++	}
++
++	ret = 0;
++
++fail:
++	return ret;
++}
++
++/*
++ * Call function "fn" on each element of the array fragmented
++ * over a list of pages.
++ */
++static int traverse_pages(unsigned nelem, size_t size,
++			  struct list_head *pos,
++			  int (*fn)(void *data, void *state),
++			  void *state)
++{
++	void *pagedata;
++	unsigned pageidx;
++	int ret = 0;
++
++	BUG_ON(size > PAGE_SIZE);
++
++	pageidx = PAGE_SIZE;
++	pagedata = NULL;	/* hush, gcc */
++
++	while (nelem--) {
++		if (pageidx > PAGE_SIZE-size) {
++			struct page *page;
++			pos = pos->next;
++			page = list_entry(pos, struct page, lru);
++			pagedata = page_address(page);
++			pageidx = 0;
++		}
++
++		ret = (*fn)(pagedata + pageidx, state);
++		if (ret)
++			break;
++		pageidx += size;
++	}
++
++	return ret;
++}
++
++struct mmap_mfn_state {
++	unsigned long va;
++	struct vm_area_struct *vma;
++	domid_t domain;
++};
++
++static int mmap_mfn_range(void *data, void *state)
++{
++	struct privcmd_mmap_entry *msg = data;
++	struct mmap_mfn_state *st = state;
++	struct vm_area_struct *vma = st->vma;
++	int rc;
++
++	/* Do not allow range to wrap the address space. */
++	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
++	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
++		return -EINVAL;
++
++	/* Range chunks must be contiguous in va space. */
++	if ((msg->va != st->va) ||
++	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
++		return -EINVAL;
++
++	rc = xen_remap_domain_mfn_range(vma,
++					msg->va & PAGE_MASK,
++					msg->mfn, msg->npages,
++					vma->vm_page_prot,
++					st->domain);
++	if (rc < 0)
++		return rc;
++
++	st->va += msg->npages << PAGE_SHIFT;
++
++	return 0;
++}
++
++static long privcmd_ioctl_mmap(void __user *udata)
++{
++	struct privcmd_mmap mmapcmd;
++	struct mm_struct *mm = current->mm;
++	struct vm_area_struct *vma;
++	int rc;
++	LIST_HEAD(pagelist);
++	struct mmap_mfn_state state;
++
++	if (!xen_initial_domain())
++		return -EPERM;
++
++	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
++		return -EFAULT;
++
++	rc = gather_array(&pagelist,
++			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
++			  mmapcmd.entry);
++
++	if (rc || list_empty(&pagelist))
++		goto out;
++
++	down_write(&mm->mmap_sem);
++
++	{
++		struct page *page = list_first_entry(&pagelist,
++						     struct page, lru);
++		struct privcmd_mmap_entry *msg = page_address(page);
++
++		vma = find_vma(mm, msg->va);
++		rc = -EINVAL;
++
++		if (!vma || (msg->va != vma->vm_start) ||
++		    !privcmd_enforce_singleshot_mapping(vma))
++			goto out_up;
++	}
++
++	state.va = vma->vm_start;
++	state.vma = vma;
++	state.domain = mmapcmd.dom;
++
++	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
++			    &pagelist,
++			    mmap_mfn_range, &state);
++
++
++out_up:
++	up_write(&mm->mmap_sem);
++
++out:
++	free_page_list(&pagelist);
++
++	return rc;
++}
++
++struct mmap_batch_state {
++	domid_t domain;
++	unsigned long va;
++	struct vm_area_struct *vma;
++	int err;
++
++	xen_pfn_t __user *user;
++};
++
++static int mmap_batch_fn(void *data, void *state)
++{
++	xen_pfn_t *mfnp = data;
++	struct mmap_batch_state *st = state;
++
++	if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
++				       st->vma->vm_page_prot, st->domain) < 0) {
++		*mfnp |= 0xf0000000U;
++		st->err++;
++	}
++	st->va += PAGE_SIZE;
++
++	return 0;
++}
++
++static int mmap_return_errors(void *data, void *state)
++{
++	xen_pfn_t *mfnp = data;
++	struct mmap_batch_state *st = state;
++
++	put_user(*mfnp, st->user++);
++
++	return 0;
++}
++
++static struct vm_operations_struct privcmd_vm_ops;
++
++static long privcmd_ioctl_mmap_batch(void __user *udata)
++{
++	int ret;
++	struct privcmd_mmapbatch m;
++	struct mm_struct *mm = current->mm;
++	struct vm_area_struct *vma;
++	unsigned long nr_pages;
++	LIST_HEAD(pagelist);
++	struct mmap_batch_state state;
++
++	if (!xen_initial_domain())
++		return -EPERM;
++
++	if (copy_from_user(&m, udata, sizeof(m)))
++		return -EFAULT;
++
++	nr_pages = m.num;
++	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
++		return -EINVAL;
++
++	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
++			   m.arr);
++
++	if (ret || list_empty(&pagelist))
++		goto out;
++
++	down_write(&mm->mmap_sem);
++
++	vma = find_vma(mm, m.addr);
++	ret = -EINVAL;
++	if (!vma ||
++	    vma->vm_ops != &privcmd_vm_ops ||
++	    (m.addr != vma->vm_start) ||
++	    ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
++	    !privcmd_enforce_singleshot_mapping(vma)) {
++		up_write(&mm->mmap_sem);
++		goto out;
++	}
++
++	state.domain = m.dom;
++	state.vma = vma;
++	state.va = m.addr;
++	state.err = 0;
++
++	ret = traverse_pages(m.num, sizeof(xen_pfn_t),
++			     &pagelist, mmap_batch_fn, &state);
++
++	up_write(&mm->mmap_sem);
++
++	if (state.err > 0) {
++		ret = 0;
++
++		state.user = m.arr;
++		traverse_pages(m.num, sizeof(xen_pfn_t),
++			       &pagelist,
++			       mmap_return_errors, &state);
++	}
++
++out:
++	free_page_list(&pagelist);
++
++	return ret;
++}
++
++static long privcmd_ioctl(struct file *file,
++			  unsigned int cmd, unsigned long data)
++{
++	int ret = -ENOSYS;
++	void __user *udata = (void __user *) data;
++
++	switch (cmd) {
++	case IOCTL_PRIVCMD_HYPERCALL:
++		ret = privcmd_ioctl_hypercall(udata);
++		break;
++
++	case IOCTL_PRIVCMD_MMAP:
++		ret = privcmd_ioctl_mmap(udata);
++		break;
++
++	case IOCTL_PRIVCMD_MMAPBATCH:
++		ret = privcmd_ioctl_mmap_batch(udata);
++		break;
++
++	default:
++		ret = -EINVAL;
++		break;
++	}
++
++	return ret;
++}
++
++#ifndef HAVE_ARCH_PRIVCMD_MMAP
++static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
++{
++	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
++	       vma, vma->vm_start, vma->vm_end,
++	       vmf->pgoff, vmf->virtual_address);
++
++	return VM_FAULT_SIGBUS;
++}
++
++static struct vm_operations_struct privcmd_vm_ops = {
++	.fault = privcmd_fault
++};
++
++static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
++{
++	/* Unsupported for auto-translate guests. */
++	if (xen_feature(XENFEAT_auto_translated_physmap))
++		return -ENOSYS;
++
++	/* DONTCOPY is essential for Xen as copy_page_range is broken. */
++	vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
++	vma->vm_ops = &privcmd_vm_ops;
++	vma->vm_private_data = NULL;
++
++	return 0;
++}
++
++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
++{
++	return (xchg(&vma->vm_private_data, (void *)1) == NULL);
++}
++#endif
++
++const struct file_operations privcmd_file_ops = {
++	.unlocked_ioctl = privcmd_ioctl,
++	.mmap = privcmd_mmap,
++};
+diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
+index 6559e0c..afaa6ed 100644
+--- a/drivers/xen/xenfs/super.c
++++ b/drivers/xen/xenfs/super.c
+@@ -12,6 +12,10 @@
+ #include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/magic.h>
++#include <linux/mm.h>
++#include <linux/backing-dev.h>
++
++#include <xen/xen.h>
+ 
+ #include "xenfs.h"
+ 
+@@ -20,6 +24,62 @@
+ MODULE_DESCRIPTION("Xen filesystem");
+ MODULE_LICENSE("GPL");
+ 
++static int xenfs_set_page_dirty(struct page *page)
++{
++	return !TestSetPageDirty(page);
++}
++
++static const struct address_space_operations xenfs_aops = {
++	.set_page_dirty = xenfs_set_page_dirty,
++};
++
++static struct backing_dev_info xenfs_backing_dev_info = {
++	.ra_pages	= 0,	/* No readahead */
++	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
++};
++
++static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
++{
++	struct inode *ret = new_inode(sb);
++
++	if (ret) {
++		ret->i_mode = mode;
++		ret->i_mapping->a_ops = &xenfs_aops;
++		ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info;
++		ret->i_uid = ret->i_gid = 0;
++		ret->i_blocks = 0;
++		ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
++	}
++	return ret;
++}
++
++static struct dentry *xenfs_create_file(struct super_block *sb,
++					struct dentry *parent,
++					const char *name,
++					const struct file_operations *fops,
++					void *data,
++					int mode)
++{
++	struct dentry *dentry;
++	struct inode *inode;
++
++	dentry = d_alloc_name(parent, name);
++	if (!dentry)
++		return NULL;
++
++	inode = xenfs_make_inode(sb, S_IFREG | mode);
++	if (!inode) {
++		dput(dentry);
++		return NULL;
++	}
++
++	inode->i_fop = fops;
++	inode->i_private = data;
++
++	d_add(dentry, inode);
++	return dentry;
++}
++
+ static ssize_t capabilities_read(struct file *file, char __user *buf,
+ 				 size_t size, loff_t *off)
+ {
+@@ -43,8 +103,22 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
+ 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
+ 		{""},
+ 	};
+-
+-	return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
++	int rc;
++
++	rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
++	if (rc < 0)
++		return rc;
++
++	if (xen_initial_domain()) {
++		xenfs_create_file(sb, sb->s_root, "xsd_kva",
++				  &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
++		xenfs_create_file(sb, sb->s_root, "xsd_port",
++				  &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
++		xenfs_create_file(sb, sb->s_root, "privcmd",
++				  &privcmd_file_ops, NULL, S_IRUSR|S_IWUSR);
++	}
++
++	return rc;
+ }
+ 
+ static int xenfs_get_sb(struct file_system_type *fs_type,
+@@ -63,16 +137,30 @@ static struct file_system_type xenfs_type = {
+ 
+ static int __init xenfs_init(void)
+ {
+-	if (xen_pv_domain())
+-		return register_filesystem(&xenfs_type);
++	int err;
++	if (!xen_domain()) {
++		printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n");
++		return 0;
++	}
++
++	err = register_filesystem(&xenfs_type);
++	if (err) {
++		printk(KERN_ERR "xenfs: Unable to register filesystem!\n");
++		goto out;
++	}
++
++	err = bdi_init(&xenfs_backing_dev_info);
++	if (err)
++		unregister_filesystem(&xenfs_type);
++
++ out:
+ 
+-	printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
+-	return 0;
++	return err;
+ }
+ 
+ static void __exit xenfs_exit(void)
+ {
+-	if (xen_pv_domain())
++	if (xen_domain())
+ 		unregister_filesystem(&xenfs_type);
+ }
+ 
+diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
+index 6c4269b..0ddef43 100644
+--- a/drivers/xen/xenfs/xenbus.c
++++ b/drivers/xen/xenfs/xenbus.c
+@@ -121,8 +121,12 @@ static ssize_t xenbus_file_read(struct file *filp,
+ 	int ret;
+ 
+ 	mutex_lock(&u->reply_mutex);
++again:
+ 	while (list_empty(&u->read_buffers)) {
+ 		mutex_unlock(&u->reply_mutex);
++		if (filp->f_flags & O_NONBLOCK)
++			return -EAGAIN;
++
+ 		ret = wait_event_interruptible(u->read_waitq,
+ 					       !list_empty(&u->read_buffers));
+ 		if (ret)
+@@ -140,7 +144,7 @@ static ssize_t xenbus_file_read(struct file *filp,
+ 		i += sz - ret;
+ 		rb->cons += sz - ret;
+ 
+-		if (ret != sz) {
++		if (ret != 0) {
+ 			if (i == 0)
+ 				i = -EFAULT;
+ 			goto out;
+@@ -156,6 +160,8 @@ static ssize_t xenbus_file_read(struct file *filp,
+ 					struct read_buffer, list);
+ 		}
+ 	}
++	if (i == 0)
++		goto again;
+ 
+ out:
+ 	mutex_unlock(&u->reply_mutex);
+@@ -403,6 +409,7 @@ static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u)
+ 
+ 		mutex_lock(&u->reply_mutex);
+ 		rc = queue_reply(&u->read_buffers, &reply, sizeof(reply));
++		wake_up(&u->read_waitq);
+ 		mutex_unlock(&u->reply_mutex);
+ 	}
+ 
+@@ -451,7 +458,7 @@ static ssize_t xenbus_file_write(struct file *filp,
+ 
+ 	ret = copy_from_user(u->u.buffer + u->len, ubuf, len);
+ 
+-	if (ret == len) {
++	if (ret != 0) {
+ 		rc = -EFAULT;
+ 		goto out;
+ 	}
+@@ -484,21 +491,6 @@ static ssize_t xenbus_file_write(struct file *filp,
+ 	msg_type = u->u.msg.type;
+ 
+ 	switch (msg_type) {
+-	case XS_TRANSACTION_START:
+-	case XS_TRANSACTION_END:
+-	case XS_DIRECTORY:
+-	case XS_READ:
+-	case XS_GET_PERMS:
+-	case XS_RELEASE:
+-	case XS_GET_DOMAIN_PATH:
+-	case XS_WRITE:
+-	case XS_MKDIR:
+-	case XS_RM:
+-	case XS_SET_PERMS:
+-		/* Send out a transaction */
+-		ret = xenbus_write_transaction(msg_type, u);
+-		break;
+-
+ 	case XS_WATCH:
+ 	case XS_UNWATCH:
+ 		/* (Un)Ask for some path to be watched for changes */
+@@ -506,7 +498,8 @@ static ssize_t xenbus_file_write(struct file *filp,
+ 		break;
+ 
+ 	default:
+-		ret = -EINVAL;
++		/* Send out a transaction */
++		ret = xenbus_write_transaction(msg_type, u);
+ 		break;
+ 	}
+ 	if (ret != 0)
+diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
+index 51f08b2..b68aa62 100644
+--- a/drivers/xen/xenfs/xenfs.h
++++ b/drivers/xen/xenfs/xenfs.h
+@@ -2,5 +2,8 @@
+ #define _XENFS_XENBUS_H
+ 
+ extern const struct file_operations xenbus_file_ops;
++extern const struct file_operations privcmd_file_ops;
++extern const struct file_operations xsd_kva_file_ops;
++extern const struct file_operations xsd_port_file_ops;
+ 
+ #endif	/* _XENFS_XENBUS_H */
+diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
+new file mode 100644
+index 0000000..af10804
+--- /dev/null
++++ b/drivers/xen/xenfs/xenstored.c
+@@ -0,0 +1,67 @@
++#include <linux/types.h>
++#include <linux/mm.h>
++#include <linux/fs.h>
++
++#include <xen/page.h>
++
++#include "xenfs.h"
++#include "../xenbus/xenbus_comms.h"
++
++static ssize_t xsd_read(struct file *file, char __user *buf,
++			    size_t size, loff_t *off)
++{
++	const char *str = (const char *)file->private_data;
++	return simple_read_from_buffer(buf, size, off, str, strlen(str));
++}
++
++static int xsd_release(struct inode *inode, struct file *file)
++{
++	kfree(file->private_data);
++	return 0;
++}
++
++static int xsd_kva_open(struct inode *inode, struct file *file)
++{
++	file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
++					       xen_store_interface);
++	if (!file->private_data)
++		return -ENOMEM;
++	return 0;
++}
++
++static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
++{
++	size_t size = vma->vm_end - vma->vm_start;
++
++	if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
++		return -EINVAL;
++
++	if (remap_pfn_range(vma, vma->vm_start,
++			    virt_to_pfn(xen_store_interface),
++			    size, vma->vm_page_prot))
++		return -EAGAIN;
++
++	return 0;
++}
++
++const struct file_operations xsd_kva_file_ops = {
++	.open = xsd_kva_open,
++	.mmap = xsd_kva_mmap,
++	.read = xsd_read,
++	.release = xsd_release,
++};
++
++static int xsd_port_open(struct inode *inode, struct file *file)
++{
++	file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
++					       xen_store_evtchn);
++	if (!file->private_data)
++		return -ENOMEM;
++	return 0;
++}
++
++const struct file_operations xsd_port_file_ops = {
++	.open = xsd_port_open,
++	.read = xsd_read,
++	.release = xsd_release,
++};
+diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
+index f4906f6..e7233e8 100644
+--- a/include/acpi/acpi_drivers.h
++++ b/include/acpi/acpi_drivers.h
+@@ -154,4 +154,25 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle)
+ }
+ #endif
+ 
++/*--------------------------------------------------------------------------
++				Memory
++  -------------------------------------------------------------------------- */
++#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
++	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
++struct acpi_memory_info {
++	struct list_head list;
++	u64 start_addr;		/* Memory Range start physical addr */
++	u64 length;		/* Memory Range length */
++	unsigned short caching;	/* memory cache attribute */
++	unsigned short write_protect;	/* memory read/write attribute */
++	unsigned int enabled:1;
++};
++
++struct acpi_memory_device {
++	struct acpi_device *device;
++	unsigned int state;	/* State of the memory device */
++	struct list_head res_list;
++};
++#endif
++
+ #endif /*__ACPI_DRIVERS_H__*/
+diff --git a/include/acpi/processor.h b/include/acpi/processor.h
+index e7bdaaf..6aa3111 100644
+--- a/include/acpi/processor.h
++++ b/include/acpi/processor.h
+@@ -239,6 +239,25 @@ struct acpi_processor_errata {
+ 	} piix4;
+ };
+ 
++extern int acpi_processor_errata(struct acpi_processor *pr);
++#ifdef CONFIG_ACPI_PROCFS
++extern int acpi_processor_add_fs(struct acpi_device *device);
++extern int acpi_processor_remove_fs(struct acpi_device *device);
++#else
++static inline int acpi_processor_add_fs(struct acpi_device *device)
++{
++	return 0;
++}
++
++static inline int acpi_processor_remove_fs(struct acpi_device *device)
++{
++	return 0;
++}
++#endif
++extern int acpi_processor_set_pdc(struct acpi_processor *pr);
++extern int acpi_processor_remove(struct acpi_device *device, int type);
++extern void acpi_processor_notify(struct acpi_device *device, u32 event);
++
+ extern int acpi_processor_preregister_performance(struct
+ 						  acpi_processor_performance
+ 						  *performance);
+@@ -296,6 +315,8 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx
+ void acpi_processor_ppc_init(void);
+ void acpi_processor_ppc_exit(void);
+ int acpi_processor_ppc_has_changed(struct acpi_processor *pr);
++int acpi_processor_get_performance_info(struct acpi_processor *pr);
++int acpi_processor_get_psd(struct acpi_processor	*pr);
+ #else
+ static inline void acpi_processor_ppc_init(void)
+ {
+@@ -332,6 +353,7 @@ int acpi_processor_power_init(struct acpi_processor *pr,
+ int acpi_processor_cst_has_changed(struct acpi_processor *pr);
+ int acpi_processor_power_exit(struct acpi_processor *pr,
+ 			      struct acpi_device *device);
++int acpi_processor_get_power_info(struct acpi_processor *pr);
+ int acpi_processor_suspend(struct acpi_device * device, pm_message_t state);
+ int acpi_processor_resume(struct acpi_device * device);
+ extern struct cpuidle_driver acpi_idle_driver;
+diff --git a/include/asm-generic/pci.h b/include/asm-generic/pci.h
+index 26373cf..9fb4270 100644
+--- a/include/asm-generic/pci.h
++++ b/include/asm-generic/pci.h
+@@ -43,6 +43,8 @@ pcibios_select_root(struct pci_dev *pdev, struct resource *res)
+ 	return root;
+ }
+ 
++#ifndef HAVE_ARCH_PCIBIOS_SCAN_ALL_FNS
++#endif
+ #ifndef HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
+ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
+ {
+diff --git a/include/drm/drmP.h b/include/drm/drmP.h
+index 7ad3faa..cf9ddce 100644
+--- a/include/drm/drmP.h
++++ b/include/drm/drmP.h
+@@ -1388,7 +1388,7 @@ extern int drm_vma_info(struct seq_file *m, void *data);
+ #endif
+ 
+ 				/* Scatter Gather Support (drm_scatter.h) */
+-extern void drm_sg_cleanup(struct drm_sg_mem * entry);
++extern void drm_sg_cleanup(struct drm_device *dev, struct drm_sg_mem * entry);
+ extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data,
+ 			struct drm_file *file_priv);
+ extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request);
+diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
+index dd97fb8..b10ec49 100644
+--- a/include/linux/bootmem.h
++++ b/include/linux/bootmem.h
+@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
+ 			      unsigned long addr,
+ 			      unsigned long size);
+ extern void free_bootmem(unsigned long addr, unsigned long size);
++extern void free_bootmem_late(unsigned long addr, unsigned long size);
+ 
+ /*
+  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
+diff --git a/include/linux/dmar.h b/include/linux/dmar.h
+index 4a2b162..5de4c9e 100644
+--- a/include/linux/dmar.h
++++ b/include/linux/dmar.h
+@@ -208,16 +208,9 @@ struct dmar_atsr_unit {
+ 	u8 include_all:1;		/* include all ports */
+ };
+ 
+-/* Intel DMAR  initialization functions */
+ extern int intel_iommu_init(void);
+-#else
+-static inline int intel_iommu_init(void)
+-{
+-#ifdef CONFIG_INTR_REMAP
+-	return dmar_dev_scope_init();
+-#else
+-	return -ENODEV;
+-#endif
+-}
+-#endif /* !CONFIG_DMAR */
++#else /* !CONFIG_DMAR: */
++static inline int intel_iommu_init(void) { return -ENODEV; }
++#endif /* CONFIG_DMAR */
++
+ #endif /* __DMAR_H__ */
+diff --git a/include/linux/fb.h b/include/linux/fb.h
+index 862e7d4..74d67ca 100644
+--- a/include/linux/fb.h
++++ b/include/linux/fb.h
+@@ -763,6 +763,7 @@ struct fb_tile_ops {
+ 	 *  takes over; acceleration engine should be in a quiescent state */
+ 
+ /* hints */
++#define FBINFO_VIRTFB		0x0004 /* FB is System RAM, not device. */
+ #define FBINFO_PARTIAL_PAN_OK	0x0040 /* otw use pan only for double-buffering */
+ #define FBINFO_READS_FAST	0x0080 /* soft-copy faster than rendering */
+ 
+diff --git a/include/linux/if_link.h b/include/linux/if_link.h
+index 176c518..d681cc9 100644
+--- a/include/linux/if_link.h
++++ b/include/linux/if_link.h
+@@ -81,6 +81,8 @@ enum
+ #define IFLA_LINKINFO IFLA_LINKINFO
+ 	IFLA_NET_NS_PID,
+ 	IFLA_IFALIAS,
++	IFLA_NUM_VF,		/* Number of VFs if device is SR-IOV PF */
++	IFLA_VFINFO_LIST,
+ 	__IFLA_MAX
+ };
+ 
+@@ -190,4 +192,47 @@ struct ifla_vlan_qos_mapping
+ 	__u32 to;
+ };
+ 
++/* SR-IOV virtual function managment section */
++
++enum {
++	IFLA_VF_INFO_UNSPEC,
++	IFLA_VF_INFO,
++	__IFLA_VF_INFO_MAX,
++};
++
++#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1)
++
++enum {
++	IFLA_VF_UNSPEC,
++	IFLA_VF_MAC,		/* Hardware queue specific attributes */
++	IFLA_VF_VLAN,
++	IFLA_VF_TX_RATE,	/* TX Bandwidth Allocation */
++	__IFLA_VF_MAX,
++};
++
++#define IFLA_VF_MAX (__IFLA_VF_MAX - 1)
++
++struct ifla_vf_mac {
++	__u32 vf;
++	__u8 mac[32]; /* MAX_ADDR_LEN */
++};
++
++struct ifla_vf_vlan {
++	__u32 vf;
++	__u32 vlan; /* 0 - 4095, 0 disables VLAN filter */
++	__u32 qos;
++};
++
++struct ifla_vf_tx_rate {
++	__u32 vf;
++	__u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */
++};
++
++struct ifla_vf_info {
++	__u32 vf;
++	__u8 mac[32];
++	__u32 vlan;
++	__u32 qos;
++	__u32 tx_rate;
++};
+ #endif /* _LINUX_IF_LINK_H */
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index a8d25e4..1bc4927 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -109,6 +109,12 @@ extern unsigned int kobjsize(const void *objp);
+ #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
+ #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
+ #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
++#ifdef CONFIG_XEN
++#define VM_FOREIGN	0x20000000      /* Has pages belonging to another VM */
++struct vm_foreign_map {
++        struct page **map;
++};
++#endif
+ 
+ #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
+ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
+@@ -199,6 +205,11 @@ struct vm_operations_struct {
+ 	 */
+ 	int (*access)(struct vm_area_struct *vma, unsigned long addr,
+ 		      void *buf, int len, int write);
++
++	/* Area-specific function for clearing the PTE at @ptep. Returns the
++	 * original value of @ptep. */
++	pte_t (*zap_pte)(struct vm_area_struct *vma, 
++			 unsigned long addr, pte_t *ptep, int is_fullmm);
+ #ifdef CONFIG_NUMA
+ 	/*
+ 	 * set_policy() op must add a reference to any non-NULL @new mempolicy
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index ec12f8c..3f4991c 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -28,6 +28,7 @@
+ #include <linux/if.h>
+ #include <linux/if_ether.h>
+ #include <linux/if_packet.h>
++#include <linux/if_link.h>
+ 
+ #ifdef __KERNEL__
+ #include <linux/timer.h>
+@@ -577,6 +578,13 @@ struct netdev_queue {
+  *	this function is called when a VLAN id is unregistered.
+  *
+  * void (*ndo_poll_controller)(struct net_device *dev);
++ *
++ *	SR-IOV management functions.
++ * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
++ * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
++ * int (*ndo_set_vf_tx_rate)(struct net_device *dev, int vf, int rate);
++ * int (*ndo_get_vf_config)(struct net_device *dev,
++ *			    int vf, struct ifla_vf_info *ivf);
+  */
+ #define HAVE_NET_DEVICE_OPS
+ struct net_device_ops {
+@@ -626,6 +634,15 @@ struct net_device_ops {
+ #define HAVE_NETDEV_POLL
+ 	void                    (*ndo_poll_controller)(struct net_device *dev);
+ #endif
++	int			(*ndo_set_vf_mac)(struct net_device *dev,
++						  int queue, u8 *mac);
++	int			(*ndo_set_vf_vlan)(struct net_device *dev,
++						   int queue, u16 vlan, u8 qos);
++	int			(*ndo_set_vf_tx_rate)(struct net_device *dev,
++						      int vf, int rate);
++	int			(*ndo_get_vf_config)(struct net_device *dev,
++						     int vf,
++						     struct ifla_vf_info *ivf);
+ #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
+ 	int			(*ndo_fcoe_enable)(struct net_device *dev);
+ 	int			(*ndo_fcoe_disable)(struct net_device *dev);
+diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
+index 6b202b1..b03950e 100644
+--- a/include/linux/page-flags.h
++++ b/include/linux/page-flags.h
+@@ -105,6 +105,9 @@ enum pageflags {
+ #ifdef CONFIG_ARCH_USES_PG_UNCACHED
+ 	PG_uncached,		/* Page has been mapped as uncached */
+ #endif
++#ifdef CONFIG_XEN
++	PG_foreign,
++#endif
+ #ifdef CONFIG_MEMORY_FAILURE
+ 	PG_hwpoison,		/* hardware poisoned page. Don't touch */
+ #endif
+@@ -275,6 +278,23 @@ PAGEFLAG(Uncached, uncached)
+ PAGEFLAG_FALSE(Uncached)
+ #endif
+ 
++#ifdef CONFIG_XEN
++TESTPAGEFLAG(Foreign, foreign)
++__SETPAGEFLAG(Foreign, foreign)
++CLEARPAGEFLAG(Foreign, foreign)
++#define SetPageForeign(_page, dtor) do {				\
++	__SetPageForeign(_page);					\
++	BUG_ON((dtor) == (void (*)(struct page *, unsigned int))0);	\
++	(_page)->index = (long)(dtor);					\
++} while (0)
++#define _PageForeignDestructor(_page) \
++	((void (*)(struct page *, unsigned int))(_page)->index)
++#define PageForeignDestructor(_page, order)	\
++	_PageForeignDestructor(_page)(_page, order)
++#else
++PAGEFLAG_FALSE(Foreign)
++#endif
++
+ #ifdef CONFIG_MEMORY_FAILURE
+ PAGEFLAG(HWPoison, hwpoison)
+ TESTSETFLAG(HWPoison, hwpoison)
+diff --git a/include/linux/pci.h b/include/linux/pci.h
+index e07d194..ca28e46 100644
+--- a/include/linux/pci.h
++++ b/include/linux/pci.h
+@@ -609,6 +609,9 @@ extern void pci_remove_bus_device(struct pci_dev *dev);
+ extern void pci_stop_bus_device(struct pci_dev *dev);
+ void pci_setup_cardbus(struct pci_bus *bus);
+ extern void pci_sort_breadthfirst(void);
++#define dev_is_pci(d) ((d)->bus == &pci_bus_type)
++#define dev_is_pf(d) ((dev_is_pci(d) ? to_pci_dev(d)->is_physfn : false))
++#define dev_num_vf(d) ((dev_is_pci(d) ? pci_num_vf(to_pci_dev(d)) : 0))
+ 
+ /* Generic PCI functions exported to card drivers */
+ 
+@@ -1124,6 +1127,9 @@ static inline struct pci_dev *pci_get_bus_and_slot(unsigned int bus,
+ 						unsigned int devfn)
+ { return NULL; }
+ 
++#define dev_is_pci(d) (false)
++#define dev_is_pf(d) (false)
++#define dev_num_vf(d) (0)
+ #endif /* CONFIG_PCI */
+ 
+ /* Include architecture-dependent settings and functions */
+@@ -1279,6 +1285,7 @@ void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
+ extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+ extern void pci_disable_sriov(struct pci_dev *dev);
+ extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
++extern int pci_num_vf(struct pci_dev *dev);
+ #else
+ static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
+ {
+@@ -1291,6 +1298,10 @@ static inline irqreturn_t pci_sriov_migration(struct pci_dev *dev)
+ {
+ 	return IRQ_NONE;
+ }
++static inline int pci_num_vf(struct pci_dev *dev)
++{
++	return 0;
++}
+ #endif
+ 
+ #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
+index 67325bf..c398cc3 100644
+--- a/include/linux/pci_ids.h
++++ b/include/linux/pci_ids.h
+@@ -2712,3 +2712,6 @@
+ #define PCI_DEVICE_ID_RME_DIGI32	0x9896
+ #define PCI_DEVICE_ID_RME_DIGI32_PRO	0x9897
+ #define PCI_DEVICE_ID_RME_DIGI32_8	0x9898
++
++#define PCI_VENDOR_ID_XEN		0x5853
++#define PCI_DEVICE_ID_XEN_PLATFORM	0x0001
+diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
+index 73b1f1c..113585a 100644
+--- a/include/linux/swiotlb.h
++++ b/include/linux/swiotlb.h
+@@ -7,6 +7,8 @@ struct device;
+ struct dma_attrs;
+ struct scatterlist;
+ 
++extern int swiotlb_force;
++
+ /*
+  * Maximum allowable number of contiguous slabs to map,
+  * must be a power of 2.  What is the appropriate value ?
+@@ -20,9 +22,46 @@ struct scatterlist;
+  */
+ #define IO_TLB_SHIFT 11
+ 
+-extern void
+-swiotlb_init(void);
+-
++/* swiotlb-core.c */
++extern void swiotlb_init(int verbose);
++#ifdef CONFIG_SWIOTLB
++extern void __init swiotlb_free(void);
++#else
++static inline void swiotlb_free(void) { }
++#endif
++extern void swiotlb_print_info(void);
++
++/* swiotlb-core.c: Internal book-keeping functions.
++ * Must be linked against the library to take advantage of them.*/
++#ifdef CONFIG_SWIOTLB
++/*
++ * Enumeration for sync targets
++ */
++enum dma_sync_target {
++	SYNC_FOR_CPU = 0,
++	SYNC_FOR_DEVICE = 1,
++};
++extern char *io_tlb_start;
++extern char *io_tlb_end;
++extern unsigned long io_tlb_nslabs;
++extern void *io_tlb_overflow_buffer;
++extern unsigned long io_tlb_overflow;
++extern int is_swiotlb_buffer(phys_addr_t paddr);
++extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
++			   enum dma_data_direction dir);
++extern void *do_map_single(struct device *hwdev, phys_addr_t phys,
++			    unsigned long start_dma_addr, size_t size, int dir);
++
++extern void do_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
++			     int dir);
++
++extern void do_sync_single(struct device *hwdev, char *dma_addr, size_t size,
++			   int dir, int target);
++extern void swiotlb_full(struct device *dev, size_t size, int dir, int do_panic);
++extern void __init swiotlb_init_early(size_t default_size, int verbose);
++#endif
++
++/* swiotlb.c: dma_ops functions. */
+ extern void
+ *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ 			dma_addr_t *dma_handle, gfp_t flags);
+@@ -88,4 +127,74 @@ swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+ extern int
+ swiotlb_dma_supported(struct device *hwdev, u64 mask);
+ 
++/* swiotlb-xen.c: dma_ops functions. */
++extern void xen_swiotlb_init(int verbose);
++extern void
++*xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
++			dma_addr_t *dma_handle, gfp_t flags);
++
++extern void
++xen_swiotlb_free_coherent(struct device *hwdev, size_t size,
++		      void *vaddr, dma_addr_t dma_handle);
++
++extern dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
++				   unsigned long offset, size_t size,
++				   enum dma_data_direction dir,
++				   struct dma_attrs *attrs);
++extern void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
++			       size_t size, enum dma_data_direction dir,
++			       struct dma_attrs *attrs);
++
++extern int
++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nents,
++	       int direction);
++
++extern void
++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
++		 int direction);
++
++extern int
++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++			 int nelems, enum dma_data_direction dir,
++			 struct dma_attrs *attrs);
++
++extern void
++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++		       int nelems, enum dma_data_direction dir,
++		       struct dma_attrs *attrs);
++
++extern void
++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++			    size_t size, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
++			int nelems, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
++			       size_t size, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
++			   int nelems, enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++				  unsigned long offset, size_t size,
++				  enum dma_data_direction dir);
++
++extern void
++xen_swiotlb_sync_single_range_for_device(struct device *hwdev,
++					 dma_addr_t dev_addr,
++					 unsigned long offset, size_t size,
++					 enum dma_data_direction dir);
++
++extern int
++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
++
++extern int
++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask);
++
++
+ #endif /* __LINUX_SWIOTLB_H */
+diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
+index 3c123c3..1a2ba21 100644
+--- a/include/linux/vmalloc.h
++++ b/include/linux/vmalloc.h
+@@ -7,6 +7,8 @@
+ 
+ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
+ 
++extern bool vmap_lazy_unmap;
++
+ /* bits in flags of vmalloc's vm_struct below */
+ #define VM_IOREMAP	0x00000001	/* ioremap() and friends */
+ #define VM_ALLOC	0x00000002	/* vmalloc() */
+diff --git a/include/xen/Kbuild b/include/xen/Kbuild
+index 4e65c16..84ad8f0 100644
+--- a/include/xen/Kbuild
++++ b/include/xen/Kbuild
+@@ -1 +1,2 @@
+ header-y += evtchn.h
++header-y += privcmd.h
+diff --git a/include/xen/acpi.h b/include/xen/acpi.h
+new file mode 100644
+index 0000000..279142d
+--- /dev/null
++++ b/include/xen/acpi.h
+@@ -0,0 +1,106 @@
++#ifndef _XEN_ACPI_H
++#define _XEN_ACPI_H
++
++#include <linux/types.h>
++#include <acpi/acpi_drivers.h>
++#include <acpi/processor.h>
++#include <xen/xen.h>
++
++#ifdef CONFIG_XEN_S3
++#include <asm/xen/hypervisor.h>
++
++static inline bool xen_pv_acpi(void)
++{
++	return xen_pv_domain();
++}
++#else
++static inline bool xen_pv_acpi(void)
++{
++	return false;
++}
++#endif
++
++int acpi_notify_hypervisor_state(u8 sleep_state,
++				 u32 pm1a_cnt, u32 pm1b_cnd);
++
++/*
++ * Following are interfaces for xen acpi processor control
++ */
++
++/* Events notified to xen */
++#define PROCESSOR_PM_INIT	1
++#define PROCESSOR_PM_CHANGE	2
++#define PROCESSOR_HOTPLUG	3
++
++/* Objects for the PM events */
++#define PM_TYPE_IDLE		0
++#define PM_TYPE_PERF		1
++#define PM_TYPE_THR		2
++#define PM_TYPE_MAX		3
++
++#define XEN_MAX_ACPI_ID 255
++
++/* Processor hotplug events */
++#define HOTPLUG_TYPE_ADD	0
++#define HOTPLUG_TYPE_REMOVE	1
++
++int xen_acpi_processor_init(void);
++void xen_acpi_processor_exit(void);
++
++int xen_acpi_processor_power_init(struct acpi_processor *pr,
++		struct acpi_device *device);
++int xen_acpi_processor_cst_has_changed(struct acpi_processor *pr);
++
++void xen_arch_acpi_processor_init_pdc(struct acpi_processor *pr);
++
++#ifdef CONFIG_CPU_FREQ
++int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr);
++int xen_acpi_processor_get_performance(struct acpi_processor *pr);
++#else
++static inline int xen_acpi_processor_ppc_has_changed(struct acpi_processor *pr)
++{
++	return acpi_processor_ppc_has_changed(pr);
++}
++static inline int xen_acpi_processor_get_performance(struct acpi_processor *pr)
++{
++	printk(KERN_WARNING
++		"Warning: xen_acpi_processor_get_performance not supported\n"
++		"Consider compiling CPUfreq support into your kernel.\n");
++	return 0;
++}
++#endif
++
++#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
++	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
++int xen_hotadd_memory(struct acpi_memory_device *mem_device);
++#endif
++
++#if defined(CONFIG_ACPI_PROCESSOR_XEN) || \
++defined(CONFIG_ACPI_PROCESSOR_XEN_MODULE)
++
++struct processor_cntl_xen_ops {
++	/* Transfer processor PM events to xen */
++int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event);
++	/* Notify physical processor status to xen */
++	int (*hotplug)(struct acpi_processor *pr, int type);
++};
++
++extern int processor_cntl_xen_notify(struct acpi_processor *pr,
++			int event, int type);
++extern int processor_cntl_xen_power_cache(int cpu, int cx,
++		struct acpi_power_register *reg);
++#else
++
++static inline int processor_cntl_xen_notify(struct acpi_processor *pr,
++			int event, int type)
++{
++	return 0;
++}
++static inline int processor_cntl_xen_power_cache(int cpu, int cx,
++		struct acpi_power_register *reg)
++{
++	return 0;
++}
++#endif /* CONFIG_ACPI_PROCESSOR_XEN */
++
++#endif	/* _XEN_ACPI_H */
+diff --git a/include/xen/balloon.h b/include/xen/balloon.h
+new file mode 100644
+index 0000000..e751514
+--- /dev/null
++++ b/include/xen/balloon.h
+@@ -0,0 +1,8 @@
++#ifndef _XEN_BALLOON_H
++#define _XEN_BALLOON_H
++
++/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
++struct page **alloc_empty_pages_and_pagevec(int nr_pages);
++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
++
++#endif
+diff --git a/include/xen/blkif.h b/include/xen/blkif.h
+new file mode 100644
+index 0000000..7172081
+--- /dev/null
++++ b/include/xen/blkif.h
+@@ -0,0 +1,123 @@
++/*
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_BLKIF_H__
++#define __XEN_BLKIF_H__
++
++#include <xen/interface/xen.h>
++#include <xen/interface/io/ring.h>
++#include <xen/interface/io/blkif.h>
++#include <xen/interface/io/protocols.h>
++
++/* Not a real protocol.  Used to generate ring structs which contain
++ * the elements common to all protocols only.  This way we get a
++ * compiler-checkable way to use common struct elements, so we can
++ * avoid using switch(protocol) in a number of places.  */
++struct blkif_common_request {
++	char dummy;
++};
++struct blkif_common_response {
++	char dummy;
++};
++
++/* i386 protocol version */
++#pragma pack(push, 4)
++struct blkif_x86_32_request {
++	uint8_t        operation;    /* BLKIF_OP_???                         */
++	uint8_t        nr_segments;  /* number of segments                   */
++	blkif_vdev_t   handle;       /* only for read/write requests         */
++	uint64_t       id;           /* private guest value, echoed in resp  */
++	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
++	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++};
++struct blkif_x86_32_response {
++	uint64_t        id;              /* copied from request */
++	uint8_t         operation;       /* copied from request */
++	int16_t         status;          /* BLKIF_RSP_???       */
++};
++typedef struct blkif_x86_32_request blkif_x86_32_request_t;
++typedef struct blkif_x86_32_response blkif_x86_32_response_t;
++#pragma pack(pop)
++
++/* x86_64 protocol version */
++struct blkif_x86_64_request {
++	uint8_t        operation;    /* BLKIF_OP_???                         */
++	uint8_t        nr_segments;  /* number of segments                   */
++	blkif_vdev_t   handle;       /* only for read/write requests         */
++	uint64_t       __attribute__((__aligned__(8))) id;
++	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
++	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
++};
++struct blkif_x86_64_response {
++	uint64_t       __attribute__((__aligned__(8))) id;
++	uint8_t         operation;       /* copied from request */
++	int16_t         status;          /* BLKIF_RSP_???       */
++};
++typedef struct blkif_x86_64_request blkif_x86_64_request_t;
++typedef struct blkif_x86_64_response blkif_x86_64_response_t;
++
++DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
++DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
++DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
++
++union blkif_back_rings {
++	struct blkif_back_ring        native;
++	struct blkif_common_back_ring common;
++	struct blkif_x86_32_back_ring x86_32;
++	struct blkif_x86_64_back_ring x86_64;
++};
++
++enum blkif_protocol {
++	BLKIF_PROTOCOL_NATIVE = 1,
++	BLKIF_PROTOCOL_X86_32 = 2,
++	BLKIF_PROTOCOL_X86_64 = 3,
++};
++
++static void inline blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src)
++{
++	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
++	dst->operation = src->operation;
++	dst->nr_segments = src->nr_segments;
++	dst->handle = src->handle;
++	dst->id = src->id;
++	dst->sector_number = src->sector_number;
++	barrier();
++	if (n > dst->nr_segments)
++		n = dst->nr_segments;
++	for (i = 0; i < n; i++)
++		dst->seg[i] = src->seg[i];
++}
++
++static void inline blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src)
++{
++	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
++	dst->operation = src->operation;
++	dst->nr_segments = src->nr_segments;
++	dst->handle = src->handle;
++	dst->id = src->id;
++	dst->sector_number = src->sector_number;
++	barrier();
++	if (n > dst->nr_segments)
++		n = dst->nr_segments;
++	for (i = 0; i < n; i++)
++		dst->seg[i] = src->seg[i];
++}
++
++#endif /* __XEN_BLKIF_H__ */
+diff --git a/include/xen/events.h b/include/xen/events.h
+index e68d59a..7e17e2a 100644
+--- a/include/xen/events.h
++++ b/include/xen/events.h
+@@ -12,6 +12,8 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn,
+ 			      irq_handler_t handler,
+ 			      unsigned long irqflags, const char *devname,
+ 			      void *dev_id);
++int bind_virq_to_irq(unsigned int virq, unsigned int cpu);
++
+ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+ 			    irq_handler_t handler,
+ 			    unsigned long irqflags, const char *devname,
+@@ -22,6 +24,12 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
+ 			   unsigned long irqflags,
+ 			   const char *devname,
+ 			   void *dev_id);
++int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
++					  unsigned int remote_port,
++					  irq_handler_t handler,
++					  unsigned long irqflags,
++					  const char *devname,
++					  void *dev_id);
+ 
+ /*
+  * Common unbind function for all event sources. Takes IRQ to unbind from.
+@@ -53,7 +61,42 @@ bool xen_test_irq_pending(int irq);
+    irq will be disabled so it won't deliver an interrupt. */
+ void xen_poll_irq(int irq);
+ 
++/* Poll waiting for an irq to become pending with a timeout.  In the usual case, the
++   irq will be disabled so it won't deliver an interrupt. */
++void xen_poll_irq_timeout(int irq, u64 timeout);
++
+ /* Determine the IRQ which is bound to an event channel */
+ unsigned irq_from_evtchn(unsigned int evtchn);
+ 
++/* Allocate an irq for a physical interrupt, given a gsi.  "Legacy"
++   GSIs are identity mapped; others are dynamically allocated as
++   usual. */
++int xen_allocate_pirq(unsigned gsi, int shareable, char *name);
++
++/* De-allocates the above mentioned physical interrupt. */
++int xen_destroy_irq(int irq);
++
++/* Return vector allocated to pirq */
++int xen_vector_from_irq(unsigned pirq);
++
++/* Return gsi allocated to pirq */
++int xen_gsi_from_irq(unsigned pirq);
++
++#ifdef CONFIG_XEN_DOM0_PCI
++void xen_setup_pirqs(void);
++#else
++static inline void xen_setup_pirqs(void)
++{
++}
++#endif
++
++/* Determine whether to ignore this IRQ if passed to a guest. */
++int xen_ignore_irq(int irq);
++/* Xen HVM evtchn vector callback */
++extern void xen_hvm_callback_vector(void);
++extern int xen_have_vector_callback;
++int xen_set_callback_via(uint64_t via);
++void xen_evtchn_do_upcall(struct pt_regs *regs);
++void xen_hvm_evtchn_do_upcall(void);
++
+ #endif	/* _XEN_EVENTS_H */
+diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h
+new file mode 100644
+index 0000000..8bd1467
+--- /dev/null
++++ b/include/xen/gntdev.h
+@@ -0,0 +1,119 @@
++/******************************************************************************
++ * gntdev.h
++ * 
++ * Interface to /dev/xen/gntdev.
++ * 
++ * Copyright (c) 2007, D G Murray
++ * 
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ * 
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ * 
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ * 
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_GNTDEV_H__
++#define __LINUX_PUBLIC_GNTDEV_H__
++
++struct ioctl_gntdev_grant_ref {
++	/* The domain ID of the grant to be mapped. */
++	uint32_t domid;
++	/* The grant reference of the grant to be mapped. */
++	uint32_t ref;
++};
++
++/*
++ * Inserts the grant references into the mapping table of an instance
++ * of gntdev. N.B. This does not perform the mapping, which is deferred
++ * until mmap() is called with @index as the offset.
++ */
++#define IOCTL_GNTDEV_MAP_GRANT_REF \
++_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
++struct ioctl_gntdev_map_grant_ref {
++	/* IN parameters */
++	/* The number of grants to be mapped. */
++	uint32_t count;
++	uint32_t pad;
++	/* OUT parameters */
++	/* The offset to be used on a subsequent call to mmap(). */
++	uint64_t index;
++	/* Variable IN parameter. */
++	/* Array of grant references, of size @count. */
++	struct ioctl_gntdev_grant_ref refs[1];
++};
++
++/*
++ * Removes the grant references from the mapping table of an instance of
++ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
++ * before this ioctl is called, or an error will result.
++ */
++#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
++_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))       
++struct ioctl_gntdev_unmap_grant_ref {
++	/* IN parameters */
++	/* The offset was returned by the corresponding map operation. */
++	uint64_t index;
++	/* The number of pages to be unmapped. */
++	uint32_t count;
++	uint32_t pad;
++};
++
++/*
++ * Returns the offset in the driver's address space that corresponds
++ * to @vaddr. This can be used to perform a munmap(), followed by an
++ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
++ * the caller. The number of pages that were allocated at the same time as
++ * @vaddr is returned in @count.
++ *
++ * N.B. Where more than one page has been mapped into a contiguous range, the
++ *      supplied @vaddr must correspond to the start of the range; otherwise
++ *      an error will result. It is only possible to munmap() the entire
++ *      contiguously-allocated range at once, and not any subrange thereof.
++ */
++#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
++_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
++struct ioctl_gntdev_get_offset_for_vaddr {
++	/* IN parameters */
++	/* The virtual address of the first mapped page in a range. */
++	uint64_t vaddr;
++	/* OUT parameters */
++	/* The offset that was used in the initial mmap() operation. */
++	uint64_t offset;
++	/* The number of pages mapped in the VM area that begins at @vaddr. */
++	uint32_t count;
++	uint32_t pad;
++};
++
++/*
++ * Sets the maximum number of grants that may mapped at once by this gntdev
++ * instance.
++ *
++ * N.B. This must be called before any other ioctl is performed on the device.
++ */
++#define IOCTL_GNTDEV_SET_MAX_GRANTS \
++_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
++struct ioctl_gntdev_set_max_grants {
++	/* IN parameter */
++	/* The maximum number of grants that may be mapped at once. */
++	uint32_t count;
++};
++
++#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
+diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
+index a40f1cd..871b553 100644
+--- a/include/xen/grant_table.h
++++ b/include/xen/grant_table.h
+@@ -37,10 +37,16 @@
+ #ifndef __ASM_GNTTAB_H__
+ #define __ASM_GNTTAB_H__
+ 
+-#include <asm/xen/hypervisor.h>
++#include <asm/page.h>
++
++#include <xen/interface/xen.h>
+ #include <xen/interface/grant_table.h>
++
++#include <asm/xen/hypervisor.h>
+ #include <asm/xen/grant_table.h>
+ 
++#include <xen/features.h>
++
+ /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */
+ #define NR_GRANT_FRAMES 4
+ 
+@@ -51,6 +57,9 @@ struct gnttab_free_callback {
+ 	u16 count;
+ };
+ 
++void gnttab_reset_grant_page(struct page *page);
++
++int gnttab_init(void);
+ int gnttab_suspend(void);
+ int gnttab_resume(void);
+ 
+@@ -80,6 +89,8 @@ unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
+ 
+ int gnttab_query_foreign_access(grant_ref_t ref);
+ 
++int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
++
+ /*
+  * operations on reserved batches of grant references
+  */
+@@ -106,12 +117,46 @@ void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+ void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
+ 				       unsigned long pfn);
+ 
++static inline void
++gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
++		  uint32_t flags, grant_ref_t ref, domid_t domid)
++{
++	if (flags & GNTMAP_contains_pte)
++		map->host_addr = addr;
++	else if (xen_feature(XENFEAT_auto_translated_physmap))
++		map->host_addr = __pa(addr);
++	else
++		map->host_addr = addr;
++
++	map->flags = flags;
++	map->ref = ref;
++	map->dom = domid;
++}
++
++static inline void
++gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr,
++		    uint32_t flags, grant_handle_t handle)
++{
++	if (flags & GNTMAP_contains_pte)
++		unmap->host_addr = addr;
++	else if (xen_feature(XENFEAT_auto_translated_physmap))
++		unmap->host_addr = __pa(addr);
++	else
++		unmap->host_addr = addr;
++
++	unmap->handle = handle;
++	unmap->dev_bus_addr = 0;
++}
++
+ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ 			   unsigned long max_nr_gframes,
+ 			   struct grant_entry **__shared);
+ void arch_gnttab_unmap_shared(struct grant_entry *shared,
+ 			      unsigned long nr_gframes);
+ 
++extern unsigned long xen_hvm_resume_frames;
++unsigned int gnttab_max_grant_frames(void);
++
+ #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr))
+ 
+ #endif /* __ASM_GNTTAB_H__ */
+diff --git a/include/xen/hvm.h b/include/xen/hvm.h
+new file mode 100644
+index 0000000..b193fa2
+--- /dev/null
++++ b/include/xen/hvm.h
+@@ -0,0 +1,30 @@
++/* Simple wrappers around HVM functions */
++#ifndef XEN_HVM_H__
++#define XEN_HVM_H__
++
++#include <xen/interface/hvm/params.h>
++#include <asm/xen/hypercall.h>
++
++static inline int hvm_get_parameter(int idx, uint64_t *value)
++{
++	struct xen_hvm_param xhv;
++	int r;
++
++	xhv.domid = DOMID_SELF;
++	xhv.index = idx;
++	r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
++	if (r < 0) {
++		printk(KERN_ERR "Cannot get hvm parameter %d: %d!\n",
++			idx, r);
++		return r;
++	}
++	*value = xhv.value;
++	return r;
++}
++
++#define HVM_CALLBACK_VIA_TYPE_VECTOR 0x2
++#define HVM_CALLBACK_VIA_TYPE_SHIFT 56
++#define HVM_CALLBACK_VECTOR(x) (((uint64_t)HVM_CALLBACK_VIA_TYPE_VECTOR)<<\
++		HVM_CALLBACK_VIA_TYPE_SHIFT | (x))
++
++#endif /* XEN_HVM_H__ */
+diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h
+index f51b641..70d2563 100644
+--- a/include/xen/interface/features.h
++++ b/include/xen/interface/features.h
+@@ -41,6 +41,12 @@
+ /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
+ #define XENFEAT_mmu_pt_update_preserve_ad  5
+ 
++/* x86: Does this Xen host support the HVM callback vector type? */
++#define XENFEAT_hvm_callback_vector        8
++
++/* x86: pvclock algorithm is safe to use on HVM */
++#define XENFEAT_hvm_safe_pvclock           9
++
+ #define XENFEAT_NR_SUBMAPS 1
+ 
+ #endif /* __XEN_PUBLIC_FEATURES_H__ */
+diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h
+index 39da93c..c704fe5 100644
+--- a/include/xen/interface/grant_table.h
++++ b/include/xen/interface/grant_table.h
+@@ -28,6 +28,7 @@
+ #ifndef __XEN_PUBLIC_GRANT_TABLE_H__
+ #define __XEN_PUBLIC_GRANT_TABLE_H__
+ 
++#include <xen/interface/xen.h>
+ 
+ /***********************************
+  * GRANT TABLE REPRESENTATION
+@@ -321,6 +322,28 @@ struct gnttab_query_size {
+ DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
+ 
+ /*
++ * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
++ * tracked by <handle> but atomically replace the page table entry with one
++ * pointing to the machine address under <new_addr>.  <new_addr> will be
++ * redirected to the null entry.
++ * NOTES:
++ *  1. The call may fail in an undefined manner if either mapping is not
++ *     tracked by <handle>.
++ *  2. After executing a batch of unmaps, it is guaranteed that no stale
++ *     mappings will remain in the device or host TLBs.
++ */
++#define GNTTABOP_unmap_and_replace    7
++struct gnttab_unmap_and_replace {
++    /* IN parameters. */
++    uint64_t host_addr;
++    uint64_t new_addr;
++    grant_handle_t handle;
++    /* OUT parameters. */
++    int16_t  status;              /* GNTST_* */
++};
++DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
++
++/*
+  * Bitfield values for update_pin_status.flags.
+  */
+  /* Map the grant entry for access by I/O devices. */
+diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h
+new file mode 100644
+index 0000000..a4827f4
+--- /dev/null
++++ b/include/xen/interface/hvm/hvm_op.h
+@@ -0,0 +1,46 @@
++/*
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
++#define __XEN_PUBLIC_HVM_HVM_OP_H__
++
++/* Get/set subcommands: the second argument of the hypercall is a
++ * pointer to a xen_hvm_param struct. */
++#define HVMOP_set_param           0
++#define HVMOP_get_param           1
++struct xen_hvm_param {
++    domid_t  domid;    /* IN */
++    uint32_t index;    /* IN */
++    uint64_t value;    /* IN/OUT */
++};
++DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
++
++/* Hint from PV drivers for pagetable destruction. */
++#define HVMOP_pagetable_dying       9
++struct xen_hvm_pagetable_dying {
++    /* Domain with a pagetable about to be destroyed. */
++    domid_t  domid;
++    /* guest physical address of the toplevel pagetable dying */
++    aligned_u64 gpa;
++};
++typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
++ 
++#endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
+diff --git a/include/xen/interface/hvm/params.h b/include/xen/interface/hvm/params.h
+new file mode 100644
+index 0000000..1888d8c
+--- /dev/null
++++ b/include/xen/interface/hvm/params.h
+@@ -0,0 +1,95 @@
++/*
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef __XEN_PUBLIC_HVM_PARAMS_H__
++#define __XEN_PUBLIC_HVM_PARAMS_H__
++
++#include "hvm_op.h"
++
++/*
++ * Parameter space for HVMOP_{set,get}_param.
++ */
++
++/*
++ * How should CPU0 event-channel notifications be delivered?
++ * val[63:56] == 0: val[55:0] is a delivery GSI (Global System Interrupt).
++ * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
++ *                  Domain = val[47:32], Bus  = val[31:16],
++ *                  DevFn  = val[15: 8], IntX = val[ 1: 0]
++ * val[63:56] == 2: val[7:0] is a vector number.
++ * If val == 0 then CPU0 event-channel notifications are not delivered.
++ */
++#define HVM_PARAM_CALLBACK_IRQ 0
++
++#define HVM_PARAM_STORE_PFN    1
++#define HVM_PARAM_STORE_EVTCHN 2
++
++#define HVM_PARAM_PAE_ENABLED  4
++
++#define HVM_PARAM_IOREQ_PFN    5
++
++#define HVM_PARAM_BUFIOREQ_PFN 6
++
++/*
++ * Set mode for virtual timers (currently x86 only):
++ *  delay_for_missed_ticks (default):
++ *   Do not advance a vcpu's time beyond the correct delivery time for
++ *   interrupts that have been missed due to preemption. Deliver missed
++ *   interrupts when the vcpu is rescheduled and advance the vcpu's virtual
++ *   time stepwise for each one.
++ *  no_delay_for_missed_ticks:
++ *   As above, missed interrupts are delivered, but guest time always tracks
++ *   wallclock (i.e., real) time while doing so.
++ *  no_missed_ticks_pending:
++ *   No missed interrupts are held pending. Instead, to ensure ticks are
++ *   delivered at some non-zero rate, if we detect missed ticks then the
++ *   internal tick alarm is not disabled if the VCPU is preempted during the
++ *   next tick period.
++ *  one_missed_tick_pending:
++ *   Missed interrupts are collapsed together and delivered as one 'late tick'.
++ *   Guest time always tracks wallclock (i.e., real) time.
++ */
++#define HVM_PARAM_TIMER_MODE   10
++#define HVMPTM_delay_for_missed_ticks    0
++#define HVMPTM_no_delay_for_missed_ticks 1
++#define HVMPTM_no_missed_ticks_pending   2
++#define HVMPTM_one_missed_tick_pending   3
++
++/* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
++#define HVM_PARAM_HPET_ENABLED 11
++
++/* Identity-map page directory used by Intel EPT when CR0.PG=0. */
++#define HVM_PARAM_IDENT_PT     12
++
++/* Device Model domain, defaults to 0. */
++#define HVM_PARAM_DM_DOMAIN    13
++
++/* ACPI S state: currently support S0 and S3 on x86. */
++#define HVM_PARAM_ACPI_S_STATE 14
++
++/* TSS used on Intel when CR0.PE=0. */
++#define HVM_PARAM_VM86_TSS     15
++
++/* Boolean: Enable aligning all periodic vpts to reduce interrupts */
++#define HVM_PARAM_VPT_ALIGN    16
++
++#define HVM_NR_PARAMS          17
++
++#endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
+diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h
+index c2d1fa4..68dd2b4 100644
+--- a/include/xen/interface/io/blkif.h
++++ b/include/xen/interface/io/blkif.h
+@@ -91,4 +91,25 @@ DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
+ #define VDISK_REMOVABLE    0x2
+ #define VDISK_READONLY     0x4
+ 
++/* Xen-defined major numbers for virtual disks, they look strangely
++ * familiar */
++#define XEN_IDE0_MAJOR	3
++#define XEN_IDE1_MAJOR	22
++#define XEN_SCSI_DISK0_MAJOR	8
++#define XEN_SCSI_DISK1_MAJOR	65
++#define XEN_SCSI_DISK2_MAJOR	66
++#define XEN_SCSI_DISK3_MAJOR	67
++#define XEN_SCSI_DISK4_MAJOR	68
++#define XEN_SCSI_DISK5_MAJOR	69
++#define XEN_SCSI_DISK6_MAJOR	70
++#define XEN_SCSI_DISK7_MAJOR	71
++#define XEN_SCSI_DISK8_MAJOR	128
++#define XEN_SCSI_DISK9_MAJOR	129
++#define XEN_SCSI_DISK10_MAJOR	130
++#define XEN_SCSI_DISK11_MAJOR	131
++#define XEN_SCSI_DISK12_MAJOR	132
++#define XEN_SCSI_DISK13_MAJOR	133
++#define XEN_SCSI_DISK14_MAJOR	134
++#define XEN_SCSI_DISK15_MAJOR	135
++
+ #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
+diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h
+index 518481c..8309344 100644
+--- a/include/xen/interface/io/netif.h
++++ b/include/xen/interface/io/netif.h
+@@ -131,6 +131,10 @@ struct xen_netif_rx_request {
+ #define _NETRXF_extra_info     (3)
+ #define  NETRXF_extra_info     (1U<<_NETRXF_extra_info)
+ 
++/* GSO Prefix descriptor. */
++#define _NETRXF_gso_prefix     (4)
++#define  NETRXF_gso_prefix     (1U<<_NETRXF_gso_prefix)
++
+ struct xen_netif_rx_response {
+     uint16_t id;
+     uint16_t offset;       /* Offset in page of start of received packet  */
+diff --git a/include/xen/interface/io/pciif.h b/include/xen/interface/io/pciif.h
+new file mode 100644
+index 0000000..c4177f3
+--- /dev/null
++++ b/include/xen/interface/io/pciif.h
+@@ -0,0 +1,124 @@
++/*
++ * PCI Backend/Frontend Common Data Structures & Macros
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
++ */
++#ifndef __XEN_PCI_COMMON_H__
++#define __XEN_PCI_COMMON_H__
++
++/* Be sure to bump this number if you change this file */
++#define XEN_PCI_MAGIC "7"
++
++/* xen_pci_sharedinfo flags */
++#define _XEN_PCIF_active     (0)
++#define XEN_PCIF_active      (1<<_XEN_PCIF_active)
++#define _XEN_PCIB_AERHANDLER (1)
++#define XEN_PCIB_AERHANDLER  (1<<_XEN_PCIB_AERHANDLER)
++#define _XEN_PCIB_active     (2)
++#define XEN_PCIB_active      (1<<_XEN_PCIB_active)
++
++/* xen_pci_op commands */
++#define XEN_PCI_OP_conf_read    	(0)
++#define XEN_PCI_OP_conf_write   	(1)
++#define XEN_PCI_OP_enable_msi   	(2)
++#define XEN_PCI_OP_disable_msi  	(3)
++#define XEN_PCI_OP_enable_msix  	(4)
++#define XEN_PCI_OP_disable_msix 	(5)
++#define XEN_PCI_OP_aer_detected 	(6)
++#define XEN_PCI_OP_aer_resume		(7)
++#define XEN_PCI_OP_aer_mmio		(8)
++#define XEN_PCI_OP_aer_slotreset	(9)
++
++/* xen_pci_op error numbers */
++#define XEN_PCI_ERR_success          (0)
++#define XEN_PCI_ERR_dev_not_found   (-1)
++#define XEN_PCI_ERR_invalid_offset  (-2)
++#define XEN_PCI_ERR_access_denied   (-3)
++#define XEN_PCI_ERR_not_implemented (-4)
++/* XEN_PCI_ERR_op_failed - backend failed to complete the operation */
++#define XEN_PCI_ERR_op_failed       (-5)
++
++/*
++ * it should be PAGE_SIZE-sizeof(struct xen_pci_op))/sizeof(struct msix_entry))
++ * Should not exceed 128
++ */
++#define SH_INFO_MAX_VEC     128
++
++struct xen_msix_entry {
++    uint16_t vector;
++    uint16_t entry;
++};
++struct xen_pci_op {
++    /* IN: what action to perform: XEN_PCI_OP_* */
++    uint32_t cmd;
++
++    /* OUT: will contain an error number (if any) from errno.h */
++    int32_t err;
++
++    /* IN: which device to touch */
++    uint32_t domain; /* PCI Domain/Segment */
++    uint32_t bus;
++    uint32_t devfn;
++
++    /* IN: which configuration registers to touch */
++    int32_t offset;
++    int32_t size;
++
++    /* IN/OUT: Contains the result after a READ or the value to WRITE */
++    uint32_t value;
++    /* IN: Contains extra infor for this operation */
++    uint32_t info;
++    /*IN:  param for msi-x */
++    struct xen_msix_entry msix_entries[SH_INFO_MAX_VEC];
++};
++
++/*used for pcie aer handling*/
++struct xen_pcie_aer_op
++{
++
++    /* IN: what action to perform: XEN_PCI_OP_* */
++    uint32_t cmd;
++    /*IN/OUT: return aer_op result or carry error_detected state as input*/
++    int32_t err;
++
++    /* IN: which device to touch */
++    uint32_t domain; /* PCI Domain/Segment*/
++    uint32_t bus;
++    uint32_t devfn;
++};
++struct xen_pci_sharedinfo {
++    /* flags - XEN_PCIF_* */
++    uint32_t flags;
++    struct xen_pci_op op;
++    struct xen_pcie_aer_op aer_op;
++};
++
++#endif /* __XEN_PCI_COMMON_H__ */
++
++/*
++ * Local variables:
++ * mode: C
++ * c-set-style: "BSD"
++ * c-basic-offset: 4
++ * tab-width: 4
++ * indent-tabs-mode: nil
++ * End:
++ */
+diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h
+index e8cbf43..c9ba846 100644
+--- a/include/xen/interface/io/ring.h
++++ b/include/xen/interface/io/ring.h
+@@ -24,8 +24,15 @@ typedef unsigned int RING_IDX;
+  * A ring contains as many entries as will fit, rounded down to the nearest
+  * power of two (so we can mask with (size-1) to loop around).
+  */
+-#define __RING_SIZE(_s, _sz) \
+-    (__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
++#define __CONST_RING_SIZE(_s, _sz)				\
++	(__RD32(((_sz) - offsetof(struct _s##_sring, ring)) /	\
++		sizeof(((struct _s##_sring *)0)->ring[0])))
++
++/*
++ * The same for passing in an actual pointer instead of a name tag.
++ */
++#define __RING_SIZE(_s, _sz)						\
++	(__RD32(((_sz) - (long)&(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0])))
+ 
+ /*
+  * Macros to make the correct C datatypes for a new kind of ring.
+@@ -73,7 +80,16 @@ union __name##_sring_entry {						\
+ struct __name##_sring {							\
+     RING_IDX req_prod, req_event;					\
+     RING_IDX rsp_prod, rsp_event;					\
+-    uint8_t  pad[48];							\
++    union {								\
++        struct {							\
++            uint8_t smartpoll_active;					\
++        } netif;							\
++        struct {							\
++            uint8_t msg;						\
++        } tapif_user;							\
++        uint8_t pvt_pad[4];						\
++    } private;								\
++    uint8_t pad[44];							\
+     union __name##_sring_entry ring[1]; /* variable-length */		\
+ };									\
+ 									\
+diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h
+index 46508c7..9fda532 100644
+--- a/include/xen/interface/io/xenbus.h
++++ b/include/xen/interface/io/xenbus.h
+@@ -27,8 +27,14 @@ enum xenbus_state
+ 	XenbusStateClosing      = 5,  /* The device is being closed
+ 					 due to an error or an unplug
+ 					 event. */
+-	XenbusStateClosed       = 6
++	XenbusStateClosed       = 6,
+ 
++	/*
++	* Reconfiguring: The device is being reconfigured.
++	*/
++	XenbusStateReconfiguring = 7,
++
++	XenbusStateReconfigured  = 8
+ };
+ 
+ #endif /* _XEN_PUBLIC_IO_XENBUS_H */
+diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
+index af36ead..aa4e368 100644
+--- a/include/xen/interface/memory.h
++++ b/include/xen/interface/memory.h
+@@ -9,6 +9,8 @@
+ #ifndef __XEN_PUBLIC_MEMORY_H__
+ #define __XEN_PUBLIC_MEMORY_H__
+ 
++#include <linux/spinlock.h>
++
+ /*
+  * Increase or decrease the specified domain's memory reservation. Returns a
+  * -ve errcode on failure, or the # extents successfully allocated or freed.
+@@ -53,6 +55,48 @@ struct xen_memory_reservation {
+ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
+ 
+ /*
++ * An atomic exchange of memory pages. If return code is zero then
++ * @out.extent_list provides GMFNs of the newly-allocated memory.
++ * Returns zero on complete success, otherwise a negative error code.
++ * On complete success then always @nr_exchanged == @in.nr_extents.
++ * On partial success @nr_exchanged indicates how much work was done.
++ */
++#define XENMEM_exchange             11
++struct xen_memory_exchange {
++    /*
++     * [IN] Details of memory extents to be exchanged (GMFN bases).
++     * Note that @in.address_bits is ignored and unused.
++     */
++    struct xen_memory_reservation in;
++
++    /*
++     * [IN/OUT] Details of new memory extents.
++     * We require that:
++     *  1. @in.domid == @out.domid
++     *  2. @in.nr_extents  << @in.extent_order ==
++     *     @out.nr_extents << @out.extent_order
++     *  3. @in.extent_start and @out.extent_start lists must not overlap
++     *  4. @out.extent_start lists GPFN bases to be populated
++     *  5. @out.extent_start is overwritten with allocated GMFN bases
++     */
++    struct xen_memory_reservation out;
++
++    /*
++     * [OUT] Number of input extents that were successfully exchanged:
++     *  1. The first @nr_exchanged input extents were successfully
++     *     deallocated.
++     *  2. The corresponding first entries in the output extent list correctly
++     *     indicate the GMFNs that were successfully exchanged.
++     *  3. All other input and output extents are untouched.
++     *  4. If not all input exents are exchanged then the return code of this
++     *     command will be non-zero.
++     *  5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
++     */
++    unsigned long nr_exchanged;
++};
++
++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
++/*
+  * Returns the maximum machine frame number of mapped RAM in this system.
+  * This command always succeeds (it never returns an error code).
+  * arg == NULL.
+@@ -97,6 +141,19 @@ struct xen_machphys_mfn_list {
+ DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
+ 
+ /*
++ * Returns the location in virtual address space of the machine_to_phys
++ * mapping table. Architectures which do not have a m2p table, or which do not
++ * map it by default into guest address space, do not implement this command.
++ * arg == addr of xen_machphys_mapping_t.
++ */
++#define XENMEM_machphys_mapping     12
++struct xen_machphys_mapping {
++    unsigned long v_start, v_end; /* Start and end virtual addresses.   */
++    unsigned long max_mfn;        /* Maximum MFN that can be looked up. */
++};
++DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t);
++
++/*
+  * Sets the GPFN at which a particular page appears in the specified guest's
+  * pseudophysical address space.
+  * arg == addr of xen_add_to_physmap_t.
+@@ -142,4 +199,38 @@ struct xen_translate_gpfn_list {
+ };
+ DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+ 
++/*
++ * Returns the pseudo-physical memory map as it was when the domain
++ * was started (specified by XENMEM_set_memory_map).
++ * arg == addr of struct xen_memory_map.
++ */
++#define XENMEM_memory_map           9
++struct xen_memory_map {
++    /*
++     * On call the number of entries which can be stored in buffer. On
++     * return the number of entries which have been stored in
++     * buffer.
++     */
++    unsigned int nr_entries;
++
++    /*
++     * Entries in the buffer are in the same format as returned by the
++     * BIOS INT 0x15 EAX=0xE820 call.
++     */
++    GUEST_HANDLE(void) buffer;
++};
++DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
++
++/*
++ * Returns the real physical memory map. Passes the same structure as
++ * XENMEM_memory_map.
++ * arg == addr of struct xen_memory_map.
++ */
++#define XENMEM_machine_memory_map   10
++
++/*
++ * Prevent the balloon driver from changing the memory reservation
++ * during a driver critical region.
++ */
++extern spinlock_t xen_reservation_lock;
+ #endif /* __XEN_PUBLIC_MEMORY_H__ */
+diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
+index cd69391..0703ef6 100644
+--- a/include/xen/interface/physdev.h
++++ b/include/xen/interface/physdev.h
+@@ -39,6 +39,19 @@ struct physdev_eoi {
+ };
+ 
+ /*
++ * Register a shared page for the hypervisor to indicate whether the guest
++ * must issue PHYSDEVOP_eoi. The semantics of PHYSDEVOP_eoi change slightly
++ * once the guest used this function in that the associated event channel
++ * will automatically get unmasked. The page registered is used as a bit
++ * array indexed by Xen's PIRQ value.
++ */
++#define PHYSDEVOP_pirq_eoi_gmfn         17
++struct physdev_pirq_eoi_gmfn {
++    /* IN */
++    unsigned long gmfn;
++};
++
++/*
+  * Query the status of an IRQ line.
+  * @arg == pointer to physdev_irq_status_query structure.
+  */
+@@ -106,6 +119,64 @@ struct physdev_irq {
+ 	uint32_t vector;
+ };
+ 
++#define MAP_PIRQ_TYPE_MSI		0x0
++#define MAP_PIRQ_TYPE_GSI		0x1
++#define MAP_PIRQ_TYPE_UNKNOWN		0x2
++
++#define PHYSDEVOP_map_pirq		13
++struct physdev_map_pirq {
++    domid_t domid;
++    /* IN */
++    int type;
++    /* IN */
++    int index;
++    /* IN or OUT */
++    int pirq;
++    /* IN */
++    int bus;
++    /* IN */
++    int devfn;
++    /* IN */
++    int entry_nr;
++    /* IN */
++    uint64_t table_base;
++};
++
++#define PHYSDEVOP_unmap_pirq		14
++struct physdev_unmap_pirq {
++    domid_t domid;
++    /* IN */
++    int pirq;
++};
++
++#define PHYSDEVOP_manage_pci_add	15
++#define PHYSDEVOP_manage_pci_remove	16
++struct physdev_manage_pci {
++	/* IN */
++	uint8_t bus;
++	uint8_t devfn;
++};
++
++#define PHYSDEVOP_restore_msi		19
++struct physdev_restore_msi {
++	/* IN */
++	uint8_t bus;
++	uint8_t devfn;
++};
++
++#define PHYSDEVOP_manage_pci_add_ext	20
++struct physdev_manage_pci_ext {
++	/* IN */
++	uint8_t bus;
++	uint8_t devfn;
++	unsigned is_extfn;
++	unsigned is_virtfn;
++	struct {
++		uint8_t bus;
++		uint8_t devfn;
++	} physfn;
++};
++
+ /*
+  * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
+  * hypercall since 0x00030202.
+@@ -121,6 +192,16 @@ struct physdev_op {
+ 	} u;
+ };
+ 
++#define PHYSDEVOP_setup_gsi    21
++struct physdev_setup_gsi {
++    int gsi;
++    /* IN */
++    uint8_t triggering;
++    /* IN */
++    uint8_t polarity;
++    /* IN */
++};
++
+ /*
+  * Notify that some PIRQ-bound event channels have been unmasked.
+  * ** This command is obsolete since interface version 0x00030202 and is **
+diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h
+new file mode 100644
+index 0000000..17ae622
+--- /dev/null
++++ b/include/xen/interface/platform.h
+@@ -0,0 +1,381 @@
++/******************************************************************************
++ * platform.h
++ *
++ * Hardware platform operations. Intended for use by domain-0 kernel.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ * Copyright (c) 2002-2006, K Fraser
++ */
++
++#ifndef __XEN_PUBLIC_PLATFORM_H__
++#define __XEN_PUBLIC_PLATFORM_H__
++
++#include "xen.h"
++
++#define XENPF_INTERFACE_VERSION 0x03000001
++
++/*
++ * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC,
++ * 1 January, 1970 if the current system time was <system_time>.
++ */
++#define XENPF_settime             17
++struct xenpf_settime {
++    /* IN variables. */
++    uint32_t secs;
++    uint32_t nsecs;
++    uint64_t system_time;
++};
++typedef struct xenpf_settime xenpf_settime_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t);
++
++/*
++ * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
++ * On x86, @type is an architecture-defined MTRR memory type.
++ * On success, returns the MTRR that was used (@reg) and a handle that can
++ * be passed to XENPF_DEL_MEMTYPE to accurately tear down the new setting.
++ * (x86-specific).
++ */
++#define XENPF_add_memtype         31
++struct xenpf_add_memtype {
++    /* IN variables. */
++    unsigned long mfn;
++    uint64_t nr_mfns;
++    uint32_t type;
++    /* OUT variables. */
++    uint32_t handle;
++    uint32_t reg;
++};
++typedef struct xenpf_add_memtype xenpf_add_memtype_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_add_memtype_t);
++
++/*
++ * Tear down an existing memory-range type. If @handle is remembered then it
++ * should be passed in to accurately tear down the correct setting (in case
++ * of overlapping memory regions with differing types). If it is not known
++ * then @handle should be set to zero. In all cases @reg must be set.
++ * (x86-specific).
++ */
++#define XENPF_del_memtype         32
++struct xenpf_del_memtype {
++    /* IN variables. */
++    uint32_t handle;
++    uint32_t reg;
++};
++typedef struct xenpf_del_memtype xenpf_del_memtype_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_del_memtype_t);
++
++/* Read current type of an MTRR (x86-specific). */
++#define XENPF_read_memtype        33
++struct xenpf_read_memtype {
++    /* IN variables. */
++    uint32_t reg;
++    /* OUT variables. */
++    unsigned long mfn;
++    uint64_t nr_mfns;
++    uint32_t type;
++};
++typedef struct xenpf_read_memtype xenpf_read_memtype_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_read_memtype_t);
++
++#define XENPF_microcode_update    35
++struct xenpf_microcode_update {
++    /* IN variables. */
++    GUEST_HANDLE(void) data;          /* Pointer to microcode data */
++    uint32_t length;                  /* Length of microcode data. */
++};
++typedef struct xenpf_microcode_update xenpf_microcode_update_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_microcode_update_t);
++
++#define XENPF_platform_quirk      39
++#define QUIRK_NOIRQBALANCING      1 /* Do not restrict IO-APIC RTE targets */
++#define QUIRK_IOAPIC_BAD_REGSEL   2 /* IO-APIC REGSEL forgets its value    */
++#define QUIRK_IOAPIC_GOOD_REGSEL  3 /* IO-APIC REGSEL behaves properly     */
++struct xenpf_platform_quirk {
++    /* IN variables. */
++    uint32_t quirk_id;
++};
++typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t);
++
++#define XENPF_firmware_info       50
++#define XEN_FW_DISK_INFO          1 /* from int 13 AH=08/41/48 */
++#define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */
++#define XEN_FW_VBEDDC_INFO        3 /* from int 10 AX=4f15 */
++struct xenpf_firmware_info {
++	/* IN variables. */
++	uint32_t type;
++	uint32_t index;
++	/* OUT variables. */
++	union {
++		struct {
++			/* Int13, Fn48: Check Extensions Present. */
++			uint8_t device;                   /* %dl: bios device number */
++			uint8_t version;                  /* %ah: major version      */
++			uint16_t interface_support;       /* %cx: support bitmap     */
++			/* Int13, Fn08: Legacy Get Device Parameters. */
++			uint16_t legacy_max_cylinder;     /* %cl[7:6]:%ch: max cyl # */
++			uint8_t legacy_max_head;          /* %dh: max head #         */
++			uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector #  */
++			/* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
++			/* NB. First uint16_t of buffer must be set to buffer size.      */
++			GUEST_HANDLE(void) edd_params;
++		} disk_info; /* XEN_FW_DISK_INFO */
++		struct {
++			uint8_t device;                   /* bios device number  */
++			uint32_t mbr_signature;           /* offset 0x1b8 in mbr */
++		} disk_mbr_signature; /* XEN_FW_DISK_MBR_SIGNATURE */
++		struct {
++			/* Int10, AX=4F15: Get EDID info. */
++			uint8_t capabilities;
++			uint8_t edid_transfer_time;
++			/* must refer to 128-byte buffer */
++			GUEST_HANDLE(uchar) edid;
++		} vbeddc_info; /* XEN_FW_VBEDDC_INFO */
++	} u;
++};
++typedef struct xenpf_firmware_info xenpf_firmware_info_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t);
++
++#define XENPF_enter_acpi_sleep    51
++struct xenpf_enter_acpi_sleep {
++	/* IN variables */
++	uint16_t pm1a_cnt_val;      /* PM1a control value. */
++	uint16_t pm1b_cnt_val;      /* PM1b control value. */
++	uint32_t sleep_state;       /* Which state to enter (Sn). */
++	uint32_t flags;             /* Must be zero. */
++};
++typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_enter_acpi_sleep_t);
++
++#define XENPF_change_freq         52
++struct xenpf_change_freq {
++	/* IN variables */
++	uint32_t flags; /* Must be zero. */
++	uint32_t cpu;   /* Physical cpu. */
++	uint64_t freq;  /* New frequency (Hz). */
++};
++typedef struct xenpf_change_freq xenpf_change_freq_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t);
++
++/*
++ * Get idle times (nanoseconds since boot) for physical CPUs specified in the
++ * @cpumap_bitmap with range [0..@cpumap_nr_cpus-1]. The @idletime array is
++ * indexed by CPU number; only entries with the corresponding @cpumap_bitmap
++ * bit set are written to. On return, @cpumap_bitmap is modified so that any
++ * non-existent CPUs are cleared. Such CPUs have their @idletime array entry
++ * cleared.
++ */
++#define XENPF_getidletime         53
++struct xenpf_getidletime {
++	/* IN/OUT variables */
++	/* IN: CPUs to interrogate; OUT: subset of IN which are present */
++	GUEST_HANDLE(uchar) cpumap_bitmap;
++	/* IN variables */
++	/* Size of cpumap bitmap. */
++	uint32_t cpumap_nr_cpus;
++	/* Must be indexable for every cpu in cpumap_bitmap. */
++	GUEST_HANDLE(uint64_t) idletime;
++	/* OUT variables */
++	/* System time when the idletime snapshots were taken. */
++	uint64_t now;
++};
++typedef struct xenpf_getidletime xenpf_getidletime_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
++
++#define XENPF_set_processor_pminfo      54
++
++/* ability bits */
++#define XEN_PROCESSOR_PM_CX	1
++#define XEN_PROCESSOR_PM_PX	2
++#define XEN_PROCESSOR_PM_TX	4
++
++/* cmd type */
++#define XEN_PM_CX   0
++#define XEN_PM_PX   1
++#define XEN_PM_TX   2
++
++/* Px sub info type */
++#define XEN_PX_PCT   1
++#define XEN_PX_PSS   2
++#define XEN_PX_PPC   4
++#define XEN_PX_PSD   8
++
++struct xen_power_register {
++	uint32_t     space_id;
++	uint32_t     bit_width;
++	uint32_t     bit_offset;
++	uint32_t     access_size;
++	uint64_t     address;
++};
++
++struct xen_processor_csd {
++	uint32_t    domain;      /* domain number of one dependent group */
++	uint32_t    coord_type;  /* coordination type */
++	uint32_t    num;         /* number of processors in same domain */
++};
++typedef struct xen_processor_csd xen_processor_csd_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_csd);
++
++struct xen_processor_cx {
++	struct xen_power_register  reg; /* GAS for Cx trigger register */
++	uint8_t     type;     /* cstate value, c0: 0, c1: 1, ... */
++	uint32_t    latency;  /* worst latency (ms) to enter/exit this cstate */
++	uint32_t    power;    /* average power consumption(mW) */
++	uint32_t    dpcnt;    /* number of dependency entries */
++	GUEST_HANDLE(xen_processor_csd) dp; /* NULL if no dependency */
++};
++typedef struct xen_processor_cx xen_processor_cx_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_cx);
++
++struct xen_processor_flags {
++	uint32_t bm_control:1;
++	uint32_t bm_check:1;
++	uint32_t has_cst:1;
++	uint32_t power_setup_done:1;
++	uint32_t bm_rld_set:1;
++};
++
++struct xen_processor_power {
++	uint32_t count;  /* number of C state entries in array below */
++	struct xen_processor_flags flags;  /* global flags of this processor */
++	GUEST_HANDLE(xen_processor_cx) states; /* supported c states */
++};
++
++struct xen_pct_register {
++	uint8_t  descriptor;
++	uint16_t length;
++	uint8_t  space_id;
++	uint8_t  bit_width;
++	uint8_t  bit_offset;
++	uint8_t  reserved;
++	uint64_t address;
++};
++
++struct xen_processor_px {
++	uint64_t core_frequency; /* megahertz */
++	uint64_t power;      /* milliWatts */
++	uint64_t transition_latency; /* microseconds */
++	uint64_t bus_master_latency; /* microseconds */
++	uint64_t control;        /* control value */
++	uint64_t status;     /* success indicator */
++};
++typedef struct xen_processor_px xen_processor_px_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_px);
++
++struct xen_psd_package {
++	uint64_t num_entries;
++	uint64_t revision;
++	uint64_t domain;
++	uint64_t coord_type;
++	uint64_t num_processors;
++};
++
++struct xen_processor_performance {
++	uint32_t flags;     /* flag for Px sub info type */
++	uint32_t platform_limit;  /* Platform limitation on freq usage */
++	struct xen_pct_register control_register;
++	struct xen_pct_register status_register;
++	uint32_t state_count;     /* total available performance states */
++	GUEST_HANDLE(xen_processor_px) states;
++	struct xen_psd_package domain_info;
++	uint32_t shared_type;     /* coordination type of this processor */
++};
++typedef struct xen_processor_performance xen_processor_performance_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_processor_performance);
++
++struct xenpf_set_processor_pminfo {
++	/* IN variables */
++	uint32_t id;    /* ACPI CPU ID */
++	uint32_t type;  /* {XEN_PM_CX, XEN_PM_PX} */
++	union {
++		struct xen_processor_power          power;/* Cx: _CST/_CSD */
++		struct xen_processor_performance    perf; /* Px: _PPC/_PCT/_PSS/_PSD */
++	};
++};
++typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo);
++
++#define XENPF_get_cpuinfo 55
++struct xenpf_pcpuinfo {
++    /* IN */
++    uint32_t xen_cpuid;
++    /* OUT */
++    /* The maxium cpu_id that is present */
++    uint32_t max_present;
++#define XEN_PCPU_FLAGS_ONLINE   1
++    /* Correponding xen_cpuid is not present*/
++#define XEN_PCPU_FLAGS_INVALID  2
++    uint32_t flags;
++    uint32_t apic_id;
++    uint32_t acpi_id;
++};
++typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo_t);
++
++#define XENPF_cpu_online    56
++#define XENPF_cpu_offline   57
++struct xenpf_cpu_ol {
++    uint32_t cpuid;
++};
++typedef struct xenpf_cpu_ol xenpf_cpu_ol_t;
++DEFINE_GUEST_HANDLE_STRUCT(xenpf_cpu_ol_t);
++
++#define XENPF_cpu_hotadd    58
++struct xenpf_cpu_hotadd {
++	uint32_t apic_id;
++	uint32_t acpi_id;
++	uint32_t pxm;
++};
++
++
++#define XENPF_mem_hotadd    59
++struct xenpf_mem_hotadd {
++	uint64_t spfn;
++	uint64_t epfn;
++	uint32_t pxm;
++	uint32_t flags;
++};
++
++struct xen_platform_op {
++	uint32_t cmd;
++	uint32_t interface_version; /* XENPF_INTERFACE_VERSION */
++	union {
++		struct xenpf_settime           settime;
++		struct xenpf_add_memtype       add_memtype;
++		struct xenpf_del_memtype       del_memtype;
++		struct xenpf_read_memtype      read_memtype;
++		struct xenpf_microcode_update  microcode;
++		struct xenpf_platform_quirk    platform_quirk;
++		struct xenpf_firmware_info     firmware_info;
++		struct xenpf_enter_acpi_sleep  enter_acpi_sleep;
++		struct xenpf_change_freq       change_freq;
++		struct xenpf_getidletime       getidletime;
++		struct xenpf_set_processor_pminfo set_pminfo;
++		struct xenpf_pcpuinfo          pcpu_info;
++		struct xenpf_cpu_ol            cpu_ol;
++		struct xenpf_cpu_hotadd        cpu_add;
++		struct xenpf_mem_hotadd        mem_add;
++		uint8_t                        pad[128];
++	} u;
++};
++typedef struct xen_platform_op xen_platform_op_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_platform_op_t);
++
++#endif /* __XEN_PUBLIC_PLATFORM_H__ */
+diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h
+index 5fec575..dd55dac 100644
+--- a/include/xen/interface/sched.h
++++ b/include/xen/interface/sched.h
+@@ -65,6 +65,39 @@ struct sched_poll {
+ DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+ 
+ /*
++ * Declare a shutdown for another domain. The main use of this function is
++ * in interpreting shutdown requests and reasons for fully-virtualized
++ * domains.  A para-virtualized domain may use SCHEDOP_shutdown directly.
++ * @arg == pointer to sched_remote_shutdown structure.
++ */
++#define SCHEDOP_remote_shutdown        4
++struct sched_remote_shutdown {
++    domid_t domain_id;         /* Remote domain ID */
++    unsigned int reason;       /* SHUTDOWN_xxx reason */
++};
++
++/*
++ * Latch a shutdown code, so that when the domain later shuts down it
++ * reports this code to the control tools.
++ * @arg == as for SCHEDOP_shutdown.
++ */
++#define SCHEDOP_shutdown_code 5
++
++/*
++ * Setup, poke and destroy a domain watchdog timer.
++ * @arg == pointer to sched_watchdog structure.
++ * With id == 0, setup a domain watchdog timer to cause domain shutdown
++ *               after timeout, returns watchdog id.
++ * With id != 0 and timeout == 0, destroy domain watchdog timer.
++ * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
++ */
++#define SCHEDOP_watchdog    6
++struct sched_watchdog {
++    uint32_t id;                /* watchdog ID */
++    uint32_t timeout;           /* timeout */
++};
++
++/*
+  * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
+  * software to determine the appropriate action. For the most part, Xen does
+  * not care about the shutdown code.
+@@ -73,5 +106,6 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+ #define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
+ #define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
+ #define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
++#define SHUTDOWN_watchdog   4  /* Restart because watchdog time expired.     */
+ 
+ #endif /* __XEN_PUBLIC_SCHED_H__ */
+diff --git a/include/xen/interface/xen-mca.h b/include/xen/interface/xen-mca.h
+new file mode 100644
+index 0000000..f31fdab
+--- /dev/null
++++ b/include/xen/interface/xen-mca.h
+@@ -0,0 +1,429 @@
++/******************************************************************************
++ * arch-x86/mca.h
++ *
++ * Contributed by Advanced Micro Devices, Inc.
++ * Author: Christoph Egger <Christoph.Egger@amd.com>
++ *
++ * Guest OS machine check interface to x86 Xen.
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this software and associated documentation files (the "Software"), to
++ * deal in the Software without restriction, including without limitation the
++ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
++ * sell copies of the Software, and to permit persons to whom the Software is
++ * furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++/* Full MCA functionality has the following Usecases from the guest side:
++ *
++ * Must have's:
++ * 1. Dom0 and DomU register machine check trap callback handlers
++ *    (already done via "set_trap_table" hypercall)
++ * 2. Dom0 registers machine check event callback handler
++ *    (doable via EVTCHNOP_bind_virq)
++ * 3. Dom0 and DomU fetches machine check data
++ * 4. Dom0 wants Xen to notify a DomU
++ * 5. Dom0 gets DomU ID from physical address
++ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
++ *
++ * Nice to have's:
++ * 7. Dom0 wants Xen to deactivate a physical CPU
++ *    This is better done as separate task, physical CPU hotplugging,
++ *    and hypercall(s) should be sysctl's
++ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
++ *    move a DomU (or Dom0 itself) away from a malicious page
++ *    producing correctable errors.
++ * 9. offlining physical page:
++ *    Xen free's and never re-uses a certain physical page.
++ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
++ *     and tell Xen to trigger a machine check
++ */
++
++#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
++#define __XEN_PUBLIC_ARCH_X86_MCA_H__
++
++/* Hypercall */
++#define __HYPERVISOR_mca __HYPERVISOR_arch_0
++
++/*
++ * The xen-unstable repo has interface version 0x03000001; out interface
++ * is incompatible with that and any future minor revisions, so we
++ * choose a different version number range that is numerically less
++ * than that used in xen-unstable.
++ */
++#define XEN_MCA_INTERFACE_VERSION 0x01ecc003
++
++/* IN: Dom0 calls hypercall to retrieve nonurgent error log entry */
++#define XEN_MC_NONURGENT  0x0001
++/* IN: Dom0/DomU calls hypercall to retrieve urgent error log entry */
++#define XEN_MC_URGENT     0x0002
++/* IN: Dom0 acknowledges previosly-fetched error log entry */
++#define XEN_MC_ACK        0x0004
++
++/* OUT: All is ok */
++#define XEN_MC_OK           0x0
++/* OUT: Domain could not fetch data. */
++#define XEN_MC_FETCHFAILED  0x1
++/* OUT: There was no machine check data to fetch. */
++#define XEN_MC_NODATA       0x2
++/* OUT: Between notification time and this hypercall an other
++ *  (most likely) correctable error happened. The fetched data,
++ *  does not match the original machine check data. */
++#define XEN_MC_NOMATCH      0x4
++
++/* OUT: DomU did not register MC NMI handler. Try something else. */
++#define XEN_MC_CANNOTHANDLE 0x8
++/* OUT: Notifying DomU failed. Retry later or try something else. */
++#define XEN_MC_NOTDELIVERED 0x10
++/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
++
++
++#ifndef __ASSEMBLY__
++
++#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
++
++/*
++ * Machine Check Architecure:
++ * structs are read-only and used to report all kinds of
++ * correctable and uncorrectable errors detected by the HW.
++ * Dom0 and DomU: register a handler to get notified.
++ * Dom0 only: Correctable errors are reported via VIRQ_MCA
++ */
++#define MC_TYPE_GLOBAL          0
++#define MC_TYPE_BANK            1
++#define MC_TYPE_EXTENDED        2
++#define MC_TYPE_RECOVERY        3
++
++struct mcinfo_common {
++	uint16_t type;      /* structure type */
++	uint16_t size;      /* size of this struct in bytes */
++};
++
++
++#define MC_FLAG_CORRECTABLE     (1 << 0)
++#define MC_FLAG_UNCORRECTABLE   (1 << 1)
++#define MC_FLAG_RECOVERABLE	(1 << 2)
++#define MC_FLAG_POLLED		(1 << 3)
++#define MC_FLAG_RESET		(1 << 4)
++#define MC_FLAG_CMCI		(1 << 5)
++#define MC_FLAG_MCE		(1 << 6)
++/* contains global x86 mc information */
++struct mcinfo_global {
++	struct mcinfo_common common;
++
++	/* running domain at the time in error (most likely
++	 * the impacted one) */
++	uint16_t mc_domid;
++	uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
++	uint32_t mc_socketid; /* physical socket of the physical core */
++	uint16_t mc_coreid; /* physical impacted core */
++	uint16_t mc_core_threadid; /* core thread of physical core */
++	uint32_t mc_apicid;
++	uint32_t mc_flags;
++	uint64_t mc_gstatus; /* global status */
++};
++
++/* contains bank local x86 mc information */
++struct mcinfo_bank {
++	struct mcinfo_common common;
++
++	uint16_t mc_bank; /* bank nr */
++	uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on
++			* privileged pv-ops dom and if mc_addr is valid.
++			* Never valid on DomU. */
++	uint64_t mc_status; /* bank status */
++	uint64_t mc_addr;   /* bank address, only valid
++					 * if addr bit is set in mc_status */
++	uint64_t mc_misc;
++	uint64_t mc_ctrl2;
++	uint64_t mc_tsc;
++};
++
++
++struct mcinfo_msr {
++	uint64_t reg;   /* MSR */
++	uint64_t value; /* MSR value */
++};
++
++/* contains mc information from other
++ * or additional mc MSRs */
++struct mcinfo_extended {
++	struct mcinfo_common common;
++
++	/* You can fill up to five registers.
++	 * If you need more, then use this structure
++	 * multiple times. */
++
++	uint32_t mc_msrs; /* Number of msr with valid values. */
++	/*
++	 * Currently Intel extended MSR (32/64) include all gp registers
++	 * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
++	 * useful at present. So expand this array to 16/32 to leave room.
++	 */
++	struct mcinfo_msr mc_msr[sizeof(void *) * 4];
++};
++
++/* Recovery Action flags. Giving recovery result information to DOM0 */
++
++/* Xen takes successful recovery action, the error is recovered */
++#define REC_ACTION_RECOVERED (0x1 << 0)
++/* No action is performed by XEN */
++#define REC_ACTION_NONE (0x1 << 1)
++/* It's possible DOM0 might take action ownership in some case */
++#define REC_ACTION_NEED_RESET (0x1 << 2)
++
++/* Different Recovery Action types, if the action is performed successfully,
++ * REC_ACTION_RECOVERED flag will be returned.
++ */
++
++/* Page Offline Action */
++#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
++/* CPU offline Action */
++#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
++/* L3 cache disable Action */
++#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
++
++/* Below interface used between XEN/DOM0 for passing XEN's recovery action
++ * information to DOM0.
++ * usage Senario: After offlining broken page, XEN might pass its page offline
++ * recovery action result to DOM0. DOM0 will save the information in
++ * non-volatile memory for further proactive actions, such as offlining the
++ * easy broken page earlier when doing next reboot.
++*/
++struct page_offline_action {
++	/* Params for passing the offlined page number to DOM0 */
++	uint64_t mfn;
++	uint64_t status;
++};
++
++struct cpu_offline_action {
++	/* Params for passing the identity of the offlined CPU to DOM0 */
++	uint32_t mc_socketid;
++	uint16_t mc_coreid;
++	uint16_t mc_core_threadid;
++};
++
++#define MAX_UNION_SIZE 16
++struct mcinfo_recovery {
++	struct mcinfo_common common;
++	uint16_t mc_bank; /* bank nr */
++	/* Recovery Action Flags defined above such as REC_ACTION_DONE */
++	uint8_t action_flags;
++	/* Recovery Action types defined above such as MC_ACTION_PAGE_OFFLINE */
++	uint8_t action_types;
++	/* In future if more than one recovery action permitted per error bank,
++	 * a mcinfo_recovery data array will be returned
++	 */
++	union {
++		struct page_offline_action page_retire;
++		struct cpu_offline_action cpu_offline;
++		uint8_t pad[MAX_UNION_SIZE];
++	} action_info;
++};
++
++
++#define MCINFO_HYPERCALLSIZE	1024
++#define MCINFO_MAXSIZE		768
++
++struct mc_info {
++	/* Number of mcinfo_* entries in mi_data */
++	uint32_t mi_nentries;
++	uint32_t _pad0;
++	uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
++};
++typedef struct mc_info mc_info_t;
++DEFINE_GUEST_HANDLE_STRUCT(mc_info);
++
++#define __MC_MSR_ARRAYSIZE 8
++#define __MC_NMSRS 1
++#define MC_NCAPS	7	/* 7 CPU feature flag words */
++#define MC_CAPS_STD_EDX	0	/* cpuid level 0x00000001 (%edx) */
++#define MC_CAPS_AMD_EDX	1	/* cpuid level 0x80000001 (%edx) */
++#define MC_CAPS_TM	2	/* cpuid level 0x80860001 (TransMeta) */
++#define MC_CAPS_LINUX	3	/* Linux-defined */
++#define MC_CAPS_STD_ECX	4	/* cpuid level 0x00000001 (%ecx) */
++#define MC_CAPS_VIA	5	/* cpuid level 0xc0000001 */
++#define MC_CAPS_AMD_ECX	6	/* cpuid level 0x80000001 (%ecx) */
++
++struct mcinfo_logical_cpu {
++	uint32_t mc_cpunr;
++	uint32_t mc_chipid;
++	uint16_t mc_coreid;
++	uint16_t mc_threadid;
++	uint32_t mc_apicid;
++	uint32_t mc_clusterid;
++	uint32_t mc_ncores;
++	uint32_t mc_ncores_active;
++	uint32_t mc_nthreads;
++	int32_t mc_cpuid_level;
++	uint32_t mc_family;
++	uint32_t mc_vendor;
++	uint32_t mc_model;
++	uint32_t mc_step;
++	char mc_vendorid[16];
++	char mc_brandid[64];
++	uint32_t mc_cpu_caps[MC_NCAPS];
++	uint32_t mc_cache_size;
++	uint32_t mc_cache_alignment;
++	int32_t mc_nmsrvals;
++	struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
++};
++typedef struct mcinfo_logical_cpu mcinfo_logical_cpu_t;
++DEFINE_GUEST_HANDLE_STRUCT(mcinfo_logical_cpu);
++
++
++/*
++ * OS's should use these instead of writing their own lookup function
++ * each with its own bugs and drawbacks.
++ * We use macros instead of static inline functions to allow guests
++ * to include this header in assembly files (*.S).
++ */
++/* Prototype:
++ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
++ */
++#define x86_mcinfo_nentries(_mi)    \
++    ((_mi)->mi_nentries)
++/* Prototype:
++ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
++ */
++#define x86_mcinfo_first(_mi)       \
++    ((struct mcinfo_common *)(_mi)->mi_data)
++/* Prototype:
++ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
++ */
++#define x86_mcinfo_next(_mic)       \
++    ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
++
++/* Prototype:
++ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
++ */
++
++static inline void x86_mcinfo_lookup
++	(struct mcinfo_common **ret, struct mc_info *mi, uint16_t type)
++{
++	uint32_t found = 0, i;
++	struct mcinfo_common *mic;
++
++	*ret = NULL;
++	if (!mi)
++		return;
++	mic = x86_mcinfo_first(mi);
++
++	for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
++		if (mic->type == type) {
++			found = 1;
++			break;
++		}
++		mic = x86_mcinfo_next(mic);
++	}
++
++	*ret = found ? mic : NULL;
++}
++/* Usecase 1
++ * Register machine check trap callback handler
++ *    (already done via "set_trap_table" hypercall)
++ */
++
++/* Usecase 2
++ * Dom0 registers machine check event callback handler
++ * done by EVTCHNOP_bind_virq
++ */
++
++/* Usecase 3
++ * Fetch machine check data from hypervisor.
++ * Note, this hypercall is special, because both Dom0 and DomU must use this.
++ */
++#define XEN_MC_fetch            1
++struct xen_mc_fetch {
++    /* IN/OUT variables.
++	 * IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
++	 * XEN_MC_ACK if ack'king an earlier fetch
++	 * OUT: XEN_MC_OK, XEN_MC_FETCHAILED,
++	 * XEN_MC_NODATA, XEN_MC_NOMATCH
++	*/
++	uint32_t flags;
++	uint32_t _pad0;
++	/* OUT: id for ack, IN: id we are ack'ing */
++	uint64_t fetch_id;
++
++	/* OUT variables. */
++	GUEST_HANDLE(mc_info) data;
++};
++typedef struct xen_mc_fetch xen_mc_fetch_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_mc_fetch);
++
++
++/* Usecase 4
++ * This tells the hypervisor to notify a DomU about the machine check error
++ */
++#define XEN_MC_notifydomain     2
++struct xen_mc_notifydomain {
++	/* IN variables. */
++	uint16_t mc_domid;/* The unprivileged domain to notify. */
++	uint16_t mc_vcpuid;/* The vcpu in mc_domid to notify.
++			* Usually echo'd value from the fetch hypercall. */
++
++	/* IN/OUT variables. */
++	uint32_t flags;
++
++/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
++};
++typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_mc_notifydomain);
++
++#define XEN_MC_physcpuinfo 3
++struct xen_mc_physcpuinfo {
++	/* IN/OUT */
++	uint32_t ncpus;
++	uint32_t _pad0;
++	/* OUT */
++	GUEST_HANDLE(mcinfo_logical_cpu) info;
++};
++
++#define XEN_MC_msrinject    4
++#define MC_MSRINJ_MAXMSRS       8
++struct xen_mc_msrinject {
++	/* IN */
++	uint32_t mcinj_cpunr;/* target processor id */
++	uint32_t mcinj_flags;/* see MC_MSRINJ_F_* below */
++	uint32_t mcinj_count;/* 0 .. count-1 in array are valid */
++	uint32_t _pad0;
++	struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
++};
++
++/* Flags for mcinj_flags above; bits 16-31 are reserved */
++#define MC_MSRINJ_F_INTERPOSE   0x1
++
++#define XEN_MC_mceinject    5
++struct xen_mc_mceinject {
++	unsigned int mceinj_cpunr;      /* target processor id */
++};
++
++struct xen_mc {
++	uint32_t cmd;
++	uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
++	union {
++		struct xen_mc_fetch        mc_fetch;
++		struct xen_mc_notifydomain mc_notifydomain;
++		struct xen_mc_physcpuinfo  mc_physcpuinfo;
++		struct xen_mc_msrinject    mc_msrinject;
++		struct xen_mc_mceinject    mc_mceinject;
++	} u;
++};
++typedef struct xen_mc xen_mc_t;
++DEFINE_GUEST_HANDLE_STRUCT(xen_mc);
++
++#endif /* __ASSEMBLY__ */
++
++#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
+diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
+index 2befa3e..9ffaee0 100644
+--- a/include/xen/interface/xen.h
++++ b/include/xen/interface/xen.h
+@@ -79,6 +79,7 @@
+ #define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
+ #define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
+ #define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
++#define VIRQ_PCPU_STATE 9  /* (DOM0) PCPU state changed                   */
+ 
+ /* Architecture-specific VIRQ definitions. */
+ #define VIRQ_ARCH_0    16
+@@ -184,6 +185,8 @@
+ #define MMUEXT_NEW_USER_BASEPTR 15
+ 
+ #ifndef __ASSEMBLY__
++#include <linux/types.h>
++
+ struct mmuext_op {
+ 	unsigned int cmd;
+ 	union {
+@@ -449,9 +452,49 @@ struct start_info {
+ 	int8_t cmd_line[MAX_GUEST_CMDLINE];
+ };
+ 
++struct dom0_vga_console_info {
++	uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
++#define XEN_VGATYPE_TEXT_MODE_3 0x03
++#define XEN_VGATYPE_VESA_LFB    0x23
++
++	union {
++		struct {
++			/* Font height, in pixels. */
++			uint16_t font_height;
++			/* Cursor location (column, row). */
++			uint16_t cursor_x, cursor_y;
++			/* Number of rows and columns (dimensions in characters). */
++			uint16_t rows, columns;
++		} text_mode_3;
++
++		struct {
++			/* Width and height, in pixels. */
++			uint16_t width, height;
++			/* Bytes per scan line. */
++			uint16_t bytes_per_line;
++			/* Bits per pixel. */
++			uint16_t bits_per_pixel;
++			/* LFB physical address, and size (in units of 64kB). */
++			uint32_t lfb_base;
++			uint32_t lfb_size;
++			/* RGB mask offsets and sizes, as defined by VBE 1.2+ */
++			uint8_t  red_pos, red_size;
++			uint8_t  green_pos, green_size;
++			uint8_t  blue_pos, blue_size;
++			uint8_t  rsvd_pos, rsvd_size;
++
++			/* VESA capabilities (offset 0xa, VESA command 0x4f00). */
++			uint32_t gbl_caps;
++			/* Mode attributes (offset 0x0, VESA command 0x4f01). */
++			uint16_t mode_attrs;
++		} vesa_lfb;
++	} u;
++};
++
+ /* These flags are passed in the 'flags' field of start_info_t. */
+ #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
+ #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
++#define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
+ 
+ typedef uint64_t cpumap_t;
+ 
+@@ -461,6 +504,8 @@ typedef uint8_t xen_domain_handle_t[16];
+ #define __mk_unsigned_long(x) x ## UL
+ #define mk_unsigned_long(x) __mk_unsigned_long(x)
+ 
++DEFINE_GUEST_HANDLE(uint64_t);
++
+ #else /* __ASSEMBLY__ */
+ 
+ /* In assembly code we cannot use C numeric constant suffixes. */
+diff --git a/include/xen/page.h b/include/xen/page.h
+index eaf85fa..0be36b9 100644
+--- a/include/xen/page.h
++++ b/include/xen/page.h
+@@ -1 +1,8 @@
++#ifndef _XEN_PAGE_H
++#define _XEN_PAGE_H
++
+ #include <asm/xen/page.h>
++
++extern phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
++
++#endif	/* _XEN_PAGE_H */
+diff --git a/include/xen/pcpu.h b/include/xen/pcpu.h
+new file mode 100644
+index 0000000..7e8f9d1
+--- /dev/null
++++ b/include/xen/pcpu.h
+@@ -0,0 +1,32 @@
++#ifndef _XEN_PCPU_H
++#define _XEN_PCPU_H
++
++#include <xen/interface/platform.h>
++#include <linux/sysdev.h>
++
++extern int xen_pcpu_hotplug(int type, uint32_t apic_id);
++#define XEN_PCPU_ONLINE     0x01
++#define XEN_PCPU_OFFLINE    0x02
++#define XEN_PCPU_ADD        0x04
++#define XEN_PCPU_REMOVE     0x08
++
++struct pcpu {
++	struct list_head pcpu_list;
++	struct sys_device sysdev;
++	uint32_t xen_id;
++	uint32_t apic_id;
++	uint32_t acpi_id;
++	uint32_t flags;
++};
++
++static inline int xen_pcpu_online(uint32_t flags)
++{
++	return !!(flags & XEN_PCPU_FLAGS_ONLINE);
++}
++
++extern int register_xen_pcpu_notifier(struct notifier_block *nb);
++
++extern void unregister_xen_pcpu_notifier(struct notifier_block *nb);
++
++extern int xen_pcpu_index(uint32_t acpi_id, int is_acpiid);
++#endif
+diff --git a/include/xen/platform_pci.h b/include/xen/platform_pci.h
+new file mode 100644
+index 0000000..a785a3b
+--- /dev/null
++++ b/include/xen/platform_pci.h
+@@ -0,0 +1,53 @@
++#ifndef _XEN_PLATFORM_PCI_H
++#define _XEN_PLATFORM_PCI_H
++
++#define XEN_IOPORT_MAGIC_VAL 0x49d2
++#define XEN_IOPORT_LINUX_PRODNUM 0x0003
++#define XEN_IOPORT_LINUX_DRVVER  0x0001
++
++#define XEN_IOPORT_BASE 0x10
++
++#define XEN_IOPORT_PLATFLAGS	(XEN_IOPORT_BASE + 0) /* 1 byte access (R/W) */
++#define XEN_IOPORT_MAGIC	(XEN_IOPORT_BASE + 0) /* 2 byte access (R) */
++#define XEN_IOPORT_UNPLUG	(XEN_IOPORT_BASE + 0) /* 2 byte access (W) */
++#define XEN_IOPORT_DRVVER	(XEN_IOPORT_BASE + 0) /* 4 byte access (W) */
++
++#define XEN_IOPORT_SYSLOG	(XEN_IOPORT_BASE + 2) /* 1 byte access (W) */
++#define XEN_IOPORT_PROTOVER	(XEN_IOPORT_BASE + 2) /* 1 byte access (R) */
++#define XEN_IOPORT_PRODNUM	(XEN_IOPORT_BASE + 2) /* 2 byte access (W) */
++
++#define XEN_UNPLUG_ALL_IDE_DISKS	(1<<0)
++#define XEN_UNPLUG_ALL_NICS		(1<<1)
++#define XEN_UNPLUG_AUX_IDE_DISKS	(1<<2)
++#define XEN_UNPLUG_ALL			(XEN_UNPLUG_ALL_IDE_DISKS|\
++					 XEN_UNPLUG_ALL_NICS|\
++					 XEN_UNPLUG_AUX_IDE_DISKS)
++
++#define XEN_UNPLUG_UNNECESSARY 		(1<<16)
++#define XEN_UNPLUG_NEVER	 		(1<<17)
++
++static inline int xen_must_unplug_nics(void) {
++#if (defined(CONFIG_XEN_NETDEV_FRONTEND) || \
++		defined(CONFIG_XEN_NETDEV_FRONTEND_MODULE)) && \
++		(defined(CONFIG_XEN_PLATFORM_PCI) || \
++		 defined(CONFIG_XEN_PLATFORM_PCI_MODULE))
++        return 1;
++#else
++        return 0;
++#endif
++}
++
++static inline int xen_must_unplug_disks(void) {
++#if (defined(CONFIG_XEN_BLKDEV_FRONTEND) || \
++		defined(CONFIG_XEN_BLKDEV_FRONTEND_MODULE)) && \
++		(defined(CONFIG_XEN_PLATFORM_PCI) || \
++		 defined(CONFIG_XEN_PLATFORM_PCI_MODULE))
++        return 1;
++#else
++        return 0;
++#endif
++}
++
++extern int xen_platform_pci_unplug;
++
++#endif /* _XEN_PLATFORM_PCI_H */
+diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h
+new file mode 100644
+index 0000000..b42cdfd
+--- /dev/null
++++ b/include/xen/privcmd.h
+@@ -0,0 +1,80 @@
++/******************************************************************************
++ * privcmd.h
++ *
++ * Interface to /proc/xen/privcmd.
++ *
++ * Copyright (c) 2003-2005, K A Fraser
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License version 2
++ * as published by the Free Software Foundation; or, when distributed
++ * separately from the Linux kernel or incorporated into other
++ * software packages, subject to the following license:
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a copy
++ * of this source file (the "Software"), to deal in the Software without
++ * restriction, including without limitation the rights to use, copy, modify,
++ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
++ * and to permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#ifndef __LINUX_PUBLIC_PRIVCMD_H__
++#define __LINUX_PUBLIC_PRIVCMD_H__
++
++#include <linux/types.h>
++
++typedef unsigned long xen_pfn_t;
++
++#ifndef __user
++#define __user
++#endif
++
++struct privcmd_hypercall {
++	__u64 op;
++	__u64 arg[5];
++};
++
++struct privcmd_mmap_entry {
++	__u64 va;
++	__u64 mfn;
++	__u64 npages;
++};
++
++struct privcmd_mmap {
++	int num;
++	domid_t dom; /* target domain */
++	struct privcmd_mmap_entry __user *entry;
++};
++
++struct privcmd_mmapbatch {
++	int num;     /* number of pages to populate */
++	domid_t dom; /* target domain */
++	__u64 addr;  /* virtual address */
++	xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
++};
++
++/*
++ * @cmd: IOCTL_PRIVCMD_HYPERCALL
++ * @arg: &privcmd_hypercall_t
++ * Return: Value returned from execution of the specified hypercall.
++ */
++#define IOCTL_PRIVCMD_HYPERCALL					\
++	_IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall))
++#define IOCTL_PRIVCMD_MMAP					\
++	_IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap))
++#define IOCTL_PRIVCMD_MMAPBATCH					\
++	_IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
++
++#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
+diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
+index 883a21b..7058f8a 100644
+--- a/include/xen/xen-ops.h
++++ b/include/xen/xen-ops.h
+@@ -7,6 +7,7 @@ DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
+ 
+ void xen_pre_suspend(void);
+ void xen_post_suspend(int suspend_cancelled);
++void xen_hvm_post_suspend(int suspend_cancelled);
+ 
+ void xen_mm_pin_all(void);
+ void xen_mm_unpin_all(void);
+@@ -14,4 +15,16 @@ void xen_mm_unpin_all(void);
+ void xen_timer_resume(void);
+ void xen_arch_resume(void);
+ 
++int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
++			       unsigned long addr,
++			       unsigned long mfn, int nr,
++			       pgprot_t prot, unsigned domid);
++
++extern unsigned long *xen_contiguous_bitmap;
++int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
++				unsigned int address_bits);
++
++void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order);
++int xen_setup_shutdown_event(void);
++
+ #endif /* INCLUDE_XEN_OPS_H */
+diff --git a/include/xen/xen.h b/include/xen/xen.h
+new file mode 100644
+index 0000000..77604ed
+--- /dev/null
++++ b/include/xen/xen.h
+@@ -0,0 +1,34 @@
++#ifndef _XEN_XEN_H
++#define _XEN_XEN_H
++
++enum xen_domain_type {
++	XEN_NATIVE,		/* running on bare hardware    */
++	XEN_PV_DOMAIN,		/* running in a PV domain      */
++	XEN_HVM_DOMAIN,		/* running in a Xen hvm domain */
++};
++
++#ifdef CONFIG_XEN
++extern enum xen_domain_type xen_domain_type;
++extern void xen_hvm_guest_init(void);
++#else
++#define xen_domain_type		XEN_NATIVE
++#define xen_hvm_guest_init() do { } while (0)
++#endif
++
++#define xen_domain()		(xen_domain_type != XEN_NATIVE)
++#define xen_pv_domain()		(xen_domain() &&			\
++				 xen_domain_type == XEN_PV_DOMAIN)
++#define xen_hvm_domain()	(xen_domain() &&			\
++				 xen_domain_type == XEN_HVM_DOMAIN)
++
++#ifdef CONFIG_XEN_DOM0
++#include <xen/interface/xen.h>
++#include <asm/xen/hypervisor.h>
++
++#define xen_initial_domain()	(xen_pv_domain() && \
++				 xen_start_info->flags & SIF_INITDOMAIN)
++#else  /* !CONFIG_XEN_DOM0 */
++#define xen_initial_domain()	(0)
++#endif	/* CONFIG_XEN_DOM0 */
++
++#endif	/* _XEN_XEN_H */
+diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
+index b9763ba..542ca7c 100644
+--- a/include/xen/xenbus.h
++++ b/include/xen/xenbus.h
+@@ -93,7 +93,7 @@ struct xenbus_driver {
+ 	int (*remove)(struct xenbus_device *dev);
+ 	int (*suspend)(struct xenbus_device *dev, pm_message_t state);
+ 	int (*resume)(struct xenbus_device *dev);
+-	int (*uevent)(struct xenbus_device *, char **, int, char *, int);
++	int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
+ 	struct device_driver driver;
+ 	int (*read_otherend_details)(struct xenbus_device *dev);
+ 	int (*is_ready)(struct xenbus_device *dev);
+diff --git a/lib/Makefile b/lib/Makefile
+index 452f188..001e918 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -77,7 +77,8 @@ obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
+ obj-$(CONFIG_SMP) += percpu_counter.o
+ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
+ 
+-obj-$(CONFIG_SWIOTLB) += swiotlb.o
++obj-$(CONFIG_SWIOTLB) += swiotlb-core.o swiotlb.o
++obj-$(CONFIG_SWIOTLB_XEN) +=  swiotlb-xen.o
+ obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
+ obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
+ 
+diff --git a/lib/swiotlb-core.c b/lib/swiotlb-core.c
+new file mode 100644
+index 0000000..a17c89e
+--- /dev/null
++++ b/lib/swiotlb-core.c
+@@ -0,0 +1,572 @@
++/*
++ * Dynamic DMA mapping support.
++ *
++ * This implementation is a fallback for platforms that do not support
++ * I/O TLBs (aka DMA address translation hardware).
++ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
++ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
++ * Copyright (C) 2000, 2003 Hewlett-Packard Co
++ *	David Mosberger-Tang <davidm@hpl.hp.com>
++ *
++ * 03/05/07 davidm	Switch from PCI-DMA to generic device DMA API.
++ * 00/12/13 davidm	Rename to swiotlb.c and add mark_clean() to avoid
++ *			unnecessary i-cache flushing.
++ * 04/07/.. ak		Better overflow handling. Assorted fixes.
++ * 05/09/10 linville	Add support for syncing ranges, support syncing for
++ *			DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
++ * 08/12/11 beckyb	Add highmem support
++ */
++
++#include <linux/cache.h>
++#include <linux/dma-mapping.h>
++#include <linux/mm.h>
++#include <linux/module.h>
++#include <linux/spinlock.h>
++#include <linux/string.h>
++#include <linux/swiotlb.h>
++#include <linux/pfn.h>
++#include <linux/types.h>
++#include <linux/ctype.h>
++#include <linux/highmem.h>
++
++#include <linux/io.h>
++#include <asm/dma.h>
++#include <linux/scatterlist.h>
++
++#include <linux/init.h>
++#include <linux/bootmem.h>
++#include <linux/iommu-helper.h>
++
++#define OFFSET(val, align) ((unsigned long)	((val) & ((align) - 1)))
++
++#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
++
++/*
++ * Minimum IO TLB size to bother booting with.  Systems with mainly
++ * 64bit capable cards will only lightly use the swiotlb.  If we can't
++ * allocate a contiguous 1MB, we're probably in trouble anyway.
++ */
++#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
++
++int swiotlb_force;
++
++/*
++ * Used to do a quick range check in do_unmap_single and
++ * do_sync_single_*, to see if the memory was in fact allocated by this
++ * API.
++ */
++char *io_tlb_start, *io_tlb_end;
++
++/*
++ * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
++ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
++ */
++unsigned long io_tlb_nslabs;
++
++/*
++ * When the IOMMU overflows we return a fallback buffer. This sets the size.
++ */
++unsigned long io_tlb_overflow = 32*1024;
++
++void *io_tlb_overflow_buffer;
++
++/*
++ * This is a free list describing the number of free entries available from
++ * each index
++ */
++static unsigned int *io_tlb_list;
++static unsigned int io_tlb_index;
++
++/*
++ * We need to save away the original address corresponding to a mapped entry
++ * for the sync operations.
++ */
++static phys_addr_t *io_tlb_orig_addr;
++
++/*
++ * Protect the above data structures in the map and unmap calls
++ */
++static DEFINE_SPINLOCK(io_tlb_lock);
++
++static int late_alloc;
++
++static int __init
++setup_io_tlb_npages(char *str)
++{
++	int get_value(const char *token, char *str, char **endp)
++	{
++		ssize_t len;
++		int val = 0;
++
++		len = strlen(token);
++		if (!strncmp(str, token, len)) {
++			str += len;
++			if (*str == '=')
++				++str;
++			if (*str != '\0')
++				val = simple_strtoul(str, endp, 0);
++		}
++		*endp = str;
++		return val;
++	}
++
++	int val;
++
++	while (*str) {
++		/* The old syntax */
++		if (isdigit(*str)) {
++			io_tlb_nslabs = simple_strtoul(str, &str, 0);
++			/* avoid tail segment of size < IO_TLB_SEGSIZE */
++			io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++		}
++		if (!strncmp(str, "force", 5))
++			swiotlb_force = 1;
++		/* The new syntax: swiotlb=nslabs=16384,overflow=32768,force */
++		val = get_value("nslabs", str, &str);
++		if (val)
++			io_tlb_nslabs = ALIGN(val, IO_TLB_SEGSIZE);
++
++		val = get_value("overflow", str, &str);
++		if (val)
++			io_tlb_overflow = val;
++		str = strpbrk(str, ",");
++		if (!str)
++			break;
++		str++; /* skip ',' */
++	}
++	return 1;
++}
++__setup("swiotlb=", setup_io_tlb_npages);
++
++void swiotlb_print_info(void)
++{
++	unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++	phys_addr_t pstart, pend;
++
++	pstart = virt_to_phys(io_tlb_start);
++	pend = virt_to_phys(io_tlb_end);
++
++	printk(KERN_INFO "DMA: Placing %luMB software IO TLB between %p - %p\n",
++	       bytes >> 20, io_tlb_start, io_tlb_end);
++	printk(KERN_INFO "DMA: software IO TLB at phys %#llx - %#llx\n",
++	       (unsigned long long)pstart,
++	       (unsigned long long)pend);
++}
++
++/*
++ * Statically reserve bounce buffer space and initialize bounce buffer data
++ * structures for the software IO TLB used to implement the DMA API.
++ */
++void __init
++swiotlb_init_early(size_t default_size, int verbose)
++{
++	unsigned long i, bytes;
++
++	if (!io_tlb_nslabs) {
++		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
++		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++	}
++
++	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++
++	/*
++	 * Get IO TLB memory from the low pages
++	 */
++	io_tlb_start = alloc_bootmem_low_pages(bytes);
++	if (!io_tlb_start)
++		panic("DMA: Cannot allocate SWIOTLB buffer");
++	io_tlb_end = io_tlb_start + bytes;
++
++	/*
++	 * Allocate and initialize the free list array.  This array is used
++	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
++	 * between io_tlb_start and io_tlb_end.
++	 */
++	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
++	for (i = 0; i < io_tlb_nslabs; i++)
++		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
++	io_tlb_index = 0;
++	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
++
++	/*
++	 * Get the overflow emergency buffer
++	 */
++	io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
++	if (!io_tlb_overflow_buffer)
++		panic("DMA: Cannot allocate SWIOTLB overflow buffer!\n");
++	if (verbose)
++		swiotlb_print_info();
++}
++
++void __init
++swiotlb_init(int verbose)
++{
++	swiotlb_init_early(64 * (1<<20), verbose);	/* default to 64MB */
++}
++
++/*
++ * Systems with larger DMA zones (those that don't support ISA) can
++ * initialize the swiotlb later using the slab allocator if needed.
++ * This should be just like above, but with some error catching.
++ */
++int
++swiotlb_init_late(size_t default_size)
++{
++	unsigned long i, bytes, req_nslabs = io_tlb_nslabs;
++	unsigned int order;
++
++	if (!io_tlb_nslabs) {
++		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
++		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
++	}
++
++	/*
++	 * Get IO TLB memory from the low pages
++	 */
++	order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
++	io_tlb_nslabs = SLABS_PER_PAGE << order;
++	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++
++	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
++		io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
++							order);
++		if (io_tlb_start)
++			break;
++		order--;
++	}
++
++	if (!io_tlb_start)
++		goto cleanup1;
++
++	if (order != get_order(bytes)) {
++		printk(KERN_WARNING "DMA: Warning: only able to allocate %ld MB"
++		       " for software IO TLB\n", (PAGE_SIZE << order) >> 20);
++		io_tlb_nslabs = SLABS_PER_PAGE << order;
++		bytes = io_tlb_nslabs << IO_TLB_SHIFT;
++	}
++	io_tlb_end = io_tlb_start + bytes;
++	memset(io_tlb_start, 0, bytes);
++
++	/*
++	 * Allocate and initialize the free list array.  This array is used
++	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
++	 * between io_tlb_start and io_tlb_end.
++	 */
++	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
++					get_order(io_tlb_nslabs * sizeof(int)));
++	if (!io_tlb_list)
++		goto cleanup2;
++
++	for (i = 0; i < io_tlb_nslabs; i++)
++		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
++	io_tlb_index = 0;
++
++	io_tlb_orig_addr = (phys_addr_t *) __get_free_pages(GFP_KERNEL,
++				get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
++	if (!io_tlb_orig_addr)
++		goto cleanup3;
++
++	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
++
++	/*
++	 * Get the overflow emergency buffer
++	 */
++	io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
++					 get_order(io_tlb_overflow));
++	if (!io_tlb_overflow_buffer)
++		goto cleanup4;
++
++	swiotlb_print_info();
++
++	late_alloc = 1;
++
++	return 0;
++
++cleanup4:
++	free_pages((unsigned long)io_tlb_orig_addr,
++		   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
++	io_tlb_orig_addr = NULL;
++cleanup3:
++	free_pages((unsigned long)io_tlb_list,
++		   get_order(io_tlb_nslabs * sizeof(int)));
++	io_tlb_list = NULL;
++cleanup2:
++	io_tlb_end = NULL;
++	free_pages((unsigned long)io_tlb_start, order);
++	io_tlb_start = NULL;
++cleanup1:
++	io_tlb_nslabs = req_nslabs;
++	return -ENOMEM;
++}
++
++void __init swiotlb_free(void)
++{
++	if (!io_tlb_overflow_buffer)
++		return;
++
++	if (late_alloc) {
++		free_pages((unsigned long)io_tlb_overflow_buffer,
++			   get_order(io_tlb_overflow));
++		free_pages((unsigned long)io_tlb_orig_addr,
++			   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
++		free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
++								 sizeof(int)));
++		free_pages((unsigned long)io_tlb_start,
++			   get_order(io_tlb_nslabs << IO_TLB_SHIFT));
++	} else {
++		free_bootmem_late(__pa(io_tlb_overflow_buffer),
++				  io_tlb_overflow);
++		free_bootmem_late(__pa(io_tlb_orig_addr),
++				  io_tlb_nslabs * sizeof(phys_addr_t));
++		free_bootmem_late(__pa(io_tlb_list),
++				  io_tlb_nslabs * sizeof(int));
++		free_bootmem_late(__pa(io_tlb_start),
++				  io_tlb_nslabs << IO_TLB_SHIFT);
++	}
++}
++
++int is_swiotlb_buffer(phys_addr_t paddr)
++{
++	return paddr >= virt_to_phys(io_tlb_start) &&
++		paddr < virt_to_phys(io_tlb_end);
++}
++
++/*
++ * Bounce: copy the swiotlb buffer back to the original dma location
++ */
++void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
++			   enum dma_data_direction dir)
++{
++	unsigned long pfn = PFN_DOWN(phys);
++
++	if (PageHighMem(pfn_to_page(pfn))) {
++		/* The buffer does not have a mapping.  Map it in and copy */
++		unsigned int offset = phys & ~PAGE_MASK;
++		char *buffer;
++		unsigned int sz = 0;
++		unsigned long flags;
++
++		while (size) {
++			sz = min_t(size_t, PAGE_SIZE - offset, size);
++
++			local_irq_save(flags);
++			buffer = kmap_atomic(pfn_to_page(pfn),
++					     KM_BOUNCE_READ);
++			if (dir == DMA_TO_DEVICE)
++				memcpy(dma_addr, buffer + offset, sz);
++			else
++				memcpy(buffer + offset, dma_addr, sz);
++			kunmap_atomic(buffer, KM_BOUNCE_READ);
++			local_irq_restore(flags);
++
++			size -= sz;
++			pfn++;
++			dma_addr += sz;
++			offset = 0;
++		}
++	} else {
++		if (dir == DMA_TO_DEVICE)
++			memcpy(dma_addr, phys_to_virt(phys), size);
++		else
++			memcpy(phys_to_virt(phys), dma_addr, size);
++	}
++}
++
++/*
++ * Allocates bounce buffer and returns its kernel virtual address.
++ */
++void *
++do_map_single(struct device *hwdev, phys_addr_t phys,
++	       unsigned long start_dma_addr, size_t size, int dir)
++{
++	unsigned long flags;
++	char *dma_addr;
++	unsigned int nslots, stride, index, wrap;
++	int i;
++	unsigned long mask;
++	unsigned long offset_slots;
++	unsigned long max_slots;
++
++	mask = dma_get_seg_boundary(hwdev);
++	start_dma_addr = start_dma_addr & mask;
++	offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++
++	/*
++	 * Carefully handle integer overflow which can occur when mask == ~0UL.
++	 */
++	max_slots = mask + 1
++		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
++		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
++
++	/*
++	 * For mappings greater than a page, we limit the stride (and
++	 * hence alignment) to a page size.
++	 */
++	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++	if (size > PAGE_SIZE)
++		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
++	else
++		stride = 1;
++
++	BUG_ON(!nslots);
++
++	/*
++	 * Find suitable number of IO TLB entries size that will fit this
++	 * request and allocate a buffer from that IO TLB pool.
++	 */
++	spin_lock_irqsave(&io_tlb_lock, flags);
++	index = ALIGN(io_tlb_index, stride);
++	if (index >= io_tlb_nslabs)
++		index = 0;
++	wrap = index;
++
++	do {
++		while (iommu_is_span_boundary(index, nslots, offset_slots,
++					      max_slots)) {
++			index += stride;
++			if (index >= io_tlb_nslabs)
++				index = 0;
++			if (index == wrap)
++				goto not_found;
++		}
++
++		/*
++		 * If we find a slot that indicates we have 'nslots' number of
++		 * contiguous buffers, we allocate the buffers from that slot
++		 * and mark the entries as '0' indicating unavailable.
++		 */
++		if (io_tlb_list[index] >= nslots) {
++			int count = 0;
++
++			for (i = index; i < (int) (index + nslots); i++)
++				io_tlb_list[i] = 0;
++			for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE)
++				!= IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
++				io_tlb_list[i] = ++count;
++			dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
++
++			/*
++			 * Update the indices to avoid searching in the next
++			 * round.
++			 */
++			io_tlb_index = ((index + nslots) < io_tlb_nslabs
++					? (index + nslots) : 0);
++
++			goto found;
++		}
++		index += stride;
++		if (index >= io_tlb_nslabs)
++			index = 0;
++	} while (index != wrap);
++
++not_found:
++	spin_unlock_irqrestore(&io_tlb_lock, flags);
++	return NULL;
++found:
++	spin_unlock_irqrestore(&io_tlb_lock, flags);
++
++	/*
++	 * Save away the mapping from the original address to the DMA address.
++	 * This is needed when we sync the memory.  Then we sync the buffer if
++	 * needed.
++	 */
++	for (i = 0; i < nslots; i++)
++		io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
++	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
++		swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
++
++	return dma_addr;
++}
++
++/*
++ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
++ */
++void
++do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
++{
++	unsigned long flags;
++	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
++	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
++	phys_addr_t phys = io_tlb_orig_addr[index];
++
++	/*
++	 * First, sync the memory before unmapping the entry
++	 */
++	if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
++		swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
++
++	/*
++	 * Return the buffer to the free list by setting the corresponding
++	 * entries to indicate the number of contigous entries available.
++	 * While returning the entries to the free list, we merge the entries
++	 * with slots below and above the pool being returned.
++	 */
++	spin_lock_irqsave(&io_tlb_lock, flags);
++	{
++		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
++			 io_tlb_list[index + nslots] : 0);
++		/*
++		 * Step 1: return the slots to the free list, merging the
++		 * slots with superceeding slots
++		 */
++		for (i = index + nslots - 1; i >= index; i--)
++			io_tlb_list[i] = ++count;
++		/*
++		 * Step 2: merge the returned slots with the preceding slots,
++		 * if available (non zero)
++		 */
++		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) !=
++				IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
++			io_tlb_list[i] = ++count;
++	}
++	spin_unlock_irqrestore(&io_tlb_lock, flags);
++}
++
++void
++do_sync_single(struct device *hwdev, char *dma_addr, size_t size,
++	    int dir, int target)
++{
++	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
++	phys_addr_t phys = io_tlb_orig_addr[index];
++
++	phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
++
++	switch (target) {
++	case SYNC_FOR_CPU:
++		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
++			swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
++		else
++			BUG_ON(dir != DMA_TO_DEVICE);
++		break;
++	case SYNC_FOR_DEVICE:
++		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
++			swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
++		else
++			BUG_ON(dir != DMA_FROM_DEVICE);
++		break;
++	default:
++		BUG();
++	}
++}
++void
++swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
++{
++	/*
++	 * Ran out of IOMMU space for this operation. This is very bad.
++	 * Unfortunately the drivers cannot handle this operation properly.
++	 * unless they check for dma_mapping_error (most don't)
++	 * When the mapping is small enough return a static buffer to limit
++	 * the damage, or panic when the transfer is too big.
++	 */
++	dev_err(dev, "DMA: Out of SW-IOMMU space for %zu bytes.", size);
++
++	if (size <= io_tlb_overflow || !do_panic)
++		return;
++
++	if (dir == DMA_BIDIRECTIONAL)
++		panic("DMA: Random memory could be DMA accessed\n");
++	if (dir == DMA_FROM_DEVICE)
++		panic("DMA: Random memory could be DMA written\n");
++	if (dir == DMA_TO_DEVICE)
++		panic("DMA: Random memory could be DMA read\n");
++}
+diff --git a/lib/swiotlb-xen.c b/lib/swiotlb-xen.c
+new file mode 100644
+index 0000000..bee577f
+--- /dev/null
++++ b/lib/swiotlb-xen.c
+@@ -0,0 +1,504 @@
++/* An software based IOMMU that utilizes the swiotlb-core fuctionality.
++ * It can function on Xen when there are PCI devices present.*/
++
++
++#include <linux/dma-mapping.h>
++#include <linux/io.h>
++#include <asm/dma.h>
++#include <linux/scatterlist.h>
++#include <xen/interface/xen.h>
++#include <xen/grant_table.h>
++
++#include <asm/xen/page.h>
++#include <xen/page.h>
++#include <xen/xen-ops.h>
++
++static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
++{
++	return phys_to_machine(XPADDR(paddr)).maddr;;
++}
++
++static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
++{
++	return machine_to_phys(XMADDR(baddr)).paddr;
++}
++
++static dma_addr_t xen_virt_to_bus(void *address)
++{
++	return xen_phys_to_bus(virt_to_phys(address));
++}
++
++static int check_pages_physically_contiguous(unsigned long pfn,
++					     unsigned int offset,
++					     size_t length)
++{
++	unsigned long next_mfn;
++	int i;
++	int nr_pages;
++
++	next_mfn = pfn_to_mfn(pfn);
++	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++	for (i = 1; i < nr_pages; i++) {
++		if (pfn_to_mfn(++pfn) != ++next_mfn)
++			return 0;
++	}
++	return 1;
++}
++
++static int range_straddles_page_boundary(phys_addr_t p, size_t size)
++{
++	unsigned long pfn = PFN_DOWN(p);
++	unsigned int offset = p & ~PAGE_MASK;
++
++	if (offset + size <= PAGE_SIZE)
++		return 0;
++	if (check_pages_physically_contiguous(pfn, offset, size))
++		return 0;
++	return 1;
++}
++
++
++bool xen_dma_capable(struct device *dev, dma_addr_t dev_addr,
++		     phys_addr_t phys, size_t size)
++{
++	int rc = 0;
++
++	rc =  dma_capable(dev, dev_addr, size) &&
++		 !range_straddles_page_boundary(phys, size);
++	return rc;
++}
++
++static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
++{
++	unsigned long mfn = PFN_DOWN(dma_addr);
++	unsigned long pfn = mfn_to_local_pfn(mfn);
++
++	/* If the address is outside our domain, it CAN have the same virtual
++	* address as another address in our domain. Hence only check address
++	* within our domain. */
++	if (pfn_valid(pfn))
++		return is_swiotlb_buffer(PFN_PHYS(pfn));
++
++	return 0;
++}
++void *
++xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
++		       dma_addr_t *dma_handle, gfp_t flags)
++{
++	void *ret;
++	int order = get_order(size);
++	u64 dma_mask = DMA_BIT_MASK(32);
++	unsigned long vstart;
++
++	/*
++	* Ignore region specifiers - the kernel's ideas of
++	* pseudo-phys memory layout has nothing to do with the
++	* machine physical layout.  We can't allocate highmem
++	* because we can't return a pointer to it.
++	*/
++	flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
++
++	if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
++		return ret;
++
++	vstart = __get_free_pages(flags, order);
++	ret = (void *)vstart;
++
++	if (hwdev && hwdev->coherent_dma_mask)
++		dma_mask = dma_alloc_coherent_mask(hwdev, flags);
++
++	if (ret) {
++		if (xen_create_contiguous_region(vstart, order,
++						 fls64(dma_mask)) != 0) {
++			free_pages(vstart, order);
++			return NULL;
++		}
++		memset(ret, 0, size);
++		*dma_handle = virt_to_machine(ret).maddr;
++	}
++	return ret;
++}
++EXPORT_SYMBOL(xen_swiotlb_alloc_coherent);
++
++void
++xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
++		      dma_addr_t dev_addr)
++{
++	int order = get_order(size);
++
++	if (dma_release_from_coherent(hwdev, order, vaddr))
++		return;
++
++	xen_destroy_contiguous_region((unsigned long)vaddr, order);
++	free_pages((unsigned long)vaddr, order);
++}
++EXPORT_SYMBOL(xen_swiotlb_free_coherent);
++
++
++static int max_dma_bits = 32;
++
++static int 
++xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
++{
++	int i, rc;
++	int dma_bits;
++
++	printk(KERN_INFO "xen_swiotlb_fixup: buf=%p size=%zu\n",
++		buf, size);
++
++	dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
++
++	i = 0;
++	do {
++		int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
++
++		do {
++			rc = xen_create_contiguous_region(
++				(unsigned long)buf + (i << IO_TLB_SHIFT),
++				get_order(slabs << IO_TLB_SHIFT),
++				dma_bits);
++		} while (rc && dma_bits++ < max_dma_bits);
++		if (rc)
++			return rc;
++
++		i += slabs;
++	} while(i < nslabs);
++	return 0;
++}
++
++void __init xen_swiotlb_init(int verbose)
++{
++	int rc = 0;
++
++	swiotlb_init_early(64 * (1<<20), verbose);
++
++  	if ((rc = xen_swiotlb_fixup(io_tlb_start,
++			  io_tlb_nslabs << IO_TLB_SHIFT,
++			  io_tlb_nslabs)))
++		goto error;
++
++	if ((rc = xen_swiotlb_fixup(io_tlb_overflow_buffer,
++			io_tlb_overflow,
++			io_tlb_overflow >> IO_TLB_SHIFT)))
++		goto error;
++
++	return;
++error:	
++	panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
++	      "We either don't have the permission or you do not have enough"\
++	      "free memory under 4GB!\n", rc);
++}
++
++/*
++ * Map a single buffer of the indicated size for DMA in streaming mode.  The
++ * physical address to use is returned.
++ *
++ * Once the device is given the dma address, the device owns this memory until
++ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
++ */
++dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
++			    unsigned long offset, size_t size,
++			    enum dma_data_direction dir,
++			    struct dma_attrs *attrs)
++{
++	unsigned long start_dma_addr;
++	phys_addr_t phys = page_to_phys(page) + offset;
++	dma_addr_t dev_addr = xen_phys_to_bus(phys);
++	void *map;
++
++	BUG_ON(dir == DMA_NONE);
++	/*
++	 * If the address happens to be in the device's DMA window,
++	 * we can safely return the device addr and not worry about bounce
++	 * buffering it.
++	 */
++	if (dma_capable(dev, dev_addr, size) &&
++	    !range_straddles_page_boundary(phys, size) && !swiotlb_force)
++		return dev_addr;
++
++	/*
++	 * Oh well, have to allocate and map a bounce buffer.
++	 */
++	start_dma_addr = xen_virt_to_bus(io_tlb_start);
++	map = do_map_single(dev, phys, start_dma_addr, size, dir);
++	if (!map) {
++		swiotlb_full(dev, size, dir, 1);
++		map = io_tlb_overflow_buffer;
++	}
++
++	dev_addr = xen_virt_to_bus(map);
++
++	/*
++	 * Ensure that the address returned is DMA'ble
++	 */
++	if (!dma_capable(dev, dev_addr, size))
++		panic("DMA: xen_swiotlb_map_single: bounce buffer  is not " \
++		      "DMA'ble\n");
++	return dev_addr;
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
++
++/*
++ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
++ * match what was provided for in a previous xen_swiotlb_map_page call.  All
++ * other usages are undefined.
++ *
++ * After this call, reads by the cpu to the buffer are guaranteed to see
++ * whatever the device wrote there.
++ */
++static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
++			 size_t size, int dir)
++{
++	phys_addr_t paddr = xen_bus_to_phys(dev_addr);
++
++	BUG_ON(dir == DMA_NONE);
++
++	/* NOTE: We use dev_addr here, not paddr! */
++	if (is_xen_swiotlb_buffer(dev_addr)) {
++		do_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
++		return;
++	}
++
++	if (dir != DMA_FROM_DEVICE)
++		return;
++
++	/*
++	 * phys_to_virt doesn't work with hihgmem page but we could
++	 * call dma_mark_clean() with hihgmem page here. However, we
++	 * are fine since dma_mark_clean() is null on POWERPC. We can
++	 * make dma_mark_clean() take a physical address if necessary.
++	 */
++	dma_mark_clean(phys_to_virt(paddr), size);
++}
++
++void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
++			size_t size, enum dma_data_direction dir,
++			struct dma_attrs *attrs)
++{
++	unmap_single(hwdev, dev_addr, size, dir);
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
++
++/*
++ * Make physical memory consistent for a single streaming mode DMA translation
++ * after a transfer.
++ *
++ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
++ * using the cpu, yet do not wish to teardown the dma mapping, you must
++ * call this function before doing so.  At the next point you give the dma
++ * address back to the card, you must first perform a
++ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
++ */
++static void
++xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
++		    size_t size, int dir, int target)
++{
++	phys_addr_t paddr = xen_bus_to_phys(dev_addr);
++
++	BUG_ON(dir == DMA_NONE);
++
++	if (is_xen_swiotlb_buffer(dev_addr)) {
++		do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
++		return;
++	}
++
++	if (dir != DMA_FROM_DEVICE)
++		return;
++
++	dma_mark_clean(phys_to_virt(paddr), size);
++}
++
++void
++xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++			    size_t size, enum dma_data_direction dir)
++{
++	xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_cpu);
++
++void
++xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
++			       size_t size, enum dma_data_direction dir)
++{
++	xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_single_for_device);
++
++/*
++ * Same as above, but for a sub-range of the mapping.
++ */
++static void
++xen_swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
++			  unsigned long offset, size_t size,
++			  int dir, int target)
++{
++	xen_swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
++}
++
++void
++xen_swiotlb_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
++				  unsigned long offset, size_t size,
++				  enum dma_data_direction dir)
++{
++	xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
++				  SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_cpu);
++
++void
++xen_swiotlb_sync_single_range_for_device(struct device *hwdev,
++					 dma_addr_t dev_addr,
++					 unsigned long offset, size_t size,
++					 enum dma_data_direction dir)
++{
++	xen_swiotlb_sync_single_range(hwdev, dev_addr, offset, size, dir,
++				  SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_range_for_device);
++
++/*
++ * Map a set of buffers described by scatterlist in streaming mode for DMA.
++ * This is the scatter-gather version of the above xen_swiotlb_map_page
++ * interface.  Here the scatter gather list elements are each tagged with the
++ * appropriate dma address and length.  They are obtained via
++ * sg_dma_{address,length}(SG).
++ *
++ * NOTE: An implementation may be able to use a smaller number of
++ *       DMA address/length pairs than there are SG table elements.
++ *       (for example via virtual mapping capabilities)
++ *       The routine returns the number of addr/length pairs actually
++ *       used, at most nents.
++ *
++ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
++ * same here.
++ */
++int
++xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++			 int nelems, enum dma_data_direction dir,
++			 struct dma_attrs *attrs)
++{
++	unsigned long start_dma_addr;
++	struct scatterlist *sg;
++	int i;
++	BUG_ON(dir == DMA_NONE);
++
++	start_dma_addr = xen_virt_to_bus(io_tlb_start);
++	for_each_sg(sgl, sg, nelems, i) {
++		phys_addr_t paddr = sg_phys(sg);
++		dma_addr_t dev_addr = xen_phys_to_bus(paddr);
++
++		if (swiotlb_force || 
++		    !dma_capable(hwdev, dev_addr, sg->length) ||
++		    range_straddles_page_boundary(paddr, sg->length)) {
++			void *map = do_map_single(hwdev, sg_phys(sg),
++						  start_dma_addr,
++						  sg->length, dir);
++			if (!map) {
++				/* Don't panic here, we expect map_sg users
++				   to do proper error handling. */
++				swiotlb_full(hwdev, sg->length, dir, 0);
++				xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
++						       attrs);
++				sgl[0].dma_length = 0;
++				return 0;
++			}
++			sg->dma_address = xen_virt_to_bus(map);
++		} else
++			sg->dma_address = dev_addr;
++		sg->dma_length = sg->length;
++	}
++	return nelems;
++}
++EXPORT_SYMBOL(xen_swiotlb_map_sg_attrs);
++
++int
++xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
++	       int dir)
++{
++	return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
++}
++EXPORT_SYMBOL(xen_swiotlb_map_sg);
++
++/*
++ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
++ * concerning calls here are the same as for xen_swiotlb_unmap_page() above.
++ */
++void
++xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
++			   int nelems, enum dma_data_direction dir,
++			   struct dma_attrs *attrs)
++{
++	struct scatterlist *sg;
++	int i;
++
++	BUG_ON(dir == DMA_NONE);
++
++	for_each_sg(sgl, sg, nelems, i)
++		unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
++
++}
++EXPORT_SYMBOL(xen_swiotlb_unmap_sg_attrs);
++
++void
++xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
++		 int dir)
++{
++	return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
++}
++EXPORT_SYMBOL(xen_swiotlb_unmap_sg);
++
++/*
++ * Make physical memory consistent for a set of streaming mode DMA translations
++ * after a transfer.
++ *
++ * The same as xen_swiotlb_sync_single_* but for a scatter-gather list,
++ * same rules and usage.
++ */
++static void
++xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
++		int nelems, int dir, int target)
++{
++	struct scatterlist *sg;
++	int i;
++
++	for_each_sg(sgl, sg, nelems, i)
++		xen_swiotlb_sync_single(hwdev, sg->dma_address,
++				    sg->dma_length, dir, target);
++}
++
++void
++xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
++			int nelems, enum dma_data_direction dir)
++{
++	xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_cpu);
++
++void
++xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
++			   int nelems, enum dma_data_direction dir)
++{
++	xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
++}
++EXPORT_SYMBOL(xen_swiotlb_sync_sg_for_device);
++
++int
++xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
++{
++	return (dma_addr == xen_virt_to_bus(io_tlb_overflow_buffer));
++}
++EXPORT_SYMBOL(xen_swiotlb_dma_mapping_error);
++
++/*
++ * Return whether the given device DMA address mask can be supported
++ * properly.  For example, if your device can only drive the low 24-bits
++ * during bus mastering, then you would pass 0x00ffffff as the mask to
++ * this function.
++ */
++int
++xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
++{
++	return xen_virt_to_bus(io_tlb_end - 1) <= mask;
++}
++EXPORT_SYMBOL(xen_swiotlb_dma_supported);
+diff --git a/lib/swiotlb.c b/lib/swiotlb.c
+index ac25cd2..f6bbcd1 100644
+--- a/lib/swiotlb.c
++++ b/lib/swiotlb.c
+@@ -1,118 +1,11 @@
+-/*
+- * Dynamic DMA mapping support.
+- *
+- * This implementation is a fallback for platforms that do not support
+- * I/O TLBs (aka DMA address translation hardware).
+- * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+- * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+- * Copyright (C) 2000, 2003 Hewlett-Packard Co
+- *	David Mosberger-Tang <davidm@hpl.hp.com>
+- *
+- * 03/05/07 davidm	Switch from PCI-DMA to generic device DMA API.
+- * 00/12/13 davidm	Rename to swiotlb.c and add mark_clean() to avoid
+- *			unnecessary i-cache flushing.
+- * 04/07/.. ak		Better overflow handling. Assorted fixes.
+- * 05/09/10 linville	Add support for syncing ranges, support syncing for
+- *			DMA_BIDIRECTIONAL mappings, miscellaneous cleanup.
+- * 08/12/11 beckyb	Add highmem support
+- */
+ 
+-#include <linux/cache.h>
+ #include <linux/dma-mapping.h>
+-#include <linux/mm.h>
+ #include <linux/module.h>
+-#include <linux/spinlock.h>
+-#include <linux/string.h>
+ #include <linux/swiotlb.h>
+-#include <linux/pfn.h>
+-#include <linux/types.h>
+-#include <linux/ctype.h>
+-#include <linux/highmem.h>
+ 
+-#include <asm/io.h>
+-#include <asm/dma.h>
+ #include <asm/scatterlist.h>
+-
+-#include <linux/init.h>
+-#include <linux/bootmem.h>
+ #include <linux/iommu-helper.h>
+ 
+-#define OFFSET(val,align) ((unsigned long)	\
+-	                   ( (val) & ( (align) - 1)))
+-
+-#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
+-
+-/*
+- * Minimum IO TLB size to bother booting with.  Systems with mainly
+- * 64bit capable cards will only lightly use the swiotlb.  If we can't
+- * allocate a contiguous 1MB, we're probably in trouble anyway.
+- */
+-#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
+-
+-/*
+- * Enumeration for sync targets
+- */
+-enum dma_sync_target {
+-	SYNC_FOR_CPU = 0,
+-	SYNC_FOR_DEVICE = 1,
+-};
+-
+-int swiotlb_force;
+-
+-/*
+- * Used to do a quick range check in unmap_single and
+- * sync_single_*, to see if the memory was in fact allocated by this
+- * API.
+- */
+-static char *io_tlb_start, *io_tlb_end;
+-
+-/*
+- * The number of IO TLB blocks (in groups of 64) betweeen io_tlb_start and
+- * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
+- */
+-static unsigned long io_tlb_nslabs;
+-
+-/*
+- * When the IOMMU overflows we return a fallback buffer. This sets the size.
+- */
+-static unsigned long io_tlb_overflow = 32*1024;
+-
+-void *io_tlb_overflow_buffer;
+-
+-/*
+- * This is a free list describing the number of free entries available from
+- * each index
+- */
+-static unsigned int *io_tlb_list;
+-static unsigned int io_tlb_index;
+-
+-/*
+- * We need to save away the original address corresponding to a mapped entry
+- * for the sync operations.
+- */
+-static phys_addr_t *io_tlb_orig_addr;
+-
+-/*
+- * Protect the above data structures in the map and unmap calls
+- */
+-static DEFINE_SPINLOCK(io_tlb_lock);
+-
+-static int __init
+-setup_io_tlb_npages(char *str)
+-{
+-	if (isdigit(*str)) {
+-		io_tlb_nslabs = simple_strtoul(str, &str, 0);
+-		/* avoid tail segment of size < IO_TLB_SEGSIZE */
+-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+-	}
+-	if (*str == ',')
+-		++str;
+-	if (!strcmp(str, "force"))
+-		swiotlb_force = 1;
+-	return 1;
+-}
+-__setup("swiotlb=", setup_io_tlb_npages);
+-/* make io_tlb_overflow tunable too? */
+ 
+ /* Note that this doesn't work with highmem page */
+ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+@@ -120,390 +13,6 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+ {
+ 	return phys_to_dma(hwdev, virt_to_phys(address));
+ }
+-
+-static void swiotlb_print_info(unsigned long bytes)
+-{
+-	phys_addr_t pstart, pend;
+-
+-	pstart = virt_to_phys(io_tlb_start);
+-	pend = virt_to_phys(io_tlb_end);
+-
+-	printk(KERN_INFO "Placing %luMB software IO TLB between %p - %p\n",
+-	       bytes >> 20, io_tlb_start, io_tlb_end);
+-	printk(KERN_INFO "software IO TLB at phys %#llx - %#llx\n",
+-	       (unsigned long long)pstart,
+-	       (unsigned long long)pend);
+-}
+-
+-/*
+- * Statically reserve bounce buffer space and initialize bounce buffer data
+- * structures for the software IO TLB used to implement the DMA API.
+- */
+-void __init
+-swiotlb_init_with_default_size(size_t default_size)
+-{
+-	unsigned long i, bytes;
+-
+-	if (!io_tlb_nslabs) {
+-		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+-	}
+-
+-	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+-
+-	/*
+-	 * Get IO TLB memory from the low pages
+-	 */
+-	io_tlb_start = alloc_bootmem_low_pages(bytes);
+-	if (!io_tlb_start)
+-		panic("Cannot allocate SWIOTLB buffer");
+-	io_tlb_end = io_tlb_start + bytes;
+-
+-	/*
+-	 * Allocate and initialize the free list array.  This array is used
+-	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+-	 * between io_tlb_start and io_tlb_end.
+-	 */
+-	io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int));
+-	for (i = 0; i < io_tlb_nslabs; i++)
+- 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+-	io_tlb_index = 0;
+-	io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t));
+-
+-	/*
+-	 * Get the overflow emergency buffer
+-	 */
+-	io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow);
+-	if (!io_tlb_overflow_buffer)
+-		panic("Cannot allocate SWIOTLB overflow buffer!\n");
+-
+-	swiotlb_print_info(bytes);
+-}
+-
+-void __init
+-swiotlb_init(void)
+-{
+-	swiotlb_init_with_default_size(64 * (1<<20));	/* default to 64MB */
+-}
+-
+-/*
+- * Systems with larger DMA zones (those that don't support ISA) can
+- * initialize the swiotlb later using the slab allocator if needed.
+- * This should be just like above, but with some error catching.
+- */
+-int
+-swiotlb_late_init_with_default_size(size_t default_size)
+-{
+-	unsigned long i, bytes, req_nslabs = io_tlb_nslabs;
+-	unsigned int order;
+-
+-	if (!io_tlb_nslabs) {
+-		io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+-		io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+-	}
+-
+-	/*
+-	 * Get IO TLB memory from the low pages
+-	 */
+-	order = get_order(io_tlb_nslabs << IO_TLB_SHIFT);
+-	io_tlb_nslabs = SLABS_PER_PAGE << order;
+-	bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+-
+-	while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
+-		io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+-							order);
+-		if (io_tlb_start)
+-			break;
+-		order--;
+-	}
+-
+-	if (!io_tlb_start)
+-		goto cleanup1;
+-
+-	if (order != get_order(bytes)) {
+-		printk(KERN_WARNING "Warning: only able to allocate %ld MB "
+-		       "for software IO TLB\n", (PAGE_SIZE << order) >> 20);
+-		io_tlb_nslabs = SLABS_PER_PAGE << order;
+-		bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+-	}
+-	io_tlb_end = io_tlb_start + bytes;
+-	memset(io_tlb_start, 0, bytes);
+-
+-	/*
+-	 * Allocate and initialize the free list array.  This array is used
+-	 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
+-	 * between io_tlb_start and io_tlb_end.
+-	 */
+-	io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
+-	                              get_order(io_tlb_nslabs * sizeof(int)));
+-	if (!io_tlb_list)
+-		goto cleanup2;
+-
+-	for (i = 0; i < io_tlb_nslabs; i++)
+- 		io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+-	io_tlb_index = 0;
+-
+-	io_tlb_orig_addr = (phys_addr_t *)
+-		__get_free_pages(GFP_KERNEL,
+-				 get_order(io_tlb_nslabs *
+-					   sizeof(phys_addr_t)));
+-	if (!io_tlb_orig_addr)
+-		goto cleanup3;
+-
+-	memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
+-
+-	/*
+-	 * Get the overflow emergency buffer
+-	 */
+-	io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+-	                                          get_order(io_tlb_overflow));
+-	if (!io_tlb_overflow_buffer)
+-		goto cleanup4;
+-
+-	swiotlb_print_info(bytes);
+-
+-	return 0;
+-
+-cleanup4:
+-	free_pages((unsigned long)io_tlb_orig_addr,
+-		   get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
+-	io_tlb_orig_addr = NULL;
+-cleanup3:
+-	free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
+-	                                                 sizeof(int)));
+-	io_tlb_list = NULL;
+-cleanup2:
+-	io_tlb_end = NULL;
+-	free_pages((unsigned long)io_tlb_start, order);
+-	io_tlb_start = NULL;
+-cleanup1:
+-	io_tlb_nslabs = req_nslabs;
+-	return -ENOMEM;
+-}
+-
+-static int is_swiotlb_buffer(phys_addr_t paddr)
+-{
+-	return paddr >= virt_to_phys(io_tlb_start) &&
+-		paddr < virt_to_phys(io_tlb_end);
+-}
+-
+-/*
+- * Bounce: copy the swiotlb buffer back to the original dma location
+- */
+-static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+-			   enum dma_data_direction dir)
+-{
+-	unsigned long pfn = PFN_DOWN(phys);
+-
+-	if (PageHighMem(pfn_to_page(pfn))) {
+-		/* The buffer does not have a mapping.  Map it in and copy */
+-		unsigned int offset = phys & ~PAGE_MASK;
+-		char *buffer;
+-		unsigned int sz = 0;
+-		unsigned long flags;
+-
+-		while (size) {
+-			sz = min_t(size_t, PAGE_SIZE - offset, size);
+-
+-			local_irq_save(flags);
+-			buffer = kmap_atomic(pfn_to_page(pfn),
+-					     KM_BOUNCE_READ);
+-			if (dir == DMA_TO_DEVICE)
+-				memcpy(dma_addr, buffer + offset, sz);
+-			else
+-				memcpy(buffer + offset, dma_addr, sz);
+-			kunmap_atomic(buffer, KM_BOUNCE_READ);
+-			local_irq_restore(flags);
+-
+-			size -= sz;
+-			pfn++;
+-			dma_addr += sz;
+-			offset = 0;
+-		}
+-	} else {
+-		if (dir == DMA_TO_DEVICE)
+-			memcpy(dma_addr, phys_to_virt(phys), size);
+-		else
+-			memcpy(phys_to_virt(phys), dma_addr, size);
+-	}
+-}
+-
+-/*
+- * Allocates bounce buffer and returns its kernel virtual address.
+- */
+-static void *
+-map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
+-{
+-	unsigned long flags;
+-	char *dma_addr;
+-	unsigned int nslots, stride, index, wrap;
+-	int i;
+-	unsigned long start_dma_addr;
+-	unsigned long mask;
+-	unsigned long offset_slots;
+-	unsigned long max_slots;
+-
+-	mask = dma_get_seg_boundary(hwdev);
+-	start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start) & mask;
+-
+-	offset_slots = ALIGN(start_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+-
+-	/*
+- 	 * Carefully handle integer overflow which can occur when mask == ~0UL.
+- 	 */
+-	max_slots = mask + 1
+-		    ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+-		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+-
+-	/*
+-	 * For mappings greater than a page, we limit the stride (and
+-	 * hence alignment) to a page size.
+-	 */
+-	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+-	if (size > PAGE_SIZE)
+-		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+-	else
+-		stride = 1;
+-
+-	BUG_ON(!nslots);
+-
+-	/*
+-	 * Find suitable number of IO TLB entries size that will fit this
+-	 * request and allocate a buffer from that IO TLB pool.
+-	 */
+-	spin_lock_irqsave(&io_tlb_lock, flags);
+-	index = ALIGN(io_tlb_index, stride);
+-	if (index >= io_tlb_nslabs)
+-		index = 0;
+-	wrap = index;
+-
+-	do {
+-		while (iommu_is_span_boundary(index, nslots, offset_slots,
+-					      max_slots)) {
+-			index += stride;
+-			if (index >= io_tlb_nslabs)
+-				index = 0;
+-			if (index == wrap)
+-				goto not_found;
+-		}
+-
+-		/*
+-		 * If we find a slot that indicates we have 'nslots' number of
+-		 * contiguous buffers, we allocate the buffers from that slot
+-		 * and mark the entries as '0' indicating unavailable.
+-		 */
+-		if (io_tlb_list[index] >= nslots) {
+-			int count = 0;
+-
+-			for (i = index; i < (int) (index + nslots); i++)
+-				io_tlb_list[i] = 0;
+-			for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+-				io_tlb_list[i] = ++count;
+-			dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+-
+-			/*
+-			 * Update the indices to avoid searching in the next
+-			 * round.
+-			 */
+-			io_tlb_index = ((index + nslots) < io_tlb_nslabs
+-					? (index + nslots) : 0);
+-
+-			goto found;
+-		}
+-		index += stride;
+-		if (index >= io_tlb_nslabs)
+-			index = 0;
+-	} while (index != wrap);
+-
+-not_found:
+-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+-	return NULL;
+-found:
+-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+-
+-	/*
+-	 * Save away the mapping from the original address to the DMA address.
+-	 * This is needed when we sync the memory.  Then we sync the buffer if
+-	 * needed.
+-	 */
+-	for (i = 0; i < nslots; i++)
+-		io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+-		swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+-
+-	return dma_addr;
+-}
+-
+-/*
+- * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+- */
+-static void
+-do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+-{
+-	unsigned long flags;
+-	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+-	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+-	phys_addr_t phys = io_tlb_orig_addr[index];
+-
+-	/*
+-	 * First, sync the memory before unmapping the entry
+-	 */
+-	if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+-		swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+-
+-	/*
+-	 * Return the buffer to the free list by setting the corresponding
+-	 * entries to indicate the number of contigous entries available.
+-	 * While returning the entries to the free list, we merge the entries
+-	 * with slots below and above the pool being returned.
+-	 */
+-	spin_lock_irqsave(&io_tlb_lock, flags);
+-	{
+-		count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+-			 io_tlb_list[index + nslots] : 0);
+-		/*
+-		 * Step 1: return the slots to the free list, merging the
+-		 * slots with superceeding slots
+-		 */
+-		for (i = index + nslots - 1; i >= index; i--)
+-			io_tlb_list[i] = ++count;
+-		/*
+-		 * Step 2: merge the returned slots with the preceding slots,
+-		 * if available (non zero)
+-		 */
+-		for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
+-			io_tlb_list[i] = ++count;
+-	}
+-	spin_unlock_irqrestore(&io_tlb_lock, flags);
+-}
+-
+-static void
+-sync_single(struct device *hwdev, char *dma_addr, size_t size,
+-	    int dir, int target)
+-{
+-	int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+-	phys_addr_t phys = io_tlb_orig_addr[index];
+-
+-	phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+-
+-	switch (target) {
+-	case SYNC_FOR_CPU:
+-		if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+-			swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+-		else
+-			BUG_ON(dir != DMA_TO_DEVICE);
+-		break;
+-	case SYNC_FOR_DEVICE:
+-		if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+-			swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+-		else
+-			BUG_ON(dir != DMA_FROM_DEVICE);
+-		break;
+-	default:
+-		BUG();
+-	}
+-}
+-
+ void *
+ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ 		       dma_addr_t *dma_handle, gfp_t flags)
+@@ -512,12 +21,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ 	void *ret;
+ 	int order = get_order(size);
+ 	u64 dma_mask = DMA_BIT_MASK(32);
++	unsigned long start_dma_addr;
+ 
+ 	if (hwdev && hwdev->coherent_dma_mask)
+ 		dma_mask = hwdev->coherent_dma_mask;
+ 
+ 	ret = (void *)__get_free_pages(flags, order);
+-	if (ret && swiotlb_virt_to_bus(hwdev, ret) + size > dma_mask) {
++	if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) {
+ 		/*
+ 		 * The allocated memory isn't reachable by the device.
+ 		 */
+@@ -527,10 +37,12 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ 	if (!ret) {
+ 		/*
+ 		 * We are either out of memory or the device can't DMA
+-		 * to GFP_DMA memory; fall back on map_single(), which
++		 * to GFP_DMA memory; fall back on do_map_single(), which
+ 		 * will grab memory from the lowest available address range.
+ 		 */
+-		ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
++		start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
++		ret = do_map_single(hwdev, 0, start_dma_addr, size,
++				    DMA_FROM_DEVICE);
+ 		if (!ret)
+ 			return NULL;
+ 	}
+@@ -539,12 +51,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ 	dev_addr = swiotlb_virt_to_bus(hwdev, ret);
+ 
+ 	/* Confirm address can be DMA'd by device */
+-	if (dev_addr + size > dma_mask) {
+-		printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
++	if (dev_addr + size - 1 > dma_mask) {
++		dev_err(hwdev, "DMA: hwdev DMA mask = 0x%016Lx, " \
++		       "dev_addr = 0x%016Lx\n",
+ 		       (unsigned long long)dma_mask,
+ 		       (unsigned long long)dev_addr);
+ 
+-		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
++		/* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */
+ 		do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
+ 		return NULL;
+ 	}
+@@ -563,35 +76,11 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+ 	if (!is_swiotlb_buffer(paddr))
+ 		free_pages((unsigned long)vaddr, get_order(size));
+ 	else
+-		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
++		/* DMA_TO_DEVICE to avoid memcpy in do_unmap_single */
+ 		do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
+ }
+ EXPORT_SYMBOL(swiotlb_free_coherent);
+ 
+-static void
+-swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
+-{
+-	/*
+-	 * Ran out of IOMMU space for this operation. This is very bad.
+-	 * Unfortunately the drivers cannot handle this operation properly.
+-	 * unless they check for dma_mapping_error (most don't)
+-	 * When the mapping is small enough return a static buffer to limit
+-	 * the damage, or panic when the transfer is too big.
+-	 */
+-	printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at "
+-	       "device %s\n", size, dev ? dev_name(dev) : "?");
+-
+-	if (size <= io_tlb_overflow || !do_panic)
+-		return;
+-
+-	if (dir == DMA_BIDIRECTIONAL)
+-		panic("DMA: Random memory could be DMA accessed\n");
+-	if (dir == DMA_FROM_DEVICE)
+-		panic("DMA: Random memory could be DMA written\n");
+-	if (dir == DMA_TO_DEVICE)
+-		panic("DMA: Random memory could be DMA read\n");
+-}
+-
+ /*
+  * Map a single buffer of the indicated size for DMA in streaming mode.  The
+  * physical address to use is returned.
+@@ -604,6 +93,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ 			    enum dma_data_direction dir,
+ 			    struct dma_attrs *attrs)
+ {
++	unsigned long start_dma_addr;
+ 	phys_addr_t phys = page_to_phys(page) + offset;
+ 	dma_addr_t dev_addr = phys_to_dma(dev, phys);
+ 	void *map;
+@@ -620,7 +110,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ 	/*
+ 	 * Oh well, have to allocate and map a bounce buffer.
+ 	 */
+-	map = map_single(dev, phys, size, dir);
++	start_dma_addr = swiotlb_virt_to_bus(dev, io_tlb_start);
++	map = do_map_single(dev, phys, start_dma_addr, size, dir);
+ 	if (!map) {
+ 		swiotlb_full(dev, size, dir, 1);
+ 		map = io_tlb_overflow_buffer;
+@@ -632,7 +123,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+ 	 * Ensure that the address returned is DMA'ble
+ 	 */
+ 	if (!dma_capable(dev, dev_addr, size))
+-		panic("map_single: bounce buffer is not DMA'ble");
++		panic("DMA: swiotlb_map_single: bounce buffer is not DMA'ble");
+ 
+ 	return dev_addr;
+ }
+@@ -697,7 +188,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+ 	BUG_ON(dir == DMA_NONE);
+ 
+ 	if (is_swiotlb_buffer(paddr)) {
+-		sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
++		do_sync_single(hwdev, phys_to_virt(paddr), size, dir, target);
+ 		return;
+ 	}
+ 
+@@ -774,19 +265,22 @@ int
+ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ 		     enum dma_data_direction dir, struct dma_attrs *attrs)
+ {
++	unsigned long start_dma_addr;
+ 	struct scatterlist *sg;
+ 	int i;
+ 
+ 	BUG_ON(dir == DMA_NONE);
+ 
++	start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
+ 	for_each_sg(sgl, sg, nelems, i) {
+ 		phys_addr_t paddr = sg_phys(sg);
+ 		dma_addr_t dev_addr = phys_to_dma(hwdev, paddr);
+ 
+ 		if (swiotlb_force ||
+ 		    !dma_capable(hwdev, dev_addr, sg->length)) {
+-			void *map = map_single(hwdev, sg_phys(sg),
+-					       sg->length, dir);
++			void *map = do_map_single(hwdev, sg_phys(sg),
++						  start_dma_addr,
++						  sg->length, dir);
+ 			if (!map) {
+ 				/* Don't panic here, we expect map_sg users
+ 				   to do proper error handling. */
+@@ -819,7 +313,8 @@ EXPORT_SYMBOL(swiotlb_map_sg);
+  */
+ void
+ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+-		       int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
++		       int nelems, enum dma_data_direction dir,
++		       struct dma_attrs *attrs)
+ {
+ 	struct scatterlist *sg;
+ 	int i;
+diff --git a/mm/bootmem.c b/mm/bootmem.c
+index 555d5d2..d1dc23c 100644
+--- a/mm/bootmem.c
++++ b/mm/bootmem.c
+@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+ 	return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+ }
+ 
++/*
++ * free_bootmem_late - free bootmem pages directly to page allocator
++ * @addr: starting address of the range
++ * @size: size of the range in bytes
++ *
++ * This is only useful when the bootmem allocator has already been torn
++ * down, but we are still initializing the system.  Pages are given directly
++ * to the page allocator, no bootmem metadata is updated because it is gone.
++ */
++void __init free_bootmem_late(unsigned long addr, unsigned long size)
++{
++	unsigned long cursor, end;
++
++	kmemleak_free_part(__va(addr), size);
++
++	cursor = PFN_UP(addr);
++	end = PFN_DOWN(addr + size);
++
++	for (; cursor < end; cursor++) {
++		__free_pages_bootmem(pfn_to_page(cursor), 0);
++		totalram_pages++;
++	}
++}
++
+ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+ {
+ 	int aligned;
+diff --git a/mm/memory.c b/mm/memory.c
+index 53c1da0..c8741df 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -553,6 +553,13 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ 	if (is_zero_pfn(pfn))
+ 		return NULL;
+ check_pfn:
++
++#if defined(CONFIG_XEN) && defined(CONFIG_X86)
++	/* XEN: Covers user-space grant mappings (even of local pages). */
++	if (unlikely(vma->vm_flags & VM_FOREIGN))
++		return NULL;
++#endif
++
+ 	if (unlikely(pfn > highest_memmap_pfn)) {
+ 		print_bad_pte(vma, addr, pte, NULL);
+ 		return NULL;
+@@ -839,8 +846,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ 				     page->index > details->last_index))
+ 					continue;
+ 			}
+-			ptent = ptep_get_and_clear_full(mm, addr, pte,
+-							tlb->fullmm);
++			if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
++				ptent = vma->vm_ops->zap_pte(vma, addr, pte,
++							     tlb->fullmm);
++			else
++				ptent = ptep_get_and_clear_full(mm, addr, pte,
++								tlb->fullmm);
+ 			tlb_remove_tlb_entry(tlb, pte, addr);
+ 			if (unlikely(!page))
+ 				continue;
+@@ -1100,6 +1111,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
+ 		tlb_finish_mmu(tlb, address, end);
+ 	return end;
+ }
++EXPORT_SYMBOL_GPL(zap_page_range);
+ 
+ /**
+  * zap_vma_ptes - remove ptes mapping the vma
+@@ -1306,6 +1318,29 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+ 			continue;
+ 		}
+ 
++#ifdef CONFIG_XEN
++		if (vma && (vma->vm_flags & VM_FOREIGN)) {
++			struct vm_foreign_map *foreign_map =
++				vma->vm_private_data;
++			struct page **map = foreign_map->map;
++			int offset = (start - vma->vm_start) >> PAGE_SHIFT;
++			if (map[offset] != NULL) {
++			        if (pages) {
++			                struct page *page = map[offset];
++
++					pages[i] = page;
++					get_page(page);
++				}
++				if (vmas)
++					vmas[i] = vma;
++				i++;
++				start += PAGE_SIZE;
++				nr_pages--;
++				continue;
++			}
++		}
++#endif
++
+ 		if (!vma ||
+ 		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ 		    !(vm_flags & vma->vm_flags))
+@@ -1781,6 +1816,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ 
+ 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ 
++#ifdef CONFIG_XEN
++	vma->vm_mm->context.has_foreign_mappings = 1;
++#endif
++
+ 	err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+ 	if (err) {
+ 		/*
+@@ -1896,11 +1935,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ {
+ 	pgd_t *pgd;
+ 	unsigned long next;
+-	unsigned long start = addr, end = addr + size;
++	unsigned long end = addr + size;
+ 	int err;
+ 
+ 	BUG_ON(addr >= end);
+-	mmu_notifier_invalidate_range_start(mm, start, end);
+ 	pgd = pgd_offset(mm, addr);
+ 	do {
+ 		next = pgd_addr_end(addr, end);
+@@ -1908,7 +1946,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+ 		if (err)
+ 			break;
+ 	} while (pgd++, addr = next, addr != end);
+-	mmu_notifier_invalidate_range_end(mm, start, end);
++
+ 	return err;
+ }
+ EXPORT_SYMBOL_GPL(apply_to_page_range);
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 902e5fc..101715c 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -594,6 +594,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
+ 	if (bad)
+ 		return;
+ 
++#ifdef CONFIG_XEN
++	if (PageForeign(page)) {
++		PageForeignDestructor(page, order);
++		return;
++	}
++#endif
++
+ 	if (!PageHighMem(page)) {
+ 		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ 		debug_check_no_obj_freed(page_address(page),
+@@ -1088,6 +1095,13 @@ static void free_hot_cold_page(struct page *page, int cold)
+ 
+ 	kmemcheck_free_shadow(page, 0);
+ 
++#ifdef CONFIG_XEN
++	if (PageForeign(page)) {
++		PageForeignDestructor(page, 0);
++		return;
++	}
++#endif
++
+ 	if (PageAnon(page))
+ 		page->mapping = NULL;
+ 	if (free_pages_check(page))
+diff --git a/mm/vmalloc.c b/mm/vmalloc.c
+index c228731..cb459fb 100644
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -31,6 +31,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/shmparam.h>
+ 
++bool vmap_lazy_unmap __read_mostly = true;
+ 
+ /*** Page table manipulation functions ***/
+ 
+@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
+ {
+ 	unsigned int log;
+ 
++	if (!vmap_lazy_unmap)
++		return 0;
++
+ 	log = fls(num_online_cpus());
+ 
+ 	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+@@ -561,8 +565,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
+ 	}
+ 	rcu_read_unlock();
+ 
+-	if (nr)
++	if (nr) {
+ 		atomic_sub(nr, &vmap_lazy_nr);
++	}
+ 
+ 	if (nr || force_flush)
+ 		flush_tlb_kernel_range(*start, *end);
+diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
+index d4fd895..4ab8c97 100644
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -35,6 +35,7 @@
+ #include <linux/security.h>
+ #include <linux/mutex.h>
+ #include <linux/if_addr.h>
++#include <linux/pci.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -582,6 +583,22 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
+ 	a->tx_compressed = b->tx_compressed;
+ };
+ 
++/* All VF info */
++static inline int rtnl_vfinfo_size(const struct net_device *dev)
++{
++	if (dev->dev.parent && dev_is_pci(dev->dev.parent)) {
++
++		int num_vfs = dev_num_vf(dev->dev.parent);
++		size_t size = nlmsg_total_size(sizeof(struct nlattr));
++		size += nlmsg_total_size(num_vfs * sizeof(struct nlattr));
++		size += num_vfs * (sizeof(struct ifla_vf_mac) +
++				  sizeof(struct ifla_vf_vlan) +
++				  sizeof(struct ifla_vf_tx_rate));
++		return size;
++	} else
++		return 0;
++}
++
+ static inline size_t if_nlmsg_size(const struct net_device *dev)
+ {
+ 	return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+@@ -599,6 +616,8 @@ static inline size_t if_nlmsg_size(const struct net_device *dev)
+ 	       + nla_total_size(4) /* IFLA_MASTER */
+ 	       + nla_total_size(1) /* IFLA_OPERSTATE */
+ 	       + nla_total_size(1) /* IFLA_LINKMODE */
++	       + nla_total_size(4) /* IFLA_NUM_VF */
++	       + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */
+ 	       + rtnl_link_get_size(dev); /* IFLA_LINKINFO */
+ }
+ 
+@@ -667,6 +686,40 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
+ 	stats = dev_get_stats(dev);
+ 	copy_rtnl_link_stats(nla_data(attr), stats);
+ 
++	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) {
++		int i;
++
++		struct nlattr *vfinfo, *vf;
++		int num_vfs = dev_num_vf(dev->dev.parent);
++
++		NLA_PUT_U32(skb, IFLA_NUM_VF, num_vfs);
++		vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
++		if (!vfinfo)
++			goto nla_put_failure;
++		for (i = 0; i < num_vfs; i++) {
++			struct ifla_vf_info ivi;
++			struct ifla_vf_mac vf_mac;
++			struct ifla_vf_vlan vf_vlan;
++			struct ifla_vf_tx_rate vf_tx_rate;
++			if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
++				break;
++			vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf;
++			memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
++			vf_vlan.vlan = ivi.vlan;
++			vf_vlan.qos = ivi.qos;
++			vf_tx_rate.rate = ivi.tx_rate;
++			vf = nla_nest_start(skb, IFLA_VF_INFO);
++			if (!vf) {
++				nla_nest_cancel(skb, vfinfo);
++				goto nla_put_failure;
++			}
++			NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac);
++			NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan);
++			NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate);
++			nla_nest_end(skb, vf);
++		}
++		nla_nest_end(skb, vfinfo);
++	}
+ 	if (dev->rtnl_link_ops) {
+ 		if (rtnl_link_fill(skb, dev) < 0)
+ 			goto nla_put_failure;
+@@ -716,6 +769,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ 	[IFLA_LINKINFO]		= { .type = NLA_NESTED },
+ 	[IFLA_NET_NS_PID]	= { .type = NLA_U32 },
+ 	[IFLA_IFALIAS]	        = { .type = NLA_STRING, .len = IFALIASZ-1 },
++	[IFLA_VFINFO_LIST]	= {. type = NLA_NESTED },
+ };
+ 
+ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+@@ -723,6 +777,33 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
+ 	[IFLA_INFO_DATA]	= { .type = NLA_NESTED },
+ };
+ 
++static const struct nla_policy ifla_vfinfo_policy[IFLA_VF_INFO_MAX+1] = {
++	[IFLA_VF_INFO]		= { .type = NLA_NESTED },
++};
++
++static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
++	[IFLA_VF_MAC]		= { .type = NLA_BINARY,
++				    .len = sizeof(struct ifla_vf_mac) },
++	[IFLA_VF_VLAN]		= { .type = NLA_BINARY,
++				    .len = sizeof(struct ifla_vf_vlan) },
++	[IFLA_VF_TX_RATE]	= { .type = NLA_BINARY,
++				    .len = sizeof(struct ifla_vf_tx_rate) },
++};
++
++struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
++{
++	struct net *net;
++	/* Examine the link attributes and figure out which
++	 * network namespace we are talking about.
++	 */
++	if (tb[IFLA_NET_NS_PID])
++		net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
++	else
++		net = get_net(src_net);
++	return net;
++}
++EXPORT_SYMBOL(rtnl_link_get_net);
++
+ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+ {
+ 	if (dev) {
+@@ -738,6 +819,52 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+ 	return 0;
+ }
+ 
++static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
++{
++	int rem, err = -EINVAL;
++	struct nlattr *vf;
++	const struct net_device_ops *ops = dev->netdev_ops;
++
++	nla_for_each_nested(vf, attr, rem) {
++		switch (nla_type(vf)) {
++		case IFLA_VF_MAC: {
++			struct ifla_vf_mac *ivm;
++			ivm = nla_data(vf);
++			err = -EOPNOTSUPP;
++			if (ops->ndo_set_vf_mac)
++				err = ops->ndo_set_vf_mac(dev, ivm->vf,
++							  ivm->mac);
++			break;
++		}
++		case IFLA_VF_VLAN: {
++			struct ifla_vf_vlan *ivv;
++			ivv = nla_data(vf);
++			err = -EOPNOTSUPP;
++			if (ops->ndo_set_vf_vlan)
++				err = ops->ndo_set_vf_vlan(dev, ivv->vf,
++							   ivv->vlan,
++							   ivv->qos);
++			break;
++		}
++		case IFLA_VF_TX_RATE: {
++			struct ifla_vf_tx_rate *ivt;
++			ivt = nla_data(vf);
++			err = -EOPNOTSUPP;
++			if (ops->ndo_set_vf_tx_rate)
++				err = ops->ndo_set_vf_tx_rate(dev, ivt->vf,
++							      ivt->rate);
++			break;
++		}
++		default:
++			err = -EINVAL;
++			break;
++		}
++		if (err)
++			break;
++	}
++	return err;
++}
++
+ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
+ 		      struct nlattr **tb, char *ifname, int modified)
+ {
+@@ -875,6 +1002,18 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
+ 		write_unlock_bh(&dev_base_lock);
+ 	}
+ 
++	if (tb[IFLA_VFINFO_LIST]) {
++		struct nlattr *attr;
++		int rem;
++		nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
++			if (nla_type(attr) != IFLA_VF_INFO)
++				goto errout;
++			err = do_setvfinfo(dev, attr);
++			if (err < 0)
++				goto errout;
++			modified = 1;
++		}
++	}
+ 	err = 0;
+ 
+ errout:
+diff --git a/net/sched/Kconfig b/net/sched/Kconfig
+index 929218a..956cd0a 100644
+--- a/net/sched/Kconfig
++++ b/net/sched/Kconfig
+@@ -215,6 +215,26 @@ config NET_SCH_INGRESS
+ 	  To compile this code as a module, choose M here: the
+ 	  module will be called sch_ingress.
+ 
++config NET_SCH_PLUG
++	tristate "Plug network traffic until release"
++	---help---
++	  Say Y here if you are using this kernel for Xen dom0 and
++	  want to protect Xen guests with Remus.
++
++	  This queueing discipline is controlled by netlink. When it receives an
++	  enqueue command it inserts a plug into the outbound queue that causes
++	  following packets to enqueue until a dequeue command arrives over
++	  netlink, releasing packets up to the plug for delivery.
++
++	  Its intention is to support speculative execution by allowing generated
++	  network traffic to be rolled back. It is used to provide network
++	  protection for the Remus high availability project.
++
++	  If unsure, say N.
++
++	  To compile this code as a module, choose M here: the
++	  module will be called sch_plug.
++
+ comment "Classification"
+ 
+ config NET_CLS
+diff --git a/net/sched/Makefile b/net/sched/Makefile
+index f14e71b..61ef5f7 100644
+--- a/net/sched/Makefile
++++ b/net/sched/Makefile
+@@ -31,6 +31,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
+ obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
+ obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
+ obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
++obj-$(CONFIG_NET_SCH_PLUG)	+= sch_plug.o
+ obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
+ obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
+ obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
+diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c
+new file mode 100644
+index 0000000..86c3ee1
+--- /dev/null
++++ b/net/sched/sch_plug.c
+@@ -0,0 +1,156 @@
++/*
++ * sch_plug.c Queue traffic until an explicit release command
++ *
++ *             This program is free software; you can redistribute it and/or
++ *             modify it under the terms of the GNU General Public License
++ *             as published by the Free Software Foundation; either version
++ *             2 of the License, or (at your option) any later version.
++ *
++ * The operation of the buffer is as follows:
++ * When a checkpoint begins, a plug is inserted into the
++ *   network queue by a netlink request (it operates by storing
++ *   a pointer to the next packet which arrives and blocking dequeue
++ *   when that packet is at the head of the queue).
++ * When a checkpoint completes (the backup acknowledges receipt),
++ *   currently-queued packets are released.
++ * So it supports two operations, plug and unplug.
++ */
++
++#include <linux/module.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/errno.h>
++#include <linux/netdevice.h>
++#include <linux/skbuff.h>
++#include <net/pkt_sched.h>
++
++#define FIFO_BUF    (10*1024*1024)
++
++#define TCQ_PLUG   0
++#define TCQ_UNPLUG 1
++
++struct plug_sched_data {
++	/*
++	 * This packet is the first packet which should not be
++	 * delivered.  If it is NULL, plug_enqueue will set it to the
++	 * next packet it sees.
++	 */
++	struct sk_buff *stop;
++};
++
++struct tc_plug_qopt {
++	/* 0: reset stop packet pointer
++	 * 1: dequeue to stop pointer */
++	int action;
++};
++
++static int skb_remove_foreign_references(struct sk_buff *skb)
++{
++	return !skb_linearize(skb);
++}
++
++static int plug_enqueue(struct sk_buff *skb, struct Qdisc* sch)
++{
++	struct plug_sched_data *q = qdisc_priv(sch);
++
++	if (likely(sch->qstats.backlog + skb->len <= FIFO_BUF)) {
++		if (!q->stop)
++			q->stop = skb;
++
++		if (!skb_remove_foreign_references(skb)) {
++			printk(KERN_DEBUG "error removing foreign ref\n");
++			return qdisc_reshape_fail(skb, sch);
++		}
++
++		return qdisc_enqueue_tail(skb, sch);
++	}
++	printk(KERN_WARNING "queue reported full: %d,%d\n",
++	       sch->qstats.backlog, skb->len);
++
++	return qdisc_reshape_fail(skb, sch);
++}
++
++/* dequeue doesn't actually dequeue until the release command is
++ * received. */
++static struct sk_buff *plug_dequeue(struct Qdisc* sch)
++{
++	struct plug_sched_data *q = qdisc_priv(sch);
++	struct sk_buff *peek;
++
++	if (sch->flags & TCQ_F_THROTTLED)
++		return NULL;
++
++	peek = (struct sk_buff *)((sch->q).next);
++
++	/* this pointer comparison may be shady */
++	if (peek == q->stop) {
++		/*
++		 * This is the tail of the last round. Release it and
++		 * block the queue
++		 */
++		sch->flags |= TCQ_F_THROTTLED;
++		return NULL;
++	}
++
++	return qdisc_dequeue_head(sch);
++}
++
++static int plug_init(struct Qdisc *sch, struct nlattr *opt)
++{
++	sch->flags |= TCQ_F_THROTTLED;
++
++	return 0;
++}
++
++/*
++ * receives two messages:
++ *   0: checkpoint queue (set stop to next packet)
++ *   1: dequeue until stop
++ */
++static int plug_change(struct Qdisc *sch, struct nlattr *opt)
++{
++	struct plug_sched_data *q = qdisc_priv(sch);
++	struct tc_plug_qopt *msg;
++
++	if (!opt || nla_len(opt) < sizeof(*msg))
++		return -EINVAL;
++
++	msg = nla_data(opt);
++
++	if (msg->action == TCQ_PLUG) {
++		/* reset stop */
++		q->stop = NULL;
++	} else if (msg->action == TCQ_UNPLUG) {
++		/* dequeue */
++		sch->flags &= ~TCQ_F_THROTTLED;
++		netif_schedule_queue(sch->dev_queue);
++	} else {
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++struct Qdisc_ops plug_qdisc_ops = {
++	.id          =       "plug",
++	.priv_size   =       sizeof(struct plug_sched_data),
++	.enqueue     =       plug_enqueue,
++	.dequeue     =       plug_dequeue,
++	.peek        =       qdisc_peek_head,
++	.init        =       plug_init,
++	.change      =       plug_change,
++	.owner       =       THIS_MODULE,
++};
++
++static int __init plug_module_init(void)
++{
++	return register_qdisc(&plug_qdisc_ops);
++}
++
++static void __exit plug_module_exit(void)
++{
++	unregister_qdisc(&plug_qdisc_ops);
++}
++module_init(plug_module_init)
++module_exit(plug_module_exit)
++MODULE_LICENSE("GPL");
diff --git a/xen.pvops.post.patch b/xen.pvops.post.patch
new file mode 100644
index 0000000..495a81a
--- /dev/null
+++ b/xen.pvops.post.patch
@@ -0,0 +1,68 @@
+Reapply and merge in Fedora changes
+
+--- a/drivers/pci/pci.h	2009-02-25 20:16:13.000000000 +0000
++++ b/drivers/pci/pci.h	2009-02-25 20:40:21.000000000 +0000
+@@ -111,9 +111,11 @@
+ 
+ #ifdef CONFIG_PCI_MSI
+ void pci_no_msi(void);
++void pci_yes_msi(void);
+ extern void pci_msi_init_pci_dev(struct pci_dev *dev);
+ #else
+ static inline void pci_no_msi(void) { }
++static inline void pci_yes_msi(void) { }
+ static inline void pci_msi_init_pci_dev(struct pci_dev *dev) { }
+ #endif
+ 
+diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
+index 2202b62..f371fe8 100644
+--- a/arch/x86/pci/common.c
++++ b/arch/x86/pci/common.c
+@@ -432,6 +432,22 @@ int __init pcibios_init(void)
+ 		pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
+ 	else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+ 		pci_cache_line_size = 128 >> 2;	/* P4 */
++	if (c->x86_clflush_size != (pci_cache_line_size <<2))
++		printk(KERN_DEBUG "PCI: old code would have set cacheline "
++			"size to %d bytes, but clflush_size = %d\n",
++			pci_cache_line_size << 2,
++			c->x86_clflush_size);
++
++	/* Once we know this logic works, all the above code can be deleted. */
++	if (c->x86_clflush_size > 0) {
++		pci_cache_line_size = c->x86_clflush_size >> 2;
++		printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
++			pci_cache_line_size << 2);
++	} else {
++		pci_cache_line_size = 32 >> 2;
++		printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
++	}
++
+ }
+ 
+ int __init pcibios_init(void)
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -7,6 +7,9 @@
+ /*
+  * The x86 doesn't have a mmu context, but
+  * we put the segment information here.
++ *
++ * exec_limit is used to track the range PROT_EXEC
++ * mappings span.
+  */
+ typedef struct {
+ 	void *ldt;
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -16,6 +19,10 @@
+ #ifdef CONFIG_XEN
+ 	int has_foreign_mappings;
+ #endif
++#ifdef CONFIG_X86_32
++	struct desc_struct user_cs;
++	unsigned long exec_limit;
++#endif
+ } mm_context_t;
+ 
+ #ifdef CONFIG_SMP
diff --git a/xen.pvops.pre.patch b/xen.pvops.pre.patch
new file mode 100644
index 0000000..a978beb
--- /dev/null
+++ b/xen.pvops.pre.patch
@@ -0,0 +1,69 @@
+temporarily revert various Fedora changes so that the pvops patch applies cleanly
+Affected patches;
+linux-2.6-defaults-pci_no_msi.patch - drivers/pci/pci.h
+linux-2.6-pci-cacheline-sizing.patch - arch/x86/pci/common.c
+linux-2.6-execshield.patch - arch/x86/include/asm/mmu.h
+
+--- a/drivers/pci/pci.h	2009-04-24 20:46:50.000000000 +0100
++++ b/drivers/pci/pci.h	2009-04-23 20:13:43.000000000 +0100
+@@ -112,11 +112,9 @@
+ 
+ #ifdef CONFIG_PCI_MSI
+ void pci_no_msi(void);
+-void pci_yes_msi(void);
+ extern void pci_msi_init_pci_dev(struct pci_dev *dev);
+ #else
+ static inline void pci_no_msi(void) { }
+-static inline void pci_yes_msi(void) { }
+ static inline void pci_msi_init_pci_dev(struct pci_dev *dev) { }
+ #endif
+
+diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
+index 2202b62..f371fe8 100644
+--- a/arch/x86/pci/common.c
++++ b/arch/x86/pci/common.c
+@@ -432,22 +432,6 @@ int __init pcibios_init(void)
+ 	else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+ 		pci_cache_line_size = 128 >> 2;	/* P4 */
+ 
+-	if (c->x86_clflush_size != (pci_cache_line_size <<2))
+-		printk(KERN_DEBUG "PCI: old code would have set cacheline "
+-			"size to %d bytes, but clflush_size = %d\n",
+-			pci_cache_line_size << 2,
+-			c->x86_clflush_size);
+-
+-	/* Once we know this logic works, all the above code can be deleted. */
+-	if (c->x86_clflush_size > 0) {
+-		pci_cache_line_size = c->x86_clflush_size >> 2;
+-		printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
+-			pci_cache_line_size << 2);
+-	} else {
+-		pci_cache_line_size = 32 >> 2;
+-		printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
+-	}
+-
+ 	pcibios_resource_survey();
+ 
+ 	if (pci_bf_sort >= pci_force_bf)
+--- a/arch/x86/include/asm/mmu.h
++++ b/arch/x86/include/asm/mmu.h
+@@ -7,19 +7,12 @@
+ /*
+  * The x86 doesn't have a mmu context, but
+  * we put the segment information here.
+- *
+- * exec_limit is used to track the range PROT_EXEC
+- * mappings span.
+  */
+ typedef struct {
+ 	void *ldt;
+ 	int size;
+ 	struct mutex lock;
+ 	void *vdso;
+-#ifdef CONFIG_X86_32
+-	struct desc_struct user_cs;
+-	unsigned long exec_limit;
+-#endif
+ } mm_context_t;
+ 
+ #ifdef CONFIG_SMP