summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms/pseries
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig131
-rw-r--r--arch/powerpc/platforms/pseries/Makefile29
-rw-r--r--arch/powerpc/platforms/pseries/cmm.c742
-rw-r--r--arch/powerpc/platforms/pseries/dlpar.c565
-rw-r--r--arch/powerpc/platforms/pseries/dtl.c396
-rw-r--r--arch/powerpc/platforms/pseries/eeh.c1346
-rw-r--r--arch/powerpc/platforms/pseries/eeh_cache.c308
-rw-r--r--arch/powerpc/platforms/pseries/eeh_driver.c511
-rw-r--r--arch/powerpc/platforms/pseries/eeh_event.c158
-rw-r--r--arch/powerpc/platforms/pseries/eeh_sysfs.c87
-rw-r--r--arch/powerpc/platforms/pseries/event_sources.c86
-rw-r--r--arch/powerpc/platforms/pseries/firmware.c87
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-cpu.c415
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-memory.c236
-rw-r--r--arch/powerpc/platforms/pseries/hvCall.S270
-rw-r--r--arch/powerpc/platforms/pseries/hvCall_inst.c165
-rw-r--r--arch/powerpc/platforms/pseries/hvconsole.c81
-rw-r--r--arch/powerpc/platforms/pseries/hvcserver.c251
-rw-r--r--arch/powerpc/platforms/pseries/io_event_irq.c229
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c1299
-rw-r--r--arch/powerpc/platforms/pseries/kexec.c76
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c640
-rw-r--r--arch/powerpc/platforms/pseries/mobility.c363
-rw-r--r--arch/powerpc/platforms/pseries/msi.c491
-rw-r--r--arch/powerpc/platforms/pseries/nvram.c679
-rw-r--r--arch/powerpc/platforms/pseries/offline_states.h37
-rw-r--r--arch/powerpc/platforms/pseries/pci.c109
-rw-r--r--arch/powerpc/platforms/pseries/pci_dlpar.c209
-rw-r--r--arch/powerpc/platforms/pseries/phyp_dump.c513
-rw-r--r--arch/powerpc/platforms/pseries/plpar_wrappers.h276
-rw-r--r--arch/powerpc/platforms/pseries/power.c81
-rw-r--r--arch/powerpc/platforms/pseries/processor_idle.c329
-rw-r--r--arch/powerpc/platforms/pseries/pseries.h63
-rw-r--r--arch/powerpc/platforms/pseries/pseries_energy.c323
-rw-r--r--arch/powerpc/platforms/pseries/ras.c347
-rw-r--r--arch/powerpc/platforms/pseries/reconfig.c564
-rw-r--r--arch/powerpc/platforms/pseries/scanlog.c214
-rw-r--r--arch/powerpc/platforms/pseries/setup.c653
-rw-r--r--arch/powerpc/platforms/pseries/smp.c259
-rw-r--r--arch/powerpc/platforms/pseries/suspend.c220
40 files changed, 13838 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
new file mode 100644
index 00000000000..31f22c1f657
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -0,0 +1,131 @@
+config PPC_PSERIES
+ depends on PPC64 && PPC_BOOK3S
+ bool "IBM pSeries & new (POWER5-based) iSeries"
+ select HAVE_PCSPKR_PLATFORM
+ select MPIC
+ select PCI_MSI
+ select PPC_XICS
+ select PPC_ICP_NATIVE
+ select PPC_ICP_HV
+ select PPC_ICS_RTAS
+ select PPC_I8259
+ select PPC_RTAS
+ select PPC_RTAS_DAEMON
+ select RTAS_ERROR_LOGGING
+ select PPC_UDBG_16550
+ select PPC_NATIVE
+ select PPC_PCI_CHOICE if EXPERT
+ select ZLIB_DEFLATE
+ default y
+
+config PPC_SPLPAR
+ depends on PPC_PSERIES
+ bool "Support for shared-processor logical partitions"
+ default n
+ help
+ Enabling this option will make the kernel run more efficiently
+ on logically-partitioned pSeries systems which use shared
+ processors, that is, which share physical processors between
+ two or more partitions.
+
+config EEH
+ bool "PCI Extended Error Handling (EEH)" if EXPERT
+ depends on PPC_PSERIES && PCI
+ default y if !EXPERT
+
+config PSERIES_MSI
+ bool
+ depends on PCI_MSI && EEH
+ default y
+
+config PSERIES_ENERGY
+ tristate "pSeries energy management capabilities driver"
+ depends on PPC_PSERIES
+ default y
+ help
+ Provides interface to platform energy management capabilities
+ on supported PSERIES platforms.
+ Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list
+ and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint
+
+config SCANLOG
+ tristate "Scanlog dump interface"
+ depends on RTAS_PROC && PPC_PSERIES
+
+config IO_EVENT_IRQ
+ bool "IO Event Interrupt support"
+ depends on PPC_PSERIES
+ default y
+ help
+ Select this option, if you want to enable support for IO Event
+ interrupts. IO event interrupt is a mechanism provided by RTAS
+ to return information about hardware error and non-error events
+ which may need OS attention. RTAS returns events for multiple
+ event types and scopes. Device drivers can register their handlers
+ to receive events.
+
+ This option will only enable the IO event platform code. You
+ will still need to enable or compile the actual drivers
+ that use this infrastruture to handle IO event interrupts.
+
+ Say Y if you are unsure.
+
+config LPARCFG
+ bool "LPAR Configuration Data"
+ depends on PPC_PSERIES || PPC_ISERIES
+ help
+ Provide system capacity information via human readable
+ <key word>=<value> pairs through a /proc/ppc64/lparcfg interface.
+
+config PPC_PSERIES_DEBUG
+ depends on PPC_PSERIES && PPC_EARLY_DEBUG
+ bool "Enable extra debug logging in platforms/pseries"
+ help
+ Say Y here if you want the pseries core to produce a bunch of
+ debug messages to the system log. Select this if you are having a
+ problem with the pseries core and want to see more of what is
+ going on. This does not enable debugging in lpar.c, which must
+ be manually done due to its verbosity.
+ default y
+
+config PPC_SMLPAR
+ bool "Support for shared-memory logical partitions"
+ depends on PPC_PSERIES
+ select LPARCFG
+ default n
+ help
+ Select this option to enable shared memory partition support.
+ With this option a system running in an LPAR can be given more
+ memory than physically available and will allow firmware to
+ balance memory across many LPARs.
+
+config CMM
+ tristate "Collaborative memory management"
+ depends on PPC_SMLPAR
+ default y
+ help
+ Select this option, if you want to enable the kernel interface
+ to reduce the memory size of the system. This is accomplished
+ by allocating pages of memory and put them "on hold". This only
+ makes sense for a system running in an LPAR where the unused pages
+ will be reused for other LPARs. The interface allows firmware to
+ balance memory across many LPARs.
+
+config DTL
+ bool "Dispatch Trace Log"
+ depends on PPC_SPLPAR && DEBUG_FS
+ help
+ SPLPAR machines can log hypervisor preempt & dispatch events to a
+ kernel buffer. Saying Y here will enable logging these events,
+ which are accessible through a debugfs file.
+
+ Say N if you are unsure.
+
+config PSERIES_IDLE
+ bool "Cpuidle driver for pSeries platforms"
+ depends on CPU_IDLE
+ depends on PPC_PSERIES
+ default y
+ help
+ Select this option to enable processor idle state management
+ through cpuidle subsystem.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
new file mode 100644
index 00000000000..236db46b407
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -0,0 +1,29 @@
+ccflags-$(CONFIG_PPC64) := -mno-minimal-toc
+ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG
+
+obj-y := lpar.o hvCall.o nvram.o reconfig.o \
+ setup.o iommu.o event_sources.o ras.o \
+ firmware.o power.o dlpar.o mobility.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SCANLOG) += scanlog.o
+obj-$(CONFIG_EEH) += eeh.o eeh_cache.o eeh_driver.o eeh_event.o eeh_sysfs.o
+obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_PCI) += pci.o pci_dlpar.o
+obj-$(CONFIG_PSERIES_MSI) += msi.o
+obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o
+
+obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o
+obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o
+
+obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o
+obj-$(CONFIG_HVCS) += hvcserver.o
+obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o
+obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o
+obj-$(CONFIG_CMM) += cmm.o
+obj-$(CONFIG_DTL) += dtl.o
+obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o
+obj-$(CONFIG_PSERIES_IDLE) += processor_idle.o
+
+ifeq ($(CONFIG_PPC_PSERIES),y)
+obj-$(CONFIG_SUSPEND) += suspend.o
+endif
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
new file mode 100644
index 00000000000..c638535753d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -0,0 +1,742 @@
+/*
+ * Collaborative memory management interface.
+ *
+ * Copyright (C) 2008 IBM Corporation
+ * Author(s): Brian King (brking@linux.vnet.ibm.com),
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/oom.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/stringify.h>
+#include <linux/swap.h>
+#include <linux/device.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/mmu.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <linux/memory.h>
+
+#include "plpar_wrappers.h"
+
+#define CMM_DRIVER_VERSION "1.0.0"
+#define CMM_DEFAULT_DELAY 1
+#define CMM_HOTPLUG_DELAY 5
+#define CMM_DEBUG 0
+#define CMM_DISABLE 0
+#define CMM_OOM_KB 1024
+#define CMM_MIN_MEM_MB 256
+#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10))
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+/*
+ * The priority level tries to ensure that this notifier is called as
+ * late as possible to reduce thrashing in the shared memory pool.
+ */
+#define CMM_MEM_HOTPLUG_PRI 1
+#define CMM_MEM_ISOLATE_PRI 15
+
+static unsigned int delay = CMM_DEFAULT_DELAY;
+static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY;
+static unsigned int oom_kb = CMM_OOM_KB;
+static unsigned int cmm_debug = CMM_DEBUG;
+static unsigned int cmm_disabled = CMM_DISABLE;
+static unsigned long min_mem_mb = CMM_MIN_MEM_MB;
+static struct device cmm_dev;
+
+MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(CMM_DRIVER_VERSION);
+
+module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. "
+ "[Default=" __stringify(CMM_DEFAULT_DELAY) "]");
+module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove "
+ "before loaning resumes. "
+ "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]");
+module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. "
+ "[Default=" __stringify(CMM_OOM_KB) "]");
+module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. "
+ "[Default=" __stringify(CMM_MIN_MEM_MB) "]");
+module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. "
+ "[Default=" __stringify(CMM_DEBUG) "]");
+
+#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long))
+
+#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); }
+
+struct cmm_page_array {
+ struct cmm_page_array *next;
+ unsigned long index;
+ unsigned long page[CMM_NR_PAGES];
+};
+
+static unsigned long loaned_pages;
+static unsigned long loaned_pages_target;
+static unsigned long oom_freed_pages;
+
+static struct cmm_page_array *cmm_page_list;
+static DEFINE_SPINLOCK(cmm_lock);
+
+static DEFINE_MUTEX(hotplug_mutex);
+static int hotplug_occurred; /* protected by the hotplug mutex */
+
+static struct task_struct *cmm_thread_ptr;
+
+/**
+ * cmm_alloc_pages - Allocate pages and mark them as loaned
+ * @nr: number of pages to allocate
+ *
+ * Return value:
+ * number of pages requested to be allocated which were not
+ **/
+static long cmm_alloc_pages(long nr)
+{
+ struct cmm_page_array *pa, *npa;
+ unsigned long addr;
+ long rc;
+
+ cmm_dbg("Begin request for %ld pages\n", nr);
+
+ while (nr) {
+ /* Exit if a hotplug operation is in progress or occurred */
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ mutex_unlock(&hotplug_mutex);
+ break;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ break;
+ }
+
+ addr = __get_free_page(GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
+ if (!addr)
+ break;
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ if (!pa || pa->index >= CMM_NR_PAGES) {
+ /* Need a new page for the page list. */
+ spin_unlock(&cmm_lock);
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
+ if (!npa) {
+ pr_info("%s: Can not allocate new page list\n", __func__);
+ free_page(addr);
+ break;
+ }
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+
+ if (!pa || pa->index >= CMM_NR_PAGES) {
+ npa->next = pa;
+ npa->index = 0;
+ pa = npa;
+ cmm_page_list = pa;
+ } else
+ free_page((unsigned long) npa);
+ }
+
+ if ((rc = plpar_page_set_loaned(__pa(addr)))) {
+ pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc);
+ spin_unlock(&cmm_lock);
+ free_page(addr);
+ break;
+ }
+
+ pa->page[pa->index++] = addr;
+ loaned_pages++;
+ totalram_pages--;
+ spin_unlock(&cmm_lock);
+ nr--;
+ }
+
+ cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+ return nr;
+}
+
+/**
+ * cmm_free_pages - Free pages and mark them as active
+ * @nr: number of pages to free
+ *
+ * Return value:
+ * number of pages requested to be freed which were not
+ **/
+static long cmm_free_pages(long nr)
+{
+ struct cmm_page_array *pa;
+ unsigned long addr;
+
+ cmm_dbg("Begin free of %ld pages.\n", nr);
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (nr) {
+ if (!pa || pa->index <= 0)
+ break;
+ addr = pa->page[--pa->index];
+
+ if (pa->index == 0) {
+ pa = pa->next;
+ free_page((unsigned long) cmm_page_list);
+ cmm_page_list = pa;
+ }
+
+ plpar_page_set_active(__pa(addr));
+ free_page(addr);
+ loaned_pages--;
+ nr--;
+ totalram_pages++;
+ }
+ spin_unlock(&cmm_lock);
+ cmm_dbg("End request with %ld pages unfulfilled\n", nr);
+ return nr;
+}
+
+/**
+ * cmm_oom_notify - OOM notifier
+ * @self: notifier block struct
+ * @dummy: not used
+ * @parm: returned - number of pages freed
+ *
+ * Return value:
+ * NOTIFY_OK
+ **/
+static int cmm_oom_notify(struct notifier_block *self,
+ unsigned long dummy, void *parm)
+{
+ unsigned long *freed = parm;
+ long nr = KB2PAGES(oom_kb);
+
+ cmm_dbg("OOM processing started\n");
+ nr = cmm_free_pages(nr);
+ loaned_pages_target = loaned_pages;
+ *freed += KB2PAGES(oom_kb) - nr;
+ oom_freed_pages += KB2PAGES(oom_kb) - nr;
+ cmm_dbg("OOM processing complete\n");
+ return NOTIFY_OK;
+}
+
+/**
+ * cmm_get_mpp - Read memory performance parameters
+ *
+ * Makes hcall to query the current page loan request from the hypervisor.
+ *
+ * Return value:
+ * nothing
+ **/
+static void cmm_get_mpp(void)
+{
+ int rc;
+ struct hvcall_mpp_data mpp_data;
+ signed long active_pages_target, page_loan_request, target;
+ signed long total_pages = totalram_pages + loaned_pages;
+ signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
+
+ rc = h_get_mpp(&mpp_data);
+
+ if (rc != H_SUCCESS)
+ return;
+
+ page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
+ target = page_loan_request + (signed long)loaned_pages;
+
+ if (target < 0 || total_pages < min_mem_pages)
+ target = 0;
+
+ if (target > oom_freed_pages)
+ target -= oom_freed_pages;
+ else
+ target = 0;
+
+ active_pages_target = total_pages - target;
+
+ if (min_mem_pages > active_pages_target)
+ target = total_pages - min_mem_pages;
+
+ if (target < 0)
+ target = 0;
+
+ loaned_pages_target = target;
+
+ cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
+ page_loan_request, loaned_pages, loaned_pages_target,
+ oom_freed_pages, totalram_pages);
+}
+
+static struct notifier_block cmm_oom_nb = {
+ .notifier_call = cmm_oom_notify
+};
+
+/**
+ * cmm_thread - CMM task thread
+ * @dummy: not used
+ *
+ * Return value:
+ * 0
+ **/
+static int cmm_thread(void *dummy)
+{
+ unsigned long timeleft;
+
+ while (1) {
+ timeleft = msleep_interruptible(delay * 1000);
+
+ if (kthread_should_stop() || timeleft)
+ break;
+
+ if (mutex_trylock(&hotplug_mutex)) {
+ if (hotplug_occurred) {
+ hotplug_occurred = 0;
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Hotplug operation has occurred, "
+ "loaning activity suspended "
+ "for %d seconds.\n",
+ hotplug_delay);
+ timeleft = msleep_interruptible(hotplug_delay *
+ 1000);
+ if (kthread_should_stop() || timeleft)
+ break;
+ continue;
+ }
+ mutex_unlock(&hotplug_mutex);
+ } else {
+ cmm_dbg("Hotplug operation in progress, activity "
+ "suspended\n");
+ continue;
+ }
+
+ cmm_get_mpp();
+
+ if (loaned_pages_target > loaned_pages) {
+ if (cmm_alloc_pages(loaned_pages_target - loaned_pages))
+ loaned_pages_target = loaned_pages;
+ } else if (loaned_pages_target < loaned_pages)
+ cmm_free_pages(loaned_pages - loaned_pages_target);
+ }
+ return 0;
+}
+
+#define CMM_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages));
+CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target));
+
+static ssize_t show_oom_pages(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages));
+}
+
+static ssize_t store_oom_pages(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long val = simple_strtoul (buf, NULL, 10);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (val != 0)
+ return -EBADMSG;
+
+ oom_freed_pages = 0;
+ return count;
+}
+
+static DEVICE_ATTR(oom_freed_kb, S_IWUSR | S_IRUGO,
+ show_oom_pages, store_oom_pages);
+
+static struct device_attribute *cmm_attrs[] = {
+ &dev_attr_loaned_kb,
+ &dev_attr_loaned_target_kb,
+ &dev_attr_oom_freed_kb,
+};
+
+static struct bus_type cmm_subsys = {
+ .name = "cmm",
+ .dev_name = "cmm",
+};
+
+/**
+ * cmm_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int cmm_sysfs_register(struct device *dev)
+{
+ int i, rc;
+
+ if ((rc = subsys_system_register(&cmm_subsys, NULL)))
+ return rc;
+
+ dev->id = 0;
+ dev->bus = &cmm_subsys;
+
+ if ((rc = device_register(dev)))
+ goto subsys_unregister;
+
+ for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) {
+ if ((rc = device_create_file(dev, cmm_attrs[i])))
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ while (--i >= 0)
+ device_remove_file(dev, cmm_attrs[i]);
+ device_unregister(dev);
+subsys_unregister:
+ bus_unregister(&cmm_subsys);
+ return rc;
+}
+
+/**
+ * cmm_unregister_sysfs - Unregister from sysfs
+ *
+ **/
+static void cmm_unregister_sysfs(struct device *dev)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++)
+ device_remove_file(dev, cmm_attrs[i]);
+ device_unregister(dev);
+ bus_unregister(&cmm_subsys);
+}
+
+/**
+ * cmm_reboot_notifier - Make sure pages are not still marked as "loaned"
+ *
+ **/
+static int cmm_reboot_notifier(struct notifier_block *nb,
+ unsigned long action, void *unused)
+{
+ if (action == SYS_RESTART) {
+ if (cmm_thread_ptr)
+ kthread_stop(cmm_thread_ptr);
+ cmm_thread_ptr = NULL;
+ cmm_free_pages(loaned_pages);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block cmm_reboot_nb = {
+ .notifier_call = cmm_reboot_notifier,
+};
+
+/**
+ * cmm_count_pages - Count the number of pages loaned in a particular range.
+ *
+ * @arg: memory_isolate_notify structure with address range and count
+ *
+ * Return value:
+ * 0 on success
+ **/
+static unsigned long cmm_count_pages(void *arg)
+{
+ struct memory_isolate_notify *marg = arg;
+ struct cmm_page_array *pa;
+ unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end = start + (marg->nr_pages << PAGE_SHIFT);
+ unsigned long idx;
+
+ spin_lock(&cmm_lock);
+ pa = cmm_page_list;
+ while (pa) {
+ if ((unsigned long)pa >= start && (unsigned long)pa < end)
+ marg->pages_found++;
+ for (idx = 0; idx < pa->index; idx++)
+ if (pa->page[idx] >= start && pa->page[idx] < end)
+ marg->pages_found++;
+ pa = pa->next;
+ }
+ spin_unlock(&cmm_lock);
+ return 0;
+}
+
+/**
+ * cmm_memory_isolate_cb - Handle memory isolation notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_isolate_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ **/
+static int cmm_memory_isolate_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ if (action == MEM_ISOLATE_COUNT)
+ ret = cmm_count_pages(arg);
+
+ return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_isolate_nb = {
+ .notifier_call = cmm_memory_isolate_cb,
+ .priority = CMM_MEM_ISOLATE_PRI
+};
+
+/**
+ * cmm_mem_going_offline - Unloan pages where memory is to be removed
+ * @arg: memory_notify structure with page range to be offlined
+ *
+ * Return value:
+ * 0 on success
+ **/
+static int cmm_mem_going_offline(void *arg)
+{
+ struct memory_notify *marg = arg;
+ unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn);
+ unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT);
+ struct cmm_page_array *pa_curr, *pa_last, *npa;
+ unsigned long idx;
+ unsigned long freed = 0;
+
+ cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n",
+ start_page, marg->nr_pages);
+ spin_lock(&cmm_lock);
+
+ /* Search the page list for pages in the range to be offlined */
+ pa_last = pa_curr = cmm_page_list;
+ while (pa_curr) {
+ for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) {
+ if ((pa_curr->page[idx] < start_page) ||
+ (pa_curr->page[idx] >= end_page))
+ continue;
+
+ plpar_page_set_active(__pa(pa_curr->page[idx]));
+ free_page(pa_curr->page[idx]);
+ freed++;
+ loaned_pages--;
+ totalram_pages++;
+ pa_curr->page[idx] = pa_last->page[--pa_last->index];
+ if (pa_last->index == 0) {
+ if (pa_curr == pa_last)
+ pa_curr = pa_last->next;
+ pa_last = pa_last->next;
+ free_page((unsigned long)cmm_page_list);
+ cmm_page_list = pa_last;
+ continue;
+ }
+ }
+ pa_curr = pa_curr->next;
+ }
+
+ /* Search for page list structures in the range to be offlined */
+ pa_last = NULL;
+ pa_curr = cmm_page_list;
+ while (pa_curr) {
+ if (((unsigned long)pa_curr >= start_page) &&
+ ((unsigned long)pa_curr < end_page)) {
+ npa = (struct cmm_page_array *)__get_free_page(
+ GFP_NOIO | __GFP_NOWARN |
+ __GFP_NORETRY | __GFP_NOMEMALLOC);
+ if (!npa) {
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Failed to allocate memory for list "
+ "management. Memory hotplug "
+ "failed.\n");
+ return ENOMEM;
+ }
+ memcpy(npa, pa_curr, PAGE_SIZE);
+ if (pa_curr == cmm_page_list)
+ cmm_page_list = npa;
+ if (pa_last)
+ pa_last->next = npa;
+ free_page((unsigned long) pa_curr);
+ freed++;
+ pa_curr = npa;
+ }
+
+ pa_last = pa_curr;
+ pa_curr = pa_curr->next;
+ }
+
+ spin_unlock(&cmm_lock);
+ cmm_dbg("Released %ld pages in the search range.\n", freed);
+
+ return 0;
+}
+
+/**
+ * cmm_memory_cb - Handle memory hotplug notifier calls
+ * @self: notifier block struct
+ * @action: action to take
+ * @arg: struct memory_notify data for handler
+ *
+ * Return value:
+ * NOTIFY_OK or notifier error based on subfunction return value
+ *
+ **/
+static int cmm_memory_cb(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_OFFLINE:
+ mutex_lock(&hotplug_mutex);
+ hotplug_occurred = 1;
+ ret = cmm_mem_going_offline(arg);
+ break;
+ case MEM_OFFLINE:
+ case MEM_CANCEL_OFFLINE:
+ mutex_unlock(&hotplug_mutex);
+ cmm_dbg("Memory offline operation complete.\n");
+ break;
+ case MEM_GOING_ONLINE:
+ case MEM_ONLINE:
+ case MEM_CANCEL_ONLINE:
+ break;
+ }
+
+ return notifier_from_errno(ret);
+}
+
+static struct notifier_block cmm_mem_nb = {
+ .notifier_call = cmm_memory_cb,
+ .priority = CMM_MEM_HOTPLUG_PRI
+};
+
+/**
+ * cmm_init - Module initialization
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int cmm_init(void)
+{
+ int rc = -ENOMEM;
+
+ if (!firmware_has_feature(FW_FEATURE_CMO))
+ return -EOPNOTSUPP;
+
+ if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0)
+ return rc;
+
+ if ((rc = register_reboot_notifier(&cmm_reboot_nb)))
+ goto out_oom_notifier;
+
+ if ((rc = cmm_sysfs_register(&cmm_dev)))
+ goto out_reboot_notifier;
+
+ if (register_memory_notifier(&cmm_mem_nb) ||
+ register_memory_isolate_notifier(&cmm_mem_isolate_nb))
+ goto out_unregister_notifier;
+
+ if (cmm_disabled)
+ return rc;
+
+ cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+ if (IS_ERR(cmm_thread_ptr)) {
+ rc = PTR_ERR(cmm_thread_ptr);
+ goto out_unregister_notifier;
+ }
+
+ return rc;
+
+out_unregister_notifier:
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+ cmm_unregister_sysfs(&cmm_dev);
+out_reboot_notifier:
+ unregister_reboot_notifier(&cmm_reboot_nb);
+out_oom_notifier:
+ unregister_oom_notifier(&cmm_oom_nb);
+ return rc;
+}
+
+/**
+ * cmm_exit - Module exit
+ *
+ * Return value:
+ * nothing
+ **/
+static void cmm_exit(void)
+{
+ if (cmm_thread_ptr)
+ kthread_stop(cmm_thread_ptr);
+ unregister_oom_notifier(&cmm_oom_nb);
+ unregister_reboot_notifier(&cmm_reboot_nb);
+ unregister_memory_notifier(&cmm_mem_nb);
+ unregister_memory_isolate_notifier(&cmm_mem_isolate_nb);
+ cmm_free_pages(loaned_pages);
+ cmm_unregister_sysfs(&cmm_dev);
+}
+
+/**
+ * cmm_set_disable - Disable/Enable CMM
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int cmm_set_disable(const char *val, struct kernel_param *kp)
+{
+ int disable = simple_strtoul(val, NULL, 10);
+
+ if (disable != 0 && disable != 1)
+ return -EINVAL;
+
+ if (disable && !cmm_disabled) {
+ if (cmm_thread_ptr)
+ kthread_stop(cmm_thread_ptr);
+ cmm_thread_ptr = NULL;
+ cmm_free_pages(loaned_pages);
+ } else if (!disable && cmm_disabled) {
+ cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
+ if (IS_ERR(cmm_thread_ptr))
+ return PTR_ERR(cmm_thread_ptr);
+ }
+
+ cmm_disabled = disable;
+ return 0;
+}
+
+module_param_call(disable, cmm_set_disable, param_get_uint,
+ &cmm_disabled, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. "
+ "[Default=" __stringify(CMM_DISABLE) "]");
+
+module_init(cmm_init);
+module_exit(cmm_exit);
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
new file mode 100644
index 00000000000..0f1b706506e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -0,0 +1,565 @@
+/*
+ * Support for dynamic reconfiguration for PCI, Memory, and CPU
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms.
+ *
+ * Copyright (C) 2009 Nathan Fontenot
+ * Copyright (C) 2009 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include "offline_states.h"
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+
+struct cc_workarea {
+ u32 drc_index;
+ u32 zero;
+ u32 name_offset;
+ u32 prop_length;
+ u32 prop_offset;
+};
+
+void dlpar_free_cc_property(struct property *prop)
+{
+ kfree(prop->name);
+ kfree(prop->value);
+ kfree(prop);
+}
+
+static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa)
+{
+ struct property *prop;
+ char *name;
+ char *value;
+
+ prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+ if (!prop)
+ return NULL;
+
+ name = (char *)ccwa + ccwa->name_offset;
+ prop->name = kstrdup(name, GFP_KERNEL);
+
+ prop->length = ccwa->prop_length;
+ value = (char *)ccwa + ccwa->prop_offset;
+ prop->value = kmemdup(value, prop->length, GFP_KERNEL);
+ if (!prop->value) {
+ dlpar_free_cc_property(prop);
+ return NULL;
+ }
+
+ return prop;
+}
+
+static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa)
+{
+ struct device_node *dn;
+ char *name;
+
+ dn = kzalloc(sizeof(*dn), GFP_KERNEL);
+ if (!dn)
+ return NULL;
+
+ /* The configure connector reported name does not contain a
+ * preceding '/', so we allocate a buffer large enough to
+ * prepend this to the full_name.
+ */
+ name = (char *)ccwa + ccwa->name_offset;
+ dn->full_name = kasprintf(GFP_KERNEL, "/%s", name);
+ if (!dn->full_name) {
+ kfree(dn);
+ return NULL;
+ }
+
+ return dn;
+}
+
+static void dlpar_free_one_cc_node(struct device_node *dn)
+{
+ struct property *prop;
+
+ while (dn->properties) {
+ prop = dn->properties;
+ dn->properties = prop->next;
+ dlpar_free_cc_property(prop);
+ }
+
+ kfree(dn->full_name);
+ kfree(dn);
+}
+
+void dlpar_free_cc_nodes(struct device_node *dn)
+{
+ if (dn->child)
+ dlpar_free_cc_nodes(dn->child);
+
+ if (dn->sibling)
+ dlpar_free_cc_nodes(dn->sibling);
+
+ dlpar_free_one_cc_node(dn);
+}
+
+#define COMPLETE 0
+#define NEXT_SIBLING 1
+#define NEXT_CHILD 2
+#define NEXT_PROPERTY 3
+#define PREV_PARENT 4
+#define MORE_MEMORY 5
+#define CALL_AGAIN -2
+#define ERR_CFG_USE -9003
+
+struct device_node *dlpar_configure_connector(u32 drc_index)
+{
+ struct device_node *dn;
+ struct device_node *first_dn = NULL;
+ struct device_node *last_dn = NULL;
+ struct property *property;
+ struct property *last_property = NULL;
+ struct cc_workarea *ccwa;
+ char *data_buf;
+ int cc_token;
+ int rc = -1;
+
+ cc_token = rtas_token("ibm,configure-connector");
+ if (cc_token == RTAS_UNKNOWN_SERVICE)
+ return NULL;
+
+ data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!data_buf)
+ return NULL;
+
+ ccwa = (struct cc_workarea *)&data_buf[0];
+ ccwa->drc_index = drc_index;
+ ccwa->zero = 0;
+
+ do {
+ /* Since we release the rtas_data_buf lock between configure
+ * connector calls we want to re-populate the rtas_data_buffer
+ * with the contents of the previous call.
+ */
+ spin_lock(&rtas_data_buf_lock);
+
+ memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
+ rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
+ memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+ spin_unlock(&rtas_data_buf_lock);
+
+ switch (rc) {
+ case COMPLETE:
+ break;
+
+ case NEXT_SIBLING:
+ dn = dlpar_parse_cc_node(ccwa);
+ if (!dn)
+ goto cc_error;
+
+ dn->parent = last_dn->parent;
+ last_dn->sibling = dn;
+ last_dn = dn;
+ break;
+
+ case NEXT_CHILD:
+ dn = dlpar_parse_cc_node(ccwa);
+ if (!dn)
+ goto cc_error;
+
+ if (!first_dn)
+ first_dn = dn;
+ else {
+ dn->parent = last_dn;
+ if (last_dn)
+ last_dn->child = dn;
+ }
+
+ last_dn = dn;
+ break;
+
+ case NEXT_PROPERTY:
+ property = dlpar_parse_cc_property(ccwa);
+ if (!property)
+ goto cc_error;
+
+ if (!last_dn->properties)
+ last_dn->properties = property;
+ else
+ last_property->next = property;
+
+ last_property = property;
+ break;
+
+ case PREV_PARENT:
+ last_dn = last_dn->parent;
+ break;
+
+ case CALL_AGAIN:
+ break;
+
+ case MORE_MEMORY:
+ case ERR_CFG_USE:
+ default:
+ printk(KERN_ERR "Unexpected Error (%d) "
+ "returned from configure-connector\n", rc);
+ goto cc_error;
+ }
+ } while (rc);
+
+cc_error:
+ kfree(data_buf);
+
+ if (rc) {
+ if (first_dn)
+ dlpar_free_cc_nodes(first_dn);
+
+ return NULL;
+ }
+
+ return first_dn;
+}
+
+static struct device_node *derive_parent(const char *path)
+{
+ struct device_node *parent;
+ char *last_slash;
+
+ last_slash = strrchr(path, '/');
+ if (last_slash == path) {
+ parent = of_find_node_by_path("/");
+ } else {
+ char *parent_path;
+ int parent_path_len = last_slash - path + 1;
+ parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+ if (!parent_path)
+ return NULL;
+
+ strlcpy(parent_path, path, parent_path_len);
+ parent = of_find_node_by_path(parent_path);
+ kfree(parent_path);
+ }
+
+ return parent;
+}
+
+int dlpar_attach_node(struct device_node *dn)
+{
+#ifdef CONFIG_PROC_DEVICETREE
+ struct proc_dir_entry *ent;
+#endif
+ int rc;
+
+ of_node_set_flag(dn, OF_DYNAMIC);
+ kref_init(&dn->kref);
+ dn->parent = derive_parent(dn->full_name);
+ if (!dn->parent)
+ return -ENOMEM;
+
+ rc = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, dn);
+ if (rc) {
+ printk(KERN_ERR "Failed to add device node %s\n",
+ dn->full_name);
+ return rc;
+ }
+
+ of_attach_node(dn);
+
+#ifdef CONFIG_PROC_DEVICETREE
+ ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
+ if (ent)
+ proc_device_tree_add_node(dn, ent);
+#endif
+
+ of_node_put(dn->parent);
+ return 0;
+}
+
+int dlpar_detach_node(struct device_node *dn)
+{
+#ifdef CONFIG_PROC_DEVICETREE
+ struct device_node *parent = dn->parent;
+ struct property *prop = dn->properties;
+
+ while (prop) {
+ remove_proc_entry(prop->name, dn->pde);
+ prop = prop->next;
+ }
+
+ if (dn->pde)
+ remove_proc_entry(dn->pde->name, parent->pde);
+#endif
+
+ pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, dn);
+ of_detach_node(dn);
+ of_node_put(dn); /* Must decrement the refcount */
+
+ return 0;
+}
+
+#define DR_ENTITY_SENSE 9003
+#define DR_ENTITY_PRESENT 1
+#define DR_ENTITY_UNUSABLE 2
+#define ALLOCATION_STATE 9003
+#define ALLOC_UNUSABLE 0
+#define ALLOC_USABLE 1
+#define ISOLATION_STATE 9001
+#define ISOLATE 0
+#define UNISOLATE 1
+
+int dlpar_acquire_drc(u32 drc_index)
+{
+ int dr_status, rc;
+
+ rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+ DR_ENTITY_SENSE, drc_index);
+ if (rc || dr_status != DR_ENTITY_UNUSABLE)
+ return -1;
+
+ rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
+ if (rc)
+ return rc;
+
+ rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+ if (rc) {
+ rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+ return rc;
+ }
+
+ return 0;
+}
+
+int dlpar_release_drc(u32 drc_index)
+{
+ int dr_status, rc;
+
+ rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+ DR_ENTITY_SENSE, drc_index);
+ if (rc || dr_status != DR_ENTITY_PRESENT)
+ return -1;
+
+ rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
+ if (rc)
+ return rc;
+
+ rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+ if (rc) {
+ rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+ return rc;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
+static int dlpar_online_cpu(struct device_node *dn)
+{
+ int rc = 0;
+ unsigned int cpu;
+ int len, nthreads, i;
+ const u32 *intserv;
+
+ intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return -EINVAL;
+
+ nthreads = len / sizeof(u32);
+
+ cpu_maps_update_begin();
+ for (i = 0; i < nthreads; i++) {
+ for_each_present_cpu(cpu) {
+ if (get_hard_smp_processor_id(cpu) != intserv[i])
+ continue;
+ BUG_ON(get_cpu_current_state(cpu)
+ != CPU_STATE_OFFLINE);
+ cpu_maps_update_done();
+ rc = cpu_up(cpu);
+ if (rc)
+ goto out;
+ cpu_maps_update_begin();
+
+ break;
+ }
+ if (cpu == num_possible_cpus())
+ printk(KERN_WARNING "Could not find cpu to online "
+ "with physical id 0x%x\n", intserv[i]);
+ }
+ cpu_maps_update_done();
+
+out:
+ return rc;
+
+}
+
+static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
+{
+ struct device_node *dn;
+ unsigned long drc_index;
+ char *cpu_name;
+ int rc;
+
+ cpu_hotplug_driver_lock();
+ rc = strict_strtoul(buf, 0, &drc_index);
+ if (rc) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ dn = dlpar_configure_connector(drc_index);
+ if (!dn) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* configure-connector reports cpus as living in the base
+ * directory of the device tree. CPUs actually live in the
+ * cpus directory so we need to fixup the full_name.
+ */
+ cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name);
+ if (!cpu_name) {
+ dlpar_free_cc_nodes(dn);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ kfree(dn->full_name);
+ dn->full_name = cpu_name;
+
+ rc = dlpar_acquire_drc(drc_index);
+ if (rc) {
+ dlpar_free_cc_nodes(dn);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = dlpar_attach_node(dn);
+ if (rc) {
+ dlpar_release_drc(drc_index);
+ dlpar_free_cc_nodes(dn);
+ goto out;
+ }
+
+ rc = dlpar_online_cpu(dn);
+out:
+ cpu_hotplug_driver_unlock();
+
+ return rc ? rc : count;
+}
+
+static int dlpar_offline_cpu(struct device_node *dn)
+{
+ int rc = 0;
+ unsigned int cpu;
+ int len, nthreads, i;
+ const u32 *intserv;
+
+ intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return -EINVAL;
+
+ nthreads = len / sizeof(u32);
+
+ cpu_maps_update_begin();
+ for (i = 0; i < nthreads; i++) {
+ for_each_present_cpu(cpu) {
+ if (get_hard_smp_processor_id(cpu) != intserv[i])
+ continue;
+
+ if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+ break;
+
+ if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
+ set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+ cpu_maps_update_done();
+ rc = cpu_down(cpu);
+ if (rc)
+ goto out;
+ cpu_maps_update_begin();
+ break;
+
+ }
+
+ /*
+ * The cpu is in CPU_STATE_INACTIVE.
+ * Upgrade it's state to CPU_STATE_OFFLINE.
+ */
+ set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+ BUG_ON(plpar_hcall_norets(H_PROD, intserv[i])
+ != H_SUCCESS);
+ __cpu_die(cpu);
+ break;
+ }
+ if (cpu == num_possible_cpus())
+ printk(KERN_WARNING "Could not find cpu to offline "
+ "with physical id 0x%x\n", intserv[i]);
+ }
+ cpu_maps_update_done();
+
+out:
+ return rc;
+
+}
+
+static ssize_t dlpar_cpu_release(const char *buf, size_t count)
+{
+ struct device_node *dn;
+ const u32 *drc_index;
+ int rc;
+
+ dn = of_find_node_by_path(buf);
+ if (!dn)
+ return -EINVAL;
+
+ drc_index = of_get_property(dn, "ibm,my-drc-index", NULL);
+ if (!drc_index) {
+ of_node_put(dn);
+ return -EINVAL;
+ }
+
+ cpu_hotplug_driver_lock();
+ rc = dlpar_offline_cpu(dn);
+ if (rc) {
+ of_node_put(dn);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = dlpar_release_drc(*drc_index);
+ if (rc) {
+ of_node_put(dn);
+ goto out;
+ }
+
+ rc = dlpar_detach_node(dn);
+ if (rc) {
+ dlpar_acquire_drc(*drc_index);
+ goto out;
+ }
+
+ of_node_put(dn);
+out:
+ cpu_hotplug_driver_unlock();
+ return rc ? rc : count;
+}
+
+static int __init pseries_dlpar_init(void)
+{
+ ppc_md.cpu_probe = dlpar_cpu_probe;
+ ppc_md.cpu_release = dlpar_cpu_release;
+
+ return 0;
+}
+machine_device_initcall(pseries, pseries_dlpar_init);
+
+#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
new file mode 100644
index 00000000000..0e865637006
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -0,0 +1,396 @@
+/*
+ * Virtual Processor Dispatch Trace Log
+ *
+ * (C) Copyright IBM Corporation 2009
+ *
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <asm/smp.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/firmware.h>
+#include <asm/lppaca.h>
+
+#include "plpar_wrappers.h"
+
+struct dtl {
+ struct dtl_entry *buf;
+ struct dentry *file;
+ int cpu;
+ int buf_entries;
+ u64 last_idx;
+ spinlock_t lock;
+};
+static DEFINE_PER_CPU(struct dtl, cpu_dtl);
+
+/*
+ * Dispatch trace log event mask:
+ * 0x7: 0x1: voluntary virtual processor waits
+ * 0x2: time-slice preempts
+ * 0x4: virtual partition memory page faults
+ */
+static u8 dtl_event_mask = 0x7;
+
+
+/*
+ * Size of per-cpu log buffers. Firmware requires that the buffer does
+ * not cross a 4k boundary.
+ */
+static int dtl_buf_entries = N_DISPATCH_LOG;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+struct dtl_ring {
+ u64 write_index;
+ struct dtl_entry *write_ptr;
+ struct dtl_entry *buf;
+ struct dtl_entry *buf_end;
+ u8 saved_dtl_mask;
+};
+
+static DEFINE_PER_CPU(struct dtl_ring, dtl_rings);
+
+static atomic_t dtl_count;
+
+/*
+ * The cpu accounting code controls the DTL ring buffer, and we get
+ * given entries as they are processed.
+ */
+static void consume_dtle(struct dtl_entry *dtle, u64 index)
+{
+ struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings);
+ struct dtl_entry *wp = dtlr->write_ptr;
+ struct lppaca *vpa = local_paca->lppaca_ptr;
+
+ if (!wp)
+ return;
+
+ *wp = *dtle;
+ barrier();
+
+ /* check for hypervisor ring buffer overflow, ignore this entry if so */
+ if (index + N_DISPATCH_LOG < vpa->dtl_idx)
+ return;
+
+ ++wp;
+ if (wp == dtlr->buf_end)
+ wp = dtlr->buf;
+ dtlr->write_ptr = wp;
+
+ /* incrementing write_index makes the new entry visible */
+ smp_wmb();
+ ++dtlr->write_index;
+}
+
+static int dtl_start(struct dtl *dtl)
+{
+ struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+ dtlr->buf = dtl->buf;
+ dtlr->buf_end = dtl->buf + dtl->buf_entries;
+ dtlr->write_index = 0;
+
+ /* setting write_ptr enables logging into our buffer */
+ smp_wmb();
+ dtlr->write_ptr = dtl->buf;
+
+ /* enable event logging */
+ dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask;
+ lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask;
+
+ dtl_consumer = consume_dtle;
+ atomic_inc(&dtl_count);
+ return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+ struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu);
+
+ dtlr->write_ptr = NULL;
+ smp_wmb();
+
+ dtlr->buf = NULL;
+
+ /* restore dtl_enable_mask */
+ lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask;
+
+ if (atomic_dec_and_test(&dtl_count))
+ dtl_consumer = NULL;
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+ return per_cpu(dtl_rings, dtl->cpu).write_index;
+}
+
+#else /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+static int dtl_start(struct dtl *dtl)
+{
+ unsigned long addr;
+ int ret, hwcpu;
+
+ /* Register our dtl buffer with the hypervisor. The HV expects the
+ * buffer size to be passed in the second word of the buffer */
+ ((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES;
+
+ hwcpu = get_hard_smp_processor_id(dtl->cpu);
+ addr = __pa(dtl->buf);
+ ret = register_dtl(hwcpu, addr);
+ if (ret) {
+ printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) "
+ "failed with %d\n", __func__, dtl->cpu, hwcpu, ret);
+ return -EIO;
+ }
+
+ /* set our initial buffer indices */
+ lppaca_of(dtl->cpu).dtl_idx = 0;
+
+ /* ensure that our updates to the lppaca fields have occurred before
+ * we actually enable the logging */
+ smp_wmb();
+
+ /* enable event logging */
+ lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask;
+
+ return 0;
+}
+
+static void dtl_stop(struct dtl *dtl)
+{
+ int hwcpu = get_hard_smp_processor_id(dtl->cpu);
+
+ lppaca_of(dtl->cpu).dtl_enable_mask = 0x0;
+
+ unregister_dtl(hwcpu);
+}
+
+static u64 dtl_current_index(struct dtl *dtl)
+{
+ return lppaca_of(dtl->cpu).dtl_idx;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+static int dtl_enable(struct dtl *dtl)
+{
+ long int n_entries;
+ long int rc;
+ struct dtl_entry *buf = NULL;
+
+ if (!dtl_cache)
+ return -ENOMEM;
+
+ /* only allow one reader */
+ if (dtl->buf)
+ return -EBUSY;
+
+ n_entries = dtl_buf_entries;
+ buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu));
+ if (!buf) {
+ printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n",
+ __func__, dtl->cpu);
+ return -ENOMEM;
+ }
+
+ spin_lock(&dtl->lock);
+ rc = -EBUSY;
+ if (!dtl->buf) {
+ /* store the original allocation size for use during read */
+ dtl->buf_entries = n_entries;
+ dtl->buf = buf;
+ dtl->last_idx = 0;
+ rc = dtl_start(dtl);
+ if (rc)
+ dtl->buf = NULL;
+ }
+ spin_unlock(&dtl->lock);
+
+ if (rc)
+ kmem_cache_free(dtl_cache, buf);
+ return rc;
+}
+
+static void dtl_disable(struct dtl *dtl)
+{
+ spin_lock(&dtl->lock);
+ dtl_stop(dtl);
+ kmem_cache_free(dtl_cache, dtl->buf);
+ dtl->buf = NULL;
+ dtl->buf_entries = 0;
+ spin_unlock(&dtl->lock);
+}
+
+/* file interface */
+
+static int dtl_file_open(struct inode *inode, struct file *filp)
+{
+ struct dtl *dtl = inode->i_private;
+ int rc;
+
+ rc = dtl_enable(dtl);
+ if (rc)
+ return rc;
+
+ filp->private_data = dtl;
+ return 0;
+}
+
+static int dtl_file_release(struct inode *inode, struct file *filp)
+{
+ struct dtl *dtl = inode->i_private;
+ dtl_disable(dtl);
+ return 0;
+}
+
+static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
+ loff_t *pos)
+{
+ long int rc, n_read, n_req, read_size;
+ struct dtl *dtl;
+ u64 cur_idx, last_idx, i;
+
+ if ((len % sizeof(struct dtl_entry)) != 0)
+ return -EINVAL;
+
+ dtl = filp->private_data;
+
+ /* requested number of entries to read */
+ n_req = len / sizeof(struct dtl_entry);
+
+ /* actual number of entries read */
+ n_read = 0;
+
+ spin_lock(&dtl->lock);
+
+ cur_idx = dtl_current_index(dtl);
+ last_idx = dtl->last_idx;
+
+ if (last_idx + dtl->buf_entries <= cur_idx)
+ last_idx = cur_idx - dtl->buf_entries + 1;
+
+ if (last_idx + n_req > cur_idx)
+ n_req = cur_idx - last_idx;
+
+ if (n_req > 0)
+ dtl->last_idx = last_idx + n_req;
+
+ spin_unlock(&dtl->lock);
+
+ if (n_req <= 0)
+ return 0;
+
+ i = last_idx % dtl->buf_entries;
+
+ /* read the tail of the buffer if we've wrapped */
+ if (i + n_req > dtl->buf_entries) {
+ read_size = dtl->buf_entries - i;
+
+ rc = copy_to_user(buf, &dtl->buf[i],
+ read_size * sizeof(struct dtl_entry));
+ if (rc)
+ return -EFAULT;
+
+ i = 0;
+ n_req -= read_size;
+ n_read += read_size;
+ buf += read_size * sizeof(struct dtl_entry);
+ }
+
+ /* .. and now the head */
+ rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry));
+ if (rc)
+ return -EFAULT;
+
+ n_read += n_req;
+
+ return n_read * sizeof(struct dtl_entry);
+}
+
+static const struct file_operations dtl_fops = {
+ .open = dtl_file_open,
+ .release = dtl_file_release,
+ .read = dtl_file_read,
+ .llseek = no_llseek,
+};
+
+static struct dentry *dtl_dir;
+
+static int dtl_setup_file(struct dtl *dtl)
+{
+ char name[10];
+
+ sprintf(name, "cpu-%d", dtl->cpu);
+
+ dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops);
+ if (!dtl->file)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int dtl_init(void)
+{
+ struct dentry *event_mask_file, *buf_entries_file;
+ int rc, i;
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+ return -ENODEV;
+
+ /* set up common debugfs structure */
+
+ rc = -ENOMEM;
+ dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root);
+ if (!dtl_dir) {
+ printk(KERN_WARNING "%s: can't create dtl root dir\n",
+ __func__);
+ goto err;
+ }
+
+ event_mask_file = debugfs_create_x8("dtl_event_mask", 0600,
+ dtl_dir, &dtl_event_mask);
+ buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400,
+ dtl_dir, &dtl_buf_entries);
+
+ if (!event_mask_file || !buf_entries_file) {
+ printk(KERN_WARNING "%s: can't create dtl files\n", __func__);
+ goto err_remove_dir;
+ }
+
+ /* set up the per-cpu log structures */
+ for_each_possible_cpu(i) {
+ struct dtl *dtl = &per_cpu(cpu_dtl, i);
+ spin_lock_init(&dtl->lock);
+ dtl->cpu = i;
+
+ rc = dtl_setup_file(dtl);
+ if (rc)
+ goto err_remove_dir;
+ }
+
+ return 0;
+
+err_remove_dir:
+ debugfs_remove_recursive(dtl_dir);
+err:
+ return rc;
+}
+arch_initcall(dtl_init);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
new file mode 100644
index 00000000000..c0b40af4ce4
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -0,0 +1,1346 @@
+/*
+ * eeh.c
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/sched.h> /* for init_mm */
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/of.h>
+
+#include <linux/atomic.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+
+/** Overview:
+ * EEH, or "Extended Error Handling" is a PCI bridge technology for
+ * dealing with PCI bus errors that can't be dealt with within the
+ * usual PCI framework, except by check-stopping the CPU. Systems
+ * that are designed for high-availability/reliability cannot afford
+ * to crash due to a "mere" PCI error, thus the need for EEH.
+ * An EEH-capable bridge operates by converting a detected error
+ * into a "slot freeze", taking the PCI adapter off-line, making
+ * the slot behave, from the OS'es point of view, as if the slot
+ * were "empty": all reads return 0xff's and all writes are silently
+ * ignored. EEH slot isolation events can be triggered by parity
+ * errors on the address or data busses (e.g. during posted writes),
+ * which in turn might be caused by low voltage on the bus, dust,
+ * vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ * Note, however, that one of the leading causes of EEH slot
+ * freeze events are buggy device drivers, buggy device microcode,
+ * or buggy device hardware. This is because any attempt by the
+ * device to bus-master data to a memory address that is not
+ * assigned to the device will trigger a slot freeze. (The idea
+ * is to prevent devices-gone-wild from corrupting system memory).
+ * Buggy hardware/drivers will have a miserable time co-existing
+ * with EEH.
+ *
+ * Ideally, a PCI device driver, when suspecting that an isolation
+ * event has occurred (e.g. by reading 0xff's), will then ask EEH
+ * whether this is the case, and then take appropriate steps to
+ * reset the PCI slot, the PCI device, and then resume operations.
+ * However, until that day, the checking is done here, with the
+ * eeh_check_failure() routine embedded in the MMIO macros. If
+ * the slot is found to be isolated, an "EEH Event" is synthesized
+ * and sent out for processing.
+ */
+
+/* If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event, it might be broken.
+ * This sets the threshold for how many read attempts we allow
+ * before printing an error message.
+ */
+#define EEH_MAX_FAILS 2100000
+
+/* Time to wait for a PCI slot to report status, in milliseconds */
+#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
+
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+static int ibm_get_config_addr_info;
+static int ibm_get_config_addr_info2;
+static int ibm_configure_bridge;
+static int ibm_configure_pe;
+
+int eeh_subsystem_enabled;
+EXPORT_SYMBOL(eeh_subsystem_enabled);
+
+/* Lock to avoid races due to multiple reports of an error */
+static DEFINE_RAW_SPINLOCK(confirm_error_lock);
+
+/* Buffer for reporting slot-error-detail rtas calls. Its here
+ * in BSS, and not dynamically alloced, so that it ends up in
+ * RMO where RTAS can access it.
+ */
+static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(slot_errbuf_lock);
+static int eeh_error_buf_size;
+
+/* Buffer for reporting pci register dumps. Its here in BSS, and
+ * not dynamically alloced, so that it ends up in RMO where RTAS
+ * can access it.
+ */
+#define EEH_PCI_REGS_LOG_LEN 4096
+static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
+
+/* System monitoring statistics */
+static unsigned long no_device;
+static unsigned long no_dn;
+static unsigned long no_cfg_addr;
+static unsigned long ignored_check;
+static unsigned long total_mmio_ffs;
+static unsigned long false_positives;
+static unsigned long slot_resets;
+
+#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
+
+/* --------------------------------------------------------------- */
+/* Below lies the EEH event infrastructure */
+
+static void rtas_slot_error_detail(struct pci_dn *pdn, int severity,
+ char *driver_log, size_t loglen)
+{
+ int config_addr;
+ unsigned long flags;
+ int rc;
+
+ /* Log the error with the rtas logger */
+ spin_lock_irqsave(&slot_errbuf_lock, flags);
+ memset(slot_errbuf, 0, eeh_error_buf_size);
+
+ /* Use PE configuration address, if present */
+ config_addr = pdn->eeh_config_addr;
+ if (pdn->eeh_pe_config_addr)
+ config_addr = pdn->eeh_pe_config_addr;
+
+ rc = rtas_call(ibm_slot_error_detail,
+ 8, 1, NULL, config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid),
+ virt_to_phys(driver_log), loglen,
+ virt_to_phys(slot_errbuf),
+ eeh_error_buf_size,
+ severity);
+
+ if (rc == 0)
+ log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+ spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+}
+
+/**
+ * gather_pci_data - copy assorted PCI config space registers to buff
+ * @pdn: device to report data for
+ * @buf: point to buffer in which to log
+ * @len: amount of room in buffer
+ *
+ * This routine captures assorted PCI configuration space data,
+ * and puts them into a buffer for RTAS error logging.
+ */
+static size_t gather_pci_data(struct pci_dn *pdn, char * buf, size_t len)
+{
+ struct pci_dev *dev = pdn->pcidev;
+ u32 cfg;
+ int cap, i;
+ int n = 0;
+
+ n += scnprintf(buf+n, len-n, "%s\n", pdn->node->full_name);
+ printk(KERN_WARNING "EEH: of node=%s\n", pdn->node->full_name);
+
+ rtas_read_config(pdn, PCI_VENDOR_ID, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg);
+
+ rtas_read_config(pdn, PCI_COMMAND, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg);
+
+ if (!dev) {
+ printk(KERN_WARNING "EEH: no PCI device for this of node\n");
+ return n;
+ }
+
+ /* Gather bridge-specific registers */
+ if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
+ rtas_read_config(pdn, PCI_SEC_STATUS, 2, &cfg);
+ n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
+ printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg);
+
+ rtas_read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg);
+ n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
+ printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg);
+ }
+
+ /* Dump out the PCI-X command and status regs */
+ cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+ if (cap) {
+ rtas_read_config(pdn, cap, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg);
+
+ rtas_read_config(pdn, cap+4, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg);
+ }
+
+ /* If PCI-E capable, dump PCI-E cap 10, and the AER */
+ cap = pci_find_capability(dev, PCI_CAP_ID_EXP);
+ if (cap) {
+ n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
+ printk(KERN_WARNING
+ "EEH: PCI-E capabilities and status follow:\n");
+
+ for (i=0; i<=8; i++) {
+ rtas_read_config(pdn, cap+4*i, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+ printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg);
+ }
+
+ cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+ if (cap) {
+ n += scnprintf(buf+n, len-n, "pci-e AER:\n");
+ printk(KERN_WARNING
+ "EEH: PCI-E AER capability register set follows:\n");
+
+ for (i=0; i<14; i++) {
+ rtas_read_config(pdn, cap+4*i, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+ printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg);
+ }
+ }
+ }
+
+ /* Gather status on devices under the bridge */
+ if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
+ struct device_node *dn;
+
+ for_each_child_of_node(pdn->node, dn) {
+ pdn = PCI_DN(dn);
+ if (pdn)
+ n += gather_pci_data(pdn, buf+n, len-n);
+ }
+ }
+
+ return n;
+}
+
+void eeh_slot_error_detail(struct pci_dn *pdn, int severity)
+{
+ size_t loglen = 0;
+ pci_regs_buf[0] = 0;
+
+ rtas_pci_enable(pdn, EEH_THAW_MMIO);
+ rtas_configure_bridge(pdn);
+ eeh_restore_bars(pdn);
+ loglen = gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN);
+
+ rtas_slot_error_detail(pdn, severity, pci_regs_buf, loglen);
+}
+
+/**
+ * read_slot_reset_state - Read the reset state of a device node's slot
+ * @dn: device node to read
+ * @rets: array to return results in
+ */
+static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
+{
+ int token, outputs;
+ int config_addr;
+
+ if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
+ token = ibm_read_slot_reset_state2;
+ outputs = 4;
+ } else {
+ token = ibm_read_slot_reset_state;
+ rets[2] = 0; /* fake PE Unavailable info */
+ outputs = 3;
+ }
+
+ /* Use PE configuration address, if present */
+ config_addr = pdn->eeh_config_addr;
+ if (pdn->eeh_pe_config_addr)
+ config_addr = pdn->eeh_pe_config_addr;
+
+ return rtas_call(token, 3, outputs, rets, config_addr,
+ BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid));
+}
+
+/**
+ * eeh_wait_for_slot_status - returns error status of slot
+ * @pdn pci device node
+ * @max_wait_msecs maximum number to millisecs to wait
+ *
+ * Return negative value if a permanent error, else return
+ * Partition Endpoint (PE) status value.
+ *
+ * If @max_wait_msecs is positive, then this routine will
+ * sleep until a valid status can be obtained, or until
+ * the max allowed wait time is exceeded, in which case
+ * a -2 is returned.
+ */
+int
+eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs)
+{
+ int rc;
+ int rets[3];
+ int mwait;
+
+ while (1) {
+ rc = read_slot_reset_state(pdn, rets);
+ if (rc) return rc;
+ if (rets[1] == 0) return -1; /* EEH is not supported */
+
+ if (rets[0] != 5) return rets[0]; /* return actual status */
+
+ if (rets[2] == 0) return -1; /* permanently unavailable */
+
+ if (max_wait_msecs <= 0) break;
+
+ mwait = rets[2];
+ if (mwait <= 0) {
+ printk (KERN_WARNING
+ "EEH: Firmware returned bad wait value=%d\n", mwait);
+ mwait = 1000;
+ } else if (mwait > 300*1000) {
+ printk (KERN_WARNING
+ "EEH: Firmware is taking too long, time=%d\n", mwait);
+ mwait = 300*1000;
+ }
+ max_wait_msecs -= mwait;
+ msleep (mwait);
+ }
+
+ printk(KERN_WARNING "EEH: Timed out waiting for slot status\n");
+ return -2;
+}
+
+/**
+ * eeh_token_to_phys - convert EEH address token to phys address
+ * @token i/o token, should be address in the form 0xA....
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+ pte_t *ptep;
+ unsigned long pa;
+
+ ptep = find_linux_pte(init_mm.pgd, token);
+ if (!ptep)
+ return token;
+ pa = pte_pfn(*ptep) << PAGE_SHIFT;
+
+ return pa | (token & (PAGE_SIZE-1));
+}
+
+/**
+ * Return the "partitionable endpoint" (pe) under which this device lies
+ */
+struct device_node * find_device_pe(struct device_node *dn)
+{
+ while ((dn->parent) && PCI_DN(dn->parent) &&
+ (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
+ dn = dn->parent;
+ }
+ return dn;
+}
+
+/** Mark all devices that are children of this device as failed.
+ * Mark the device driver too, so that it can see the failure
+ * immediately; this is critical, since some drivers poll
+ * status registers in interrupts ... If a driver is polling,
+ * and the slot is frozen, then the driver can deadlock in
+ * an interrupt context, which is bad.
+ */
+
+static void __eeh_mark_slot(struct device_node *parent, int mode_flag)
+{
+ struct device_node *dn;
+
+ for_each_child_of_node(parent, dn) {
+ if (PCI_DN(dn)) {
+ /* Mark the pci device driver too */
+ struct pci_dev *dev = PCI_DN(dn)->pcidev;
+
+ PCI_DN(dn)->eeh_mode |= mode_flag;
+
+ if (dev && dev->driver)
+ dev->error_state = pci_channel_io_frozen;
+
+ __eeh_mark_slot(dn, mode_flag);
+ }
+ }
+}
+
+void eeh_mark_slot (struct device_node *dn, int mode_flag)
+{
+ struct pci_dev *dev;
+ dn = find_device_pe (dn);
+
+ /* Back up one, since config addrs might be shared */
+ if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
+ dn = dn->parent;
+
+ PCI_DN(dn)->eeh_mode |= mode_flag;
+
+ /* Mark the pci device too */
+ dev = PCI_DN(dn)->pcidev;
+ if (dev)
+ dev->error_state = pci_channel_io_frozen;
+
+ __eeh_mark_slot(dn, mode_flag);
+}
+
+static void __eeh_clear_slot(struct device_node *parent, int mode_flag)
+{
+ struct device_node *dn;
+
+ for_each_child_of_node(parent, dn) {
+ if (PCI_DN(dn)) {
+ PCI_DN(dn)->eeh_mode &= ~mode_flag;
+ PCI_DN(dn)->eeh_check_count = 0;
+ __eeh_clear_slot(dn, mode_flag);
+ }
+ }
+}
+
+void eeh_clear_slot (struct device_node *dn, int mode_flag)
+{
+ unsigned long flags;
+ raw_spin_lock_irqsave(&confirm_error_lock, flags);
+
+ dn = find_device_pe (dn);
+
+ /* Back up one, since config addrs might be shared */
+ if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
+ dn = dn->parent;
+
+ PCI_DN(dn)->eeh_mode &= ~mode_flag;
+ PCI_DN(dn)->eeh_check_count = 0;
+ __eeh_clear_slot(dn, mode_flag);
+ raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+}
+
+void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset)
+{
+ struct device_node *dn;
+
+ for_each_child_of_node(parent, dn) {
+ if (PCI_DN(dn)) {
+
+ struct pci_dev *dev = PCI_DN(dn)->pcidev;
+
+ if (dev && dev->driver)
+ *freset |= dev->needs_freset;
+
+ __eeh_set_pe_freset(dn, freset);
+ }
+ }
+}
+
+void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset)
+{
+ struct pci_dev *dev;
+ dn = find_device_pe(dn);
+
+ /* Back up one, since config addrs might be shared */
+ if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
+ dn = dn->parent;
+
+ dev = PCI_DN(dn)->pcidev;
+ if (dev)
+ *freset |= dev->needs_freset;
+
+ __eeh_set_pe_freset(dn, freset);
+}
+
+/**
+ * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
+ * @dn device node
+ * @dev pci device, if known
+ *
+ * Check for an EEH failure for the given device node. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze. This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a slot isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
+{
+ int ret;
+ int rets[3];
+ unsigned long flags;
+ struct pci_dn *pdn;
+ int rc = 0;
+ const char *location;
+
+ total_mmio_ffs++;
+
+ if (!eeh_subsystem_enabled)
+ return 0;
+
+ if (!dn) {
+ no_dn++;
+ return 0;
+ }
+ dn = find_device_pe(dn);
+ pdn = PCI_DN(dn);
+
+ /* Access to IO BARs might get this far and still not want checking. */
+ if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
+ pdn->eeh_mode & EEH_MODE_NOCHECK) {
+ ignored_check++;
+ pr_debug("EEH: Ignored check (%x) for %s %s\n",
+ pdn->eeh_mode, eeh_pci_name(dev), dn->full_name);
+ return 0;
+ }
+
+ if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) {
+ no_cfg_addr++;
+ return 0;
+ }
+
+ /* If we already have a pending isolation event for this
+ * slot, we know it's bad already, we don't need to check.
+ * Do this checking under a lock; as multiple PCI devices
+ * in one slot might report errors simultaneously, and we
+ * only want one error recovery routine running.
+ */
+ raw_spin_lock_irqsave(&confirm_error_lock, flags);
+ rc = 1;
+ if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
+ pdn->eeh_check_count ++;
+ if (pdn->eeh_check_count % EEH_MAX_FAILS == 0) {
+ location = of_get_property(dn, "ibm,loc-code", NULL);
+ printk (KERN_ERR "EEH: %d reads ignored for recovering device at "
+ "location=%s driver=%s pci addr=%s\n",
+ pdn->eeh_check_count, location,
+ eeh_driver_name(dev), eeh_pci_name(dev));
+ printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+ eeh_driver_name(dev));
+ dump_stack();
+ }
+ goto dn_unlock;
+ }
+
+ /*
+ * Now test for an EEH failure. This is VERY expensive.
+ * Note that the eeh_config_addr may be a parent device
+ * in the case of a device behind a bridge, or it may be
+ * function zero of a multi-function device.
+ * In any case they must share a common PHB.
+ */
+ ret = read_slot_reset_state(pdn, rets);
+
+ /* If the call to firmware failed, punt */
+ if (ret != 0) {
+ printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
+ ret, dn->full_name);
+ false_positives++;
+ pdn->eeh_false_positives ++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ /* Note that config-io to empty slots may fail;
+ * they are empty when they don't have children. */
+ if ((rets[0] == 5) && (rets[2] == 0) && (dn->child == NULL)) {
+ false_positives++;
+ pdn->eeh_false_positives ++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ /* If EEH is not supported on this device, punt. */
+ if (rets[1] != 1) {
+ printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
+ ret, dn->full_name);
+ false_positives++;
+ pdn->eeh_false_positives ++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ /* If not the kind of error we know about, punt. */
+ if (rets[0] != 1 && rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
+ false_positives++;
+ pdn->eeh_false_positives ++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ slot_resets++;
+
+ /* Avoid repeated reports of this failure, including problems
+ * with other functions on this device, and functions under
+ * bridges. */
+ eeh_mark_slot (dn, EEH_MODE_ISOLATED);
+ raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+
+ eeh_send_failure_event (dn, dev);
+
+ /* Most EEH events are due to device driver bugs. Having
+ * a stack trace will help the device-driver authors figure
+ * out what happened. So print that out. */
+ dump_stack();
+ return 1;
+
+dn_unlock:
+ raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
+ return rc;
+}
+
+EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
+
+/**
+ * eeh_check_failure - check if all 1's data is due to EEH slot freeze
+ * @token i/o token, should be address in the form 0xA....
+ * @val value, should be all 1's (XXX why do we need this arg??)
+ *
+ * Check for an EEH failure at the given token address. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event. This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
+{
+ unsigned long addr;
+ struct pci_dev *dev;
+ struct device_node *dn;
+
+ /* Finding the phys addr + pci device; this is pretty quick. */
+ addr = eeh_token_to_phys((unsigned long __force) token);
+ dev = pci_get_device_by_addr(addr);
+ if (!dev) {
+ no_device++;
+ return val;
+ }
+
+ dn = pci_device_to_OF_node(dev);
+ eeh_dn_check_failure (dn, dev);
+
+ pci_dev_put(dev);
+ return val;
+}
+
+EXPORT_SYMBOL(eeh_check_failure);
+
+/* ------------------------------------------------------------- */
+/* The code below deals with error recovery */
+
+/**
+ * rtas_pci_enable - enable MMIO or DMA transfers for this slot
+ * @pdn pci device node
+ */
+
+int
+rtas_pci_enable(struct pci_dn *pdn, int function)
+{
+ int config_addr;
+ int rc;
+
+ /* Use PE configuration address, if present */
+ config_addr = pdn->eeh_config_addr;
+ if (pdn->eeh_pe_config_addr)
+ config_addr = pdn->eeh_pe_config_addr;
+
+ rc = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+ config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid),
+ function);
+
+ if (rc)
+ printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n",
+ function, rc, pdn->node->full_name);
+
+ rc = eeh_wait_for_slot_status (pdn, PCI_BUS_RESET_WAIT_MSEC);
+ if ((rc == 4) && (function == EEH_THAW_MMIO))
+ return 0;
+
+ return rc;
+}
+
+/**
+ * rtas_pci_slot_reset - raises/lowers the pci #RST line
+ * @pdn pci device node
+ * @state: 1/0 to raise/lower the #RST
+ *
+ * Clear the EEH-frozen condition on a slot. This routine
+ * asserts the PCI #RST line if the 'state' argument is '1',
+ * and drops the #RST line if 'state is '0'. This routine is
+ * safe to call in an interrupt context.
+ *
+ */
+
+static void
+rtas_pci_slot_reset(struct pci_dn *pdn, int state)
+{
+ int config_addr;
+ int rc;
+
+ BUG_ON (pdn==NULL);
+
+ if (!pdn->phb) {
+ printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n",
+ pdn->node->full_name);
+ return;
+ }
+
+ /* Use PE configuration address, if present */
+ config_addr = pdn->eeh_config_addr;
+ if (pdn->eeh_pe_config_addr)
+ config_addr = pdn->eeh_pe_config_addr;
+
+ rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+ config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid),
+ state);
+
+ /* Fundamental-reset not supported on this PE, try hot-reset */
+ if (rc == -8 && state == 3) {
+ rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+ config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid), 1);
+ if (rc)
+ printk(KERN_WARNING
+ "EEH: Unable to reset the failed slot,"
+ " #RST=%d dn=%s\n",
+ rc, pdn->node->full_name);
+ }
+}
+
+/**
+ * pcibios_set_pcie_slot_reset - Set PCI-E reset state
+ * @dev: pci device struct
+ * @state: reset state to enter
+ *
+ * Return value:
+ * 0 if success
+ **/
+int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
+{
+ struct device_node *dn = pci_device_to_OF_node(dev);
+ struct pci_dn *pdn = PCI_DN(dn);
+
+ switch (state) {
+ case pcie_deassert_reset:
+ rtas_pci_slot_reset(pdn, 0);
+ break;
+ case pcie_hot_reset:
+ rtas_pci_slot_reset(pdn, 1);
+ break;
+ case pcie_warm_reset:
+ rtas_pci_slot_reset(pdn, 3);
+ break;
+ default:
+ return -EINVAL;
+ };
+
+ return 0;
+}
+
+/**
+ * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
+ * @pdn: pci device node to be reset.
+ */
+
+static void __rtas_set_slot_reset(struct pci_dn *pdn)
+{
+ unsigned int freset = 0;
+
+ /* Determine type of EEH reset required for
+ * Partitionable Endpoint, a hot-reset (1)
+ * or a fundamental reset (3).
+ * A fundamental reset required by any device under
+ * Partitionable Endpoint trumps hot-reset.
+ */
+ eeh_set_pe_freset(pdn->node, &freset);
+
+ if (freset)
+ rtas_pci_slot_reset(pdn, 3);
+ else
+ rtas_pci_slot_reset(pdn, 1);
+
+ /* The PCI bus requires that the reset be held high for at least
+ * a 100 milliseconds. We wait a bit longer 'just in case'. */
+
+#define PCI_BUS_RST_HOLD_TIME_MSEC 250
+ msleep (PCI_BUS_RST_HOLD_TIME_MSEC);
+
+ /* We might get hit with another EEH freeze as soon as the
+ * pci slot reset line is dropped. Make sure we don't miss
+ * these, and clear the flag now. */
+ eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED);
+
+ rtas_pci_slot_reset (pdn, 0);
+
+ /* After a PCI slot has been reset, the PCI Express spec requires
+ * a 1.5 second idle time for the bus to stabilize, before starting
+ * up traffic. */
+#define PCI_BUS_SETTLE_TIME_MSEC 1800
+ msleep (PCI_BUS_SETTLE_TIME_MSEC);
+}
+
+int rtas_set_slot_reset(struct pci_dn *pdn)
+{
+ int i, rc;
+
+ /* Take three shots at resetting the bus */
+ for (i=0; i<3; i++) {
+ __rtas_set_slot_reset(pdn);
+
+ rc = eeh_wait_for_slot_status(pdn, PCI_BUS_RESET_WAIT_MSEC);
+ if (rc == 0)
+ return 0;
+
+ if (rc < 0) {
+ printk(KERN_ERR "EEH: unrecoverable slot failure %s\n",
+ pdn->node->full_name);
+ return -1;
+ }
+ printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n",
+ i+1, pdn->node->full_name, rc);
+ }
+
+ return -1;
+}
+
+/* ------------------------------------------------------- */
+/** Save and restore of PCI BARs
+ *
+ * Although firmware will set up BARs during boot, it doesn't
+ * set up device BAR's after a device reset, although it will,
+ * if requested, set up bridge configuration. Thus, we need to
+ * configure the PCI devices ourselves.
+ */
+
+/**
+ * __restore_bars - Restore the Base Address Registers
+ * @pdn: pci device node
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static inline void __restore_bars (struct pci_dn *pdn)
+{
+ int i;
+ u32 cmd;
+
+ if (NULL==pdn->phb) return;
+ for (i=4; i<10; i++) {
+ rtas_write_config(pdn, i*4, 4, pdn->config_space[i]);
+ }
+
+ /* 12 == Expansion ROM Address */
+ rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]);
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)])
+
+ rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1,
+ SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+
+ rtas_write_config (pdn, PCI_LATENCY_TIMER, 1,
+ SAVED_BYTE(PCI_LATENCY_TIMER));
+
+ /* max latency, min grant, interrupt pin and line */
+ rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]);
+
+ /* Restore PERR & SERR bits, some devices require it,
+ don't touch the other command bits */
+ rtas_read_config(pdn, PCI_COMMAND, 4, &cmd);
+ if (pdn->config_space[1] & PCI_COMMAND_PARITY)
+ cmd |= PCI_COMMAND_PARITY;
+ else
+ cmd &= ~PCI_COMMAND_PARITY;
+ if (pdn->config_space[1] & PCI_COMMAND_SERR)
+ cmd |= PCI_COMMAND_SERR;
+ else
+ cmd &= ~PCI_COMMAND_SERR;
+ rtas_write_config(pdn, PCI_COMMAND, 4, cmd);
+}
+
+/**
+ * eeh_restore_bars - restore the PCI config space info
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_restore_bars(struct pci_dn *pdn)
+{
+ struct device_node *dn;
+ if (!pdn)
+ return;
+
+ if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code))
+ __restore_bars (pdn);
+
+ for_each_child_of_node(pdn->node, dn)
+ eeh_restore_bars (PCI_DN(dn));
+}
+
+/**
+ * eeh_save_bars - save device bars
+ *
+ * Save the values of the device bars. Unlike the restore
+ * routine, this routine is *not* recursive. This is because
+ * PCI devices are added individually; but, for the restore,
+ * an entire slot is reset at a time.
+ */
+static void eeh_save_bars(struct pci_dn *pdn)
+{
+ int i;
+
+ if (!pdn )
+ return;
+
+ for (i = 0; i < 16; i++)
+ rtas_read_config(pdn, i * 4, 4, &pdn->config_space[i]);
+}
+
+void
+rtas_configure_bridge(struct pci_dn *pdn)
+{
+ int config_addr;
+ int rc;
+ int token;
+
+ /* Use PE configuration address, if present */
+ config_addr = pdn->eeh_config_addr;
+ if (pdn->eeh_pe_config_addr)
+ config_addr = pdn->eeh_pe_config_addr;
+
+ /* Use new configure-pe function, if supported */
+ if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE)
+ token = ibm_configure_pe;
+ else
+ token = ibm_configure_bridge;
+
+ rc = rtas_call(token, 3, 1, NULL,
+ config_addr,
+ BUID_HI(pdn->phb->buid),
+ BUID_LO(pdn->phb->buid));
+ if (rc) {
+ printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n",
+ rc, pdn->node->full_name);
+ }
+}
+
+/* ------------------------------------------------------------- */
+/* The code below deals with enabling EEH for devices during the
+ * early boot sequence. EEH must be enabled before any PCI probing
+ * can be done.
+ */
+
+#define EEH_ENABLE 1
+
+struct eeh_early_enable_info {
+ unsigned int buid_hi;
+ unsigned int buid_lo;
+};
+
+static int get_pe_addr (int config_addr,
+ struct eeh_early_enable_info *info)
+{
+ unsigned int rets[3];
+ int ret;
+
+ /* Use latest config-addr token on power6 */
+ if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+ /* Make sure we have a PE in hand */
+ ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets,
+ config_addr, info->buid_hi, info->buid_lo, 1);
+ if (ret || (rets[0]==0))
+ return 0;
+
+ ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets,
+ config_addr, info->buid_hi, info->buid_lo, 0);
+ if (ret)
+ return 0;
+ return rets[0];
+ }
+
+ /* Use older config-addr token on power5 */
+ if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+ ret = rtas_call (ibm_get_config_addr_info, 4, 2, rets,
+ config_addr, info->buid_hi, info->buid_lo, 0);
+ if (ret)
+ return 0;
+ return rets[0];
+ }
+ return 0;
+}
+
+/* Enable eeh for the given device node. */
+static void *early_enable_eeh(struct device_node *dn, void *data)
+{
+ unsigned int rets[3];
+ struct eeh_early_enable_info *info = data;
+ int ret;
+ const u32 *class_code = of_get_property(dn, "class-code", NULL);
+ const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL);
+ const u32 *device_id = of_get_property(dn, "device-id", NULL);
+ const u32 *regs;
+ int enable;
+ struct pci_dn *pdn = PCI_DN(dn);
+
+ pdn->class_code = 0;
+ pdn->eeh_mode = 0;
+ pdn->eeh_check_count = 0;
+ pdn->eeh_freeze_count = 0;
+ pdn->eeh_false_positives = 0;
+
+ if (!of_device_is_available(dn))
+ return NULL;
+
+ /* Ignore bad nodes. */
+ if (!class_code || !vendor_id || !device_id)
+ return NULL;
+
+ /* There is nothing to check on PCI to ISA bridges */
+ if (dn->type && !strcmp(dn->type, "isa")) {
+ pdn->eeh_mode |= EEH_MODE_NOCHECK;
+ return NULL;
+ }
+ pdn->class_code = *class_code;
+
+ /* Ok... see if this device supports EEH. Some do, some don't,
+ * and the only way to find out is to check each and every one. */
+ regs = of_get_property(dn, "reg", NULL);
+ if (regs) {
+ /* First register entry is addr (00BBSS00) */
+ /* Try to enable eeh */
+ ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
+ regs[0], info->buid_hi, info->buid_lo,
+ EEH_ENABLE);
+
+ enable = 0;
+ if (ret == 0) {
+ pdn->eeh_config_addr = regs[0];
+
+ /* If the newer, better, ibm,get-config-addr-info is supported,
+ * then use that instead. */
+ pdn->eeh_pe_config_addr = get_pe_addr(pdn->eeh_config_addr, info);
+
+ /* Some older systems (Power4) allow the
+ * ibm,set-eeh-option call to succeed even on nodes
+ * where EEH is not supported. Verify support
+ * explicitly. */
+ ret = read_slot_reset_state(pdn, rets);
+ if ((ret == 0) && (rets[1] == 1))
+ enable = 1;
+ }
+
+ if (enable) {
+ eeh_subsystem_enabled = 1;
+ pdn->eeh_mode |= EEH_MODE_SUPPORTED;
+
+ pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n",
+ dn->full_name, pdn->eeh_config_addr,
+ pdn->eeh_pe_config_addr);
+ } else {
+
+ /* This device doesn't support EEH, but it may have an
+ * EEH parent, in which case we mark it as supported. */
+ if (dn->parent && PCI_DN(dn->parent)
+ && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
+ /* Parent supports EEH. */
+ pdn->eeh_mode |= EEH_MODE_SUPPORTED;
+ pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr;
+ return NULL;
+ }
+ }
+ } else {
+ printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",
+ dn->full_name);
+ }
+
+ eeh_save_bars(pdn);
+ return NULL;
+}
+
+/*
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check. If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+void __init eeh_init(void)
+{
+ struct device_node *phb, *np;
+ struct eeh_early_enable_info info;
+
+ raw_spin_lock_init(&confirm_error_lock);
+ spin_lock_init(&slot_errbuf_lock);
+
+ np = of_find_node_by_path("/rtas");
+ if (np == NULL)
+ return;
+
+ ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
+ ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
+ ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
+ ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
+ ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
+ ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
+ ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
+ ibm_configure_bridge = rtas_token ("ibm,configure-bridge");
+ ibm_configure_pe = rtas_token("ibm,configure-pe");
+
+ if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)
+ return;
+
+ eeh_error_buf_size = rtas_token("rtas-error-log-max");
+ if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) {
+ eeh_error_buf_size = 1024;
+ }
+ if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) {
+ printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated "
+ "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX);
+ eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
+ }
+
+ /* Enable EEH for all adapters. Note that eeh requires buid's */
+ for (phb = of_find_node_by_name(NULL, "pci"); phb;
+ phb = of_find_node_by_name(phb, "pci")) {
+ unsigned long buid;
+
+ buid = get_phb_buid(phb);
+ if (buid == 0 || PCI_DN(phb) == NULL)
+ continue;
+
+ info.buid_lo = BUID_LO(buid);
+ info.buid_hi = BUID_HI(buid);
+ traverse_pci_devices(phb, early_enable_eeh, &info);
+ }
+
+ if (eeh_subsystem_enabled)
+ printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");
+ else
+ printk(KERN_WARNING "EEH: No capable adapters found\n");
+}
+
+/**
+ * eeh_add_device_early - enable EEH for the indicated device_node
+ * @dn: device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+static void eeh_add_device_early(struct device_node *dn)
+{
+ struct pci_controller *phb;
+ struct eeh_early_enable_info info;
+
+ if (!dn || !PCI_DN(dn))
+ return;
+ phb = PCI_DN(dn)->phb;
+
+ /* USB Bus children of PCI devices will not have BUID's */
+ if (NULL == phb || 0 == phb->buid)
+ return;
+
+ info.buid_hi = BUID_HI(phb->buid);
+ info.buid_lo = BUID_LO(phb->buid);
+ early_enable_eeh(dn, &info);
+}
+
+void eeh_add_device_tree_early(struct device_node *dn)
+{
+ struct device_node *sib;
+
+ for_each_child_of_node(dn, sib)
+ eeh_add_device_tree_early(sib);
+ eeh_add_device_early(dn);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
+
+/**
+ * eeh_add_device_late - perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+static void eeh_add_device_late(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+
+ if (!dev || !eeh_subsystem_enabled)
+ return;
+
+ pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+ dn = pci_device_to_OF_node(dev);
+ pdn = PCI_DN(dn);
+ if (pdn->pcidev == dev) {
+ pr_debug("EEH: Already referenced !\n");
+ return;
+ }
+ WARN_ON(pdn->pcidev);
+
+ pci_dev_get (dev);
+ pdn->pcidev = dev;
+
+ pci_addr_cache_insert_device(dev);
+ eeh_sysfs_add_device(dev);
+}
+
+void eeh_add_device_tree_late(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ eeh_add_device_late(dev);
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ struct pci_bus *subbus = dev->subordinate;
+ if (subbus)
+ eeh_add_device_tree_late(subbus);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
+
+/**
+ * eeh_remove_device - undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ *
+ * This routine should be called when a device is removed from
+ * a running system (e.g. by hotplug or dlpar). It unregisters
+ * the PCI device from the EEH subsystem. I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
+ */
+static void eeh_remove_device(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ if (!dev || !eeh_subsystem_enabled)
+ return;
+
+ /* Unregister the device with the EEH/PCI address search system */
+ pr_debug("EEH: Removing device %s\n", pci_name(dev));
+
+ dn = pci_device_to_OF_node(dev);
+ if (PCI_DN(dn)->pcidev == NULL) {
+ pr_debug("EEH: Not referenced !\n");
+ return;
+ }
+ PCI_DN(dn)->pcidev = NULL;
+ pci_dev_put (dev);
+
+ pci_addr_cache_remove_device(dev);
+ eeh_sysfs_remove_device(dev);
+}
+
+void eeh_remove_bus_device(struct pci_dev *dev)
+{
+ struct pci_bus *bus = dev->subordinate;
+ struct pci_dev *child, *tmp;
+
+ eeh_remove_device(dev);
+
+ if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
+ eeh_remove_bus_device(child);
+ }
+}
+EXPORT_SYMBOL_GPL(eeh_remove_bus_device);
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+ if (0 == eeh_subsystem_enabled) {
+ seq_printf(m, "EEH Subsystem is globally disabled\n");
+ seq_printf(m, "eeh_total_mmio_ffs=%ld\n", total_mmio_ffs);
+ } else {
+ seq_printf(m, "EEH Subsystem is enabled\n");
+ seq_printf(m,
+ "no device=%ld\n"
+ "no device node=%ld\n"
+ "no config address=%ld\n"
+ "check not wanted=%ld\n"
+ "eeh_total_mmio_ffs=%ld\n"
+ "eeh_false_positives=%ld\n"
+ "eeh_slot_resets=%ld\n",
+ no_device, no_dn, no_cfg_addr,
+ ignored_check, total_mmio_ffs,
+ false_positives,
+ slot_resets);
+ }
+
+ return 0;
+}
+
+static int proc_eeh_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_eeh_show, NULL);
+}
+
+static const struct file_operations proc_eeh_operations = {
+ .open = proc_eeh_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init eeh_init_proc(void)
+{
+ if (machine_is(pseries))
+ proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
+ return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
new file mode 100644
index 00000000000..fc5ae767989
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -0,0 +1,308 @@
+/*
+ * eeh_cache.c
+ * PCI address cache; allows the lookup of PCI devices based on I/O address
+ *
+ * Copyright IBM Corporation 2004
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+/**
+ * The pci address cache subsystem. This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range
+{
+ struct rb_node rb_node;
+ unsigned long addr_lo;
+ unsigned long addr_hi;
+ struct pci_dev *pcidev;
+ unsigned int flags;
+};
+
+static struct pci_io_addr_cache
+{
+ struct rb_root rb_root;
+ spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr)
+{
+ struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (addr < piar->addr_lo) {
+ n = n->rb_left;
+ } else {
+ if (addr > piar->addr_hi) {
+ n = n->rb_right;
+ } else {
+ pci_dev_get(piar->pcidev);
+ return piar->pcidev;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * pci_get_device_by_addr - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address. Be sure to pci_dev_put the device
+ * when finished. I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+struct pci_dev *pci_get_device_by_addr(unsigned long addr)
+{
+ struct pci_dev *dev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ dev = __pci_get_device_by_addr(addr);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+ return dev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+ struct rb_node *n;
+ int cnt = 0;
+
+ n = rb_first(&cache->rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+ printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n",
+ (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+ piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+ cnt++;
+ n = rb_next(n);
+ }
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+ unsigned long ahi, unsigned int flags)
+{
+ struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct pci_io_addr_range *piar;
+
+ /* Walk tree, find a place to insert into tree */
+ while (*p) {
+ parent = *p;
+ piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+ if (ahi < piar->addr_lo) {
+ p = &parent->rb_left;
+ } else if (alo > piar->addr_hi) {
+ p = &parent->rb_right;
+ } else {
+ if (dev != piar->pcidev ||
+ alo != piar->addr_lo || ahi != piar->addr_hi) {
+ printk(KERN_WARNING "PIAR: overlapping address range\n");
+ }
+ return piar;
+ }
+ }
+ piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+ if (!piar)
+ return NULL;
+
+ pci_dev_get(dev);
+ piar->addr_lo = alo;
+ piar->addr_hi = ahi;
+ piar->pcidev = dev;
+ piar->flags = flags;
+
+#ifdef DEBUG
+ printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n",
+ alo, ahi, pci_name (dev));
+#endif
+
+ rb_link_node(&piar->rb_node, parent, p);
+ rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+ return piar;
+}
+
+static void __pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+ int i;
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn) {
+ printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev));
+ return;
+ }
+
+ /* Skip any devices for which EEH is not enabled. */
+ pdn = PCI_DN(dn);
+ if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
+ pdn->eeh_mode & EEH_MODE_NOCHECK) {
+#ifdef DEBUG
+ printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n",
+ pci_name(dev), pdn->node->full_name);
+#endif
+ return;
+ }
+
+ /* Walk resources on this device, poke them into the tree */
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ unsigned long start = pci_resource_start(dev,i);
+ unsigned long end = pci_resource_end(dev,i);
+ unsigned int flags = pci_resource_flags(dev,i);
+
+ /* We are interested only bus addresses, not dma or other stuff */
+ if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+ continue;
+ if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+ continue;
+ pci_addr_cache_insert(dev, start, end, flags);
+ }
+}
+
+/**
+ * pci_addr_cache_insert_device - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void pci_addr_cache_insert_device(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ /* Ignore PCI bridges */
+ if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
+ return;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __pci_addr_cache_insert_device(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+ struct rb_node *n;
+
+restart:
+ n = rb_first(&pci_io_addr_cache_root.rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (piar->pcidev == dev) {
+ rb_erase(n, &pci_io_addr_cache_root.rb_root);
+ pci_dev_put(piar->pcidev);
+ kfree(piar);
+ goto restart;
+ }
+ n = rb_next(n);
+ }
+}
+
+/**
+ * pci_addr_cache_remove_device - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void pci_addr_cache_remove_device(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __pci_addr_cache_remove_device(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * pci_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses. This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scanned for devices (after all device resources are known).
+ */
+void __init pci_addr_cache_build(void)
+{
+ struct device_node *dn;
+ struct pci_dev *dev = NULL;
+
+ spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+ for_each_pci_dev(dev) {
+ pci_addr_cache_insert_device(dev);
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn)
+ continue;
+ pci_dev_get(dev); /* matching put is in eeh_remove_device() */
+ PCI_DN(dn)->pcidev = dev;
+
+ eeh_sysfs_add_device(dev);
+ }
+
+#ifdef DEBUG
+ /* Verify tree built up above, echo back the list of addrs. */
+ pci_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
+
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
new file mode 100644
index 00000000000..1b6cb10589e
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -0,0 +1,511 @@
+/*
+ * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
+ * Copyright IBM Corp. 2004 2005
+ * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+
+static inline const char * pcid_name (struct pci_dev *pdev)
+{
+ if (pdev && pdev->dev.driver)
+ return pdev->dev.driver->name;
+ return "";
+}
+
+#if 0
+static void print_device_node_tree(struct pci_dn *pdn, int dent)
+{
+ int i;
+ struct device_node *pc;
+
+ if (!pdn)
+ return;
+ for (i = 0; i < dent; i++)
+ printk(" ");
+ printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
+ pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
+ pdn->eeh_pe_config_addr, pdn->node->full_name);
+ dent += 3;
+ pc = pdn->node->child;
+ while (pc) {
+ print_device_node_tree(PCI_DN(pc), dent);
+ pc = pc->sibling;
+ }
+}
+#endif
+
+/**
+ * eeh_disable_irq - disable interrupt for the recovering device
+ */
+static void eeh_disable_irq(struct pci_dev *dev)
+{
+ struct device_node *dn = pci_device_to_OF_node(dev);
+
+ /* Don't disable MSI and MSI-X interrupts. They are
+ * effectively disabled by the DMA Stopped state
+ * when an EEH error occurs.
+ */
+ if (dev->msi_enabled || dev->msix_enabled)
+ return;
+
+ if (!irq_has_action(dev->irq))
+ return;
+
+ PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;
+ disable_irq_nosync(dev->irq);
+}
+
+/**
+ * eeh_enable_irq - enable interrupt for the recovering device
+ */
+static void eeh_enable_irq(struct pci_dev *dev)
+{
+ struct device_node *dn = pci_device_to_OF_node(dev);
+
+ if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) {
+ PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED;
+ enable_irq(dev->irq);
+ }
+}
+
+/* ------------------------------------------------------- */
+/**
+ * eeh_report_error - report pci error to each device driver
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
+ * passed back in "userdata".
+ */
+
+static int eeh_report_error(struct pci_dev *dev, void *userdata)
+{
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver = dev->driver;
+
+ dev->error_state = pci_channel_io_frozen;
+
+ if (!driver)
+ return 0;
+
+ eeh_disable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->error_detected)
+ return 0;
+
+ rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen);
+
+ /* A driver that needs a reset trumps all others */
+ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+ return 0;
+}
+
+/**
+ * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled
+ *
+ * Tells each device driver that IO ports, MMIO and config space I/O
+ * are now enabled. Collects up and merges the device driver responses.
+ * Cumulative response passed back in "userdata".
+ */
+
+static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
+{
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver = dev->driver;
+
+ if (!driver ||
+ !driver->err_handler ||
+ !driver->err_handler->mmio_enabled)
+ return 0;
+
+ rc = driver->err_handler->mmio_enabled (dev);
+
+ /* A driver that needs a reset trumps all others */
+ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+ return 0;
+}
+
+/**
+ * eeh_report_reset - tell device that slot has been reset
+ */
+
+static int eeh_report_reset(struct pci_dev *dev, void *userdata)
+{
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver = dev->driver;
+
+ if (!driver)
+ return 0;
+
+ dev->error_state = pci_channel_io_normal;
+
+ eeh_enable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->slot_reset)
+ return 0;
+
+ rc = driver->err_handler->slot_reset(dev);
+ if ((*res == PCI_ERS_RESULT_NONE) ||
+ (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
+ if (*res == PCI_ERS_RESULT_DISCONNECT &&
+ rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+ return 0;
+}
+
+/**
+ * eeh_report_resume - tell device to resume normal operations
+ */
+
+static int eeh_report_resume(struct pci_dev *dev, void *userdata)
+{
+ struct pci_driver *driver = dev->driver;
+
+ dev->error_state = pci_channel_io_normal;
+
+ if (!driver)
+ return 0;
+
+ eeh_enable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->resume)
+ return 0;
+
+ driver->err_handler->resume(dev);
+
+ return 0;
+}
+
+/**
+ * eeh_report_failure - tell device driver that device is dead.
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+
+static int eeh_report_failure(struct pci_dev *dev, void *userdata)
+{
+ struct pci_driver *driver = dev->driver;
+
+ dev->error_state = pci_channel_io_perm_failure;
+
+ if (!driver)
+ return 0;
+
+ eeh_disable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->error_detected)
+ return 0;
+
+ driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+ return 0;
+}
+
+/* ------------------------------------------------------- */
+/**
+ * handle_eeh_events -- reset a PCI device after hard lockup.
+ *
+ * pSeries systems will isolate a PCI slot if the PCI-Host
+ * bridge detects address or data parity errors, DMA's
+ * occurring to wild addresses (which usually happen due to
+ * bugs in device drivers or in PCI adapter firmware).
+ * Slot isolations also occur if #SERR, #PERR or other misc
+ * PCI-related errors are detected.
+ *
+ * Recovery process consists of unplugging the device driver
+ * (which generated hotplug events to userspace), then issuing
+ * a PCI #RST to the device, then reconfiguring the PCI config
+ * space for all bridges & devices under this slot, and then
+ * finally restarting the device drivers (which cause a second
+ * set of hotplug events to go out to userspace).
+ */
+
+/**
+ * eeh_reset_device() -- perform actual reset of a pci slot
+ * @bus: pointer to the pci bus structure corresponding
+ * to the isolated slot. A non-null value will
+ * cause all devices under the bus to be removed
+ * and then re-added.
+ * @pe_dn: pointer to a "Partionable Endpoint" device node.
+ * This is the top-level structure on which pci
+ * bus resets can be performed.
+ */
+
+static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
+{
+ struct device_node *dn;
+ int cnt, rc;
+
+ /* pcibios will clear the counter; save the value */
+ cnt = pe_dn->eeh_freeze_count;
+
+ if (bus)
+ pcibios_remove_pci_devices(bus);
+
+ /* Reset the pci controller. (Asserts RST#; resets config space).
+ * Reconfigure bridges and devices. Don't try to bring the system
+ * up if the reset failed for some reason. */
+ rc = rtas_set_slot_reset(pe_dn);
+ if (rc)
+ return rc;
+
+ /* Walk over all functions on this device. */
+ dn = pe_dn->node;
+ if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
+ dn = dn->parent->child;
+
+ while (dn) {
+ struct pci_dn *ppe = PCI_DN(dn);
+ /* On Power4, always true because eeh_pe_config_addr=0 */
+ if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) {
+ rtas_configure_bridge(ppe);
+ eeh_restore_bars(ppe);
+ }
+ dn = dn->sibling;
+ }
+
+ /* Give the system 5 seconds to finish running the user-space
+ * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
+ * this is a hack, but if we don't do this, and try to bring
+ * the device up before the scripts have taken it down,
+ * potentially weird things happen.
+ */
+ if (bus) {
+ ssleep (5);
+ pcibios_add_pci_devices(bus);
+ }
+ pe_dn->eeh_freeze_count = cnt;
+
+ return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 150
+
+struct pci_dn * handle_eeh_events (struct eeh_event *event)
+{
+ struct device_node *frozen_dn;
+ struct pci_dn *frozen_pdn;
+ struct pci_bus *frozen_bus;
+ int rc = 0;
+ enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+ const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str;
+
+ frozen_dn = find_device_pe(event->dn);
+ if (!frozen_dn) {
+
+ location = of_get_property(event->dn, "ibm,loc-code", NULL);
+ location = location ? location : "unknown";
+ printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
+ "for location=%s pci addr=%s\n",
+ location, eeh_pci_name(event->dev));
+ return NULL;
+ }
+
+ frozen_bus = pcibios_find_pci_bus(frozen_dn);
+ location = of_get_property(frozen_dn, "ibm,loc-code", NULL);
+ location = location ? location : "unknown";
+
+ /* There are two different styles for coming up with the PE.
+ * In the old style, it was the highest EEH-capable device
+ * which was always an EADS pci bridge. In the new style,
+ * there might not be any EADS bridges, and even when there are,
+ * the firmware marks them as "EEH incapable". So another
+ * two-step is needed to find the pci bus.. */
+ if (!frozen_bus)
+ frozen_bus = pcibios_find_pci_bus (frozen_dn->parent);
+
+ if (!frozen_bus) {
+ printk(KERN_ERR "EEH: Cannot find PCI bus "
+ "for location=%s dn=%s\n",
+ location, frozen_dn->full_name);
+ return NULL;
+ }
+
+ frozen_pdn = PCI_DN(frozen_dn);
+ frozen_pdn->eeh_freeze_count++;
+
+ pci_str = eeh_pci_name(event->dev);
+ drv_str = pcid_name(event->dev);
+
+ if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES)
+ goto excess_failures;
+
+ printk(KERN_WARNING
+ "EEH: This PCI device has failed %d times in the last hour:\n",
+ frozen_pdn->eeh_freeze_count);
+
+ if (frozen_pdn->pcidev) {
+ bus_pci_str = pci_name(frozen_pdn->pcidev);
+ bus_drv_str = pcid_name(frozen_pdn->pcidev);
+ printk(KERN_WARNING
+ "EEH: Bus location=%s driver=%s pci addr=%s\n",
+ location, bus_drv_str, bus_pci_str);
+ }
+
+ printk(KERN_WARNING
+ "EEH: Device location=%s driver=%s pci addr=%s\n",
+ location, drv_str, pci_str);
+
+ /* Walk the various device drivers attached to this slot through
+ * a reset sequence, giving each an opportunity to do what it needs
+ * to accomplish the reset. Each child gets a report of the
+ * status ... if any child can't handle the reset, then the entire
+ * slot is dlpar removed and added.
+ */
+ pci_walk_bus(frozen_bus, eeh_report_error, &result);
+
+ /* Get the current PCI slot state. This can take a long time,
+ * sometimes over 3 seconds for certain systems. */
+ rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000);
+ if (rc < 0) {
+ printk(KERN_WARNING "EEH: Permanent failure\n");
+ goto hard_fail;
+ }
+
+ /* Since rtas may enable MMIO when posting the error log,
+ * don't post the error log until after all dev drivers
+ * have been informed.
+ */
+ eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE);
+
+ /* If all device drivers were EEH-unaware, then shut
+ * down all of the device drivers, and hope they
+ * go down willingly, without panicing the system.
+ */
+ if (result == PCI_ERS_RESULT_NONE) {
+ rc = eeh_reset_device(frozen_pdn, frozen_bus);
+ if (rc) {
+ printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
+ goto hard_fail;
+ }
+ }
+
+ /* If all devices reported they can proceed, then re-enable MMIO */
+ if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+ rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO);
+
+ if (rc < 0)
+ goto hard_fail;
+ if (rc) {
+ result = PCI_ERS_RESULT_NEED_RESET;
+ } else {
+ result = PCI_ERS_RESULT_NONE;
+ pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
+ }
+ }
+
+ /* If all devices reported they can proceed, then re-enable DMA */
+ if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+ rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA);
+
+ if (rc < 0)
+ goto hard_fail;
+ if (rc)
+ result = PCI_ERS_RESULT_NEED_RESET;
+ else
+ result = PCI_ERS_RESULT_RECOVERED;
+ }
+
+ /* If any device has a hard failure, then shut off everything. */
+ if (result == PCI_ERS_RESULT_DISCONNECT) {
+ printk(KERN_WARNING "EEH: Device driver gave up\n");
+ goto hard_fail;
+ }
+
+ /* If any device called out for a reset, then reset the slot */
+ if (result == PCI_ERS_RESULT_NEED_RESET) {
+ rc = eeh_reset_device(frozen_pdn, NULL);
+ if (rc) {
+ printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
+ goto hard_fail;
+ }
+ result = PCI_ERS_RESULT_NONE;
+ pci_walk_bus(frozen_bus, eeh_report_reset, &result);
+ }
+
+ /* All devices should claim they have recovered by now. */
+ if ((result != PCI_ERS_RESULT_RECOVERED) &&
+ (result != PCI_ERS_RESULT_NONE)) {
+ printk(KERN_WARNING "EEH: Not recovered\n");
+ goto hard_fail;
+ }
+
+ /* Tell all device drivers that they can resume operations */
+ pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
+
+ return frozen_pdn;
+
+excess_failures:
+ /*
+ * About 90% of all real-life EEH failures in the field
+ * are due to poorly seated PCI cards. Only 10% or so are
+ * due to actual, failed cards.
+ */
+ printk(KERN_ERR
+ "EEH: PCI device at location=%s driver=%s pci addr=%s\n"
+ "has failed %d times in the last hour "
+ "and has been permanently disabled.\n"
+ "Please try reseating this device or replacing it.\n",
+ location, drv_str, pci_str, frozen_pdn->eeh_freeze_count);
+ goto perm_error;
+
+hard_fail:
+ printk(KERN_ERR
+ "EEH: Unable to recover from failure of PCI device "
+ "at location=%s driver=%s pci addr=%s\n"
+ "Please try reseating this device or replacing it.\n",
+ location, drv_str, pci_str);
+
+perm_error:
+ eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE);
+
+ /* Notify all devices that they're about to go down. */
+ pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
+
+ /* Shut down the device drivers for good. */
+ pcibios_remove_pci_devices(frozen_bus);
+
+ return NULL;
+}
+
+/* ---------- end of file ---------- */
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
new file mode 100644
index 00000000000..d2383cfb6df
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -0,0 +1,158 @@
+/*
+ * eeh_event.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+
+/** Overview:
+ * EEH error states may be detected within exception handlers;
+ * however, the recovery processing needs to occur asynchronously
+ * in a normal kernel context and not an interrupt context.
+ * This pair of routines creates an event and queues it onto a
+ * work-queue, where a worker thread can drive recovery.
+ */
+
+/* EEH event workqueue setup. */
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+LIST_HEAD(eeh_eventlist);
+static void eeh_thread_launcher(struct work_struct *);
+DECLARE_WORK(eeh_event_wq, eeh_thread_launcher);
+
+/* Serialize reset sequences for a given pci device */
+DEFINE_MUTEX(eeh_event_mutex);
+
+/**
+ * eeh_event_handler - dispatch EEH events.
+ * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it. The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
+ */
+static int eeh_event_handler(void * dummy)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+ struct pci_dn *pdn;
+
+ daemonize ("eehd");
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ event = NULL;
+
+ /* Unqueue the event, get ready to process. */
+ if (!list_empty(&eeh_eventlist)) {
+ event = list_entry(eeh_eventlist.next, struct eeh_event, list);
+ list_del(&event->list);
+ }
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+ if (event == NULL)
+ return 0;
+
+ /* Serialize processing of EEH events */
+ mutex_lock(&eeh_event_mutex);
+ eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
+
+ printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+ eeh_pci_name(event->dev));
+
+ pdn = handle_eeh_events(event);
+
+ eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
+ pci_dev_put(event->dev);
+ kfree(event);
+ mutex_unlock(&eeh_event_mutex);
+
+ /* If there are no new errors after an hour, clear the counter. */
+ if (pdn && pdn->eeh_freeze_count>0) {
+ msleep_interruptible (3600*1000);
+ if (pdn->eeh_freeze_count>0)
+ pdn->eeh_freeze_count--;
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_thread_launcher
+ * @dummy - unused
+ */
+static void eeh_thread_launcher(struct work_struct *dummy)
+{
+ if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
+ printk(KERN_ERR "Failed to start EEH daemon\n");
+}
+
+/**
+ * eeh_send_failure_event - generate a PCI error event
+ * @dev pci device
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event (struct device_node *dn,
+ struct pci_dev *dev)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+ const char *location;
+
+ if (!mem_init_done) {
+ printk(KERN_ERR "EEH: event during early boot not handled\n");
+ location = of_get_property(dn, "ibm,loc-code", NULL);
+ printk(KERN_ERR "EEH: device node = %s\n", dn->full_name);
+ printk(KERN_ERR "EEH: PCI location = %s\n", location);
+ return 1;
+ }
+ event = kmalloc(sizeof(*event), GFP_ATOMIC);
+ if (event == NULL) {
+ printk (KERN_ERR "EEH: out of memory, event not handled\n");
+ return 1;
+ }
+
+ if (dev)
+ pci_dev_get(dev);
+
+ event->dn = dn;
+ event->dev = dev;
+
+ /* We may or may not be called in an interrupt context */
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ list_add(&event->list, &eeh_eventlist);
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+ schedule_work(&eeh_event_wq);
+
+ return 0;
+}
+
+/********************** END OF FILE ******************************/
diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c
new file mode 100644
index 00000000000..eb744ee234d
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_sysfs.c
@@ -0,0 +1,87 @@
+/*
+ * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
+ * Copyright IBM Corporation 2007
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+
+/**
+ * EEH_SHOW_ATTR -- create sysfs entry for eeh statistic
+ * @_name: name of file in sysfs directory
+ * @_memb: name of member in struct pci_dn to access
+ * @_format: printf format for display
+ *
+ * All of the attributes look very similar, so just
+ * auto-gen a cut-n-paste routine to display them.
+ */
+#define EEH_SHOW_ATTR(_name,_memb,_format) \
+static ssize_t eeh_show_##_name(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct pci_dev *pdev = to_pci_dev(dev); \
+ struct device_node *dn = pci_device_to_OF_node(pdev); \
+ struct pci_dn *pdn; \
+ \
+ if (!dn || PCI_DN(dn) == NULL) \
+ return 0; \
+ \
+ pdn = PCI_DN(dn); \
+ return sprintf(buf, _format "\n", pdn->_memb); \
+} \
+static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
+
+
+EEH_SHOW_ATTR(eeh_mode, eeh_mode, "0x%x");
+EEH_SHOW_ATTR(eeh_config_addr, eeh_config_addr, "0x%x");
+EEH_SHOW_ATTR(eeh_pe_config_addr, eeh_pe_config_addr, "0x%x");
+EEH_SHOW_ATTR(eeh_check_count, eeh_check_count, "%d");
+EEH_SHOW_ATTR(eeh_freeze_count, eeh_freeze_count, "%d");
+EEH_SHOW_ATTR(eeh_false_positives, eeh_false_positives, "%d");
+
+void eeh_sysfs_add_device(struct pci_dev *pdev)
+{
+ int rc=0;
+
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count);
+
+ if (rc)
+ printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
+}
+
+void eeh_sysfs_remove_device(struct pci_dev *pdev)
+{
+ device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_check_count);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count);
+}
+
diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c
new file mode 100644
index 00000000000..2605c310166
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/event_sources.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/prom.h>
+
+#include "pseries.h"
+
+void request_event_sources_irqs(struct device_node *np,
+ irq_handler_t handler,
+ const char *name)
+{
+ int i, index, count = 0;
+ struct of_irq oirq;
+ const u32 *opicprop;
+ unsigned int opicplen;
+ unsigned int virqs[16];
+
+ /* Check for obsolete "open-pic-interrupt" property. If present, then
+ * map those interrupts using the default interrupt host and default
+ * trigger
+ */
+ opicprop = of_get_property(np, "open-pic-interrupt", &opicplen);
+ if (opicprop) {
+ opicplen /= sizeof(u32);
+ for (i = 0; i < opicplen; i++) {
+ if (count > 15)
+ break;
+ virqs[count] = irq_create_mapping(NULL, *(opicprop++));
+ if (virqs[count] == NO_IRQ) {
+ pr_err("event-sources: Unable to allocate "
+ "interrupt number for %s\n",
+ np->full_name);
+ WARN_ON(1);
+ }
+ else
+ count++;
+
+ }
+ }
+ /* Else use normal interrupt tree parsing */
+ else {
+ /* First try to do a proper OF tree parsing */
+ for (index = 0; of_irq_map_one(np, index, &oirq) == 0;
+ index++) {
+ if (count > 15)
+ break;
+ virqs[count] = irq_create_of_mapping(oirq.controller,
+ oirq.specifier,
+ oirq.size);
+ if (virqs[count] == NO_IRQ) {
+ pr_err("event-sources: Unable to allocate "
+ "interrupt number for %s\n",
+ np->full_name);
+ WARN_ON(1);
+ }
+ else
+ count++;
+ }
+ }
+
+ /* Now request them */
+ for (i = 0; i < count; i++) {
+ if (request_irq(virqs[i], handler, 0, name, NULL)) {
+ pr_err("event-sources: Unable to request interrupt "
+ "%d for %s\n", virqs[i], np->full_name);
+ WARN_ON(1);
+ return;
+ }
+ }
+}
+
diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c
new file mode 100644
index 00000000000..0b0eff0cce3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -0,0 +1,87 @@
+/*
+ * pSeries firmware setup code.
+ *
+ * Portions from arch/powerpc/platforms/pseries/setup.c:
+ * Copyright (C) 1995 Linus Torvalds
+ * Adapted from 'alpha' version by Gary Thomas
+ * Modified by Cort Dougan (cort@cs.nmt.edu)
+ * Modified by PPC64 Team, IBM Corp
+ *
+ * Portions from arch/powerpc/kernel/firmware.c
+ * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org)
+ * Modifications for ppc64:
+ * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com>
+ * Copyright (C) 2005 Stephen Rothwell, IBM Corporation
+ *
+ * Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <asm/firmware.h>
+#include <asm/prom.h>
+#include <asm/udbg.h>
+
+#include "pseries.h"
+
+typedef struct {
+ unsigned long val;
+ char * name;
+} firmware_feature_t;
+
+static __initdata firmware_feature_t
+firmware_features_table[FIRMWARE_MAX_FEATURES] = {
+ {FW_FEATURE_PFT, "hcall-pft"},
+ {FW_FEATURE_TCE, "hcall-tce"},
+ {FW_FEATURE_SPRG0, "hcall-sprg0"},
+ {FW_FEATURE_DABR, "hcall-dabr"},
+ {FW_FEATURE_COPY, "hcall-copy"},
+ {FW_FEATURE_ASR, "hcall-asr"},
+ {FW_FEATURE_DEBUG, "hcall-debug"},
+ {FW_FEATURE_PERF, "hcall-perf"},
+ {FW_FEATURE_DUMP, "hcall-dump"},
+ {FW_FEATURE_INTERRUPT, "hcall-interrupt"},
+ {FW_FEATURE_MIGRATE, "hcall-migrate"},
+ {FW_FEATURE_PERFMON, "hcall-perfmon"},
+ {FW_FEATURE_CRQ, "hcall-crq"},
+ {FW_FEATURE_VIO, "hcall-vio"},
+ {FW_FEATURE_RDMA, "hcall-rdma"},
+ {FW_FEATURE_LLAN, "hcall-lLAN"},
+ {FW_FEATURE_BULK_REMOVE, "hcall-bulk"},
+ {FW_FEATURE_XDABR, "hcall-xdabr"},
+ {FW_FEATURE_MULTITCE, "hcall-multi-tce"},
+ {FW_FEATURE_SPLPAR, "hcall-splpar"},
+ {FW_FEATURE_VPHN, "hcall-vphn"},
+};
+
+/* Build up the firmware features bitmask using the contents of
+ * device-tree/ibm,hypertas-functions. Ultimately this functionality may
+ * be moved into prom.c prom_init().
+ */
+void __init fw_feature_init(const char *hypertas, unsigned long len)
+{
+ const char *s;
+ int i;
+
+ pr_debug(" -> fw_feature_init()\n");
+
+ for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) {
+ for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) {
+ /* check value against table of strings */
+ if (!firmware_features_table[i].name ||
+ strcmp(firmware_features_table[i].name, s))
+ continue;
+
+ /* we have a match */
+ powerpc_firmware_features |=
+ firmware_features_table[i].val;
+ break;
+ }
+ }
+
+ pr_debug(" <- fw_feature_init()\n");
+}
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
new file mode 100644
index 00000000000..c986d08d080
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -0,0 +1,415 @@
+/*
+ * pseries CPU Hotplug infrastructure.
+ *
+ * Split out from arch/powerpc/platforms/pseries/setup.c
+ * arch/powerpc/kernel/rtas.c, and arch/powerpc/platforms/pseries/smp.c
+ *
+ * Peter Bergner, IBM March 2001.
+ * Copyright (C) 2001 IBM.
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ * Plus various changes from other IBM teams...
+ *
+ * Copyright (C) 2006 Michael Ellerman, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/sched.h> /* for idle_task_exit */
+#include <linux/cpu.h>
+#include <asm/system.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/vdso_datapage.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/xics.h>
+#include "plpar_wrappers.h"
+#include "offline_states.h"
+
+/* This version can't take the spinlock, because it never returns */
+static struct rtas_args rtas_stop_self_args = {
+ .token = RTAS_UNKNOWN_SERVICE,
+ .nargs = 0,
+ .nret = 1,
+ .rets = &rtas_stop_self_args.args[0],
+};
+
+static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
+ CPU_STATE_OFFLINE;
+static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
+
+static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
+
+static int cede_offline_enabled __read_mostly = 1;
+
+/*
+ * Enable/disable cede_offline when available.
+ */
+static int __init setup_cede_offline(char *str)
+{
+ if (!strcmp(str, "off"))
+ cede_offline_enabled = 0;
+ else if (!strcmp(str, "on"))
+ cede_offline_enabled = 1;
+ else
+ return 0;
+ return 1;
+}
+
+__setup("cede_offline=", setup_cede_offline);
+
+enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+ return per_cpu(current_state, cpu);
+}
+
+void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+ per_cpu(current_state, cpu) = state;
+}
+
+enum cpu_state_vals get_preferred_offline_state(int cpu)
+{
+ return per_cpu(preferred_offline_state, cpu);
+}
+
+void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+ per_cpu(preferred_offline_state, cpu) = state;
+}
+
+void set_default_offline_state(int cpu)
+{
+ per_cpu(preferred_offline_state, cpu) = default_offline_state;
+}
+
+static void rtas_stop_self(void)
+{
+ struct rtas_args *args = &rtas_stop_self_args;
+
+ local_irq_disable();
+
+ BUG_ON(args->token == RTAS_UNKNOWN_SERVICE);
+
+ printk("cpu %u (hwid %u) Ready to die...\n",
+ smp_processor_id(), hard_smp_processor_id());
+ enter_rtas(__pa(args));
+
+ panic("Alas, I survived.\n");
+}
+
+static void pseries_mach_cpu_die(void)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned int hwcpu = hard_smp_processor_id();
+ u8 cede_latency_hint = 0;
+
+ local_irq_disable();
+ idle_task_exit();
+ xics_teardown_cpu();
+
+ if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+ set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
+ if (ppc_md.suspend_disable_cpu)
+ ppc_md.suspend_disable_cpu();
+
+ cede_latency_hint = 2;
+
+ get_lppaca()->idle = 1;
+ if (!get_lppaca()->shared_proc)
+ get_lppaca()->donate_dedicated_cpu = 1;
+
+ while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+ extended_cede_processor(cede_latency_hint);
+ }
+
+ if (!get_lppaca()->shared_proc)
+ get_lppaca()->donate_dedicated_cpu = 0;
+ get_lppaca()->idle = 0;
+
+ if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
+ unregister_slb_shadow(hwcpu);
+
+ /*
+ * Call to start_secondary_resume() will not return.
+ * Kernel stack will be reset and start_secondary()
+ * will be called to continue the online operation.
+ */
+ start_secondary_resume();
+ }
+ }
+
+ /* Requested state is CPU_STATE_OFFLINE at this point */
+ WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE);
+
+ set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
+ unregister_slb_shadow(hwcpu);
+ rtas_stop_self();
+
+ /* Should never get here... */
+ BUG();
+ for(;;);
+}
+
+static int pseries_cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ set_cpu_online(cpu, false);
+ vdso_data->processorCount--;
+
+ /*fix boot_cpuid here*/
+ if (cpu == boot_cpuid)
+ boot_cpuid = cpumask_any(cpu_online_mask);
+
+ /* FIXME: abstract this to not be platform specific later on */
+ xics_migrate_irqs_away();
+ return 0;
+}
+
+/*
+ * pseries_cpu_die: Wait for the cpu to die.
+ * @cpu: logical processor id of the CPU whose death we're awaiting.
+ *
+ * This function is called from the context of the thread which is performing
+ * the cpu-offline. Here we wait for long enough to allow the cpu in question
+ * to self-destroy so that the cpu-offline thread can send the CPU_DEAD
+ * notifications.
+ *
+ * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to
+ * self-destruct.
+ */
+static void pseries_cpu_die(unsigned int cpu)
+{
+ int tries;
+ int cpu_status = 1;
+ unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+ if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+ cpu_status = 1;
+ for (tries = 0; tries < 5000; tries++) {
+ if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
+ cpu_status = 0;
+ break;
+ }
+ msleep(1);
+ }
+ } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
+
+ for (tries = 0; tries < 25; tries++) {
+ cpu_status = smp_query_cpu_stopped(pcpu);
+ if (cpu_status == QCSS_STOPPED ||
+ cpu_status == QCSS_HARDWARE_ERROR)
+ break;
+ cpu_relax();
+ }
+ }
+
+ if (cpu_status != 0) {
+ printk("Querying DEAD? cpu %i (%i) shows %i\n",
+ cpu, pcpu, cpu_status);
+ }
+
+ /* Isolation and deallocation are definitely done by
+ * drslot_chrp_cpu. If they were not they would be
+ * done here. Change isolate state to Isolate and
+ * change allocation-state to Unusable.
+ */
+ paca[cpu].cpu_start = 0;
+}
+
+/*
+ * Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle
+ * here is that a cpu device node may represent up to two logical cpus
+ * in the SMT case. We must honor the assumption in other code that
+ * the logical ids for sibling SMT threads x and y are adjacent, such
+ * that x^1 == y and y^1 == x.
+ */
+static int pseries_add_processor(struct device_node *np)
+{
+ unsigned int cpu;
+ cpumask_var_t candidate_mask, tmp;
+ int err = -ENOSPC, len, nthreads, i;
+ const u32 *intserv;
+
+ intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return 0;
+
+ zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&tmp, GFP_KERNEL);
+
+ nthreads = len / sizeof(u32);
+ for (i = 0; i < nthreads; i++)
+ cpumask_set_cpu(i, tmp);
+
+ cpu_maps_update_begin();
+
+ BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
+
+ /* Get a bitmap of unoccupied slots. */
+ cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
+ if (cpumask_empty(candidate_mask)) {
+ /* If we get here, it most likely means that NR_CPUS is
+ * less than the partition's max processors setting.
+ */
+ printk(KERN_ERR "Cannot add cpu %s; this system configuration"
+ " supports %d logical cpus.\n", np->full_name,
+ cpumask_weight(cpu_possible_mask));
+ goto out_unlock;
+ }
+
+ while (!cpumask_empty(tmp))
+ if (cpumask_subset(tmp, candidate_mask))
+ /* Found a range where we can insert the new cpu(s) */
+ break;
+ else
+ cpumask_shift_left(tmp, tmp, nthreads);
+
+ if (cpumask_empty(tmp)) {
+ printk(KERN_ERR "Unable to find space in cpu_present_mask for"
+ " processor %s with %d thread(s)\n", np->name,
+ nthreads);
+ goto out_unlock;
+ }
+
+ for_each_cpu(cpu, tmp) {
+ BUG_ON(cpu_present(cpu));
+ set_cpu_present(cpu, true);
+ set_hard_smp_processor_id(cpu, *intserv++);
+ }
+ err = 0;
+out_unlock:
+ cpu_maps_update_done();
+ free_cpumask_var(candidate_mask);
+ free_cpumask_var(tmp);
+ return err;
+}
+
+/*
+ * Update the present map for a cpu node which is going away, and set
+ * the hard id in the paca(s) to -1 to be consistent with boot time
+ * convention for non-present cpus.
+ */
+static void pseries_remove_processor(struct device_node *np)
+{
+ unsigned int cpu;
+ int len, nthreads, i;
+ const u32 *intserv;
+
+ intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
+ if (!intserv)
+ return;
+
+ nthreads = len / sizeof(u32);
+
+ cpu_maps_update_begin();
+ for (i = 0; i < nthreads; i++) {
+ for_each_present_cpu(cpu) {
+ if (get_hard_smp_processor_id(cpu) != intserv[i])
+ continue;
+ BUG_ON(cpu_online(cpu));
+ set_cpu_present(cpu, false);
+ set_hard_smp_processor_id(cpu, -1);
+ break;
+ }
+ if (cpu >= nr_cpu_ids)
+ printk(KERN_WARNING "Could not find cpu to remove "
+ "with physical id 0x%x\n", intserv[i]);
+ }
+ cpu_maps_update_done();
+}
+
+static int pseries_smp_notifier(struct notifier_block *nb,
+ unsigned long action, void *node)
+{
+ int err = 0;
+
+ switch (action) {
+ case PSERIES_RECONFIG_ADD:
+ err = pseries_add_processor(node);
+ break;
+ case PSERIES_RECONFIG_REMOVE:
+ pseries_remove_processor(node);
+ break;
+ }
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_smp_nb = {
+ .notifier_call = pseries_smp_notifier,
+};
+
+#define MAX_CEDE_LATENCY_LEVELS 4
+#define CEDE_LATENCY_PARAM_LENGTH 10
+#define CEDE_LATENCY_PARAM_MAX_LENGTH \
+ (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
+#define CEDE_LATENCY_TOKEN 45
+
+static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+ memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
+ return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL,
+ CEDE_LATENCY_TOKEN,
+ __pa(cede_parameters),
+ CEDE_LATENCY_PARAM_MAX_LENGTH);
+}
+
+static int __init pseries_cpu_hotplug_init(void)
+{
+ struct device_node *np;
+ const char *typep;
+ int cpu;
+ int qcss_tok;
+
+ for_each_node_by_name(np, "interrupt-controller") {
+ typep = of_get_property(np, "compatible", NULL);
+ if (strstr(typep, "open-pic")) {
+ of_node_put(np);
+
+ printk(KERN_INFO "CPU Hotplug not supported on "
+ "systems using MPIC\n");
+ return 0;
+ }
+ }
+
+ rtas_stop_self_args.token = rtas_token("stop-self");
+ qcss_tok = rtas_token("query-cpu-stopped-state");
+
+ if (rtas_stop_self_args.token == RTAS_UNKNOWN_SERVICE ||
+ qcss_tok == RTAS_UNKNOWN_SERVICE) {
+ printk(KERN_INFO "CPU Hotplug not supported by firmware "
+ "- disabling.\n");
+ return 0;
+ }
+
+ ppc_md.cpu_die = pseries_mach_cpu_die;
+ smp_ops->cpu_disable = pseries_cpu_disable;
+ smp_ops->cpu_die = pseries_cpu_die;
+
+ /* Processors can be added/removed only on LPAR */
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ pSeries_reconfig_notifier_register(&pseries_smp_nb);
+ cpu_maps_update_begin();
+ if (cede_offline_enabled && parse_cede_parameters() == 0) {
+ default_offline_state = CPU_STATE_INACTIVE;
+ for_each_online_cpu(cpu)
+ set_default_offline_state(cpu);
+ }
+ cpu_maps_update_done();
+ }
+
+ return 0;
+}
+arch_initcall(pseries_cpu_hotplug_init);
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
new file mode 100644
index 00000000000..11d8e0544ac
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -0,0 +1,236 @@
+/*
+ * pseries Memory Hotplug infrastructure.
+ *
+ * Copyright (C) 2008 Badari Pulavarty, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/of.h>
+#include <linux/memblock.h>
+#include <linux/vmalloc.h>
+#include <linux/memory.h>
+
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/sparsemem.h>
+
+static unsigned long get_memblock_size(void)
+{
+ struct device_node *np;
+ unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
+ struct resource r;
+
+ np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (np) {
+ const __be64 *size;
+
+ size = of_get_property(np, "ibm,lmb-size", NULL);
+ if (size)
+ memblock_size = be64_to_cpup(size);
+ of_node_put(np);
+ } else if (machine_is(pseries)) {
+ /* This fallback really only applies to pseries */
+ unsigned int memzero_size = 0;
+
+ np = of_find_node_by_path("/memory@0");
+ if (np) {
+ if (!of_address_to_resource(np, 0, &r))
+ memzero_size = resource_size(&r);
+ of_node_put(np);
+ }
+
+ if (memzero_size) {
+ /* We now know the size of memory@0, use this to find
+ * the first memoryblock and get its size.
+ */
+ char buf[64];
+
+ sprintf(buf, "/memory@%x", memzero_size);
+ np = of_find_node_by_path(buf);
+ if (np) {
+ if (!of_address_to_resource(np, 0, &r))
+ memblock_size = resource_size(&r);
+ of_node_put(np);
+ }
+ }
+ }
+ return memblock_size;
+}
+
+/* WARNING: This is going to override the generic definition whenever
+ * pseries is built-in regardless of what platform is active at boot
+ * time. This is fine for now as this is the only "option" and it
+ * should work everywhere. If not, we'll have to turn this into a
+ * ppc_md. callback
+ */
+unsigned long memory_block_size_bytes(void)
+{
+ return get_memblock_size();
+}
+
+static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
+{
+ unsigned long start, start_pfn;
+ struct zone *zone;
+ int ret;
+
+ start_pfn = base >> PAGE_SHIFT;
+
+ if (!pfn_valid(start_pfn)) {
+ memblock_remove(base, memblock_size);
+ return 0;
+ }
+
+ zone = page_zone(pfn_to_page(start_pfn));
+
+ /*
+ * Remove section mappings and sysfs entries for the
+ * section of the memory we are removing.
+ *
+ * NOTE: Ideally, this should be done in generic code like
+ * remove_memory(). But remove_memory() gets called by writing
+ * to sysfs "state" file and we can't remove sysfs entries
+ * while writing to it. So we have to defer it to here.
+ */
+ ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
+ if (ret)
+ return ret;
+
+ /*
+ * Update memory regions for memory remove
+ */
+ memblock_remove(base, memblock_size);
+
+ /*
+ * Remove htab bolted mappings for this section of memory
+ */
+ start = (unsigned long)__va(base);
+ ret = remove_section_mapping(start, start + memblock_size);
+
+ /* Ensure all vmalloc mappings are flushed in case they also
+ * hit that section of memory
+ */
+ vm_unmap_aliases();
+
+ return ret;
+}
+
+static int pseries_remove_memory(struct device_node *np)
+{
+ const char *type;
+ const unsigned int *regs;
+ unsigned long base;
+ unsigned int lmb_size;
+ int ret = -EINVAL;
+
+ /*
+ * Check to see if we are actually removing memory
+ */
+ type = of_get_property(np, "device_type", NULL);
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /*
+ * Find the bae address and size of the memblock
+ */
+ regs = of_get_property(np, "reg", NULL);
+ if (!regs)
+ return ret;
+
+ base = *(unsigned long *)regs;
+ lmb_size = regs[3];
+
+ ret = pseries_remove_memblock(base, lmb_size);
+ return ret;
+}
+
+static int pseries_add_memory(struct device_node *np)
+{
+ const char *type;
+ const unsigned int *regs;
+ unsigned long base;
+ unsigned int lmb_size;
+ int ret = -EINVAL;
+
+ /*
+ * Check to see if we are actually adding memory
+ */
+ type = of_get_property(np, "device_type", NULL);
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /*
+ * Find the base and size of the memblock
+ */
+ regs = of_get_property(np, "reg", NULL);
+ if (!regs)
+ return ret;
+
+ base = *(unsigned long *)regs;
+ lmb_size = regs[3];
+
+ /*
+ * Update memory region to represent the memory add
+ */
+ ret = memblock_add(base, lmb_size);
+ return (ret < 0) ? -EINVAL : 0;
+}
+
+static int pseries_drconf_memory(unsigned long *base, unsigned int action)
+{
+ unsigned long memblock_size;
+ int rc;
+
+ memblock_size = get_memblock_size();
+ if (!memblock_size)
+ return -EINVAL;
+
+ if (action == PSERIES_DRCONF_MEM_ADD) {
+ rc = memblock_add(*base, memblock_size);
+ rc = (rc < 0) ? -EINVAL : 0;
+ } else if (action == PSERIES_DRCONF_MEM_REMOVE) {
+ rc = pseries_remove_memblock(*base, memblock_size);
+ } else {
+ rc = -EINVAL;
+ }
+
+ return rc;
+}
+
+static int pseries_memory_notifier(struct notifier_block *nb,
+ unsigned long action, void *node)
+{
+ int err = 0;
+
+ switch (action) {
+ case PSERIES_RECONFIG_ADD:
+ err = pseries_add_memory(node);
+ break;
+ case PSERIES_RECONFIG_REMOVE:
+ err = pseries_remove_memory(node);
+ break;
+ case PSERIES_DRCONF_MEM_ADD:
+ case PSERIES_DRCONF_MEM_REMOVE:
+ err = pseries_drconf_memory(node, action);
+ break;
+ }
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block pseries_mem_nb = {
+ .notifier_call = pseries_memory_notifier,
+};
+
+static int __init pseries_memory_hotplug_init(void)
+{
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ pSeries_reconfig_notifier_register(&pseries_mem_nb);
+
+ return 0;
+}
+machine_device_initcall(pseries, pseries_memory_hotplug_init);
diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S
new file mode 100644
index 00000000000..3ce73d0052b
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall.S
@@ -0,0 +1,270 @@
+/*
+ * This file contains the generic code to perform a call to the
+ * pSeries LPAR hypervisor.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <asm/hvcall.h>
+#include <asm/processor.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ptrace.h>
+
+#define STK_PARM(i) (48 + ((i)-3)*8)
+
+#ifdef CONFIG_TRACEPOINTS
+
+ .section ".toc","aw"
+
+ .globl hcall_tracepoint_refcount
+hcall_tracepoint_refcount:
+ .llong 0
+
+ .section ".text"
+
+/*
+ * precall must preserve all registers. use unused STK_PARM()
+ * areas to save snapshots and opcode. We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
+ */
+#define HCALL_INST_PRECALL(FIRST_REG) \
+BEGIN_FTR_SECTION; \
+ b 1f; \
+END_FTR_SECTION(0, 1); \
+ ld r12,hcall_tracepoint_refcount@toc(r2); \
+ std r12,32(r1); \
+ cmpdi r12,0; \
+ beq+ 1f; \
+ mflr r0; \
+ std r3,STK_PARM(r3)(r1); \
+ std r4,STK_PARM(r4)(r1); \
+ std r5,STK_PARM(r5)(r1); \
+ std r6,STK_PARM(r6)(r1); \
+ std r7,STK_PARM(r7)(r1); \
+ std r8,STK_PARM(r8)(r1); \
+ std r9,STK_PARM(r9)(r1); \
+ std r10,STK_PARM(r10)(r1); \
+ std r0,16(r1); \
+ addi r4,r1,STK_PARM(FIRST_REG); \
+ stdu r1,-STACK_FRAME_OVERHEAD(r1); \
+ bl .__trace_hcall_entry; \
+ addi r1,r1,STACK_FRAME_OVERHEAD; \
+ ld r0,16(r1); \
+ ld r3,STK_PARM(r3)(r1); \
+ ld r4,STK_PARM(r4)(r1); \
+ ld r5,STK_PARM(r5)(r1); \
+ ld r6,STK_PARM(r6)(r1); \
+ ld r7,STK_PARM(r7)(r1); \
+ ld r8,STK_PARM(r8)(r1); \
+ ld r9,STK_PARM(r9)(r1); \
+ ld r10,STK_PARM(r10)(r1); \
+ mtlr r0; \
+1:
+
+/*
+ * postcall is performed immediately before function return which
+ * allows liberal use of volatile registers. We branch around this
+ * in early init (eg when populating the MMU hashtable) by using an
+ * unconditional cpu feature.
+ */
+#define __HCALL_INST_POSTCALL \
+BEGIN_FTR_SECTION; \
+ b 1f; \
+END_FTR_SECTION(0, 1); \
+ ld r12,32(r1); \
+ cmpdi r12,0; \
+ beq+ 1f; \
+ mflr r0; \
+ ld r6,STK_PARM(r3)(r1); \
+ std r3,STK_PARM(r3)(r1); \
+ mr r4,r3; \
+ mr r3,r6; \
+ std r0,16(r1); \
+ stdu r1,-STACK_FRAME_OVERHEAD(r1); \
+ bl .__trace_hcall_exit; \
+ addi r1,r1,STACK_FRAME_OVERHEAD; \
+ ld r0,16(r1); \
+ ld r3,STK_PARM(r3)(r1); \
+ mtlr r0; \
+1:
+
+#define HCALL_INST_POSTCALL_NORETS \
+ li r5,0; \
+ __HCALL_INST_POSTCALL
+
+#define HCALL_INST_POSTCALL(BUFREG) \
+ mr r5,BUFREG; \
+ __HCALL_INST_POSTCALL
+
+#else
+#define HCALL_INST_PRECALL(FIRST_ARG)
+#define HCALL_INST_POSTCALL_NORETS
+#define HCALL_INST_POSTCALL(BUFREG)
+#endif
+
+ .text
+
+_GLOBAL(plpar_hcall_norets)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ HCALL_INST_PRECALL(r4)
+
+ HVSC /* invoke the hypervisor */
+
+ HCALL_INST_POSTCALL_NORETS
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+ blr /* return r3 = status */
+
+_GLOBAL(plpar_hcall)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ HCALL_INST_PRECALL(r5)
+
+ std r4,STK_PARM(r4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+
+ HVSC /* invoke the hypervisor */
+
+ ld r12,STK_PARM(r4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+
+ HCALL_INST_POSTCALL(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
+
+/*
+ * plpar_hcall_raw can be called in real mode. kexec/kdump need some
+ * hypervisor calls to be executed in real mode. So plpar_hcall_raw
+ * does not access the per cpu hypervisor call statistics variables,
+ * since these variables may not be present in the RMO region.
+ */
+_GLOBAL(plpar_hcall_raw)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ std r4,STK_PARM(r4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+
+ HVSC /* invoke the hypervisor */
+
+ ld r12,STK_PARM(r4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
+
+_GLOBAL(plpar_hcall9)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ HCALL_INST_PRECALL(r5)
+
+ std r4,STK_PARM(r4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+ ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */
+ ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */
+ ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */
+
+ HVSC /* invoke the hypervisor */
+
+ mr r0,r12
+ ld r12,STK_PARM(r4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+ std r8, 32(r12)
+ std r9, 40(r12)
+ std r10,48(r12)
+ std r11,56(r12)
+ std r0, 64(r12)
+
+ HCALL_INST_POSTCALL(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
+
+/* See plpar_hcall_raw to see why this is needed */
+_GLOBAL(plpar_hcall9_raw)
+ HMT_MEDIUM
+
+ mfcr r0
+ stw r0,8(r1)
+
+ std r4,STK_PARM(r4)(r1) /* Save ret buffer */
+
+ mr r4,r5
+ mr r5,r6
+ mr r6,r7
+ mr r7,r8
+ mr r8,r9
+ mr r9,r10
+ ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */
+ ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */
+ ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */
+
+ HVSC /* invoke the hypervisor */
+
+ mr r0,r12
+ ld r12,STK_PARM(r4)(r1)
+ std r4, 0(r12)
+ std r5, 8(r12)
+ std r6, 16(r12)
+ std r7, 24(r12)
+ std r8, 32(r12)
+ std r9, 40(r12)
+ std r10,48(r12)
+ std r11,56(r12)
+ std r0, 64(r12)
+
+ lwz r0,8(r1)
+ mtcrf 0xff,r0
+
+ blr /* return r3 = status */
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
new file mode 100644
index 00000000000..c9311cfdfca
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (C) 2006 Mike Kravetz IBM Corporation
+ *
+ * Hypervisor Call Instrumentation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <asm/hvcall.h>
+#include <asm/firmware.h>
+#include <asm/cputable.h>
+#include <asm/trace.h>
+
+DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
+
+/*
+ * Routines for displaying the statistics in debugfs
+ */
+static void *hc_start(struct seq_file *m, loff_t *pos)
+{
+ if ((int)*pos < (HCALL_STAT_ARRAY_SIZE-1))
+ return (void *)(unsigned long)(*pos + 1);
+
+ return NULL;
+}
+
+static void *hc_next(struct seq_file *m, void *p, loff_t * pos)
+{
+ ++*pos;
+
+ return hc_start(m, pos);
+}
+
+static void hc_stop(struct seq_file *m, void *p)
+{
+}
+
+static int hc_show(struct seq_file *m, void *p)
+{
+ unsigned long h_num = (unsigned long)p;
+ struct hcall_stats *hs = m->private;
+
+ if (hs[h_num].num_calls) {
+ if (cpu_has_feature(CPU_FTR_PURR))
+ seq_printf(m, "%lu %lu %lu %lu\n", h_num<<2,
+ hs[h_num].num_calls,
+ hs[h_num].tb_total,
+ hs[h_num].purr_total);
+ else
+ seq_printf(m, "%lu %lu %lu\n", h_num<<2,
+ hs[h_num].num_calls,
+ hs[h_num].tb_total);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations hcall_inst_seq_ops = {
+ .start = hc_start,
+ .next = hc_next,
+ .stop = hc_stop,
+ .show = hc_show
+};
+
+static int hcall_inst_seq_open(struct inode *inode, struct file *file)
+{
+ int rc;
+ struct seq_file *seq;
+
+ rc = seq_open(file, &hcall_inst_seq_ops);
+ seq = file->private_data;
+ seq->private = file->f_path.dentry->d_inode->i_private;
+
+ return rc;
+}
+
+static const struct file_operations hcall_inst_seq_fops = {
+ .open = hcall_inst_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+#define HCALL_ROOT_DIR "hcall_inst"
+#define CPU_NAME_BUF_SIZE 32
+
+
+static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args)
+{
+ struct hcall_stats *h;
+
+ if (opcode > MAX_HCALL_OPCODE)
+ return;
+
+ h = &__get_cpu_var(hcall_stats)[opcode / 4];
+ h->tb_start = mftb();
+ h->purr_start = mfspr(SPRN_PURR);
+}
+
+static void probe_hcall_exit(void *ignored, unsigned long opcode, unsigned long retval,
+ unsigned long *retbuf)
+{
+ struct hcall_stats *h;
+
+ if (opcode > MAX_HCALL_OPCODE)
+ return;
+
+ h = &__get_cpu_var(hcall_stats)[opcode / 4];
+ h->num_calls++;
+ h->tb_total += mftb() - h->tb_start;
+ h->purr_total += mfspr(SPRN_PURR) - h->purr_start;
+}
+
+static int __init hcall_inst_init(void)
+{
+ struct dentry *hcall_root;
+ struct dentry *hcall_file;
+ char cpu_name_buf[CPU_NAME_BUF_SIZE];
+ int cpu;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ return 0;
+
+ if (register_trace_hcall_entry(probe_hcall_entry, NULL))
+ return -EINVAL;
+
+ if (register_trace_hcall_exit(probe_hcall_exit, NULL)) {
+ unregister_trace_hcall_entry(probe_hcall_entry, NULL);
+ return -EINVAL;
+ }
+
+ hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL);
+ if (!hcall_root)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu);
+ hcall_file = debugfs_create_file(cpu_name_buf, S_IRUGO,
+ hcall_root,
+ per_cpu(hcall_stats, cpu),
+ &hcall_inst_seq_fops);
+ if (!hcall_file)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+__initcall(hcall_inst_init);
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
new file mode 100644
index 00000000000..b344f94b040
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -0,0 +1,81 @@
+/*
+ * hvconsole.c
+ * Copyright (C) 2004 Hollis Blanchard, IBM Corporation
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Additional Author(s):
+ * Ryan S. Arnold <rsa@us.ibm.com>
+ *
+ * LPAR console support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/errno.h>
+#include <asm/hvcall.h>
+#include <asm/hvconsole.h>
+#include "plpar_wrappers.h"
+
+/**
+ * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
+ * @vtermno: The vtermno or unit_address of the adapter from which to fetch the
+ * data.
+ * @buf: The character buffer into which to put the character data fetched from
+ * firmware.
+ * @count: not used?
+ */
+int hvc_get_chars(uint32_t vtermno, char *buf, int count)
+{
+ unsigned long got;
+
+ if (plpar_get_term_char(vtermno, &got, buf) == H_SUCCESS)
+ return got;
+
+ return 0;
+}
+
+EXPORT_SYMBOL(hvc_get_chars);
+
+
+/**
+ * hvc_put_chars: send characters to firmware for denoted vterm adapter
+ * @vtermno: The vtermno or unit_address of the adapter from which the data
+ * originated.
+ * @buf: The character buffer that contains the character data to send to
+ * firmware.
+ * @count: Send this number of characters.
+ */
+int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
+{
+ unsigned long *lbuf = (unsigned long *) buf;
+ long ret;
+
+
+ /* hcall will ret H_PARAMETER if 'count' exceeds firmware max.*/
+ if (count > MAX_VIO_PUT_CHARS)
+ count = MAX_VIO_PUT_CHARS;
+
+ ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0],
+ lbuf[1]);
+ if (ret == H_SUCCESS)
+ return count;
+ if (ret == H_BUSY)
+ return -EAGAIN;
+ return -EIO;
+}
+
+EXPORT_SYMBOL(hvc_put_chars);
diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c
new file mode 100644
index 00000000000..fcf4b4cbeaf
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/hvcserver.c
@@ -0,0 +1,251 @@
+/*
+ * hvcserver.c
+ * Copyright (C) 2004 Ryan S Arnold, IBM Corporation
+ *
+ * PPC64 virtual I/O console server support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <asm/hvcall.h>
+#include <asm/hvcserver.h>
+#include <asm/io.h>
+
+#define HVCS_ARCH_VERSION "1.0.0"
+
+MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>");
+MODULE_DESCRIPTION("IBM hvcs ppc64 API");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HVCS_ARCH_VERSION);
+
+/*
+ * Convert arch specific return codes into relevant errnos. The hvcs
+ * functions aren't performance sensitive, so this conversion isn't an
+ * issue.
+ */
+static int hvcs_convert(long to_convert)
+{
+ switch (to_convert) {
+ case H_SUCCESS:
+ return 0;
+ case H_PARAMETER:
+ return -EINVAL;
+ case H_HARDWARE:
+ return -EIO;
+ case H_BUSY:
+ case H_LONG_BUSY_ORDER_1_MSEC:
+ case H_LONG_BUSY_ORDER_10_MSEC:
+ case H_LONG_BUSY_ORDER_100_MSEC:
+ case H_LONG_BUSY_ORDER_1_SEC:
+ case H_LONG_BUSY_ORDER_10_SEC:
+ case H_LONG_BUSY_ORDER_100_SEC:
+ return -EBUSY;
+ case H_FUNCTION: /* fall through */
+ default:
+ return -EPERM;
+ }
+}
+
+/**
+ * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info
+ * @head: list_head pointer for an allocated list of partner info structs to
+ * free.
+ *
+ * This function is used to free the partner info list that was returned by
+ * calling hvcs_get_partner_info().
+ */
+int hvcs_free_partner_info(struct list_head *head)
+{
+ struct hvcs_partner_info *pi;
+ struct list_head *element;
+
+ if (!head)
+ return -EINVAL;
+
+ while (!list_empty(head)) {
+ element = head->next;
+ pi = list_entry(element, struct hvcs_partner_info, node);
+ list_del(element);
+ kfree(pi);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(hvcs_free_partner_info);
+
+/* Helper function for hvcs_get_partner_info */
+static int hvcs_next_partner(uint32_t unit_address,
+ unsigned long last_p_partition_ID,
+ unsigned long last_p_unit_address, unsigned long *pi_buff)
+
+{
+ long retval;
+ retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address,
+ last_p_partition_ID,
+ last_p_unit_address, virt_to_phys(pi_buff));
+ return hvcs_convert(retval);
+}
+
+/**
+ * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter
+ * @unit_address: The unit_address of the vty-server adapter for which this
+ * function is fetching partner info.
+ * @head: An initialized list_head pointer to an empty list to use to return the
+ * list of partner info fetched from the hypervisor to the caller.
+ * @pi_buff: A page sized buffer pre-allocated prior to calling this function
+ * that is to be used to be used by firmware as an iterator to keep track
+ * of the partner info retrieval.
+ *
+ * This function returns non-zero on success, or if there is no partner info.
+ *
+ * The pi_buff is pre-allocated prior to calling this function because this
+ * function may be called with a spin_lock held and kmalloc of a page is not
+ * recommended as GFP_ATOMIC.
+ *
+ * The first long of this buffer is used to store a partner unit address. The
+ * second long is used to store a partner partition ID and starting at
+ * pi_buff[2] is the 79 character Converged Location Code (diff size than the
+ * unsigned longs, hence the casting mumbo jumbo you see later).
+ *
+ * Invocation of this function should always be followed by an invocation of
+ * hvcs_free_partner_info() using a pointer to the SAME list head instance
+ * that was passed as a parameter to this function.
+ */
+int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
+ unsigned long *pi_buff)
+{
+ /*
+ * Dealt with as longs because of the hcall interface even though the
+ * values are uint32_t.
+ */
+ unsigned long last_p_partition_ID;
+ unsigned long last_p_unit_address;
+ struct hvcs_partner_info *next_partner_info = NULL;
+ int more = 1;
+ int retval;
+
+ memset(pi_buff, 0x00, PAGE_SIZE);
+ /* invalid parameters */
+ if (!head || !pi_buff)
+ return -EINVAL;
+
+ last_p_partition_ID = last_p_unit_address = ~0UL;
+ INIT_LIST_HEAD(head);
+
+ do {
+ retval = hvcs_next_partner(unit_address, last_p_partition_ID,
+ last_p_unit_address, pi_buff);
+ if (retval) {
+ /*
+ * Don't indicate that we've failed if we have
+ * any list elements.
+ */
+ if (!list_empty(head))
+ return 0;
+ return retval;
+ }
+
+ last_p_partition_ID = pi_buff[0];
+ last_p_unit_address = pi_buff[1];
+
+ /* This indicates that there are no further partners */
+ if (last_p_partition_ID == ~0UL
+ && last_p_unit_address == ~0UL)
+ break;
+
+ /* This is a very small struct and will be freed soon in
+ * hvcs_free_partner_info(). */
+ next_partner_info = kmalloc(sizeof(struct hvcs_partner_info),
+ GFP_ATOMIC);
+
+ if (!next_partner_info) {
+ printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to"
+ " allocate partner info struct.\n");
+ hvcs_free_partner_info(head);
+ return -ENOMEM;
+ }
+
+ next_partner_info->unit_address
+ = (unsigned int)last_p_unit_address;
+ next_partner_info->partition_ID
+ = (unsigned int)last_p_partition_ID;
+
+ /* copy the Null-term char too */
+ strncpy(&next_partner_info->location_code[0],
+ (char *)&pi_buff[2],
+ strlen((char *)&pi_buff[2]) + 1);
+
+ list_add_tail(&(next_partner_info->node), head);
+ next_partner_info = NULL;
+
+ } while (more);
+
+ return 0;
+}
+EXPORT_SYMBOL(hvcs_get_partner_info);
+
+/**
+ * hvcs_register_connection - establish a connection between this vty-server and
+ * a vty.
+ * @unit_address: The unit address of the vty-server adapter that is to be
+ * establish a connection.
+ * @p_partition_ID: The partition ID of the vty adapter that is to be connected.
+ * @p_unit_address: The unit address of the vty adapter to which the vty-server
+ * is to be connected.
+ *
+ * If this function is called once and -EINVAL is returned it may
+ * indicate that the partner info needs to be refreshed for the
+ * target unit address at which point the caller must invoke
+ * hvcs_get_partner_info() and then call this function again. If,
+ * for a second time, -EINVAL is returned then it indicates that
+ * there is probably already a partner connection registered to a
+ * different vty-server adapter. It is also possible that a second
+ * -EINVAL may indicate that one of the parms is not valid, for
+ * instance if the link was removed between the vty-server adapter
+ * and the vty adapter that you are trying to open. Don't shoot the
+ * messenger. Firmware implemented it this way.
+ */
+int hvcs_register_connection( uint32_t unit_address,
+ uint32_t p_partition_ID, uint32_t p_unit_address)
+{
+ long retval;
+ retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address,
+ p_partition_ID, p_unit_address);
+ return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_register_connection);
+
+/**
+ * hvcs_free_connection - free the connection between a vty-server and vty
+ * @unit_address: The unit address of the vty-server that is to have its
+ * connection severed.
+ *
+ * This function is used to free the partner connection between a vty-server
+ * adapter and a vty adapter.
+ *
+ * If -EBUSY is returned continue to call this function until 0 is returned.
+ */
+int hvcs_free_connection(uint32_t unit_address)
+{
+ long retval;
+ retval = plpar_hcall_norets(H_FREE_VTERM, unit_address);
+ return hvcs_convert(retval);
+}
+EXPORT_SYMBOL(hvcs_free_connection);
diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c
new file mode 100644
index 00000000000..1a709bc48ce
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/io_event_irq.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/list.h>
+#include <linux/notifier.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/irq.h>
+#include <asm/io_event_irq.h>
+
+#include "pseries.h"
+
+/*
+ * IO event interrupt is a mechanism provided by RTAS to return
+ * information about hardware error and non-error events. Device
+ * drivers can register their event handlers to receive events.
+ * Device drivers are expected to use atomic_notifier_chain_register()
+ * and atomic_notifier_chain_unregister() to register and unregister
+ * their event handlers. Since multiple IO event types and scopes
+ * share an IO event interrupt, the event handlers are called one
+ * by one until the IO event is claimed by one of the handlers.
+ * The event handlers are expected to return NOTIFY_OK if the
+ * event is handled by the event handler or NOTIFY_DONE if the
+ * event does not belong to the handler.
+ *
+ * Usage:
+ *
+ * Notifier function:
+ * #include <asm/io_event_irq.h>
+ * int event_handler(struct notifier_block *nb, unsigned long val, void *data) {
+ * p = (struct pseries_io_event_sect_data *) data;
+ * if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE;
+ * :
+ * :
+ * return NOTIFY_OK;
+ * }
+ * struct notifier_block event_nb = {
+ * .notifier_call = event_handler,
+ * }
+ *
+ * Registration:
+ * atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb);
+ *
+ * Unregistration:
+ * atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb);
+ */
+
+ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list);
+EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list);
+
+static int ioei_check_exception_token;
+
+/* pSeries event log format */
+
+/* Two bytes ASCII section IDs */
+#define PSERIES_ELOG_SECT_ID_PRIV_HDR (('P' << 8) | 'H')
+#define PSERIES_ELOG_SECT_ID_USER_HDR (('U' << 8) | 'H')
+#define PSERIES_ELOG_SECT_ID_PRIMARY_SRC (('P' << 8) | 'S')
+#define PSERIES_ELOG_SECT_ID_EXTENDED_UH (('E' << 8) | 'H')
+#define PSERIES_ELOG_SECT_ID_FAILING_MTMS (('M' << 8) | 'T')
+#define PSERIES_ELOG_SECT_ID_SECONDARY_SRC (('S' << 8) | 'S')
+#define PSERIES_ELOG_SECT_ID_DUMP_LOCATOR (('D' << 8) | 'H')
+#define PSERIES_ELOG_SECT_ID_FW_ERROR (('S' << 8) | 'W')
+#define PSERIES_ELOG_SECT_ID_IMPACT_PART_ID (('L' << 8) | 'P')
+#define PSERIES_ELOG_SECT_ID_LOGIC_RESOURCE_ID (('L' << 8) | 'R')
+#define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M')
+#define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P')
+#define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E')
+#define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I')
+#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H')
+#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D')
+
+/* Vendor specific Platform Event Log Format, Version 6, section header */
+struct pseries_elog_section {
+ uint16_t id; /* 0x00 2-byte ASCII section ID */
+ uint16_t length; /* 0x02 Section length in bytes */
+ uint8_t version; /* 0x04 Section version */
+ uint8_t subtype; /* 0x05 Section subtype */
+ uint16_t creator_component; /* 0x06 Creator component ID */
+ uint8_t data[]; /* 0x08 Start of section data */
+};
+
+static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
+
+/**
+ * Find data portion of a specific section in RTAS extended event log.
+ * @elog: RTAS error/event log.
+ * @sect_id: secsion ID.
+ *
+ * Return:
+ * pointer to the section data of the specified section
+ * NULL if not found
+ */
+static struct pseries_elog_section *find_xelog_section(struct rtas_error_log *elog,
+ uint16_t sect_id)
+{
+ struct rtas_ext_event_log_v6 *xelog =
+ (struct rtas_ext_event_log_v6 *) elog->buffer;
+ struct pseries_elog_section *sect;
+ unsigned char *p, *log_end;
+
+ /* Check that we understand the format */
+ if (elog->extended_log_length < sizeof(struct rtas_ext_event_log_v6) ||
+ xelog->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG ||
+ xelog->company_id != RTAS_V6EXT_COMPANY_ID_IBM)
+ return NULL;
+
+ log_end = elog->buffer + elog->extended_log_length;
+ p = xelog->vendor_log;
+ while (p < log_end) {
+ sect = (struct pseries_elog_section *)p;
+ if (sect->id == sect_id)
+ return sect;
+ p += sect->length;
+ }
+ return NULL;
+}
+
+/**
+ * Find the data portion of an IO Event section from event log.
+ * @elog: RTAS error/event log.
+ *
+ * Return:
+ * pointer to a valid IO event section data. NULL if not found.
+ */
+static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog)
+{
+ struct pseries_elog_section *sect;
+
+ /* We should only ever get called for io-event interrupts, but if
+ * we do get called for another type then something went wrong so
+ * make some noise about it.
+ * RTAS_TYPE_IO only exists in extended event log version 6 or later.
+ * No need to check event log version.
+ */
+ if (unlikely(elog->type != RTAS_TYPE_IO)) {
+ printk_once(KERN_WARNING "io_event_irq: Unexpected event type %d",
+ elog->type);
+ return NULL;
+ }
+
+ sect = find_xelog_section(elog, PSERIES_ELOG_SECT_ID_IO_EVENT);
+ if (unlikely(!sect)) {
+ printk_once(KERN_WARNING "io_event_irq: RTAS extended event "
+ "log does not contain an IO Event section. "
+ "Could be a bug in system firmware!\n");
+ return NULL;
+ }
+ return (struct pseries_io_event *) &sect->data;
+}
+
+/*
+ * PAPR:
+ * - check-exception returns the first found error or event and clear that
+ * error or event so it is reported once.
+ * - Each interrupt returns one event. If a plateform chooses to report
+ * multiple events through a single interrupt, it must ensure that the
+ * interrupt remains asserted until check-exception has been used to
+ * process all out-standing events for that interrupt.
+ *
+ * Implementation notes:
+ * - Events must be processed in the order they are returned. Hence,
+ * sequential in nature.
+ * - The owner of an event is determined by combinations of scope,
+ * event type, and sub-type. There is no easy way to pre-sort clients
+ * by scope or event type alone. For example, Torrent ISR route change
+ * event is reported with scope 0x00 (Not Applicatable) rather than
+ * 0x3B (Torrent-hub). It is better to let the clients to identify
+ * who owns the the event.
+ */
+
+static irqreturn_t ioei_interrupt(int irq, void *dev_id)
+{
+ struct pseries_io_event *event;
+ int rtas_rc;
+
+ for (;;) {
+ rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL,
+ RTAS_VECTOR_EXTERNAL_INTERRUPT,
+ virq_to_hw(irq),
+ RTAS_IO_EVENTS, 1 /* Time Critical */,
+ __pa(ioei_rtas_buf),
+ RTAS_DATA_BUF_SIZE);
+ if (rtas_rc != 0)
+ break;
+
+ event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf);
+ if (!event)
+ continue;
+
+ atomic_notifier_call_chain(&pseries_ioei_notifier_list,
+ 0, event);
+ }
+ return IRQ_HANDLED;
+}
+
+static int __init ioei_init(void)
+{
+ struct device_node *np;
+
+ ioei_check_exception_token = rtas_token("check-exception");
+ if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ np = of_find_node_by_path("/event-sources/ibm,io-events");
+ if (np) {
+ request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT");
+ pr_info("IBM I/O event interrupts enabled\n");
+ of_node_put(np);
+ } else {
+ return -ENODEV;
+ }
+ return 0;
+}
+machine_subsys_initcall(pseries, ioei_init);
+
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
new file mode 100644
index 00000000000..c442f2b1980
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -0,0 +1,1299 @@
+/*
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
+ *
+ * Rewrite, cleanup:
+ *
+ * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation
+ * Copyright (C) 2006 Olof Johansson <olof@lixom.net>
+ *
+ * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h> /* for show_stack */
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/crash_dump.h>
+#include <linux/memory.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/iommu.h>
+#include <asm/pci-bridge.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/ppc-pci.h>
+#include <asm/udbg.h>
+#include <asm/mmzone.h>
+
+#include "plpar_wrappers.h"
+
+
+static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
+ u64 *startp, u64 *endp)
+{
+ u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index;
+ unsigned long start, end, inc;
+
+ start = __pa(startp);
+ end = __pa(endp);
+ inc = L1_CACHE_BYTES; /* invalidate a cacheline of TCEs at a time */
+
+ /* If this is non-zero, change the format. We shift the
+ * address and or in the magic from the device tree. */
+ if (tbl->it_busno) {
+ start <<= 12;
+ end <<= 12;
+ inc <<= 12;
+ start |= tbl->it_busno;
+ end |= tbl->it_busno;
+ }
+
+ end |= inc - 1; /* round up end to be different than start */
+
+ mb(); /* Make sure TCEs in memory are written */
+ while (start <= end) {
+ out_be64(invalidate, start);
+ start += inc;
+ }
+}
+
+static int tce_build_pSeries(struct iommu_table *tbl, long index,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ u64 proto_tce;
+ u64 *tcep, *tces;
+ u64 rpn;
+
+ proto_tce = TCE_PCI_READ; // Read allowed
+
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ tces = tcep = ((u64 *)tbl->it_base) + index;
+
+ while (npages--) {
+ /* can't move this out since we might cross MEMBLOCK boundary */
+ rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+ *tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
+
+ uaddr += TCE_PAGE_SIZE;
+ tcep++;
+ }
+
+ if (tbl->it_type == TCE_PCI_SWINV_CREATE)
+ tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
+ return 0;
+}
+
+
+static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
+{
+ u64 *tcep, *tces;
+
+ tces = tcep = ((u64 *)tbl->it_base) + index;
+
+ while (npages--)
+ *(tcep++) = 0;
+
+ if (tbl->it_type == TCE_PCI_SWINV_FREE)
+ tce_invalidate_pSeries_sw(tbl, tces, tcep - 1);
+}
+
+static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
+{
+ u64 *tcep;
+
+ tcep = ((u64 *)tbl->it_base) + index;
+
+ return *tcep;
+}
+
+static void tce_free_pSeriesLP(struct iommu_table*, long, long);
+static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+
+static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ u64 rc = 0;
+ u64 proto_tce, tce;
+ u64 rpn;
+ int ret = 0;
+ long tcenum_start = tcenum, npages_start = npages;
+
+ rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+ proto_tce = TCE_PCI_READ;
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ while (npages--) {
+ tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
+ rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
+
+ if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+ ret = (int)rc;
+ tce_free_pSeriesLP(tbl, tcenum_start,
+ (npages_start - (npages + 1)));
+ break;
+ }
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\ttcenum = 0x%llx\n", (u64)tcenum);
+ printk("\ttce val = 0x%llx\n", tce );
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+
+ tcenum++;
+ rpn++;
+ }
+ return ret;
+}
+
+static DEFINE_PER_CPU(u64 *, tce_page);
+
+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
+ long npages, unsigned long uaddr,
+ enum dma_data_direction direction,
+ struct dma_attrs *attrs)
+{
+ u64 rc = 0;
+ u64 proto_tce;
+ u64 *tcep;
+ u64 rpn;
+ long l, limit;
+ long tcenum_start = tcenum, npages_start = npages;
+ int ret = 0;
+
+ if (npages == 1) {
+ return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+ direction, attrs);
+ }
+
+ tcep = __get_cpu_var(tce_page);
+
+ /* This is safe to do since interrupts are off when we're called
+ * from iommu_alloc{,_sg}()
+ */
+ if (!tcep) {
+ tcep = (u64 *)__get_free_page(GFP_ATOMIC);
+ /* If allocation fails, fall back to the loop implementation */
+ if (!tcep) {
+ return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
+ direction, attrs);
+ }
+ __get_cpu_var(tce_page) = tcep;
+ }
+
+ rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+ proto_tce = TCE_PCI_READ;
+ if (direction != DMA_TO_DEVICE)
+ proto_tce |= TCE_PCI_WRITE;
+
+ /* We can map max one pageful of TCEs at a time */
+ do {
+ /*
+ * Set up the page with TCE data, looping through and setting
+ * the values.
+ */
+ limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE);
+
+ for (l = 0; l < limit; l++) {
+ tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
+ rpn++;
+ }
+
+ rc = plpar_tce_put_indirect((u64)tbl->it_index,
+ (u64)tcenum << 12,
+ (u64)virt_to_abs(tcep),
+ limit);
+
+ npages -= limit;
+ tcenum += limit;
+ } while (npages > 0 && !rc);
+
+ if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
+ ret = (int)rc;
+ tce_freemulti_pSeriesLP(tbl, tcenum_start,
+ (npages_start - (npages + limit)));
+ return ret;
+ }
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\tnpages = 0x%llx\n", (u64)npages);
+ printk("\ttce[0] val = 0x%llx\n", tcep[0]);
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+ return ret;
+}
+
+static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+ u64 rc;
+
+ while (npages--) {
+ rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\ttcenum = 0x%llx\n", (u64)tcenum);
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+
+ tcenum++;
+ }
+}
+
+
+static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
+{
+ u64 rc;
+
+ rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n");
+ printk("\trc = %lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\tnpages = 0x%llx\n", (u64)npages);
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+}
+
+static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
+{
+ u64 rc;
+ unsigned long tce_ret;
+
+ rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);
+
+ if (rc && printk_ratelimit()) {
+ printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc);
+ printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
+ printk("\ttcenum = 0x%llx\n", (u64)tcenum);
+ show_stack(current, (unsigned long *)__get_SP());
+ }
+
+ return tce_ret;
+}
+
+/* this is compatible with cells for the device tree property */
+struct dynamic_dma_window_prop {
+ __be32 liobn; /* tce table number */
+ __be64 dma_base; /* address hi,lo */
+ __be32 tce_shift; /* ilog2(tce_page_size) */
+ __be32 window_shift; /* ilog2(tce_window_size) */
+};
+
+struct direct_window {
+ struct device_node *device;
+ const struct dynamic_dma_window_prop *prop;
+ struct list_head list;
+};
+
+/* Dynamic DMA Window support */
+struct ddw_query_response {
+ u32 windows_available;
+ u32 largest_available_block;
+ u32 page_size;
+ u32 migration_capable;
+};
+
+struct ddw_create_response {
+ u32 liobn;
+ u32 addr_hi;
+ u32 addr_lo;
+};
+
+static LIST_HEAD(direct_window_list);
+/* prevents races between memory on/offline and window creation */
+static DEFINE_SPINLOCK(direct_window_list_lock);
+/* protects initializing window twice for same device */
+static DEFINE_MUTEX(direct_window_init_mutex);
+#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
+
+static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
+ unsigned long num_pfn, const void *arg)
+{
+ const struct dynamic_dma_window_prop *maprange = arg;
+ int rc;
+ u64 tce_size, num_tce, dma_offset, next;
+ u32 tce_shift;
+ long limit;
+
+ tce_shift = be32_to_cpu(maprange->tce_shift);
+ tce_size = 1ULL << tce_shift;
+ next = start_pfn << PAGE_SHIFT;
+ num_tce = num_pfn << PAGE_SHIFT;
+
+ /* round back to the beginning of the tce page size */
+ num_tce += next & (tce_size - 1);
+ next &= ~(tce_size - 1);
+
+ /* covert to number of tces */
+ num_tce |= tce_size - 1;
+ num_tce >>= tce_shift;
+
+ do {
+ /*
+ * Set up the page with TCE data, looping through and setting
+ * the values.
+ */
+ limit = min_t(long, num_tce, 512);
+ dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+ rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn),
+ dma_offset,
+ 0, limit);
+ num_tce -= limit;
+ } while (num_tce > 0 && !rc);
+
+ return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
+ unsigned long num_pfn, const void *arg)
+{
+ const struct dynamic_dma_window_prop *maprange = arg;
+ u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn;
+ u32 tce_shift;
+ u64 rc = 0;
+ long l, limit;
+
+ local_irq_disable(); /* to protect tcep and the page behind it */
+ tcep = __get_cpu_var(tce_page);
+
+ if (!tcep) {
+ tcep = (u64 *)__get_free_page(GFP_ATOMIC);
+ if (!tcep) {
+ local_irq_enable();
+ return -ENOMEM;
+ }
+ __get_cpu_var(tce_page) = tcep;
+ }
+
+ proto_tce = TCE_PCI_READ | TCE_PCI_WRITE;
+
+ liobn = (u64)be32_to_cpu(maprange->liobn);
+ tce_shift = be32_to_cpu(maprange->tce_shift);
+ tce_size = 1ULL << tce_shift;
+ next = start_pfn << PAGE_SHIFT;
+ num_tce = num_pfn << PAGE_SHIFT;
+
+ /* round back to the beginning of the tce page size */
+ num_tce += next & (tce_size - 1);
+ next &= ~(tce_size - 1);
+
+ /* covert to number of tces */
+ num_tce |= tce_size - 1;
+ num_tce >>= tce_shift;
+
+ /* We can map max one pageful of TCEs at a time */
+ do {
+ /*
+ * Set up the page with TCE data, looping through and setting
+ * the values.
+ */
+ limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE);
+ dma_offset = next + be64_to_cpu(maprange->dma_base);
+
+ for (l = 0; l < limit; l++) {
+ tcep[l] = proto_tce | next;
+ next += tce_size;
+ }
+
+ rc = plpar_tce_put_indirect(liobn,
+ dma_offset,
+ (u64)virt_to_abs(tcep),
+ limit);
+
+ num_tce -= limit;
+ } while (num_tce > 0 && !rc);
+
+ /* error cleanup: caller will clear whole range */
+
+ local_irq_enable();
+ return rc;
+}
+
+static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
+ unsigned long num_pfn, void *arg)
+{
+ return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
+}
+
+
+#ifdef CONFIG_PCI
+static void iommu_table_setparms(struct pci_controller *phb,
+ struct device_node *dn,
+ struct iommu_table *tbl)
+{
+ struct device_node *node;
+ const unsigned long *basep, *sw_inval;
+ const u32 *sizep;
+
+ node = phb->dn;
+
+ basep = of_get_property(node, "linux,tce-base", NULL);
+ sizep = of_get_property(node, "linux,tce-size", NULL);
+ if (basep == NULL || sizep == NULL) {
+ printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has "
+ "missing tce entries !\n", dn->full_name);
+ return;
+ }
+
+ tbl->it_base = (unsigned long)__va(*basep);
+
+ if (!is_kdump_kernel())
+ memset((void *)tbl->it_base, 0, *sizep);
+
+ tbl->it_busno = phb->bus->number;
+
+ /* Units of tce entries */
+ tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
+
+ /* Test if we are going over 2GB of DMA space */
+ if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
+ udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+ }
+
+ phb->dma_window_base_cur += phb->dma_window_size;
+
+ /* Set the tce table size - measured in entries */
+ tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
+
+ tbl->it_index = 0;
+ tbl->it_blocksize = 16;
+ tbl->it_type = TCE_PCI;
+
+ sw_inval = of_get_property(node, "linux,tce-sw-invalidate-info", NULL);
+ if (sw_inval) {
+ /*
+ * This property contains information on how to
+ * invalidate the TCE entry. The first property is
+ * the base MMIO address used to invalidate entries.
+ * The second property tells us the format of the TCE
+ * invalidate (whether it needs to be shifted) and
+ * some magic routing info to add to our invalidate
+ * command.
+ */
+ tbl->it_index = (unsigned long) ioremap(sw_inval[0], 8);
+ tbl->it_busno = sw_inval[1]; /* overload this with magic */
+ tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE;
+ }
+}
+
+/*
+ * iommu_table_setparms_lpar
+ *
+ * Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
+ */
+static void iommu_table_setparms_lpar(struct pci_controller *phb,
+ struct device_node *dn,
+ struct iommu_table *tbl,
+ const void *dma_window)
+{
+ unsigned long offset, size;
+
+ of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
+
+ tbl->it_busno = phb->bus->number;
+ tbl->it_base = 0;
+ tbl->it_blocksize = 16;
+ tbl->it_type = TCE_PCI;
+ tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
+ tbl->it_size = size >> IOMMU_PAGE_SHIFT;
+}
+
+static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
+{
+ struct device_node *dn;
+ struct iommu_table *tbl;
+ struct device_node *isa_dn, *isa_dn_orig;
+ struct device_node *tmp;
+ struct pci_dn *pci;
+ int children;
+
+ dn = pci_bus_to_OF_node(bus);
+
+ pr_debug("pci_dma_bus_setup_pSeries: setting up bus %s\n", dn->full_name);
+
+ if (bus->self) {
+ /* This is not a root bus, any setup will be done for the
+ * device-side of the bridge in iommu_dev_setup_pSeries().
+ */
+ return;
+ }
+ pci = PCI_DN(dn);
+
+ /* Check if the ISA bus on the system is under
+ * this PHB.
+ */
+ isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa");
+
+ while (isa_dn && isa_dn != dn)
+ isa_dn = isa_dn->parent;
+
+ if (isa_dn_orig)
+ of_node_put(isa_dn_orig);
+
+ /* Count number of direct PCI children of the PHB. */
+ for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling)
+ children++;
+
+ pr_debug("Children: %d\n", children);
+
+ /* Calculate amount of DMA window per slot. Each window must be
+ * a power of two (due to pci_alloc_consistent requirements).
+ *
+ * Keep 256MB aside for PHBs with ISA.
+ */
+
+ if (!isa_dn) {
+ /* No ISA/IDE - just set window size and return */
+ pci->phb->dma_window_size = 0x80000000ul; /* To be divided */
+
+ while (pci->phb->dma_window_size * children > 0x80000000ul)
+ pci->phb->dma_window_size >>= 1;
+ pr_debug("No ISA/IDE, window size is 0x%llx\n",
+ pci->phb->dma_window_size);
+ pci->phb->dma_window_base_cur = 0;
+
+ return;
+ }
+
+ /* If we have ISA, then we probably have an IDE
+ * controller too. Allocate a 128MB table but
+ * skip the first 128MB to avoid stepping on ISA
+ * space.
+ */
+ pci->phb->dma_window_size = 0x8000000ul;
+ pci->phb->dma_window_base_cur = 0x8000000ul;
+
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+ pci->phb->node);
+
+ iommu_table_setparms(pci->phb, dn, tbl);
+ pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+
+ /* Divide the rest (1.75GB) among the children */
+ pci->phb->dma_window_size = 0x80000000ul;
+ while (pci->phb->dma_window_size * children > 0x70000000ul)
+ pci->phb->dma_window_size >>= 1;
+
+ pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
+}
+
+
+static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
+{
+ struct iommu_table *tbl;
+ struct device_node *dn, *pdn;
+ struct pci_dn *ppci;
+ const void *dma_window = NULL;
+
+ dn = pci_bus_to_OF_node(bus);
+
+ pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %s\n",
+ dn->full_name);
+
+ /* Find nearest ibm,dma-window, walking up the device tree */
+ for (pdn = dn; pdn != NULL; pdn = pdn->parent) {
+ dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window != NULL)
+ break;
+ }
+
+ if (dma_window == NULL) {
+ pr_debug(" no ibm,dma-window property !\n");
+ return;
+ }
+
+ ppci = PCI_DN(pdn);
+
+ pr_debug(" parent is %s, iommu_table: 0x%p\n",
+ pdn->full_name, ppci->iommu_table);
+
+ if (!ppci->iommu_table) {
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+ ppci->phb->node);
+ iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+ ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
+ pr_debug(" created table: %p\n", ppci->iommu_table);
+ }
+}
+
+
+static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct iommu_table *tbl;
+
+ pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev));
+
+ dn = dev->dev.of_node;
+
+ /* If we're the direct child of a root bus, then we need to allocate
+ * an iommu table ourselves. The bus setup code should have setup
+ * the window sizes already.
+ */
+ if (!dev->bus->self) {
+ struct pci_controller *phb = PCI_DN(dn)->phb;
+
+ pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+ phb->node);
+ iommu_table_setparms(phb, dn, tbl);
+ PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
+ set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+ return;
+ }
+
+ /* If this device is further down the bus tree, search upwards until
+ * an already allocated iommu table is found and use that.
+ */
+
+ while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
+ dn = dn->parent;
+
+ if (dn && PCI_DN(dn))
+ set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+ else
+ printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
+ pci_name(dev));
+}
+
+static int __read_mostly disable_ddw;
+
+static int __init disable_ddw_setup(char *str)
+{
+ disable_ddw = 1;
+ printk(KERN_INFO "ppc iommu: disabling ddw.\n");
+
+ return 0;
+}
+
+early_param("disable_ddw", disable_ddw_setup);
+
+static void remove_ddw(struct device_node *np)
+{
+ struct dynamic_dma_window_prop *dwp;
+ struct property *win64;
+ const u32 *ddw_avail;
+ u64 liobn;
+ int len, ret;
+
+ ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len);
+ win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
+ if (!win64)
+ return;
+
+ if (!ddw_avail || len < 3 * sizeof(u32) || win64->length < sizeof(*dwp))
+ goto delprop;
+
+ dwp = win64->value;
+ liobn = (u64)be32_to_cpu(dwp->liobn);
+
+ /* clear the whole window, note the arg is in kernel pages */
+ ret = tce_clearrange_multi_pSeriesLP(0,
+ 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
+ if (ret)
+ pr_warning("%s failed to clear tces in window.\n",
+ np->full_name);
+ else
+ pr_debug("%s successfully cleared tces in window.\n",
+ np->full_name);
+
+ ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
+ if (ret)
+ pr_warning("%s: failed to remove direct window: rtas returned "
+ "%d to ibm,remove-pe-dma-window(%x) %llx\n",
+ np->full_name, ret, ddw_avail[2], liobn);
+ else
+ pr_debug("%s: successfully removed direct window: rtas returned "
+ "%d to ibm,remove-pe-dma-window(%x) %llx\n",
+ np->full_name, ret, ddw_avail[2], liobn);
+
+delprop:
+ ret = prom_remove_property(np, win64);
+ if (ret)
+ pr_warning("%s: failed to remove direct window property: %d\n",
+ np->full_name, ret);
+}
+
+static u64 find_existing_ddw(struct device_node *pdn)
+{
+ struct direct_window *window;
+ const struct dynamic_dma_window_prop *direct64;
+ u64 dma_addr = 0;
+
+ spin_lock(&direct_window_list_lock);
+ /* check if we already created a window and dupe that config if so */
+ list_for_each_entry(window, &direct_window_list, list) {
+ if (window->device == pdn) {
+ direct64 = window->prop;
+ dma_addr = direct64->dma_base;
+ break;
+ }
+ }
+ spin_unlock(&direct_window_list_lock);
+
+ return dma_addr;
+}
+
+static int find_existing_ddw_windows(void)
+{
+ int len;
+ struct device_node *pdn;
+ struct direct_window *window;
+ const struct dynamic_dma_window_prop *direct64;
+
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ return 0;
+
+ for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
+ direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
+ if (!direct64)
+ continue;
+
+ window = kzalloc(sizeof(*window), GFP_KERNEL);
+ if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
+ kfree(window);
+ remove_ddw(pdn);
+ continue;
+ }
+
+ window->device = pdn;
+ window->prop = direct64;
+ spin_lock(&direct_window_list_lock);
+ list_add(&window->list, &direct_window_list);
+ spin_unlock(&direct_window_list_lock);
+ }
+
+ return 0;
+}
+machine_arch_initcall(pseries, find_existing_ddw_windows);
+
+static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+ struct ddw_query_response *query)
+{
+ struct device_node *dn;
+ struct pci_dn *pcidn;
+ u32 cfg_addr;
+ u64 buid;
+ int ret;
+
+ /*
+ * Get the config address and phb buid of the PE window.
+ * Rely on eeh to retrieve this for us.
+ * Retrieve them from the pci device, not the node with the
+ * dma-window property
+ */
+ dn = pci_device_to_OF_node(dev);
+ pcidn = PCI_DN(dn);
+ cfg_addr = pcidn->eeh_config_addr;
+ if (pcidn->eeh_pe_config_addr)
+ cfg_addr = pcidn->eeh_pe_config_addr;
+ buid = pcidn->phb->buid;
+ ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
+ cfg_addr, BUID_HI(buid), BUID_LO(buid));
+ dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
+ " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
+ BUID_LO(buid), ret);
+ return ret;
+}
+
+static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail,
+ struct ddw_create_response *create, int page_shift,
+ int window_shift)
+{
+ struct device_node *dn;
+ struct pci_dn *pcidn;
+ u32 cfg_addr;
+ u64 buid;
+ int ret;
+
+ /*
+ * Get the config address and phb buid of the PE window.
+ * Rely on eeh to retrieve this for us.
+ * Retrieve them from the pci device, not the node with the
+ * dma-window property
+ */
+ dn = pci_device_to_OF_node(dev);
+ pcidn = PCI_DN(dn);
+ cfg_addr = pcidn->eeh_config_addr;
+ if (pcidn->eeh_pe_config_addr)
+ cfg_addr = pcidn->eeh_pe_config_addr;
+ buid = pcidn->phb->buid;
+
+ do {
+ /* extra outputs are LIOBN and dma-addr (hi, lo) */
+ ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr,
+ BUID_HI(buid), BUID_LO(buid), page_shift, window_shift);
+ } while (rtas_busy_delay(ret));
+ dev_info(&dev->dev,
+ "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d "
+ "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1],
+ cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift,
+ window_shift, ret, create->liobn, create->addr_hi, create->addr_lo);
+
+ return ret;
+}
+
+/*
+ * If the PE supports dynamic dma windows, and there is space for a table
+ * that can map all pages in a linear offset, then setup such a table,
+ * and record the dma-offset in the struct device.
+ *
+ * dev: the pci device we are checking
+ * pdn: the parent pe node with the ibm,dma_window property
+ * Future: also check if we can remap the base window for our base page size
+ *
+ * returns the dma offset for use by dma_set_mask
+ */
+static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
+{
+ int len, ret;
+ struct ddw_query_response query;
+ struct ddw_create_response create;
+ int page_shift;
+ u64 dma_addr, max_addr;
+ struct device_node *dn;
+ const u32 *uninitialized_var(ddw_avail);
+ struct direct_window *window;
+ struct property *win64;
+ struct dynamic_dma_window_prop *ddwprop;
+
+ mutex_lock(&direct_window_init_mutex);
+
+ dma_addr = find_existing_ddw(pdn);
+ if (dma_addr != 0)
+ goto out_unlock;
+
+ /*
+ * the ibm,ddw-applicable property holds the tokens for:
+ * ibm,query-pe-dma-window
+ * ibm,create-pe-dma-window
+ * ibm,remove-pe-dma-window
+ * for the given node in that order.
+ * the property is actually in the parent, not the PE
+ */
+ ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", &len);
+ if (!ddw_avail || len < 3 * sizeof(u32))
+ goto out_unlock;
+
+ /*
+ * Query if there is a second window of size to map the
+ * whole partition. Query returns number of windows, largest
+ * block assigned to PE (partition endpoint), and two bitmasks
+ * of page sizes: supported and supported for migrate-dma.
+ */
+ dn = pci_device_to_OF_node(dev);
+ ret = query_ddw(dev, ddw_avail, &query);
+ if (ret != 0)
+ goto out_unlock;
+
+ if (query.windows_available == 0) {
+ /*
+ * no additional windows are available for this device.
+ * We might be able to reallocate the existing window,
+ * trading in for a larger page size.
+ */
+ dev_dbg(&dev->dev, "no free dynamic windows");
+ goto out_unlock;
+ }
+ if (query.page_size & 4) {
+ page_shift = 24; /* 16MB */
+ } else if (query.page_size & 2) {
+ page_shift = 16; /* 64kB */
+ } else if (query.page_size & 1) {
+ page_shift = 12; /* 4kB */
+ } else {
+ dev_dbg(&dev->dev, "no supported direct page size in mask %x",
+ query.page_size);
+ goto out_unlock;
+ }
+ /* verify the window * number of ptes will map the partition */
+ /* check largest block * page size > max memory hotplug addr */
+ max_addr = memory_hotplug_max();
+ if (query.largest_available_block < (max_addr >> page_shift)) {
+ dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u "
+ "%llu-sized pages\n", max_addr, query.largest_available_block,
+ 1ULL << page_shift);
+ goto out_unlock;
+ }
+ len = order_base_2(max_addr);
+ win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
+ if (!win64) {
+ dev_info(&dev->dev,
+ "couldn't allocate property for 64bit dma window\n");
+ goto out_unlock;
+ }
+ win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
+ win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
+ win64->length = sizeof(*ddwprop);
+ if (!win64->name || !win64->value) {
+ dev_info(&dev->dev,
+ "couldn't allocate property name and value\n");
+ goto out_free_prop;
+ }
+
+ ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
+ if (ret != 0)
+ goto out_free_prop;
+
+ ddwprop->liobn = cpu_to_be32(create.liobn);
+ ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2));
+ ddwprop->tce_shift = cpu_to_be32(page_shift);
+ ddwprop->window_shift = cpu_to_be32(len);
+
+ dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n",
+ create.liobn, dn->full_name);
+
+ window = kzalloc(sizeof(*window), GFP_KERNEL);
+ if (!window)
+ goto out_clear_window;
+
+ ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
+ win64->value, tce_setrange_multi_pSeriesLP_walk);
+ if (ret) {
+ dev_info(&dev->dev, "failed to map direct window for %s: %d\n",
+ dn->full_name, ret);
+ goto out_free_window;
+ }
+
+ ret = prom_add_property(pdn, win64);
+ if (ret) {
+ dev_err(&dev->dev, "unable to add dma window property for %s: %d",
+ pdn->full_name, ret);
+ goto out_free_window;
+ }
+
+ window->device = pdn;
+ window->prop = ddwprop;
+ spin_lock(&direct_window_list_lock);
+ list_add(&window->list, &direct_window_list);
+ spin_unlock(&direct_window_list_lock);
+
+ dma_addr = of_read_number(&create.addr_hi, 2);
+ goto out_unlock;
+
+out_free_window:
+ kfree(window);
+
+out_clear_window:
+ remove_ddw(pdn);
+
+out_free_prop:
+ kfree(win64->name);
+ kfree(win64->value);
+ kfree(win64);
+
+out_unlock:
+ mutex_unlock(&direct_window_init_mutex);
+ return dma_addr;
+}
+
+static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
+{
+ struct device_node *pdn, *dn;
+ struct iommu_table *tbl;
+ const void *dma_window = NULL;
+ struct pci_dn *pci;
+
+ pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
+
+ /* dev setup for LPAR is a little tricky, since the device tree might
+ * contain the dma-window properties per-device and not necessarily
+ * for the bus. So we need to search upwards in the tree until we
+ * either hit a dma-window property, OR find a parent with a table
+ * already allocated.
+ */
+ dn = pci_device_to_OF_node(dev);
+ pr_debug(" node is %s\n", dn->full_name);
+
+ for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+ pdn = pdn->parent) {
+ dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window)
+ break;
+ }
+
+ if (!pdn || !PCI_DN(pdn)) {
+ printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
+ "no DMA window found for pci dev=%s dn=%s\n",
+ pci_name(dev), dn? dn->full_name : "<null>");
+ return;
+ }
+ pr_debug(" parent is %s\n", pdn->full_name);
+
+ pci = PCI_DN(pdn);
+ if (!pci->iommu_table) {
+ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+ pci->phb->node);
+ iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+ pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
+ pr_debug(" created table: %p\n", pci->iommu_table);
+ } else {
+ pr_debug(" found DMA window, table: %p\n", pci->iommu_table);
+ }
+
+ set_iommu_table_base(&dev->dev, pci->iommu_table);
+}
+
+static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
+{
+ bool ddw_enabled = false;
+ struct device_node *pdn, *dn;
+ struct pci_dev *pdev;
+ const void *dma_window = NULL;
+ u64 dma_offset;
+
+ if (!dev->dma_mask)
+ return -EIO;
+
+ if (!dev_is_pci(dev))
+ goto check_mask;
+
+ pdev = to_pci_dev(dev);
+
+ /* only attempt to use a new window if 64-bit DMA is requested */
+ if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) {
+ dn = pci_device_to_OF_node(pdev);
+ dev_dbg(dev, "node is %s\n", dn->full_name);
+
+ /*
+ * the device tree might contain the dma-window properties
+ * per-device and not necessarily for the bus. So we need to
+ * search upwards in the tree until we either hit a dma-window
+ * property, OR find a parent with a table already allocated.
+ */
+ for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+ pdn = pdn->parent) {
+ dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
+ if (dma_window)
+ break;
+ }
+ if (pdn && PCI_DN(pdn)) {
+ dma_offset = enable_ddw(pdev, pdn);
+ if (dma_offset != 0) {
+ dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset);
+ set_dma_offset(dev, dma_offset);
+ set_dma_ops(dev, &dma_direct_ops);
+ ddw_enabled = true;
+ }
+ }
+ }
+
+ /* fall back on iommu ops, restore table pointer with ops */
+ if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) {
+ dev_info(dev, "Restoring 32-bit DMA via iommu\n");
+ set_dma_ops(dev, &dma_iommu_ops);
+ pci_dma_dev_setup_pSeriesLP(pdev);
+ }
+
+check_mask:
+ if (!dma_supported(dev, dma_mask))
+ return -EIO;
+
+ *dev->dma_mask = dma_mask;
+ return 0;
+}
+
+static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
+{
+ if (!dev->dma_mask)
+ return 0;
+
+ if (!disable_ddw && dev_is_pci(dev)) {
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct device_node *dn;
+
+ dn = pci_device_to_OF_node(pdev);
+
+ /* search upwards for ibm,dma-window */
+ for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table;
+ dn = dn->parent)
+ if (of_get_property(dn, "ibm,dma-window", NULL))
+ break;
+ /* if there is a ibm,ddw-applicable property require 64 bits */
+ if (dn && PCI_DN(dn) &&
+ of_get_property(dn, "ibm,ddw-applicable", NULL))
+ return DMA_BIT_MASK(64);
+ }
+
+ return dma_iommu_ops.get_required_mask(dev);
+}
+
+#else /* CONFIG_PCI */
+#define pci_dma_bus_setup_pSeries NULL
+#define pci_dma_dev_setup_pSeries NULL
+#define pci_dma_bus_setup_pSeriesLP NULL
+#define pci_dma_dev_setup_pSeriesLP NULL
+#define dma_set_mask_pSeriesLP NULL
+#define dma_get_required_mask_pSeriesLP NULL
+#endif /* !CONFIG_PCI */
+
+static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct direct_window *window;
+ struct memory_notify *arg = data;
+ int ret = 0;
+
+ switch (action) {
+ case MEM_GOING_ONLINE:
+ spin_lock(&direct_window_list_lock);
+ list_for_each_entry(window, &direct_window_list, list) {
+ ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
+ arg->nr_pages, window->prop);
+ /* XXX log error */
+ }
+ spin_unlock(&direct_window_list_lock);
+ break;
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE:
+ spin_lock(&direct_window_list_lock);
+ list_for_each_entry(window, &direct_window_list, list) {
+ ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
+ arg->nr_pages, window->prop);
+ /* XXX log error */
+ }
+ spin_unlock(&direct_window_list_lock);
+ break;
+ default:
+ break;
+ }
+ if (ret && action != MEM_CANCEL_ONLINE)
+ return NOTIFY_BAD;
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block iommu_mem_nb = {
+ .notifier_call = iommu_mem_notifier,
+};
+
+static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+ int err = NOTIFY_OK;
+ struct device_node *np = node;
+ struct pci_dn *pci = PCI_DN(np);
+ struct direct_window *window;
+
+ switch (action) {
+ case PSERIES_RECONFIG_REMOVE:
+ if (pci && pci->iommu_table)
+ iommu_free_table(pci->iommu_table, np->full_name);
+
+ spin_lock(&direct_window_list_lock);
+ list_for_each_entry(window, &direct_window_list, list) {
+ if (window->device == np) {
+ list_del(&window->list);
+ kfree(window);
+ break;
+ }
+ }
+ spin_unlock(&direct_window_list_lock);
+
+ /*
+ * Because the notifier runs after isolation of the
+ * slot, we are guaranteed any DMA window has already
+ * been revoked and the TCEs have been marked invalid,
+ * so we don't need a call to remove_ddw(np). However,
+ * if an additional notifier action is added before the
+ * isolate call, we should update this code for
+ * completeness with such a call.
+ */
+ break;
+ default:
+ err = NOTIFY_DONE;
+ break;
+ }
+ return err;
+}
+
+static struct notifier_block iommu_reconfig_nb = {
+ .notifier_call = iommu_reconfig_notifier,
+};
+
+/* These are called very early. */
+void iommu_init_early_pSeries(void)
+{
+ if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL))
+ return;
+
+ if (firmware_has_feature(FW_FEATURE_LPAR)) {
+ if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
+ ppc_md.tce_build = tce_buildmulti_pSeriesLP;
+ ppc_md.tce_free = tce_freemulti_pSeriesLP;
+ } else {
+ ppc_md.tce_build = tce_build_pSeriesLP;
+ ppc_md.tce_free = tce_free_pSeriesLP;
+ }
+ ppc_md.tce_get = tce_get_pSeriesLP;
+ ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
+ ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
+ ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
+ ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
+ } else {
+ ppc_md.tce_build = tce_build_pSeries;
+ ppc_md.tce_free = tce_free_pSeries;
+ ppc_md.tce_get = tce_get_pseries;
+ ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeries;
+ ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeries;
+ }
+
+
+ pSeries_reconfig_notifier_register(&iommu_reconfig_nb);
+ register_memory_notifier(&iommu_mem_nb);
+
+ set_pci_dma_ops(&dma_iommu_ops);
+}
+
+static int __init disable_multitce(char *str)
+{
+ if (strcmp(str, "off") == 0 &&
+ firmware_has_feature(FW_FEATURE_LPAR) &&
+ firmware_has_feature(FW_FEATURE_MULTITCE)) {
+ printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
+ ppc_md.tce_build = tce_build_pSeriesLP;
+ ppc_md.tce_free = tce_free_pSeriesLP;
+ powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
+ }
+ return 1;
+}
+
+__setup("multitce=", disable_multitce);
diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c
new file mode 100644
index 00000000000..7d94bdc63d5
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/kexec.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2006 Michael Ellerman, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+
+#include <asm/machdep.h>
+#include <asm/page.h>
+#include <asm/firmware.h>
+#include <asm/kexec.h>
+#include <asm/mpic.h>
+#include <asm/xics.h>
+#include <asm/smp.h>
+
+#include "pseries.h"
+#include "plpar_wrappers.h"
+
+static void pseries_kexec_cpu_down(int crash_shutdown, int secondary)
+{
+ /* Don't risk a hypervisor call if we're crashing */
+ if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) {
+ int ret;
+ int cpu = smp_processor_id();
+ int hwcpu = hard_smp_processor_id();
+
+ if (get_lppaca()->dtl_enable_mask) {
+ ret = unregister_dtl(hwcpu);
+ if (ret) {
+ pr_err("WARNING: DTL deregistration for cpu "
+ "%d (hw %d) failed with %d\n",
+ cpu, hwcpu, ret);
+ }
+ }
+
+ ret = unregister_slb_shadow(hwcpu);
+ if (ret) {
+ pr_err("WARNING: SLB shadow buffer deregistration "
+ "for cpu %d (hw %d) failed with %d\n",
+ cpu, hwcpu, ret);
+ }
+
+ ret = unregister_vpa(hwcpu);
+ if (ret) {
+ pr_err("WARNING: VPA deregistration for cpu %d "
+ "(hw %d) failed with %d\n", cpu, hwcpu, ret);
+ }
+ }
+}
+
+static void pseries_kexec_cpu_down_mpic(int crash_shutdown, int secondary)
+{
+ pseries_kexec_cpu_down(crash_shutdown, secondary);
+ mpic_teardown_this_cpu(secondary);
+}
+
+void __init setup_kexec_cpu_down_mpic(void)
+{
+ ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_mpic;
+}
+
+static void pseries_kexec_cpu_down_xics(int crash_shutdown, int secondary)
+{
+ pseries_kexec_cpu_down(crash_shutdown, secondary);
+ xics_kexec_teardown_cpu(secondary);
+}
+
+void __init setup_kexec_cpu_down_xics(void)
+{
+ ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics;
+}
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
new file mode 100644
index 00000000000..7bc73af6c7b
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -0,0 +1,640 @@
+/*
+ * pSeries_lpar.c
+ * Copyright (C) 2001 Todd Inglett, IBM Corporation
+ *
+ * pSeries LPAR support.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* Enables debugging of low-level hash table routines - careful! */
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include <linux/console.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/machdep.h>
+#include <asm/abs_addr.h>
+#include <asm/mmu_context.h>
+#include <asm/iommu.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/prom.h>
+#include <asm/cputable.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/trace.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+
+/* in hvCall.S */
+EXPORT_SYMBOL(plpar_hcall);
+EXPORT_SYMBOL(plpar_hcall9);
+EXPORT_SYMBOL(plpar_hcall_norets);
+
+extern void pSeries_find_serial_port(void);
+
+void vpa_init(int cpu)
+{
+ int hwcpu = get_hard_smp_processor_id(cpu);
+ unsigned long addr;
+ long ret;
+ struct paca_struct *pp;
+ struct dtl_entry *dtl;
+
+ if (cpu_has_feature(CPU_FTR_ALTIVEC))
+ lppaca_of(cpu).vmxregs_in_use = 1;
+
+ addr = __pa(&lppaca_of(cpu));
+ ret = register_vpa(hwcpu, addr);
+
+ if (ret) {
+ pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
+ "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
+ return;
+ }
+ /*
+ * PAPR says this feature is SLB-Buffer but firmware never
+ * reports that. All SPLPAR support SLB shadow buffer.
+ */
+ addr = __pa(&slb_shadow[cpu]);
+ if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ ret = register_slb_shadow(hwcpu, addr);
+ if (ret)
+ pr_err("WARNING: SLB shadow buffer registration for "
+ "cpu %d (hw %d) of area %lx failed with %ld\n",
+ cpu, hwcpu, addr, ret);
+ }
+
+ /*
+ * Register dispatch trace log, if one has been allocated.
+ */
+ pp = &paca[cpu];
+ dtl = pp->dispatch_log;
+ if (dtl) {
+ pp->dtl_ridx = 0;
+ pp->dtl_curr = dtl;
+ lppaca_of(cpu).dtl_idx = 0;
+
+ /* hypervisor reads buffer length from this field */
+ dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
+ ret = register_dtl(hwcpu, __pa(dtl));
+ if (ret)
+ pr_err("WARNING: DTL registration of cpu %d (hw %d) "
+ "failed with %ld\n", smp_processor_id(),
+ hwcpu, ret);
+ lppaca_of(cpu).dtl_enable_mask = 2;
+ }
+}
+
+static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
+ unsigned long va, unsigned long pa,
+ unsigned long rflags, unsigned long vflags,
+ int psize, int ssize)
+{
+ unsigned long lpar_rc;
+ unsigned long flags;
+ unsigned long slot;
+ unsigned long hpte_v, hpte_r;
+
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, "
+ "rflags=%lx, vflags=%lx, psize=%d)\n",
+ hpte_group, va, pa, rflags, vflags, psize);
+
+ hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
+ hpte_r = hpte_encode_r(pa, psize) | rflags;
+
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
+
+ /* Now fill in the actual HPTE */
+ /* Set CEC cookie to 0 */
+ /* Zero page = 0 */
+ /* I-cache Invalidate = 0 */
+ /* I-cache synchronize = 0 */
+ /* Exact = 0 */
+ flags = 0;
+
+ /* Make pHyp happy */
+ if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU))
+ hpte_r &= ~_PAGE_COHERENT;
+ if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
+ flags |= H_COALESCE_CAND;
+
+ lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
+ if (unlikely(lpar_rc == H_PTEG_FULL)) {
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel(" full\n");
+ return -1;
+ }
+
+ /*
+ * Since we try and ioremap PHBs we don't own, the pte insert
+ * will fail. However we must catch the failure in hash_page
+ * or we will loop forever, so return -2 in this case.
+ */
+ if (unlikely(lpar_rc != H_SUCCESS)) {
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel(" lpar err %lu\n", lpar_rc);
+ return -2;
+ }
+ if (!(vflags & HPTE_V_BOLTED))
+ pr_devel(" -> slot: %lu\n", slot & 7);
+
+ /* Because of iSeries, we have to pass down the secondary
+ * bucket bit here as well
+ */
+ return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
+}
+
+static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);
+
+static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
+{
+ unsigned long slot_offset;
+ unsigned long lpar_rc;
+ int i;
+ unsigned long dummy1, dummy2;
+
+ /* pick a random slot to start at */
+ slot_offset = mftb() & 0x7;
+
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+
+ /* don't remove a bolted entry */
+ lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
+ (0x1UL << 4), &dummy1, &dummy2);
+ if (lpar_rc == H_SUCCESS)
+ return i;
+ BUG_ON(lpar_rc != H_NOT_FOUND);
+
+ slot_offset++;
+ slot_offset &= 0x7;
+ }
+
+ return -1;
+}
+
+static void pSeries_lpar_hptab_clear(void)
+{
+ unsigned long size_bytes = 1UL << ppc64_pft_size;
+ unsigned long hpte_count = size_bytes >> 4;
+ struct {
+ unsigned long pteh;
+ unsigned long ptel;
+ } ptes[4];
+ long lpar_rc;
+ unsigned long i, j;
+
+ /* Read in batches of 4,
+ * invalidate only valid entries not in the VRMA
+ * hpte_count will be a multiple of 4
+ */
+ for (i = 0; i < hpte_count; i += 4) {
+ lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
+ if (lpar_rc != H_SUCCESS)
+ continue;
+ for (j = 0; j < 4; j++){
+ if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
+ HPTE_V_VRMA_MASK)
+ continue;
+ if (ptes[j].pteh & HPTE_V_VALID)
+ plpar_pte_remove_raw(0, i + j, 0,
+ &(ptes[j].pteh), &(ptes[j].ptel));
+ }
+ }
+}
+
+/*
+ * This computes the AVPN and B fields of the first dword of a HPTE,
+ * for use when we want to match an existing PTE. The bottom 7 bits
+ * of the returned value are zero.
+ */
+static inline unsigned long hpte_encode_avpn(unsigned long va, int psize,
+ int ssize)
+{
+ unsigned long v;
+
+ v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm);
+ v <<= HPTE_V_AVPN_SHIFT;
+ v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
+ return v;
+}
+
+/*
+ * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
+ * the low 3 bits of flags happen to line up. So no transform is needed.
+ * We can probably optimize here and assume the high bits of newpp are
+ * already zero. For now I am paranoid.
+ */
+static long pSeries_lpar_hpte_updatepp(unsigned long slot,
+ unsigned long newpp,
+ unsigned long va,
+ int psize, int ssize, int local)
+{
+ unsigned long lpar_rc;
+ unsigned long flags = (newpp & 7) | H_AVPN;
+ unsigned long want_v;
+
+ want_v = hpte_encode_avpn(va, psize, ssize);
+
+ pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
+ want_v, slot, flags, psize);
+
+ lpar_rc = plpar_pte_protect(flags, slot, want_v);
+
+ if (lpar_rc == H_NOT_FOUND) {
+ pr_devel("not found !\n");
+ return -1;
+ }
+
+ pr_devel("ok\n");
+
+ BUG_ON(lpar_rc != H_SUCCESS);
+
+ return 0;
+}
+
+static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
+{
+ unsigned long dword0;
+ unsigned long lpar_rc;
+ unsigned long dummy_word1;
+ unsigned long flags;
+
+ /* Read 1 pte at a time */
+ /* Do not need RPN to logical page translation */
+ /* No cross CEC PFT access */
+ flags = 0;
+
+ lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1);
+
+ BUG_ON(lpar_rc != H_SUCCESS);
+
+ return dword0;
+}
+
+static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize)
+{
+ unsigned long hash;
+ unsigned long i;
+ long slot;
+ unsigned long want_v, hpte_v;
+
+ hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
+ want_v = hpte_encode_avpn(va, psize, ssize);
+
+ /* Bolted entries are always in the primary group */
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ for (i = 0; i < HPTES_PER_GROUP; i++) {
+ hpte_v = pSeries_lpar_hpte_getword0(slot);
+
+ if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
+ /* HPTE matches */
+ return slot;
+ ++slot;
+ }
+
+ return -1;
+}
+
+static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
+ unsigned long ea,
+ int psize, int ssize)
+{
+ unsigned long lpar_rc, slot, vsid, va, flags;
+
+ vsid = get_kernel_vsid(ea, ssize);
+ va = hpt_va(ea, vsid, ssize);
+
+ slot = pSeries_lpar_hpte_find(va, psize, ssize);
+ BUG_ON(slot == -1);
+
+ flags = newpp & 7;
+ lpar_rc = plpar_pte_protect(flags, slot, 0);
+
+ BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
+ int psize, int ssize, int local)
+{
+ unsigned long want_v;
+ unsigned long lpar_rc;
+ unsigned long dummy1, dummy2;
+
+ pr_devel(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n",
+ slot, va, psize, local);
+
+ want_v = hpte_encode_avpn(va, psize, ssize);
+ lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
+ if (lpar_rc == H_NOT_FOUND)
+ return;
+
+ BUG_ON(lpar_rc != H_SUCCESS);
+}
+
+static void pSeries_lpar_hpte_removebolted(unsigned long ea,
+ int psize, int ssize)
+{
+ unsigned long slot, vsid, va;
+
+ vsid = get_kernel_vsid(ea, ssize);
+ va = hpt_va(ea, vsid, ssize);
+
+ slot = pSeries_lpar_hpte_find(va, psize, ssize);
+ BUG_ON(slot == -1);
+
+ pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0);
+}
+
+/* Flag bits for H_BULK_REMOVE */
+#define HBR_REQUEST 0x4000000000000000UL
+#define HBR_RESPONSE 0x8000000000000000UL
+#define HBR_END 0xc000000000000000UL
+#define HBR_AVPN 0x0200000000000000UL
+#define HBR_ANDCOND 0x0100000000000000UL
+
+/*
+ * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
+ * lock.
+ */
+static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
+{
+ unsigned long i, pix, rc;
+ unsigned long flags = 0;
+ struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
+ int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
+ unsigned long param[9];
+ unsigned long va;
+ unsigned long hash, index, shift, hidx, slot;
+ real_pte_t pte;
+ int psize, ssize;
+
+ if (lock_tlbie)
+ spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+
+ psize = batch->psize;
+ ssize = batch->ssize;
+ pix = 0;
+ for (i = 0; i < number; i++) {
+ va = batch->vaddr[i];
+ pte = batch->pte[i];
+ pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
+ hash = hpt_hash(va, shift, ssize);
+ hidx = __rpte_to_hidx(pte, index);
+ if (hidx & _PTEIDX_SECONDARY)
+ hash = ~hash;
+ slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+ slot += hidx & _PTEIDX_GROUP_IX;
+ if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+ pSeries_lpar_hpte_invalidate(slot, va, psize,
+ ssize, local);
+ } else {
+ param[pix] = HBR_REQUEST | HBR_AVPN | slot;
+ param[pix+1] = hpte_encode_avpn(va, psize,
+ ssize);
+ pix += 2;
+ if (pix == 8) {
+ rc = plpar_hcall9(H_BULK_REMOVE, param,
+ param[0], param[1], param[2],
+ param[3], param[4], param[5],
+ param[6], param[7]);
+ BUG_ON(rc != H_SUCCESS);
+ pix = 0;
+ }
+ }
+ } pte_iterate_hashed_end();
+ }
+ if (pix) {
+ param[pix] = HBR_END;
+ rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
+ param[2], param[3], param[4], param[5],
+ param[6], param[7]);
+ BUG_ON(rc != H_SUCCESS);
+ }
+
+ if (lock_tlbie)
+ spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+}
+
+static int __init disable_bulk_remove(char *str)
+{
+ if (strcmp(str, "off") == 0 &&
+ firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
+ printk(KERN_INFO "Disabling BULK_REMOVE firmware feature");
+ powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
+ }
+ return 1;
+}
+
+__setup("bulk_remove=", disable_bulk_remove);
+
+void __init hpte_init_lpar(void)
+{
+ ppc_md.hpte_invalidate = pSeries_lpar_hpte_invalidate;
+ ppc_md.hpte_updatepp = pSeries_lpar_hpte_updatepp;
+ ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
+ ppc_md.hpte_insert = pSeries_lpar_hpte_insert;
+ ppc_md.hpte_remove = pSeries_lpar_hpte_remove;
+ ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted;
+ ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range;
+ ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear;
+}
+
+#ifdef CONFIG_PPC_SMLPAR
+#define CMO_FREE_HINT_DEFAULT 1
+static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;
+
+static int __init cmo_free_hint(char *str)
+{
+ char *parm;
+ parm = strstrip(str);
+
+ if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
+ printk(KERN_INFO "cmo_free_hint: CMO free page hinting is not active.\n");
+ cmo_free_hint_flag = 0;
+ return 1;
+ }
+
+ cmo_free_hint_flag = 1;
+ printk(KERN_INFO "cmo_free_hint: CMO free page hinting is active.\n");
+
+ if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
+ return 1;
+
+ return 0;
+}
+
+__setup("cmo_free_hint=", cmo_free_hint);
+
+static void pSeries_set_page_state(struct page *page, int order,
+ unsigned long state)
+{
+ int i, j;
+ unsigned long cmo_page_sz, addr;
+
+ cmo_page_sz = cmo_get_page_size();
+ addr = __pa((unsigned long)page_address(page));
+
+ for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
+ for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
+ plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
+ }
+}
+
+void arch_free_page(struct page *page, int order)
+{
+ if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
+ return;
+
+ pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
+}
+EXPORT_SYMBOL(arch_free_page);
+
+#endif
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * We optimise our hcall path by placing hcall_tracepoint_refcount
+ * directly in the TOC so we can check if the hcall tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long hcall_tracepoint_refcount;
+
+/*
+ * Since the tracing code might execute hcalls we need to guard against
+ * recursion. One example of this are spinlocks calling H_YIELD on
+ * shared processor partitions.
+ */
+static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);
+
+void hcall_tracepoint_regfunc(void)
+{
+ hcall_tracepoint_refcount++;
+}
+
+void hcall_tracepoint_unregfunc(void)
+{
+ hcall_tracepoint_refcount--;
+}
+
+void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
+{
+ unsigned long flags;
+ unsigned int *depth;
+
+ /*
+ * We cannot call tracepoints inside RCU idle regions which
+ * means we must not trace H_CEDE.
+ */
+ if (opcode == H_CEDE)
+ return;
+
+ local_irq_save(flags);
+
+ depth = &__get_cpu_var(hcall_trace_depth);
+
+ if (*depth)
+ goto out;
+
+ (*depth)++;
+ preempt_disable();
+ trace_hcall_entry(opcode, args);
+ (*depth)--;
+
+out:
+ local_irq_restore(flags);
+}
+
+void __trace_hcall_exit(long opcode, unsigned long retval,
+ unsigned long *retbuf)
+{
+ unsigned long flags;
+ unsigned int *depth;
+
+ if (opcode == H_CEDE)
+ return;
+
+ local_irq_save(flags);
+
+ depth = &__get_cpu_var(hcall_trace_depth);
+
+ if (*depth)
+ goto out;
+
+ (*depth)++;
+ trace_hcall_exit(opcode, retval, retbuf);
+ preempt_enable();
+ (*depth)--;
+
+out:
+ local_irq_restore(flags);
+}
+#endif
+
+/**
+ * h_get_mpp
+ * H_GET_MPP hcall returns info in 7 parms
+ */
+int h_get_mpp(struct hvcall_mpp_data *mpp_data)
+{
+ int rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+ rc = plpar_hcall9(H_GET_MPP, retbuf);
+
+ mpp_data->entitled_mem = retbuf[0];
+ mpp_data->mapped_mem = retbuf[1];
+
+ mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
+ mpp_data->pool_num = retbuf[2] & 0xffff;
+
+ mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
+ mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
+ mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff;
+
+ mpp_data->pool_size = retbuf[4];
+ mpp_data->loan_request = retbuf[5];
+ mpp_data->backing_mem = retbuf[6];
+
+ return rc;
+}
+EXPORT_SYMBOL(h_get_mpp);
+
+int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
+{
+ int rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };
+
+ rc = plpar_hcall9(H_GET_MPP_X, retbuf);
+
+ mpp_x_data->coalesced_bytes = retbuf[0];
+ mpp_x_data->pool_coalesced_bytes = retbuf[1];
+ mpp_x_data->pool_purr_cycles = retbuf[2];
+ mpp_x_data->pool_spurr_cycles = retbuf[3];
+
+ return rc;
+}
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
new file mode 100644
index 00000000000..029a562af37
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -0,0 +1,363 @@
+/*
+ * Support for Partition Mobility/Migration
+ *
+ * Copyright (C) 2010 Nathan Fontenot
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include <asm/rtas.h>
+#include "pseries.h"
+
+static struct kobject *mobility_kobj;
+
+struct update_props_workarea {
+ u32 phandle;
+ u32 state;
+ u64 reserved;
+ u32 nprops;
+};
+
+#define NODE_ACTION_MASK 0xff000000
+#define NODE_COUNT_MASK 0x00ffffff
+
+#define DELETE_DT_NODE 0x01000000
+#define UPDATE_DT_NODE 0x02000000
+#define ADD_DT_NODE 0x03000000
+
+static int mobility_rtas_call(int token, char *buf)
+{
+ int rc;
+
+ spin_lock(&rtas_data_buf_lock);
+
+ memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
+ rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1);
+ memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+ spin_unlock(&rtas_data_buf_lock);
+ return rc;
+}
+
+static int delete_dt_node(u32 phandle)
+{
+ struct device_node *dn;
+
+ dn = of_find_node_by_phandle(phandle);
+ if (!dn)
+ return -ENOENT;
+
+ dlpar_detach_node(dn);
+ return 0;
+}
+
+static int update_dt_property(struct device_node *dn, struct property **prop,
+ const char *name, u32 vd, char *value)
+{
+ struct property *new_prop = *prop;
+ struct property *old_prop;
+ int more = 0;
+
+ /* A negative 'vd' value indicates that only part of the new property
+ * value is contained in the buffer and we need to call
+ * ibm,update-properties again to get the rest of the value.
+ *
+ * A negative value is also the two's compliment of the actual value.
+ */
+ if (vd & 0x80000000) {
+ vd = ~vd + 1;
+ more = 1;
+ }
+
+ if (new_prop) {
+ /* partial property fixup */
+ char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL);
+ if (!new_data)
+ return -ENOMEM;
+
+ memcpy(new_data, new_prop->value, new_prop->length);
+ memcpy(new_data + new_prop->length, value, vd);
+
+ kfree(new_prop->value);
+ new_prop->value = new_data;
+ new_prop->length += vd;
+ } else {
+ new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+ if (!new_prop)
+ return -ENOMEM;
+
+ new_prop->name = kstrdup(name, GFP_KERNEL);
+ if (!new_prop->name) {
+ kfree(new_prop);
+ return -ENOMEM;
+ }
+
+ new_prop->length = vd;
+ new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
+ if (!new_prop->value) {
+ kfree(new_prop->name);
+ kfree(new_prop);
+ return -ENOMEM;
+ }
+
+ memcpy(new_prop->value, value, vd);
+ *prop = new_prop;
+ }
+
+ if (!more) {
+ old_prop = of_find_property(dn, new_prop->name, NULL);
+ if (old_prop)
+ prom_update_property(dn, new_prop, old_prop);
+ else
+ prom_add_property(dn, new_prop);
+
+ new_prop = NULL;
+ }
+
+ return 0;
+}
+
+static int update_dt_node(u32 phandle)
+{
+ struct update_props_workarea *upwa;
+ struct device_node *dn;
+ struct property *prop = NULL;
+ int i, rc;
+ char *prop_data;
+ char *rtas_buf;
+ int update_properties_token;
+
+ update_properties_token = rtas_token("ibm,update-properties");
+ if (update_properties_token == RTAS_UNKNOWN_SERVICE)
+ return -EINVAL;
+
+ rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!rtas_buf)
+ return -ENOMEM;
+
+ dn = of_find_node_by_phandle(phandle);
+ if (!dn) {
+ kfree(rtas_buf);
+ return -ENOENT;
+ }
+
+ upwa = (struct update_props_workarea *)&rtas_buf[0];
+ upwa->phandle = phandle;
+
+ do {
+ rc = mobility_rtas_call(update_properties_token, rtas_buf);
+ if (rc < 0)
+ break;
+
+ prop_data = rtas_buf + sizeof(*upwa);
+
+ for (i = 0; i < upwa->nprops; i++) {
+ char *prop_name;
+ u32 vd;
+
+ prop_name = prop_data + 1;
+ prop_data += strlen(prop_name) + 1;
+ vd = *prop_data++;
+
+ switch (vd) {
+ case 0x00000000:
+ /* name only property, nothing to do */
+ break;
+
+ case 0x80000000:
+ prop = of_find_property(dn, prop_name, NULL);
+ prom_remove_property(dn, prop);
+ prop = NULL;
+ break;
+
+ default:
+ rc = update_dt_property(dn, &prop, prop_name,
+ vd, prop_data);
+ if (rc) {
+ printk(KERN_ERR "Could not update %s"
+ " property\n", prop_name);
+ }
+
+ prop_data += vd;
+ }
+ }
+ } while (rc == 1);
+
+ of_node_put(dn);
+ kfree(rtas_buf);
+ return 0;
+}
+
+static int add_dt_node(u32 parent_phandle, u32 drc_index)
+{
+ struct device_node *dn;
+ struct device_node *parent_dn;
+ int rc;
+
+ dn = dlpar_configure_connector(drc_index);
+ if (!dn)
+ return -ENOENT;
+
+ parent_dn = of_find_node_by_phandle(parent_phandle);
+ if (!parent_dn) {
+ dlpar_free_cc_nodes(dn);
+ return -ENOENT;
+ }
+
+ dn->parent = parent_dn;
+ rc = dlpar_attach_node(dn);
+ if (rc)
+ dlpar_free_cc_nodes(dn);
+
+ of_node_put(parent_dn);
+ return rc;
+}
+
+static int pseries_devicetree_update(void)
+{
+ char *rtas_buf;
+ u32 *data;
+ int update_nodes_token;
+ int rc;
+
+ update_nodes_token = rtas_token("ibm,update-nodes");
+ if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
+ return -EINVAL;
+
+ rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!rtas_buf)
+ return -ENOMEM;
+
+ do {
+ rc = mobility_rtas_call(update_nodes_token, rtas_buf);
+ if (rc && rc != 1)
+ break;
+
+ data = (u32 *)rtas_buf + 4;
+ while (*data & NODE_ACTION_MASK) {
+ int i;
+ u32 action = *data & NODE_ACTION_MASK;
+ int node_count = *data & NODE_COUNT_MASK;
+
+ data++;
+
+ for (i = 0; i < node_count; i++) {
+ u32 phandle = *data++;
+ u32 drc_index;
+
+ switch (action) {
+ case DELETE_DT_NODE:
+ delete_dt_node(phandle);
+ break;
+ case UPDATE_DT_NODE:
+ update_dt_node(phandle);
+ break;
+ case ADD_DT_NODE:
+ drc_index = *data++;
+ add_dt_node(phandle, drc_index);
+ break;
+ }
+ }
+ }
+ } while (rc == 1);
+
+ kfree(rtas_buf);
+ return rc;
+}
+
+void post_mobility_fixup(void)
+{
+ int rc;
+ int activate_fw_token;
+
+ rc = pseries_devicetree_update();
+ if (rc) {
+ printk(KERN_ERR "Initial post-mobility device tree update "
+ "failed: %d\n", rc);
+ return;
+ }
+
+ activate_fw_token = rtas_token("ibm,activate-firmware");
+ if (activate_fw_token == RTAS_UNKNOWN_SERVICE) {
+ printk(KERN_ERR "Could not make post-mobility "
+ "activate-fw call.\n");
+ return;
+ }
+
+ rc = rtas_call(activate_fw_token, 0, 1, NULL);
+ if (!rc) {
+ rc = pseries_devicetree_update();
+ if (rc)
+ printk(KERN_ERR "Secondary post-mobility device tree "
+ "update failed: %d\n", rc);
+ } else {
+ printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);
+ return;
+ }
+
+ return;
+}
+
+static ssize_t migrate_store(struct class *class, struct class_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct rtas_args args;
+ u64 streamid;
+ int rc;
+
+ rc = strict_strtoull(buf, 0, &streamid);
+ if (rc)
+ return rc;
+
+ memset(&args, 0, sizeof(args));
+ args.token = rtas_token("ibm,suspend-me");
+ args.nargs = 2;
+ args.nret = 1;
+
+ args.args[0] = streamid >> 32 ;
+ args.args[1] = streamid & 0xffffffff;
+ args.rets = &args.args[args.nargs];
+
+ do {
+ args.rets[0] = 0;
+ rc = rtas_ibm_suspend_me(&args);
+ if (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE)
+ ssleep(1);
+ } while (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE);
+
+ if (rc)
+ return rc;
+ else if (args.rets[0])
+ return args.rets[0];
+
+ post_mobility_fixup();
+ return count;
+}
+
+static CLASS_ATTR(migration, S_IWUSR, NULL, migrate_store);
+
+static int __init mobility_sysfs_init(void)
+{
+ int rc;
+
+ mobility_kobj = kobject_create_and_add("mobility", kernel_kobj);
+ if (!mobility_kobj)
+ return -ENOMEM;
+
+ rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr);
+
+ return rc;
+}
+device_initcall(mobility_sysfs_init);
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
new file mode 100644
index 00000000000..38d24e7e7bb
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright 2006 Jake Moilanen <moilanen@austin.ibm.com>, IBM Corp.
+ * Copyright 2006-2007 Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/irq.h>
+#include <linux/msi.h>
+
+#include <asm/rtas.h>
+#include <asm/hw_irq.h>
+#include <asm/ppc-pci.h>
+
+static int query_token, change_token;
+
+#define RTAS_QUERY_FN 0
+#define RTAS_CHANGE_FN 1
+#define RTAS_RESET_FN 2
+#define RTAS_CHANGE_MSI_FN 3
+#define RTAS_CHANGE_MSIX_FN 4
+
+static struct pci_dn *get_pdn(struct pci_dev *pdev)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+
+ dn = pci_device_to_OF_node(pdev);
+ if (!dn) {
+ dev_dbg(&pdev->dev, "rtas_msi: No OF device node\n");
+ return NULL;
+ }
+
+ pdn = PCI_DN(dn);
+ if (!pdn) {
+ dev_dbg(&pdev->dev, "rtas_msi: No PCI DN\n");
+ return NULL;
+ }
+
+ return pdn;
+}
+
+/* RTAS Helpers */
+
+static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs)
+{
+ u32 addr, seq_num, rtas_ret[3];
+ unsigned long buid;
+ int rc;
+
+ addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+ buid = pdn->phb->buid;
+
+ seq_num = 1;
+ do {
+ if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN)
+ rc = rtas_call(change_token, 6, 4, rtas_ret, addr,
+ BUID_HI(buid), BUID_LO(buid),
+ func, num_irqs, seq_num);
+ else
+ rc = rtas_call(change_token, 6, 3, rtas_ret, addr,
+ BUID_HI(buid), BUID_LO(buid),
+ func, num_irqs, seq_num);
+
+ seq_num = rtas_ret[1];
+ } while (rtas_busy_delay(rc));
+
+ /*
+ * If the RTAS call succeeded, return the number of irqs allocated.
+ * If not, make sure we return a negative error code.
+ */
+ if (rc == 0)
+ rc = rtas_ret[0];
+ else if (rc > 0)
+ rc = -rc;
+
+ pr_debug("rtas_msi: ibm,change_msi(func=%d,num=%d), got %d rc = %d\n",
+ func, num_irqs, rtas_ret[0], rc);
+
+ return rc;
+}
+
+static void rtas_disable_msi(struct pci_dev *pdev)
+{
+ struct pci_dn *pdn;
+
+ pdn = get_pdn(pdev);
+ if (!pdn)
+ return;
+
+ /*
+ * disabling MSI with the explicit interface also disables MSI-X
+ */
+ if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) {
+ /*
+ * may have failed because explicit interface is not
+ * present
+ */
+ if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) {
+ pr_debug("rtas_msi: Setting MSIs to 0 failed!\n");
+ }
+ }
+}
+
+static int rtas_query_irq_number(struct pci_dn *pdn, int offset)
+{
+ u32 addr, rtas_ret[2];
+ unsigned long buid;
+ int rc;
+
+ addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+ buid = pdn->phb->buid;
+
+ do {
+ rc = rtas_call(query_token, 4, 3, rtas_ret, addr,
+ BUID_HI(buid), BUID_LO(buid), offset);
+ } while (rtas_busy_delay(rc));
+
+ if (rc) {
+ pr_debug("rtas_msi: error (%d) querying source number\n", rc);
+ return rc;
+ }
+
+ return rtas_ret[0];
+}
+
+static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
+{
+ struct msi_desc *entry;
+
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ if (entry->irq == NO_IRQ)
+ continue;
+
+ irq_set_msi_desc(entry->irq, NULL);
+ irq_dispose_mapping(entry->irq);
+ }
+
+ rtas_disable_msi(pdev);
+}
+
+static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
+{
+ struct device_node *dn;
+ struct pci_dn *pdn;
+ const u32 *req_msi;
+
+ pdn = get_pdn(pdev);
+ if (!pdn)
+ return -ENODEV;
+
+ dn = pdn->node;
+
+ req_msi = of_get_property(dn, prop_name, NULL);
+ if (!req_msi) {
+ pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name);
+ return -ENOENT;
+ }
+
+ if (*req_msi < nvec) {
+ pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec);
+
+ if (*req_msi == 0) /* Be paranoid */
+ return -ENOSPC;
+
+ return *req_msi;
+ }
+
+ return 0;
+}
+
+static int check_req_msi(struct pci_dev *pdev, int nvec)
+{
+ return check_req(pdev, nvec, "ibm,req#msi");
+}
+
+static int check_req_msix(struct pci_dev *pdev, int nvec)
+{
+ return check_req(pdev, nvec, "ibm,req#msi-x");
+}
+
+/* Quota calculation */
+
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+ struct device_node *dn;
+ const u32 *p;
+
+ dn = of_node_get(pci_device_to_OF_node(dev));
+ while (dn) {
+ p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
+ if (p) {
+ pr_debug("rtas_msi: found prop on dn %s\n",
+ dn->full_name);
+ *total = *p;
+ return dn;
+ }
+
+ dn = of_get_next_parent(dn);
+ }
+
+ return NULL;
+}
+
+static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
+{
+ struct device_node *dn;
+
+ /* Found our PE and assume 8 at that point. */
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn)
+ return NULL;
+
+ dn = find_device_pe(dn);
+ if (!dn)
+ return NULL;
+
+ /* We actually want the parent */
+ dn = of_get_parent(dn);
+ if (!dn)
+ return NULL;
+
+ /* Hardcode of 8 for old firmwares */
+ *total = 8;
+ pr_debug("rtas_msi: using PE dn %s\n", dn->full_name);
+
+ return dn;
+}
+
+struct msi_counts {
+ struct device_node *requestor;
+ int num_devices;
+ int request;
+ int quota;
+ int spare;
+ int over_quota;
+};
+
+static void *count_non_bridge_devices(struct device_node *dn, void *data)
+{
+ struct msi_counts *counts = data;
+ const u32 *p;
+ u32 class;
+
+ pr_debug("rtas_msi: counting %s\n", dn->full_name);
+
+ p = of_get_property(dn, "class-code", NULL);
+ class = p ? *p : 0;
+
+ if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
+ counts->num_devices++;
+
+ return NULL;
+}
+
+static void *count_spare_msis(struct device_node *dn, void *data)
+{
+ struct msi_counts *counts = data;
+ const u32 *p;
+ int req;
+
+ if (dn == counts->requestor)
+ req = counts->request;
+ else {
+ /* We don't know if a driver will try to use MSI or MSI-X,
+ * so we just have to punt and use the larger of the two. */
+ req = 0;
+ p = of_get_property(dn, "ibm,req#msi", NULL);
+ if (p)
+ req = *p;
+
+ p = of_get_property(dn, "ibm,req#msi-x", NULL);
+ if (p)
+ req = max(req, (int)*p);
+ }
+
+ if (req < counts->quota)
+ counts->spare += counts->quota - req;
+ else if (req > counts->quota)
+ counts->over_quota++;
+
+ return NULL;
+}
+
+static int msi_quota_for_device(struct pci_dev *dev, int request)
+{
+ struct device_node *pe_dn;
+ struct msi_counts counts;
+ int total;
+
+ pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
+ request);
+
+ pe_dn = find_pe_total_msi(dev, &total);
+ if (!pe_dn)
+ pe_dn = find_pe_dn(dev, &total);
+
+ if (!pe_dn) {
+ pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev));
+ goto out;
+ }
+
+ pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name);
+
+ memset(&counts, 0, sizeof(struct msi_counts));
+
+ /* Work out how many devices we have below this PE */
+ traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
+
+ if (counts.num_devices == 0) {
+ pr_err("rtas_msi: found 0 devices under PE for %s\n",
+ pci_name(dev));
+ goto out;
+ }
+
+ counts.quota = total / counts.num_devices;
+ if (request <= counts.quota)
+ goto out;
+
+ /* else, we have some more calculating to do */
+ counts.requestor = pci_device_to_OF_node(dev);
+ counts.request = request;
+ traverse_pci_devices(pe_dn, count_spare_msis, &counts);
+
+ /* If the quota isn't an integer multiple of the total, we can
+ * use the remainder as spare MSIs for anyone that wants them. */
+ counts.spare += total % counts.num_devices;
+
+ /* Divide any spare by the number of over-quota requestors */
+ if (counts.over_quota)
+ counts.quota += counts.spare / counts.over_quota;
+
+ /* And finally clamp the request to the possibly adjusted quota */
+ request = min(counts.quota, request);
+
+ pr_debug("rtas_msi: request clamped to quota %d\n", request);
+out:
+ of_node_put(pe_dn);
+
+ return request;
+}
+
+static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type)
+{
+ int quota, rc;
+
+ if (type == PCI_CAP_ID_MSIX)
+ rc = check_req_msix(pdev, nvec);
+ else
+ rc = check_req_msi(pdev, nvec);
+
+ if (rc)
+ return rc;
+
+ quota = msi_quota_for_device(pdev, nvec);
+
+ if (quota && quota < nvec)
+ return quota;
+
+ return 0;
+}
+
+static int check_msix_entries(struct pci_dev *pdev)
+{
+ struct msi_desc *entry;
+ int expected;
+
+ /* There's no way for us to express to firmware that we want
+ * a discontiguous, or non-zero based, range of MSI-X entries.
+ * So we must reject such requests. */
+
+ expected = 0;
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ if (entry->msi_attrib.entry_nr != expected) {
+ pr_debug("rtas_msi: bad MSI-X entries.\n");
+ return -EINVAL;
+ }
+ expected++;
+ }
+
+ return 0;
+}
+
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+ struct pci_dn *pdn;
+ int hwirq, virq, i, rc;
+ struct msi_desc *entry;
+ struct msi_msg msg;
+
+ pdn = get_pdn(pdev);
+ if (!pdn)
+ return -ENODEV;
+
+ if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev))
+ return -EINVAL;
+
+ /*
+ * Try the new more explicit firmware interface, if that fails fall
+ * back to the old interface. The old interface is known to never
+ * return MSI-Xs.
+ */
+ if (type == PCI_CAP_ID_MSI) {
+ rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
+
+ if (rc < 0) {
+ pr_debug("rtas_msi: trying the old firmware call.\n");
+ rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec);
+ }
+ } else
+ rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
+
+ if (rc != nvec) {
+ pr_debug("rtas_msi: rtas_change_msi() failed\n");
+ return rc;
+ }
+
+ i = 0;
+ list_for_each_entry(entry, &pdev->msi_list, list) {
+ hwirq = rtas_query_irq_number(pdn, i++);
+ if (hwirq < 0) {
+ pr_debug("rtas_msi: error (%d) getting hwirq\n", rc);
+ return hwirq;
+ }
+
+ virq = irq_create_mapping(NULL, hwirq);
+
+ if (virq == NO_IRQ) {
+ pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
+ return -ENOSPC;
+ }
+
+ dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq);
+ irq_set_msi_desc(virq, entry);
+
+ /* Read config space back so we can restore after reset */
+ read_msi_msg(virq, &msg);
+ entry->msg = msg;
+ }
+
+ return 0;
+}
+
+static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
+{
+ /* No LSI -> leave MSIs (if any) configured */
+ if (pdev->irq == NO_IRQ) {
+ dev_dbg(&pdev->dev, "rtas_msi: no LSI, nothing to do.\n");
+ return;
+ }
+
+ /* No MSI -> MSIs can't have been assigned by fw, leave LSI */
+ if (check_req_msi(pdev, 1) && check_req_msix(pdev, 1)) {
+ dev_dbg(&pdev->dev, "rtas_msi: no req#msi/x, nothing to do.\n");
+ return;
+ }
+
+ dev_dbg(&pdev->dev, "rtas_msi: disabling existing MSI.\n");
+ rtas_disable_msi(pdev);
+}
+
+static int rtas_msi_init(void)
+{
+ query_token = rtas_token("ibm,query-interrupt-source-number");
+ change_token = rtas_token("ibm,change-msi");
+
+ if ((query_token == RTAS_UNKNOWN_SERVICE) ||
+ (change_token == RTAS_UNKNOWN_SERVICE)) {
+ pr_debug("rtas_msi: no RTAS tokens, no MSI support.\n");
+ return -1;
+ }
+
+ pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
+
+ WARN_ON(ppc_md.setup_msi_irqs);
+ ppc_md.setup_msi_irqs = rtas_setup_msi_irqs;
+ ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs;
+ ppc_md.msi_check_device = rtas_msi_check_device;
+
+ WARN_ON(ppc_md.pci_irq_fixup);
+ ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
+
+ return 0;
+}
+arch_initcall(rtas_msi_init);
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
new file mode 100644
index 00000000000..36f957f3184
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -0,0 +1,679 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * /dev/nvram driver for PPC64
+ *
+ * This perhaps should live in drivers/char
+ */
+
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/kmsg_dump.h>
+#include <linux/ctype.h>
+#include <linux/zlib.h>
+#include <asm/uaccess.h>
+#include <asm/nvram.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/machdep.h>
+
+/* Max bytes to read/write in one go */
+#define NVRW_CNT 0x20
+
+static unsigned int nvram_size;
+static int nvram_fetch, nvram_store;
+static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */
+static DEFINE_SPINLOCK(nvram_lock);
+
+struct err_log_info {
+ int error_type;
+ unsigned int seq_num;
+};
+
+struct nvram_os_partition {
+ const char *name;
+ int req_size; /* desired size, in bytes */
+ int min_size; /* minimum acceptable size (0 means req_size) */
+ long size; /* size of data portion (excluding err_log_info) */
+ long index; /* offset of data portion of partition */
+};
+
+static struct nvram_os_partition rtas_log_partition = {
+ .name = "ibm,rtas-log",
+ .req_size = 2079,
+ .min_size = 1055,
+ .index = -1
+};
+
+static struct nvram_os_partition oops_log_partition = {
+ .name = "lnx,oops-log",
+ .req_size = 4000,
+ .min_size = 2000,
+ .index = -1
+};
+
+static const char *pseries_nvram_os_partitions[] = {
+ "ibm,rtas-log",
+ "lnx,oops-log",
+ NULL
+};
+
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason,
+ const char *old_msgs, unsigned long old_len,
+ const char *new_msgs, unsigned long new_len);
+
+static struct kmsg_dumper nvram_kmsg_dumper = {
+ .dump = oops_to_nvram
+};
+
+/* See clobbering_unread_rtas_event() */
+#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */
+static unsigned long last_unread_rtas_event; /* timestamp */
+
+/*
+ * For capturing and compressing an oops or panic report...
+
+ * big_oops_buf[] holds the uncompressed text we're capturing.
+ *
+ * oops_buf[] holds the compressed text, preceded by a prefix.
+ * The prefix is just a u16 holding the length of the compressed* text.
+ * (*Or uncompressed, if compression fails.) oops_buf[] gets written
+ * to NVRAM.
+ *
+ * oops_len points to the prefix. oops_data points to the compressed text.
+ *
+ * +- oops_buf
+ * | +- oops_data
+ * v v
+ * +------------+-----------------------------------------------+
+ * | length | text |
+ * | (2 bytes) | (oops_data_sz bytes) |
+ * +------------+-----------------------------------------------+
+ * ^
+ * +- oops_len
+ *
+ * We preallocate these buffers during init to avoid kmalloc during oops/panic.
+ */
+static size_t big_oops_buf_sz;
+static char *big_oops_buf, *oops_buf;
+static u16 *oops_len;
+static char *oops_data;
+static size_t oops_data_sz;
+
+/* Compression parameters */
+#define COMPR_LEVEL 6
+#define WINDOW_BITS 12
+#define MEM_LEVEL 4
+static struct z_stream_s stream;
+
+static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index)
+{
+ unsigned int i;
+ unsigned long len;
+ int done;
+ unsigned long flags;
+ char *p = buf;
+
+
+ if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ if (*index >= nvram_size)
+ return 0;
+
+ i = *index;
+ if (i + count > nvram_size)
+ count = nvram_size - i;
+
+ spin_lock_irqsave(&nvram_lock, flags);
+
+ for (; count != 0; count -= len) {
+ len = count;
+ if (len > NVRW_CNT)
+ len = NVRW_CNT;
+
+ if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf),
+ len) != 0) || len != done) {
+ spin_unlock_irqrestore(&nvram_lock, flags);
+ return -EIO;
+ }
+
+ memcpy(p, nvram_buf, len);
+
+ p += len;
+ i += len;
+ }
+
+ spin_unlock_irqrestore(&nvram_lock, flags);
+
+ *index = i;
+ return p - buf;
+}
+
+static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index)
+{
+ unsigned int i;
+ unsigned long len;
+ int done;
+ unsigned long flags;
+ const char *p = buf;
+
+ if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ if (*index >= nvram_size)
+ return 0;
+
+ i = *index;
+ if (i + count > nvram_size)
+ count = nvram_size - i;
+
+ spin_lock_irqsave(&nvram_lock, flags);
+
+ for (; count != 0; count -= len) {
+ len = count;
+ if (len > NVRW_CNT)
+ len = NVRW_CNT;
+
+ memcpy(nvram_buf, p, len);
+
+ if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf),
+ len) != 0) || len != done) {
+ spin_unlock_irqrestore(&nvram_lock, flags);
+ return -EIO;
+ }
+
+ p += len;
+ i += len;
+ }
+ spin_unlock_irqrestore(&nvram_lock, flags);
+
+ *index = i;
+ return p - buf;
+}
+
+static ssize_t pSeries_nvram_get_size(void)
+{
+ return nvram_size ? nvram_size : -ENODEV;
+}
+
+
+/* nvram_write_os_partition, nvram_write_error_log
+ *
+ * We need to buffer the error logs into nvram to ensure that we have
+ * the failure information to decode. If we have a severe error there
+ * is no way to guarantee that the OS or the machine is in a state to
+ * get back to user land and write the error to disk. For example if
+ * the SCSI device driver causes a Machine Check by writing to a bad
+ * IO address, there is no way of guaranteeing that the device driver
+ * is in any state that is would also be able to write the error data
+ * captured to disk, thus we buffer it in NVRAM for analysis on the
+ * next boot.
+ *
+ * In NVRAM the partition containing the error log buffer will looks like:
+ * Header (in bytes):
+ * +-----------+----------+--------+------------+------------------+
+ * | signature | checksum | length | name | data |
+ * |0 |1 |2 3|4 15|16 length-1|
+ * +-----------+----------+--------+------------+------------------+
+ *
+ * The 'data' section would look like (in bytes):
+ * +--------------+------------+-----------------------------------+
+ * | event_logged | sequence # | error log |
+ * |0 3|4 7|8 error_log_size-1|
+ * +--------------+------------+-----------------------------------+
+ *
+ * event_logged: 0 if event has not been logged to syslog, 1 if it has
+ * sequence #: The unique sequence # for each event. (until it wraps)
+ * error log: The error log from event_scan
+ */
+int nvram_write_os_partition(struct nvram_os_partition *part, char * buff,
+ int length, unsigned int err_type, unsigned int error_log_cnt)
+{
+ int rc;
+ loff_t tmp_index;
+ struct err_log_info info;
+
+ if (part->index == -1) {
+ return -ESPIPE;
+ }
+
+ if (length > part->size) {
+ length = part->size;
+ }
+
+ info.error_type = err_type;
+ info.seq_num = error_log_cnt;
+
+ tmp_index = part->index;
+
+ rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index);
+ if (rc <= 0) {
+ pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
+ return rc;
+ }
+
+ rc = ppc_md.nvram_write(buff, length, &tmp_index);
+ if (rc <= 0) {
+ pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+int nvram_write_error_log(char * buff, int length,
+ unsigned int err_type, unsigned int error_log_cnt)
+{
+ int rc = nvram_write_os_partition(&rtas_log_partition, buff, length,
+ err_type, error_log_cnt);
+ if (!rc)
+ last_unread_rtas_event = get_seconds();
+ return rc;
+}
+
+/* nvram_read_error_log
+ *
+ * Reads nvram for error log for at most 'length'
+ */
+int nvram_read_error_log(char * buff, int length,
+ unsigned int * err_type, unsigned int * error_log_cnt)
+{
+ int rc;
+ loff_t tmp_index;
+ struct err_log_info info;
+
+ if (rtas_log_partition.index == -1)
+ return -1;
+
+ if (length > rtas_log_partition.size)
+ length = rtas_log_partition.size;
+
+ tmp_index = rtas_log_partition.index;
+
+ rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index);
+ if (rc <= 0) {
+ printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+ return rc;
+ }
+
+ rc = ppc_md.nvram_read(buff, length, &tmp_index);
+ if (rc <= 0) {
+ printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc);
+ return rc;
+ }
+
+ *error_log_cnt = info.seq_num;
+ *err_type = info.error_type;
+
+ return 0;
+}
+
+/* This doesn't actually zero anything, but it sets the event_logged
+ * word to tell that this event is safely in syslog.
+ */
+int nvram_clear_error_log(void)
+{
+ loff_t tmp_index;
+ int clear_word = ERR_FLAG_ALREADY_LOGGED;
+ int rc;
+
+ if (rtas_log_partition.index == -1)
+ return -1;
+
+ tmp_index = rtas_log_partition.index;
+
+ rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
+ if (rc <= 0) {
+ printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc);
+ return rc;
+ }
+ last_unread_rtas_event = 0;
+
+ return 0;
+}
+
+/* pseries_nvram_init_os_partition
+ *
+ * This sets up a partition with an "OS" signature.
+ *
+ * The general strategy is the following:
+ * 1.) If a partition with the indicated name already exists...
+ * - If it's large enough, use it.
+ * - Otherwise, recycle it and keep going.
+ * 2.) Search for a free partition that is large enough.
+ * 3.) If there's not a free partition large enough, recycle any obsolete
+ * OS partitions and try again.
+ * 4.) Will first try getting a chunk that will satisfy the requested size.
+ * 5.) If a chunk of the requested size cannot be allocated, then try finding
+ * a chunk that will satisfy the minum needed.
+ *
+ * Returns 0 on success, else -1.
+ */
+static int __init pseries_nvram_init_os_partition(struct nvram_os_partition
+ *part)
+{
+ loff_t p;
+ int size;
+
+ /* Scan nvram for partitions */
+ nvram_scan_partitions();
+
+ /* Look for ours */
+ p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size);
+
+ /* Found one but too small, remove it */
+ if (p && size < part->min_size) {
+ pr_info("nvram: Found too small %s partition,"
+ " removing it...\n", part->name);
+ nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL);
+ p = 0;
+ }
+
+ /* Create one if we didn't find */
+ if (!p) {
+ p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+ part->req_size, part->min_size);
+ if (p == -ENOSPC) {
+ pr_info("nvram: No room to create %s partition, "
+ "deleting any obsolete OS partitions...\n",
+ part->name);
+ nvram_remove_partition(NULL, NVRAM_SIG_OS,
+ pseries_nvram_os_partitions);
+ p = nvram_create_partition(part->name, NVRAM_SIG_OS,
+ part->req_size, part->min_size);
+ }
+ }
+
+ if (p <= 0) {
+ pr_err("nvram: Failed to find or create %s"
+ " partition, err %d\n", part->name, (int)p);
+ return -1;
+ }
+
+ part->index = p;
+ part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info);
+
+ return 0;
+}
+
+static void __init nvram_init_oops_partition(int rtas_partition_exists)
+{
+ int rc;
+
+ rc = pseries_nvram_init_os_partition(&oops_log_partition);
+ if (rc != 0) {
+ if (!rtas_partition_exists)
+ return;
+ pr_notice("nvram: Using %s partition to log both"
+ " RTAS errors and oops/panic reports\n",
+ rtas_log_partition.name);
+ memcpy(&oops_log_partition, &rtas_log_partition,
+ sizeof(rtas_log_partition));
+ }
+ oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL);
+ if (!oops_buf) {
+ pr_err("nvram: No memory for %s partition\n",
+ oops_log_partition.name);
+ return;
+ }
+ oops_len = (u16*) oops_buf;
+ oops_data = oops_buf + sizeof(u16);
+ oops_data_sz = oops_log_partition.size - sizeof(u16);
+
+ /*
+ * Figure compression (preceded by elimination of each line's <n>
+ * severity prefix) will reduce the oops/panic report to at most
+ * 45% of its original size.
+ */
+ big_oops_buf_sz = (oops_data_sz * 100) / 45;
+ big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
+ if (big_oops_buf) {
+ stream.workspace = kmalloc(zlib_deflate_workspacesize(
+ WINDOW_BITS, MEM_LEVEL), GFP_KERNEL);
+ if (!stream.workspace) {
+ pr_err("nvram: No memory for compression workspace; "
+ "skipping compression of %s partition data\n",
+ oops_log_partition.name);
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+ }
+ } else {
+ pr_err("No memory for uncompressed %s data; "
+ "skipping compression\n", oops_log_partition.name);
+ stream.workspace = NULL;
+ }
+
+ rc = kmsg_dump_register(&nvram_kmsg_dumper);
+ if (rc != 0) {
+ pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc);
+ kfree(oops_buf);
+ kfree(big_oops_buf);
+ kfree(stream.workspace);
+ }
+}
+
+static int __init pseries_nvram_init_log_partitions(void)
+{
+ int rc;
+
+ rc = pseries_nvram_init_os_partition(&rtas_log_partition);
+ nvram_init_oops_partition(rc == 0);
+ return 0;
+}
+machine_arch_initcall(pseries, pseries_nvram_init_log_partitions);
+
+int __init pSeries_nvram_init(void)
+{
+ struct device_node *nvram;
+ const unsigned int *nbytes_p;
+ unsigned int proplen;
+
+ nvram = of_find_node_by_type(NULL, "nvram");
+ if (nvram == NULL)
+ return -ENODEV;
+
+ nbytes_p = of_get_property(nvram, "#bytes", &proplen);
+ if (nbytes_p == NULL || proplen != sizeof(unsigned int)) {
+ of_node_put(nvram);
+ return -EIO;
+ }
+
+ nvram_size = *nbytes_p;
+
+ nvram_fetch = rtas_token("nvram-fetch");
+ nvram_store = rtas_token("nvram-store");
+ printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
+ of_node_put(nvram);
+
+ ppc_md.nvram_read = pSeries_nvram_read;
+ ppc_md.nvram_write = pSeries_nvram_write;
+ ppc_md.nvram_size = pSeries_nvram_get_size;
+
+ return 0;
+}
+
+/*
+ * Try to capture the last capture_len bytes of the printk buffer. Return
+ * the amount actually captured.
+ */
+static size_t capture_last_msgs(const char *old_msgs, size_t old_len,
+ const char *new_msgs, size_t new_len,
+ char *captured, size_t capture_len)
+{
+ if (new_len >= capture_len) {
+ memcpy(captured, new_msgs + (new_len - capture_len),
+ capture_len);
+ return capture_len;
+ } else {
+ /* Grab the end of old_msgs. */
+ size_t old_tail_len = min(old_len, capture_len - new_len);
+ memcpy(captured, old_msgs + (old_len - old_tail_len),
+ old_tail_len);
+ memcpy(captured + old_tail_len, new_msgs, new_len);
+ return old_tail_len + new_len;
+ }
+}
+
+/*
+ * Are we using the ibm,rtas-log for oops/panic reports? And if so,
+ * would logging this oops/panic overwrite an RTAS event that rtas_errd
+ * hasn't had a chance to read and process? Return 1 if so, else 0.
+ *
+ * We assume that if rtas_errd hasn't read the RTAS event in
+ * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to.
+ */
+static int clobbering_unread_rtas_event(void)
+{
+ return (oops_log_partition.index == rtas_log_partition.index
+ && last_unread_rtas_event
+ && get_seconds() - last_unread_rtas_event <=
+ NVRAM_RTAS_READ_TIMEOUT);
+}
+
+/* Squeeze out each line's <n> severity prefix. */
+static size_t elide_severities(char *buf, size_t len)
+{
+ char *in, *out, *buf_end = buf + len;
+ /* Assume a <n> at the very beginning marks the start of a line. */
+ int newline = 1;
+
+ in = out = buf;
+ while (in < buf_end) {
+ if (newline && in+3 <= buf_end &&
+ *in == '<' && isdigit(in[1]) && in[2] == '>') {
+ in += 3;
+ newline = 0;
+ } else {
+ newline = (*in == '\n');
+ *out++ = *in++;
+ }
+ }
+ return out - buf;
+}
+
+/* Derived from logfs_compress() */
+static int nvram_compress(const void *in, void *out, size_t inlen,
+ size_t outlen)
+{
+ int err, ret;
+
+ ret = -EIO;
+ err = zlib_deflateInit2(&stream, COMPR_LEVEL, Z_DEFLATED, WINDOW_BITS,
+ MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (err != Z_OK)
+ goto error;
+
+ stream.next_in = in;
+ stream.avail_in = inlen;
+ stream.total_in = 0;
+ stream.next_out = out;
+ stream.avail_out = outlen;
+ stream.total_out = 0;
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END)
+ goto error;
+
+ err = zlib_deflateEnd(&stream);
+ if (err != Z_OK)
+ goto error;
+
+ if (stream.total_out >= stream.total_in)
+ goto error;
+
+ ret = stream.total_out;
+error:
+ return ret;
+}
+
+/* Compress the text from big_oops_buf into oops_buf. */
+static int zip_oops(size_t text_len)
+{
+ int zipped_len = nvram_compress(big_oops_buf, oops_data, text_len,
+ oops_data_sz);
+ if (zipped_len < 0) {
+ pr_err("nvram: compression failed; returned %d\n", zipped_len);
+ pr_err("nvram: logging uncompressed oops/panic report\n");
+ return -1;
+ }
+ *oops_len = (u16) zipped_len;
+ return 0;
+}
+
+/*
+ * This is our kmsg_dump callback, called after an oops or panic report
+ * has been written to the printk buffer. We want to capture as much
+ * of the printk buffer as possible. First, capture as much as we can
+ * that we think will compress sufficiently to fit in the lnx,oops-log
+ * partition. If that's too much, go back and capture uncompressed text.
+ */
+static void oops_to_nvram(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason,
+ const char *old_msgs, unsigned long old_len,
+ const char *new_msgs, unsigned long new_len)
+{
+ static unsigned int oops_count = 0;
+ static bool panicking = false;
+ static DEFINE_SPINLOCK(lock);
+ unsigned long flags;
+ size_t text_len;
+ unsigned int err_type = ERR_TYPE_KERNEL_PANIC_GZ;
+ int rc = -1;
+
+ switch (reason) {
+ case KMSG_DUMP_RESTART:
+ case KMSG_DUMP_HALT:
+ case KMSG_DUMP_POWEROFF:
+ /* These are almost always orderly shutdowns. */
+ return;
+ case KMSG_DUMP_OOPS:
+ break;
+ case KMSG_DUMP_PANIC:
+ panicking = true;
+ break;
+ case KMSG_DUMP_EMERG:
+ if (panicking)
+ /* Panic report already captured. */
+ return;
+ break;
+ default:
+ pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n",
+ __FUNCTION__, (int) reason);
+ return;
+ }
+
+ if (clobbering_unread_rtas_event())
+ return;
+
+ if (!spin_trylock_irqsave(&lock, flags))
+ return;
+
+ if (big_oops_buf) {
+ text_len = capture_last_msgs(old_msgs, old_len,
+ new_msgs, new_len, big_oops_buf, big_oops_buf_sz);
+ text_len = elide_severities(big_oops_buf, text_len);
+ rc = zip_oops(text_len);
+ }
+ if (rc != 0) {
+ text_len = capture_last_msgs(old_msgs, old_len,
+ new_msgs, new_len, oops_data, oops_data_sz);
+ err_type = ERR_TYPE_KERNEL_PANIC;
+ *oops_len = (u16) text_len;
+ }
+
+ (void) nvram_write_os_partition(&oops_log_partition, oops_buf,
+ (int) (sizeof(*oops_len) + *oops_len), err_type, ++oops_count);
+
+ spin_unlock_irqrestore(&lock, flags);
+}
diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h
new file mode 100644
index 00000000000..08672d9136a
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/offline_states.h
@@ -0,0 +1,37 @@
+#ifndef _OFFLINE_STATES_H_
+#define _OFFLINE_STATES_H_
+
+/* Cpu offline states go here */
+enum cpu_state_vals {
+ CPU_STATE_OFFLINE,
+ CPU_STATE_INACTIVE,
+ CPU_STATE_ONLINE,
+ CPU_MAX_OFFLINE_STATES
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern enum cpu_state_vals get_cpu_current_state(int cpu);
+extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
+extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
+extern void set_default_offline_state(int cpu);
+#else
+static inline enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+ return CPU_STATE_ONLINE;
+}
+
+static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+}
+
+static inline void set_default_offline_state(int cpu)
+{
+}
+#endif
+
+extern enum cpu_state_vals get_preferred_offline_state(int cpu);
+#endif
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
new file mode 100644
index 00000000000..2c6ded29f73
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen, IBM Corporation
+ * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * pSeries specific routines for PCI.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/eeh.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/ppc-pci.h>
+
+#if 0
+void pcibios_name_device(struct pci_dev *dev)
+{
+ struct device_node *dn;
+
+ /*
+ * Add IBM loc code (slot) as a prefix to the device names for service
+ */
+ dn = pci_device_to_OF_node(dev);
+ if (dn) {
+ const char *loc_code = of_get_property(dn, "ibm,loc-code", 0);
+ if (loc_code) {
+ int loc_len = strlen(loc_code);
+ if (loc_len < sizeof(dev->dev.name)) {
+ memmove(dev->dev.name+loc_len+1, dev->dev.name,
+ sizeof(dev->dev.name)-loc_len-1);
+ memcpy(dev->dev.name, loc_code, loc_len);
+ dev->dev.name[loc_len] = ' ';
+ dev->dev.name[sizeof(dev->dev.name)-1] = '\0';
+ }
+ }
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device);
+#endif
+
+static void __init pSeries_request_regions(void)
+{
+ if (!isa_io_base)
+ return;
+
+ request_region(0x20,0x20,"pic1");
+ request_region(0xa0,0x20,"pic2");
+ request_region(0x00,0x20,"dma1");
+ request_region(0x40,0x20,"timer");
+ request_region(0x80,0x10,"dma page reg");
+ request_region(0xc0,0x20,"dma2");
+}
+
+void __init pSeries_final_fixup(void)
+{
+ pSeries_request_regions();
+
+ pci_addr_cache_build();
+}
+
+/*
+ * Assume the winbond 82c105 is the IDE controller on a
+ * p610/p615/p630. We should probably be more careful in case
+ * someone tries to plug in a similar adapter.
+ */
+static void fixup_winbond_82c105(struct pci_dev* dev)
+{
+ int i;
+ unsigned int reg;
+
+ if (!machine_is(pseries))
+ return;
+
+ printk("Using INTC for W82c105 IDE controller.\n");
+ pci_read_config_dword(dev, 0x40, &reg);
+ /* Enable LEGIRQ to use INTC instead of ISA interrupts */
+ pci_write_config_dword(dev, 0x40, reg | (1<<11));
+
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) {
+ /* zap the 2nd function of the winbond chip */
+ if (dev->resource[i].flags & IORESOURCE_IO
+ && dev->bus->number == 0 && dev->devfn == 0x81)
+ dev->resource[i].flags &= ~IORESOURCE_IO;
+ if (dev->resource[i].start == 0 && dev->resource[i].end) {
+ dev->resource[i].flags = 0;
+ dev->resource[i].end = 0;
+ }
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105,
+ fixup_winbond_82c105);
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
new file mode 100644
index 00000000000..55d4ec1bd1a
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -0,0 +1,209 @@
+/*
+ * PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code
+ * for RPA-compliant PPC64 platform.
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+static struct pci_bus *
+find_bus_among_children(struct pci_bus *bus,
+ struct device_node *dn)
+{
+ struct pci_bus *child = NULL;
+ struct list_head *tmp;
+ struct device_node *busdn;
+
+ busdn = pci_bus_to_OF_node(bus);
+ if (busdn == dn)
+ return bus;
+
+ list_for_each(tmp, &bus->children) {
+ child = find_bus_among_children(pci_bus_b(tmp), dn);
+ if (child)
+ break;
+ };
+ return child;
+}
+
+struct pci_bus *
+pcibios_find_pci_bus(struct device_node *dn)
+{
+ struct pci_dn *pdn = dn->data;
+
+ if (!pdn || !pdn->phb || !pdn->phb->bus)
+ return NULL;
+
+ return find_bus_among_children(pdn->phb->bus, dn);
+}
+EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
+
+/**
+ * pcibios_remove_pci_devices - remove all devices under this bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pcibios_remove_pci_devices(struct pci_bus *bus)
+{
+ struct pci_dev *dev, *tmp;
+ struct pci_bus *child_bus;
+
+ /* First go down child busses */
+ list_for_each_entry(child_bus, &bus->children, node)
+ pcibios_remove_pci_devices(child_bus);
+
+ pr_debug("PCI: Removing devices on bus %04x:%02x\n",
+ pci_domain_nr(bus), bus->number);
+ list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+ pr_debug(" * Removing %s...\n", pci_name(dev));
+ eeh_remove_bus_device(dev);
+ pci_remove_bus_device(dev);
+ }
+}
+EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+
+/**
+ * pcibios_add_pci_devices - adds new pci devices to bus
+ *
+ * This routine will find and fixup new pci devices under
+ * the indicated bus. This routine presumes that there
+ * might already be some devices under this bridge, so
+ * it carefully tries to add only new devices. (And that
+ * is how this routine differs from other, similar pcibios
+ * routines.)
+ */
+void pcibios_add_pci_devices(struct pci_bus * bus)
+{
+ int slotno, num, mode, pass, max;
+ struct pci_dev *dev;
+ struct device_node *dn = pci_bus_to_OF_node(bus);
+
+ eeh_add_device_tree_early(dn);
+
+ mode = PCI_PROBE_NORMAL;
+ if (ppc_md.pci_probe_mode)
+ mode = ppc_md.pci_probe_mode(bus);
+
+ if (mode == PCI_PROBE_DEVTREE) {
+ /* use ofdt-based probe */
+ of_rescan_bus(dn, bus);
+ } else if (mode == PCI_PROBE_NORMAL) {
+ /* use legacy probe */
+ slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
+ num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+ if (!num)
+ return;
+ pcibios_setup_bus_devices(bus);
+ max = bus->secondary;
+ for (pass=0; pass < 2; pass++)
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+ dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+ max = pci_scan_bridge(bus, dev, max, pass);
+ }
+ }
+ pcibios_finish_adding_to_bus(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
+
+struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
+{
+ struct pci_controller *phb;
+
+ pr_debug("PCI: Initializing new hotplug PHB %s\n", dn->full_name);
+
+ phb = pcibios_alloc_controller(dn);
+ if (!phb)
+ return NULL;
+ rtas_setup_phb(phb);
+ pci_process_bridge_OF_ranges(phb, dn, 0);
+
+ pci_devs_phb_init_dynamic(phb);
+
+ if (dn->child)
+ eeh_add_device_tree_early(dn);
+
+ pcibios_scan_phb(phb);
+ pcibios_finish_adding_to_bus(phb->bus);
+
+ return phb;
+}
+EXPORT_SYMBOL_GPL(init_phb_dynamic);
+
+/* RPA-specific bits for removing PHBs */
+int remove_phb_dynamic(struct pci_controller *phb)
+{
+ struct pci_bus *b = phb->bus;
+ struct resource *res;
+ int rc, i;
+
+ pr_debug("PCI: Removing PHB %04x:%02x...\n",
+ pci_domain_nr(b), b->number);
+
+ /* We cannot to remove a root bus that has children */
+ if (!(list_empty(&b->children) && list_empty(&b->devices)))
+ return -EBUSY;
+
+ /* We -know- there aren't any child devices anymore at this stage
+ * and thus, we can safely unmap the IO space as it's not in use
+ */
+ res = &phb->io_resource;
+ if (res->flags & IORESOURCE_IO) {
+ rc = pcibios_unmap_io_space(b);
+ if (rc) {
+ printk(KERN_ERR "%s: failed to unmap IO on bus %s\n",
+ __func__, b->name);
+ return 1;
+ }
+ }
+
+ /* Unregister the bridge device from sysfs and remove the PCI bus */
+ device_unregister(b->bridge);
+ phb->bus = NULL;
+ pci_remove_bus(b);
+
+ /* Now release the IO resource */
+ if (res->flags & IORESOURCE_IO)
+ release_resource(res);
+
+ /* Release memory resources */
+ for (i = 0; i < 3; ++i) {
+ res = &phb->mem_resources[i];
+ if (!(res->flags & IORESOURCE_MEM))
+ continue;
+ release_resource(res);
+ }
+
+ /* Free pci_controller data structure */
+ pcibios_free_controller(phb);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(remove_phb_dynamic);
diff --git a/arch/powerpc/platforms/pseries/phyp_dump.c b/arch/powerpc/platforms/pseries/phyp_dump.c
new file mode 100644
index 00000000000..6e7742da007
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/phyp_dump.c
@@ -0,0 +1,513 @@
+/*
+ * Hypervisor-assisted dump
+ *
+ * Linas Vepstas, Manish Ahuja 2008
+ * Copyright 2008 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/of.h>
+#include <linux/pfn.h>
+#include <linux/swap.h>
+#include <linux/sysfs.h>
+
+#include <asm/page.h>
+#include <asm/phyp_dump.h>
+#include <asm/machdep.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+/* Variables, used to communicate data between early boot and late boot */
+static struct phyp_dump phyp_dump_vars;
+struct phyp_dump *phyp_dump_info = &phyp_dump_vars;
+
+static int ibm_configure_kernel_dump;
+/* ------------------------------------------------- */
+/* RTAS interfaces to declare the dump regions */
+
+struct dump_section {
+ u32 dump_flags;
+ u16 source_type;
+ u16 error_flags;
+ u64 source_address;
+ u64 source_length;
+ u64 length_copied;
+ u64 destination_address;
+};
+
+struct phyp_dump_header {
+ u32 version;
+ u16 num_of_sections;
+ u16 status;
+
+ u32 first_offset_section;
+ u32 dump_disk_section;
+ u64 block_num_dd;
+ u64 num_of_blocks_dd;
+ u32 offset_dd;
+ u32 maxtime_to_auto;
+ /* No dump disk path string used */
+
+ struct dump_section cpu_data;
+ struct dump_section hpte_data;
+ struct dump_section kernel_data;
+};
+
+/* The dump header *must be* in low memory, so .bss it */
+static struct phyp_dump_header phdr;
+
+#define NUM_DUMP_SECTIONS 3
+#define DUMP_HEADER_VERSION 0x1
+#define DUMP_REQUEST_FLAG 0x1
+#define DUMP_SOURCE_CPU 0x0001
+#define DUMP_SOURCE_HPTE 0x0002
+#define DUMP_SOURCE_RMO 0x0011
+#define DUMP_ERROR_FLAG 0x2000
+#define DUMP_TRIGGERED 0x4000
+#define DUMP_PERFORMED 0x8000
+
+
+/**
+ * init_dump_header() - initialize the header declaring a dump
+ * Returns: length of dump save area.
+ *
+ * When the hypervisor saves crashed state, it needs to put
+ * it somewhere. The dump header tells the hypervisor where
+ * the data can be saved.
+ */
+static unsigned long init_dump_header(struct phyp_dump_header *ph)
+{
+ unsigned long addr_offset = 0;
+
+ /* Set up the dump header */
+ ph->version = DUMP_HEADER_VERSION;
+ ph->num_of_sections = NUM_DUMP_SECTIONS;
+ ph->status = 0;
+
+ ph->first_offset_section =
+ (u32)offsetof(struct phyp_dump_header, cpu_data);
+ ph->dump_disk_section = 0;
+ ph->block_num_dd = 0;
+ ph->num_of_blocks_dd = 0;
+ ph->offset_dd = 0;
+
+ ph->maxtime_to_auto = 0; /* disabled */
+
+ /* The first two sections are mandatory */
+ ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG;
+ ph->cpu_data.source_type = DUMP_SOURCE_CPU;
+ ph->cpu_data.source_address = 0;
+ ph->cpu_data.source_length = phyp_dump_info->cpu_state_size;
+ ph->cpu_data.destination_address = addr_offset;
+ addr_offset += phyp_dump_info->cpu_state_size;
+
+ ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG;
+ ph->hpte_data.source_type = DUMP_SOURCE_HPTE;
+ ph->hpte_data.source_address = 0;
+ ph->hpte_data.source_length = phyp_dump_info->hpte_region_size;
+ ph->hpte_data.destination_address = addr_offset;
+ addr_offset += phyp_dump_info->hpte_region_size;
+
+ /* This section describes the low kernel region */
+ ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG;
+ ph->kernel_data.source_type = DUMP_SOURCE_RMO;
+ ph->kernel_data.source_address = PHYP_DUMP_RMR_START;
+ ph->kernel_data.source_length = PHYP_DUMP_RMR_END;
+ ph->kernel_data.destination_address = addr_offset;
+ addr_offset += ph->kernel_data.source_length;
+
+ return addr_offset;
+}
+
+static void print_dump_header(const struct phyp_dump_header *ph)
+{
+#ifdef DEBUG
+ if (ph == NULL)
+ return;
+
+ printk(KERN_INFO "dump header:\n");
+ /* setup some ph->sections required */
+ printk(KERN_INFO "version = %d\n", ph->version);
+ printk(KERN_INFO "Sections = %d\n", ph->num_of_sections);
+ printk(KERN_INFO "Status = 0x%x\n", ph->status);
+
+ /* No ph->disk, so all should be set to 0 */
+ printk(KERN_INFO "Offset to first section 0x%x\n",
+ ph->first_offset_section);
+ printk(KERN_INFO "dump disk sections should be zero\n");
+ printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section);
+ printk(KERN_INFO "block num = %lld\n", ph->block_num_dd);
+ printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd);
+ printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd);
+ printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto);
+
+ /*set cpu state and hpte states as well scratch pad area */
+ printk(KERN_INFO " CPU AREA\n");
+ printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags);
+ printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type);
+ printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags);
+ printk(KERN_INFO "cpu source_address =%llx\n",
+ ph->cpu_data.source_address);
+ printk(KERN_INFO "cpu source_length =%llx\n",
+ ph->cpu_data.source_length);
+ printk(KERN_INFO "cpu length_copied =%llx\n",
+ ph->cpu_data.length_copied);
+
+ printk(KERN_INFO " HPTE AREA\n");
+ printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags);
+ printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type);
+ printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags);
+ printk(KERN_INFO "HPTE source_address =%llx\n",
+ ph->hpte_data.source_address);
+ printk(KERN_INFO "HPTE source_length =%llx\n",
+ ph->hpte_data.source_length);
+ printk(KERN_INFO "HPTE length_copied =%llx\n",
+ ph->hpte_data.length_copied);
+
+ printk(KERN_INFO " SRSD AREA\n");
+ printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags);
+ printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type);
+ printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags);
+ printk(KERN_INFO "SRSD source_address =%llx\n",
+ ph->kernel_data.source_address);
+ printk(KERN_INFO "SRSD source_length =%llx\n",
+ ph->kernel_data.source_length);
+ printk(KERN_INFO "SRSD length_copied =%llx\n",
+ ph->kernel_data.length_copied);
+#endif
+}
+
+static ssize_t show_phyp_dump_active(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+
+ /* create filesystem entry so kdump is phyp-dump aware */
+ return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot);
+}
+
+static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600,
+ show_phyp_dump_active,
+ NULL);
+
+static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr)
+{
+ int rc;
+
+ /* Add addr value if not initialized before */
+ if (ph->cpu_data.destination_address == 0) {
+ ph->cpu_data.destination_address += addr;
+ ph->hpte_data.destination_address += addr;
+ ph->kernel_data.destination_address += addr;
+ }
+
+ /* ToDo Invalidate kdump and free memory range. */
+
+ do {
+ rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
+ 1, ph, sizeof(struct phyp_dump_header));
+ } while (rtas_busy_delay(rc));
+
+ if (rc) {
+ printk(KERN_ERR "phyp-dump: unexpected error (%d) on "
+ "register\n", rc);
+ print_dump_header(ph);
+ return;
+ }
+
+ rc = sysfs_create_file(kernel_kobj, &pdl.attr);
+ if (rc)
+ printk(KERN_ERR "phyp-dump: unable to create sysfs"
+ " file (%d)\n", rc);
+}
+
+static
+void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr)
+{
+ int rc;
+
+ /* Add addr value if not initialized before */
+ if (ph->cpu_data.destination_address == 0) {
+ ph->cpu_data.destination_address += addr;
+ ph->hpte_data.destination_address += addr;
+ ph->kernel_data.destination_address += addr;
+ }
+
+ do {
+ rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL,
+ 2, ph, sizeof(struct phyp_dump_header));
+ } while (rtas_busy_delay(rc));
+
+ if (rc) {
+ printk(KERN_ERR "phyp-dump: unexpected error (%d) "
+ "on invalidate\n", rc);
+ print_dump_header(ph);
+ }
+}
+
+/* ------------------------------------------------- */
+/**
+ * release_memory_range -- release memory previously memblock_reserved
+ * @start_pfn: starting physical frame number
+ * @nr_pages: number of pages to free.
+ *
+ * This routine will release memory that had been previously
+ * memblock_reserved in early boot. The released memory becomes
+ * available for genreal use.
+ */
+static void release_memory_range(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct page *rpage;
+ unsigned long end_pfn;
+ long i;
+
+ end_pfn = start_pfn + nr_pages;
+
+ for (i = start_pfn; i <= end_pfn; i++) {
+ rpage = pfn_to_page(i);
+ if (PageReserved(rpage)) {
+ ClearPageReserved(rpage);
+ init_page_count(rpage);
+ __free_page(rpage);
+ totalram_pages++;
+ }
+ }
+}
+
+/**
+ * track_freed_range -- Counts the range being freed.
+ * Once the counter goes to zero, it re-registers dump for
+ * future use.
+ */
+static void
+track_freed_range(unsigned long addr, unsigned long length)
+{
+ static unsigned long scratch_area_size, reserved_area_size;
+
+ if (addr < phyp_dump_info->init_reserve_start)
+ return;
+
+ if ((addr >= phyp_dump_info->init_reserve_start) &&
+ (addr <= phyp_dump_info->init_reserve_start +
+ phyp_dump_info->init_reserve_size))
+ reserved_area_size += length;
+
+ if ((addr >= phyp_dump_info->reserved_scratch_addr) &&
+ (addr <= phyp_dump_info->reserved_scratch_addr +
+ phyp_dump_info->reserved_scratch_size))
+ scratch_area_size += length;
+
+ if ((reserved_area_size == phyp_dump_info->init_reserve_size) &&
+ (scratch_area_size == phyp_dump_info->reserved_scratch_size)) {
+
+ invalidate_last_dump(&phdr,
+ phyp_dump_info->reserved_scratch_addr);
+ register_dump_area(&phdr,
+ phyp_dump_info->reserved_scratch_addr);
+ }
+}
+
+/* ------------------------------------------------- */
+/**
+ * sysfs_release_region -- sysfs interface to release memory range.
+ *
+ * Usage:
+ * "echo <start addr> <length> > /sys/kernel/release_region"
+ *
+ * Example:
+ * "echo 0x40000000 0x10000000 > /sys/kernel/release_region"
+ *
+ * will release 256MB starting at 1GB.
+ */
+static ssize_t store_release_region(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long start_addr, length, end_addr;
+ unsigned long start_pfn, nr_pages;
+ ssize_t ret;
+
+ ret = sscanf(buf, "%lx %lx", &start_addr, &length);
+ if (ret != 2)
+ return -EINVAL;
+
+ track_freed_range(start_addr, length);
+
+ /* Range-check - don't free any reserved memory that
+ * wasn't reserved for phyp-dump */
+ if (start_addr < phyp_dump_info->init_reserve_start)
+ start_addr = phyp_dump_info->init_reserve_start;
+
+ end_addr = phyp_dump_info->init_reserve_start +
+ phyp_dump_info->init_reserve_size;
+ if (start_addr+length > end_addr)
+ length = end_addr - start_addr;
+
+ /* Release the region of memory assed in by user */
+ start_pfn = PFN_DOWN(start_addr);
+ nr_pages = PFN_DOWN(length);
+ release_memory_range(start_pfn, nr_pages);
+
+ return count;
+}
+
+static ssize_t show_release_region(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ u64 second_addr_range;
+
+ /* total reserved size - start of scratch area */
+ second_addr_range = phyp_dump_info->init_reserve_size -
+ phyp_dump_info->reserved_scratch_size;
+ return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:"
+ " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n",
+ phdr.cpu_data.destination_address,
+ phdr.cpu_data.length_copied,
+ phdr.hpte_data.destination_address,
+ phdr.hpte_data.length_copied,
+ phdr.kernel_data.destination_address,
+ phdr.kernel_data.length_copied,
+ phyp_dump_info->init_reserve_start,
+ second_addr_range);
+}
+
+static struct kobj_attribute rr = __ATTR(release_region, 0600,
+ show_release_region,
+ store_release_region);
+
+static int __init phyp_dump_setup(void)
+{
+ struct device_node *rtas;
+ const struct phyp_dump_header *dump_header = NULL;
+ unsigned long dump_area_start;
+ unsigned long dump_area_length;
+ int header_len = 0;
+ int rc;
+
+ /* If no memory was reserved in early boot, there is nothing to do */
+ if (phyp_dump_info->init_reserve_size == 0)
+ return 0;
+
+ /* Return if phyp dump not supported */
+ if (!phyp_dump_info->phyp_dump_configured)
+ return -ENOSYS;
+
+ /* Is there dump data waiting for us? If there isn't,
+ * then register a new dump area, and release all of
+ * the rest of the reserved ram.
+ *
+ * The /rtas/ibm,kernel-dump rtas node is present only
+ * if there is dump data waiting for us.
+ */
+ rtas = of_find_node_by_path("/rtas");
+ if (rtas) {
+ dump_header = of_get_property(rtas, "ibm,kernel-dump",
+ &header_len);
+ of_node_put(rtas);
+ }
+
+ ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump");
+
+ print_dump_header(dump_header);
+ dump_area_length = init_dump_header(&phdr);
+ /* align down */
+ dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK;
+
+ if (dump_header == NULL) {
+ register_dump_area(&phdr, dump_area_start);
+ return 0;
+ }
+
+ /* re-register the dump area, if old dump was invalid */
+ if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) {
+ invalidate_last_dump(&phdr, dump_area_start);
+ register_dump_area(&phdr, dump_area_start);
+ return 0;
+ }
+
+ if (dump_header) {
+ phyp_dump_info->reserved_scratch_addr =
+ dump_header->cpu_data.destination_address;
+ phyp_dump_info->reserved_scratch_size =
+ dump_header->cpu_data.source_length +
+ dump_header->hpte_data.source_length +
+ dump_header->kernel_data.source_length;
+ }
+
+ /* Should we create a dump_subsys, analogous to s390/ipl.c ? */
+ rc = sysfs_create_file(kernel_kobj, &rr.attr);
+ if (rc)
+ printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n",
+ rc);
+
+ /* ToDo: re-register the dump area, for next time. */
+ return 0;
+}
+machine_subsys_initcall(pseries, phyp_dump_setup);
+
+int __init early_init_dt_scan_phyp_dump(unsigned long node,
+ const char *uname, int depth, void *data)
+{
+ const unsigned int *sizes;
+
+ phyp_dump_info->phyp_dump_configured = 0;
+ phyp_dump_info->phyp_dump_is_active = 0;
+
+ if (depth != 1 || strcmp(uname, "rtas") != 0)
+ return 0;
+
+ if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL))
+ phyp_dump_info->phyp_dump_configured++;
+
+ if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL))
+ phyp_dump_info->phyp_dump_is_active++;
+
+ sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes",
+ NULL);
+ if (!sizes)
+ return 0;
+
+ if (sizes[0] == 1)
+ phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]);
+
+ if (sizes[3] == 2)
+ phyp_dump_info->hpte_region_size =
+ *((unsigned long *)&sizes[4]);
+ return 1;
+}
+
+/* Look for phyp_dump= cmdline option */
+static int __init early_phyp_dump_enabled(char *p)
+{
+ phyp_dump_info->phyp_dump_at_boot = 1;
+
+ if (!p)
+ return 0;
+
+ if (strncmp(p, "1", 1) == 0)
+ phyp_dump_info->phyp_dump_at_boot = 1;
+ else if (strncmp(p, "0", 1) == 0)
+ phyp_dump_info->phyp_dump_at_boot = 0;
+
+ return 0;
+}
+early_param("phyp_dump", early_phyp_dump_enabled);
+
+/* Look for phyp_dump_reserve_size= cmdline option */
+static int __init early_phyp_dump_reserve_size(char *p)
+{
+ if (p)
+ phyp_dump_info->reserve_bootvar = memparse(p, &p);
+
+ return 0;
+}
+early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size);
diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h
new file mode 100644
index 00000000000..342797fc0f9
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h
@@ -0,0 +1,276 @@
+#ifndef _PSERIES_PLPAR_WRAPPERS_H
+#define _PSERIES_PLPAR_WRAPPERS_H
+
+#include <linux/string.h>
+
+#include <asm/hvcall.h>
+#include <asm/paca.h>
+#include <asm/page.h>
+
+/* Get state of physical CPU from query_cpu_stopped */
+int smp_query_cpu_stopped(unsigned int pcpu);
+#define QCSS_STOPPED 0
+#define QCSS_STOPPING 1
+#define QCSS_NOT_STOPPED 2
+#define QCSS_HARDWARE_ERROR -1
+#define QCSS_HARDWARE_BUSY -2
+
+static inline long poll_pending(void)
+{
+ return plpar_hcall_norets(H_POLL_PENDING);
+}
+
+static inline u8 get_cede_latency_hint(void)
+{
+ return get_lppaca()->gpr5_dword.fields.cede_latency_hint;
+}
+
+static inline void set_cede_latency_hint(u8 latency_hint)
+{
+ get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint;
+}
+
+static inline long cede_processor(void)
+{
+ return plpar_hcall_norets(H_CEDE);
+}
+
+static inline long extended_cede_processor(unsigned long latency_hint)
+{
+ long rc;
+ u8 old_latency_hint = get_cede_latency_hint();
+
+ set_cede_latency_hint(latency_hint);
+ rc = cede_processor();
+ set_cede_latency_hint(old_latency_hint);
+
+ return rc;
+}
+
+static inline long vpa_call(unsigned long flags, unsigned long cpu,
+ unsigned long vpa)
+{
+ /* flags are in bits 16-18 (counting from most significant bit) */
+ flags = flags << (63 - 18);
+
+ return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa);
+}
+
+static inline long unregister_vpa(unsigned long cpu)
+{
+ return vpa_call(0x5, cpu, 0);
+}
+
+static inline long register_vpa(unsigned long cpu, unsigned long vpa)
+{
+ return vpa_call(0x1, cpu, vpa);
+}
+
+static inline long unregister_slb_shadow(unsigned long cpu)
+{
+ return vpa_call(0x7, cpu, 0);
+}
+
+static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa)
+{
+ return vpa_call(0x3, cpu, vpa);
+}
+
+static inline long unregister_dtl(unsigned long cpu)
+{
+ return vpa_call(0x6, cpu, 0);
+}
+
+static inline long register_dtl(unsigned long cpu, unsigned long vpa)
+{
+ return vpa_call(0x2, cpu, vpa);
+}
+
+static inline long plpar_page_set_loaned(unsigned long vpa)
+{
+ unsigned long cmo_page_sz = cmo_get_page_size();
+ long rc = 0;
+ int i;
+
+ for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+ rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0);
+
+ for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+ plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
+ vpa + i - cmo_page_sz, 0);
+
+ return rc;
+}
+
+static inline long plpar_page_set_active(unsigned long vpa)
+{
+ unsigned long cmo_page_sz = cmo_get_page_size();
+ long rc = 0;
+ int i;
+
+ for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+ rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0);
+
+ for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+ plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
+ vpa + i - cmo_page_sz, 0);
+
+ return rc;
+}
+
+extern void vpa_init(int cpu);
+
+static inline long plpar_pte_enter(unsigned long flags,
+ unsigned long hpte_group, unsigned long hpte_v,
+ unsigned long hpte_r, unsigned long *slot)
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall(H_ENTER, retbuf, flags, hpte_group, hpte_v, hpte_r);
+
+ *slot = retbuf[0];
+
+ return rc;
+}
+
+static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex,
+ unsigned long avpn, unsigned long *old_pteh_ret,
+ unsigned long *old_ptel_ret)
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall(H_REMOVE, retbuf, flags, ptex, avpn);
+
+ *old_pteh_ret = retbuf[0];
+ *old_ptel_ret = retbuf[1];
+
+ return rc;
+}
+
+/* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */
+static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex,
+ unsigned long avpn, unsigned long *old_pteh_ret,
+ unsigned long *old_ptel_ret)
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall_raw(H_REMOVE, retbuf, flags, ptex, avpn);
+
+ *old_pteh_ret = retbuf[0];
+ *old_ptel_ret = retbuf[1];
+
+ return rc;
+}
+
+static inline long plpar_pte_read(unsigned long flags, unsigned long ptex,
+ unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall(H_READ, retbuf, flags, ptex);
+
+ *old_pteh_ret = retbuf[0];
+ *old_ptel_ret = retbuf[1];
+
+ return rc;
+}
+
+/* plpar_pte_read_raw can be called in real mode. It calls plpar_hcall_raw */
+static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex,
+ unsigned long *old_pteh_ret, unsigned long *old_ptel_ret)
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall_raw(H_READ, retbuf, flags, ptex);
+
+ *old_pteh_ret = retbuf[0];
+ *old_ptel_ret = retbuf[1];
+
+ return rc;
+}
+
+/*
+ * plpar_pte_read_4_raw can be called in real mode.
+ * ptes must be 8*sizeof(unsigned long)
+ */
+static inline long plpar_pte_read_4_raw(unsigned long flags, unsigned long ptex,
+ unsigned long *ptes)
+
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+ rc = plpar_hcall9_raw(H_READ, retbuf, flags | H_READ_4, ptex);
+
+ memcpy(ptes, retbuf, 8*sizeof(unsigned long));
+
+ return rc;
+}
+
+static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex,
+ unsigned long avpn)
+{
+ return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
+}
+
+static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba,
+ unsigned long *tce_ret)
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall(H_GET_TCE, retbuf, liobn, ioba);
+
+ *tce_ret = retbuf[0];
+
+ return rc;
+}
+
+static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba,
+ unsigned long tceval)
+{
+ return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval);
+}
+
+static inline long plpar_tce_put_indirect(unsigned long liobn,
+ unsigned long ioba, unsigned long page, unsigned long count)
+{
+ return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count);
+}
+
+static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba,
+ unsigned long tceval, unsigned long count)
+{
+ return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count);
+}
+
+static inline long plpar_get_term_char(unsigned long termno,
+ unsigned long *len_ret, char *buf_ret)
+{
+ long rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+ unsigned long *lbuf = (unsigned long *)buf_ret; /* TODO: alignment? */
+
+ rc = plpar_hcall(H_GET_TERM_CHAR, retbuf, termno);
+
+ *len_ret = retbuf[0];
+ lbuf[0] = retbuf[1];
+ lbuf[1] = retbuf[2];
+
+ return rc;
+}
+
+static inline long plpar_put_term_char(unsigned long termno, unsigned long len,
+ const char *buffer)
+{
+ unsigned long *lbuf = (unsigned long *)buffer; /* TODO: alignment? */
+ return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0],
+ lbuf[1]);
+}
+
+#endif /* _PSERIES_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c
new file mode 100644
index 00000000000..6d626623644
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/power.c
@@ -0,0 +1,81 @@
+/*
+ * Interface for power-management for ppc64 compliant platform
+ *
+ * Manish Ahuja <mahuja@us.ibm.com>
+ *
+ * Feb 2007
+ *
+ * Copyright (C) 2007 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+unsigned long rtas_poweron_auto; /* default and normal state is 0 */
+
+static ssize_t auto_poweron_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", rtas_poweron_auto);
+}
+
+static ssize_t auto_poweron_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int ret;
+ unsigned long ups_restart;
+ ret = sscanf(buf, "%lu", &ups_restart);
+
+ if ((ret == 1) && ((ups_restart == 1) || (ups_restart == 0))){
+ rtas_poweron_auto = ups_restart;
+ return n;
+ }
+ return -EINVAL;
+}
+
+static struct kobj_attribute auto_poweron_attr =
+ __ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store);
+
+#ifndef CONFIG_PM
+struct kobject *power_kobj;
+
+static struct attribute *g[] = {
+ &auto_poweron_attr.attr,
+ NULL,
+};
+
+static struct attribute_group attr_group = {
+ .attrs = g,
+};
+
+static int __init pm_init(void)
+{
+ power_kobj = kobject_create_and_add("power", NULL);
+ if (!power_kobj)
+ return -ENOMEM;
+ return sysfs_create_group(power_kobj, &attr_group);
+}
+core_initcall(pm_init);
+#else
+static int __init apo_pm_init(void)
+{
+ return (sysfs_create_file(power_kobj, &auto_poweron_attr.attr));
+}
+__initcall(apo_pm_init);
+#endif
diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c
new file mode 100644
index 00000000000..085fd3f45ad
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/processor_idle.c
@@ -0,0 +1,329 @@
+/*
+ * processor_idle - idle state cpuidle driver.
+ * Adapted from drivers/idle/intel_idle.c and
+ * drivers/acpi/processor_idle.c
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu.h>
+
+#include <asm/paca.h>
+#include <asm/reg.h>
+#include <asm/system.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+struct cpuidle_driver pseries_idle_driver = {
+ .name = "pseries_idle",
+ .owner = THIS_MODULE,
+};
+
+#define MAX_IDLE_STATE_COUNT 2
+
+static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
+static struct cpuidle_device __percpu *pseries_cpuidle_devices;
+static struct cpuidle_state *cpuidle_state_table;
+
+void update_smt_snooze_delay(int snooze)
+{
+ struct cpuidle_driver *drv = cpuidle_get_driver();
+ if (drv)
+ drv->states[0].target_residency = snooze;
+}
+
+static inline void idle_loop_prolog(unsigned long *in_purr, ktime_t *kt_before)
+{
+
+ *kt_before = ktime_get_real();
+ *in_purr = mfspr(SPRN_PURR);
+ /*
+ * Indicate to the HV that we are idle. Now would be
+ * a good time to find other work to dispatch.
+ */
+ get_lppaca()->idle = 1;
+}
+
+static inline s64 idle_loop_epilog(unsigned long in_purr, ktime_t kt_before)
+{
+ get_lppaca()->wait_state_cycles += mfspr(SPRN_PURR) - in_purr;
+ get_lppaca()->idle = 0;
+
+ return ktime_to_us(ktime_sub(ktime_get_real(), kt_before));
+}
+
+static int snooze_loop(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index)
+{
+ unsigned long in_purr;
+ ktime_t kt_before;
+ unsigned long start_snooze;
+ long snooze = drv->states[0].target_residency;
+
+ idle_loop_prolog(&in_purr, &kt_before);
+
+ if (snooze) {
+ start_snooze = get_tb() + snooze * tb_ticks_per_usec;
+ local_irq_enable();
+ set_thread_flag(TIF_POLLING_NRFLAG);
+
+ while ((snooze < 0) || (get_tb() < start_snooze)) {
+ if (need_resched() || cpu_is_offline(dev->cpu))
+ goto out;
+ ppc64_runlatch_off();
+ HMT_low();
+ HMT_very_low();
+ }
+
+ HMT_medium();
+ clear_thread_flag(TIF_POLLING_NRFLAG);
+ smp_mb();
+ local_irq_disable();
+ }
+
+out:
+ HMT_medium();
+ dev->last_residency =
+ (int)idle_loop_epilog(in_purr, kt_before);
+ return index;
+}
+
+static int dedicated_cede_loop(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index)
+{
+ unsigned long in_purr;
+ ktime_t kt_before;
+
+ idle_loop_prolog(&in_purr, &kt_before);
+ get_lppaca()->donate_dedicated_cpu = 1;
+
+ ppc64_runlatch_off();
+ HMT_medium();
+ cede_processor();
+
+ get_lppaca()->donate_dedicated_cpu = 0;
+ dev->last_residency =
+ (int)idle_loop_epilog(in_purr, kt_before);
+ return index;
+}
+
+static int shared_cede_loop(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index)
+{
+ unsigned long in_purr;
+ ktime_t kt_before;
+
+ idle_loop_prolog(&in_purr, &kt_before);
+
+ /*
+ * Yield the processor to the hypervisor. We return if
+ * an external interrupt occurs (which are driven prior
+ * to returning here) or if a prod occurs from another
+ * processor. When returning here, external interrupts
+ * are enabled.
+ */
+ cede_processor();
+
+ dev->last_residency =
+ (int)idle_loop_epilog(in_purr, kt_before);
+ return index;
+}
+
+/*
+ * States for dedicated partition case.
+ */
+static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
+ { /* Snooze */
+ .name = "snooze",
+ .desc = "snooze",
+ .flags = CPUIDLE_FLAG_TIME_VALID,
+ .exit_latency = 0,
+ .target_residency = 0,
+ .enter = &snooze_loop },
+ { /* CEDE */
+ .name = "CEDE",
+ .desc = "CEDE",
+ .flags = CPUIDLE_FLAG_TIME_VALID,
+ .exit_latency = 1,
+ .target_residency = 10,
+ .enter = &dedicated_cede_loop },
+};
+
+/*
+ * States for shared partition case.
+ */
+static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
+ { /* Shared Cede */
+ .name = "Shared Cede",
+ .desc = "Shared Cede",
+ .flags = CPUIDLE_FLAG_TIME_VALID,
+ .exit_latency = 0,
+ .target_residency = 0,
+ .enter = &shared_cede_loop },
+};
+
+int pseries_notify_cpuidle_add_cpu(int cpu)
+{
+ struct cpuidle_device *dev =
+ per_cpu_ptr(pseries_cpuidle_devices, cpu);
+ if (dev && cpuidle_get_driver()) {
+ cpuidle_disable_device(dev);
+ cpuidle_enable_device(dev);
+ }
+ return 0;
+}
+
+/*
+ * pseries_cpuidle_driver_init()
+ */
+static int pseries_cpuidle_driver_init(void)
+{
+ int idle_state;
+ struct cpuidle_driver *drv = &pseries_idle_driver;
+
+ drv->state_count = 0;
+
+ for (idle_state = 0; idle_state < MAX_IDLE_STATE_COUNT; ++idle_state) {
+
+ if (idle_state > max_idle_state)
+ break;
+
+ /* is the state not enabled? */
+ if (cpuidle_state_table[idle_state].enter == NULL)
+ continue;
+
+ drv->states[drv->state_count] = /* structure copy */
+ cpuidle_state_table[idle_state];
+
+ if (cpuidle_state_table == dedicated_states)
+ drv->states[drv->state_count].target_residency =
+ __get_cpu_var(smt_snooze_delay);
+
+ drv->state_count += 1;
+ }
+
+ return 0;
+}
+
+/* pseries_idle_devices_uninit(void)
+ * unregister cpuidle devices and de-allocate memory
+ */
+static void pseries_idle_devices_uninit(void)
+{
+ int i;
+ struct cpuidle_device *dev;
+
+ for_each_possible_cpu(i) {
+ dev = per_cpu_ptr(pseries_cpuidle_devices, i);
+ cpuidle_unregister_device(dev);
+ }
+
+ free_percpu(pseries_cpuidle_devices);
+ return;
+}
+
+/* pseries_idle_devices_init()
+ * allocate, initialize and register cpuidle device
+ */
+static int pseries_idle_devices_init(void)
+{
+ int i;
+ struct cpuidle_driver *drv = &pseries_idle_driver;
+ struct cpuidle_device *dev;
+
+ pseries_cpuidle_devices = alloc_percpu(struct cpuidle_device);
+ if (pseries_cpuidle_devices == NULL)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ dev = per_cpu_ptr(pseries_cpuidle_devices, i);
+ dev->state_count = drv->state_count;
+ dev->cpu = i;
+ if (cpuidle_register_device(dev)) {
+ printk(KERN_DEBUG \
+ "cpuidle_register_device %d failed!\n", i);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * pseries_idle_probe()
+ * Choose state table for shared versus dedicated partition
+ */
+static int pseries_idle_probe(void)
+{
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+ return -ENODEV;
+
+ if (cpuidle_disable != IDLE_NO_OVERRIDE)
+ return -ENODEV;
+
+ if (max_idle_state == 0) {
+ printk(KERN_DEBUG "pseries processor idle disabled.\n");
+ return -EPERM;
+ }
+
+ if (get_lppaca()->shared_proc)
+ cpuidle_state_table = shared_states;
+ else
+ cpuidle_state_table = dedicated_states;
+
+ return 0;
+}
+
+static int __init pseries_processor_idle_init(void)
+{
+ int retval;
+
+ retval = pseries_idle_probe();
+ if (retval)
+ return retval;
+
+ pseries_cpuidle_driver_init();
+ retval = cpuidle_register_driver(&pseries_idle_driver);
+ if (retval) {
+ printk(KERN_DEBUG "Registration of pseries driver failed.\n");
+ return retval;
+ }
+
+ retval = pseries_idle_devices_init();
+ if (retval) {
+ pseries_idle_devices_uninit();
+ cpuidle_unregister_driver(&pseries_idle_driver);
+ return retval;
+ }
+
+ printk(KERN_DEBUG "pseries_idle_driver registered\n");
+
+ return 0;
+}
+
+static void __exit pseries_processor_idle_exit(void)
+{
+
+ pseries_idle_devices_uninit();
+ cpuidle_unregister_driver(&pseries_idle_driver);
+
+ return;
+}
+
+module_init(pseries_processor_idle_init);
+module_exit(pseries_processor_idle_exit);
+
+MODULE_AUTHOR("Deepthi Dharwar <deepthi@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Cpuidle driver for POWER");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
new file mode 100644
index 00000000000..9a3dda07566
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2006 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _PSERIES_PSERIES_H
+#define _PSERIES_PSERIES_H
+
+#include <linux/interrupt.h>
+
+struct device_node;
+
+extern void request_event_sources_irqs(struct device_node *np,
+ irq_handler_t handler, const char *name);
+
+#include <linux/of.h>
+
+extern void __init fw_feature_init(const char *hypertas, unsigned long len);
+
+struct pt_regs;
+
+extern int pSeries_system_reset_exception(struct pt_regs *regs);
+extern int pSeries_machine_check_exception(struct pt_regs *regs);
+
+#ifdef CONFIG_SMP
+extern void smp_init_pseries_mpic(void);
+extern void smp_init_pseries_xics(void);
+#else
+static inline void smp_init_pseries_mpic(void) { };
+static inline void smp_init_pseries_xics(void) { };
+#endif
+
+#ifdef CONFIG_KEXEC
+extern void setup_kexec_cpu_down_xics(void);
+extern void setup_kexec_cpu_down_mpic(void);
+#else
+static inline void setup_kexec_cpu_down_xics(void) { }
+static inline void setup_kexec_cpu_down_mpic(void) { }
+#endif
+
+extern void pSeries_final_fixup(void);
+
+/* Poweron flag used for enabling auto ups restart */
+extern unsigned long rtas_poweron_auto;
+
+/* Provided by HVC VIO */
+extern void hvc_vio_init_early(void);
+
+/* Dynamic logical Partitioning/Mobility */
+extern void dlpar_free_cc_nodes(struct device_node *);
+extern void dlpar_free_cc_property(struct property *);
+extern struct device_node *dlpar_configure_connector(u32);
+extern int dlpar_attach_node(struct device_node *);
+extern int dlpar_detach_node(struct device_node *);
+
+/* Snooze Delay, pseries_idle */
+DECLARE_PER_CPU(long, smt_snooze_delay);
+
+#endif /* _PSERIES_PSERIES_H */
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c
new file mode 100644
index 00000000000..af281dce510
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -0,0 +1,323 @@
+/*
+ * POWER platform energy management driver
+ * Copyright (C) 2010 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This pseries platform device driver provides access to
+ * platform energy management capabilities.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "pseries_energy"
+
+/* Driver flags */
+
+static int sysfs_entries;
+
+/* Helper routines */
+
+/*
+ * Routine to detect firmware support for hcall
+ * return 1 if H_BEST_ENERGY is supported
+ * else return 0
+ */
+
+static int check_for_h_best_energy(void)
+{
+ struct device_node *rtas = NULL;
+ const char *hypertas, *s;
+ int length;
+ int rc = 0;
+
+ rtas = of_find_node_by_path("/rtas");
+ if (!rtas)
+ return 0;
+
+ hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length);
+ if (!hypertas) {
+ of_node_put(rtas);
+ return 0;
+ }
+
+ /* hypertas will have list of strings with hcall names */
+ for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) {
+ if (!strncmp("hcall-best-energy-1", s, 19)) {
+ rc = 1; /* Found the string */
+ break;
+ }
+ }
+ of_node_put(rtas);
+ return rc;
+}
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+static u32 cpu_to_drc_index(int cpu)
+{
+ struct device_node *dn = NULL;
+ const int *indexes;
+ int i;
+ int rc = 1;
+ u32 ret = 0;
+
+ dn = of_find_node_by_path("/cpus");
+ if (dn == NULL)
+ goto err;
+ indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+ if (indexes == NULL)
+ goto err_of_node_put;
+ /* Convert logical cpu number to core number */
+ i = cpu_core_index_of_thread(cpu);
+ /*
+ * The first element indexes[0] is the number of drc_indexes
+ * returned in the list. Hence i+1 will get the drc_index
+ * corresponding to core number i.
+ */
+ WARN_ON(i > indexes[0]);
+ ret = indexes[i + 1];
+ rc = 0;
+
+err_of_node_put:
+ of_node_put(dn);
+err:
+ if (rc)
+ printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu);
+ return ret;
+}
+
+static int drc_index_to_cpu(u32 drc_index)
+{
+ struct device_node *dn = NULL;
+ const int *indexes;
+ int i, cpu = 0;
+ int rc = 1;
+
+ dn = of_find_node_by_path("/cpus");
+ if (dn == NULL)
+ goto err;
+ indexes = of_get_property(dn, "ibm,drc-indexes", NULL);
+ if (indexes == NULL)
+ goto err_of_node_put;
+ /*
+ * First element in the array is the number of drc_indexes
+ * returned. Search through the list to find the matching
+ * drc_index and get the core number
+ */
+ for (i = 0; i < indexes[0]; i++) {
+ if (indexes[i + 1] == drc_index)
+ break;
+ }
+ /* Convert core number to logical cpu number */
+ cpu = cpu_first_thread_of_core(i);
+ rc = 0;
+
+err_of_node_put:
+ of_node_put(dn);
+err:
+ if (rc)
+ printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index);
+ return cpu;
+}
+
+/*
+ * pseries hypervisor call H_BEST_ENERGY provides hints to OS on
+ * preferred logical cpus to activate or deactivate for optimized
+ * energy consumption.
+ */
+
+#define FLAGS_MODE1 0x004E200000080E01
+#define FLAGS_MODE2 0x004E200000080401
+#define FLAGS_ACTIVATE 0x100
+
+static ssize_t get_best_energy_list(char *page, int activate)
+{
+ int rc, cnt, i, cpu;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+ unsigned long flags = 0;
+ u32 *buf_page;
+ char *s = page;
+
+ buf_page = (u32 *) get_zeroed_page(GFP_KERNEL);
+ if (!buf_page)
+ return -ENOMEM;
+
+ flags = FLAGS_MODE1;
+ if (activate)
+ flags |= FLAGS_ACTIVATE;
+
+ rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page),
+ 0, 0, 0, 0, 0, 0);
+ if (rc != H_SUCCESS) {
+ free_page((unsigned long) buf_page);
+ return -EINVAL;
+ }
+
+ cnt = retbuf[0];
+ for (i = 0; i < cnt; i++) {
+ cpu = drc_index_to_cpu(buf_page[2*i+1]);
+ if ((cpu_online(cpu) && !activate) ||
+ (!cpu_online(cpu) && activate))
+ s += sprintf(s, "%d,", cpu);
+ }
+ if (s > page) { /* Something to show */
+ s--; /* Suppress last comma */
+ s += sprintf(s, "\n");
+ }
+
+ free_page((unsigned long) buf_page);
+ return s-page;
+}
+
+static ssize_t get_best_energy_data(struct device *dev,
+ char *page, int activate)
+{
+ int rc;
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+ unsigned long flags = 0;
+
+ flags = FLAGS_MODE2;
+ if (activate)
+ flags |= FLAGS_ACTIVATE;
+
+ rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags,
+ cpu_to_drc_index(dev->id),
+ 0, 0, 0, 0, 0, 0, 0);
+
+ if (rc != H_SUCCESS)
+ return -EINVAL;
+
+ return sprintf(page, "%lu\n", retbuf[1] >> 32);
+}
+
+/* Wrapper functions */
+
+static ssize_t cpu_activate_hint_list_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_list(page, 1);
+}
+
+static ssize_t cpu_deactivate_hint_list_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_list(page, 0);
+}
+
+static ssize_t percpu_activate_hint_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_data(dev, page, 1);
+}
+
+static ssize_t percpu_deactivate_hint_show(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ return get_best_energy_data(dev, page, 0);
+}
+
+/*
+ * Create sysfs interface:
+ * /sys/devices/system/cpu/pseries_activate_hint_list
+ * /sys/devices/system/cpu/pseries_deactivate_hint_list
+ * Comma separated list of cpus to activate or deactivate
+ * /sys/devices/system/cpu/cpuN/pseries_activate_hint
+ * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint
+ * Per-cpu value of the hint
+ */
+
+struct device_attribute attr_cpu_activate_hint_list =
+ __ATTR(pseries_activate_hint_list, 0444,
+ cpu_activate_hint_list_show, NULL);
+
+struct device_attribute attr_cpu_deactivate_hint_list =
+ __ATTR(pseries_deactivate_hint_list, 0444,
+ cpu_deactivate_hint_list_show, NULL);
+
+struct device_attribute attr_percpu_activate_hint =
+ __ATTR(pseries_activate_hint, 0444,
+ percpu_activate_hint_show, NULL);
+
+struct device_attribute attr_percpu_deactivate_hint =
+ __ATTR(pseries_deactivate_hint, 0444,
+ percpu_deactivate_hint_show, NULL);
+
+static int __init pseries_energy_init(void)
+{
+ int cpu, err;
+ struct device *cpu_dev;
+
+ if (!check_for_h_best_energy()) {
+ printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
+ return 0;
+ }
+ /* Create the sysfs files */
+ err = device_create_file(cpu_subsys.dev_root,
+ &attr_cpu_activate_hint_list);
+ if (!err)
+ err = device_create_file(cpu_subsys.dev_root,
+ &attr_cpu_deactivate_hint_list);
+
+ if (err)
+ return err;
+ for_each_possible_cpu(cpu) {
+ cpu_dev = get_cpu_device(cpu);
+ err = device_create_file(cpu_dev,
+ &attr_percpu_activate_hint);
+ if (err)
+ break;
+ err = device_create_file(cpu_dev,
+ &attr_percpu_deactivate_hint);
+ if (err)
+ break;
+ }
+
+ if (err)
+ return err;
+
+ sysfs_entries = 1; /* Removed entries on cleanup */
+ return 0;
+
+}
+
+static void __exit pseries_energy_cleanup(void)
+{
+ int cpu;
+ struct device *cpu_dev;
+
+ if (!sysfs_entries)
+ return;
+
+ /* Remove the sysfs files */
+ device_remove_file(cpu_subsys.dev_root, &attr_cpu_activate_hint_list);
+ device_remove_file(cpu_subsys.dev_root, &attr_cpu_deactivate_hint_list);
+
+ for_each_possible_cpu(cpu) {
+ cpu_dev = get_cpu_device(cpu);
+ sysfs_remove_file(&cpu_dev->kobj,
+ &attr_percpu_activate_hint.attr);
+ sysfs_remove_file(&cpu_dev->kobj,
+ &attr_percpu_deactivate_hint.attr);
+ }
+}
+
+module_init(pseries_energy_init);
+module_exit(pseries_energy_cleanup);
+MODULE_DESCRIPTION("Driver for pSeries platform energy management");
+MODULE_AUTHOR("Vaidyanathan Srinivasan");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
new file mode 100644
index 00000000000..086d2ae4e06
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* Change Activity:
+ * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support.
+ * End Change Activity
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/kernel_stat.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/random.h>
+#include <linux/sysrq.h>
+#include <linux/bitops.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/irq.h>
+#include <asm/cache.h>
+#include <asm/prom.h>
+#include <asm/ptrace.h>
+#include <asm/machdep.h>
+#include <asm/rtas.h>
+#include <asm/udbg.h>
+#include <asm/firmware.h>
+
+#include "pseries.h"
+
+static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_SPINLOCK(ras_log_buf_lock);
+
+static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
+static DEFINE_PER_CPU(__u64, mce_data_buf);
+
+static int ras_get_sensor_state_token;
+static int ras_check_exception_token;
+
+#define EPOW_SENSOR_TOKEN 9
+#define EPOW_SENSOR_INDEX 0
+
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
+
+
+/*
+ * Initialize handlers for the set of interrupts caused by hardware errors
+ * and power system events.
+ */
+static int __init init_ras_IRQ(void)
+{
+ struct device_node *np;
+
+ ras_get_sensor_state_token = rtas_token("get-sensor-state");
+ ras_check_exception_token = rtas_token("check-exception");
+
+ /* Internal Errors */
+ np = of_find_node_by_path("/event-sources/internal-errors");
+ if (np != NULL) {
+ request_event_sources_irqs(np, ras_error_interrupt,
+ "RAS_ERROR");
+ of_node_put(np);
+ }
+
+ /* EPOW Events */
+ np = of_find_node_by_path("/event-sources/epow-events");
+ if (np != NULL) {
+ request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
+ of_node_put(np);
+ }
+
+ return 0;
+}
+__initcall(init_ras_IRQ);
+
+/*
+ * Handle power subsystem events (EPOW).
+ *
+ * Presently we just log the event has occurred. This should be fixed
+ * to examine the type of power failure and take appropriate action where
+ * the time horizon permits something useful to be done.
+ */
+static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
+{
+ int status = 0xdeadbeef;
+ int state = 0;
+ int critical;
+
+ status = rtas_call(ras_get_sensor_state_token, 2, 2, &state,
+ EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX);
+
+ if (state > 3)
+ critical = 1; /* Time Critical */
+ else
+ critical = 0;
+
+ spin_lock(&ras_log_buf_lock);
+
+ status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+ RTAS_VECTOR_EXTERNAL_INTERRUPT,
+ virq_to_hw(irq),
+ RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
+ critical, __pa(&ras_log_buf),
+ rtas_get_error_log_max());
+
+ udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status, state);
+ printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status, state);
+
+ /* format and print the extended information */
+ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
+
+ spin_unlock(&ras_log_buf_lock);
+ return IRQ_HANDLED;
+}
+
+/*
+ * Handle hardware error interrupts.
+ *
+ * RTAS check-exception is called to collect data on the exception. If
+ * the error is deemed recoverable, we log a warning and return.
+ * For nonrecoverable errors, an error is logged and we stop all processing
+ * as quickly as possible in order to prevent propagation of the failure.
+ */
+static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
+{
+ struct rtas_error_log *rtas_elog;
+ int status = 0xdeadbeef;
+ int fatal;
+
+ spin_lock(&ras_log_buf_lock);
+
+ status = rtas_call(ras_check_exception_token, 6, 1, NULL,
+ RTAS_VECTOR_EXTERNAL_INTERRUPT,
+ virq_to_hw(irq),
+ RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
+ __pa(&ras_log_buf),
+ rtas_get_error_log_max());
+
+ rtas_elog = (struct rtas_error_log *)ras_log_buf;
+
+ if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC))
+ fatal = 1;
+ else
+ fatal = 0;
+
+ /* format and print the extended information */
+ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
+
+ if (fatal) {
+ udbg_printf("Fatal HW Error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+ printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+
+#ifndef DEBUG_RTAS_POWER_OFF
+ /* Don't actually power off when debugging so we can test
+ * without actually failing while injecting errors.
+ * Error data will not be logged to syslog.
+ */
+ ppc_md.power_off();
+#endif
+ } else {
+ udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+ printk(KERN_WARNING
+ "Warning: Recoverable hardware error <0x%lx 0x%x>\n",
+ *((unsigned long *)&ras_log_buf), status);
+ }
+
+ spin_unlock(&ras_log_buf_lock);
+ return IRQ_HANDLED;
+}
+
+/*
+ * Some versions of FWNMI place the buffer inside the 4kB page starting at
+ * 0x7000. Other versions place it inside the rtas buffer. We check both.
+ */
+#define VALID_FWNMI_BUFFER(A) \
+ ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
+ (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
+
+/*
+ * Get the error information for errors coming through the
+ * FWNMI vectors. The pt_regs' r3 will be updated to reflect
+ * the actual r3 if possible, and a ptr to the error log entry
+ * will be returned if found.
+ *
+ * If the RTAS error is not of the extended type, then we put it in a per
+ * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf.
+ *
+ * The global_mce_data_buf does not have any locks or protection around it,
+ * if a second machine check comes in, or a system reset is done
+ * before we have logged the error, then we will get corruption in the
+ * error log. This is preferable over holding off on calling
+ * ibm,nmi-interlock which would result in us checkstopping if a
+ * second machine check did come in.
+ */
+static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
+{
+ unsigned long *savep;
+ struct rtas_error_log *h, *errhdr = NULL;
+
+ if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
+ printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
+ return NULL;
+ }
+
+ savep = __va(regs->gpr[3]);
+ regs->gpr[3] = savep[0]; /* restore original r3 */
+
+ /* If it isn't an extended log we can use the per cpu 64bit buffer */
+ h = (struct rtas_error_log *)&savep[1];
+ if (!h->extended) {
+ memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
+ errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
+ } else {
+ int len;
+
+ len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX);
+ memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
+ memcpy(global_mce_data_buf, h, len);
+ errhdr = (struct rtas_error_log *)global_mce_data_buf;
+ }
+
+ return errhdr;
+}
+
+/* Call this when done with the data returned by FWNMI_get_errinfo.
+ * It will release the saved data area for other CPUs in the
+ * partition to receive FWNMI errors.
+ */
+static void fwnmi_release_errinfo(void)
+{
+ int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
+ if (ret != 0)
+ printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
+}
+
+int pSeries_system_reset_exception(struct pt_regs *regs)
+{
+ if (fwnmi_active) {
+ struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
+ if (errhdr) {
+ /* XXX Should look at FWNMI information */
+ }
+ fwnmi_release_errinfo();
+ }
+ return 0; /* need to perform reset */
+}
+
+/*
+ * See if we can recover from a machine check exception.
+ * This is only called on power4 (or above) and only via
+ * the Firmware Non-Maskable Interrupts (fwnmi) handler
+ * which provides the error analysis for us.
+ *
+ * Return 1 if corrected (or delivered a signal).
+ * Return 0 if there is nothing we can do.
+ */
+static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
+{
+ int recovered = 0;
+
+ if (!(regs->msr & MSR_RI)) {
+ /* If MSR_RI isn't set, we cannot recover */
+ recovered = 0;
+
+ } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
+ /* Platform corrected itself */
+ recovered = 1;
+
+ } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
+ /* Platform corrected itself but could be degraded */
+ printk(KERN_ERR "MCE: limited recovery, system may "
+ "be degraded\n");
+ recovered = 1;
+
+ } else if (user_mode(regs) && !is_global_init(current) &&
+ err->severity == RTAS_SEVERITY_ERROR_SYNC) {
+
+ /*
+ * If we received a synchronous error when in userspace
+ * kill the task. Firmware may report details of the fail
+ * asynchronously, so we can't rely on the target and type
+ * fields being valid here.
+ */
+ printk(KERN_ERR "MCE: uncorrectable error, killing task "
+ "%s:%d\n", current->comm, current->pid);
+
+ _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+ recovered = 1;
+ }
+
+ log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
+
+ return recovered;
+}
+
+/*
+ * Handle a machine check.
+ *
+ * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi)
+ * should be present. If so the handler which called us tells us if the
+ * error was recovered (never true if RI=0).
+ *
+ * On hardware prior to Power 4 these exceptions were asynchronous which
+ * means we can't tell exactly where it occurred and so we can't recover.
+ */
+int pSeries_machine_check_exception(struct pt_regs *regs)
+{
+ struct rtas_error_log *errp;
+
+ if (fwnmi_active) {
+ errp = fwnmi_get_errinfo(regs);
+ fwnmi_release_errinfo();
+ if (errp && recover_mce(regs, errp))
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
new file mode 100644
index 00000000000..168651acdd8
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -0,0 +1,564 @@
+/*
+ * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI
+ * Hotplug and Dynamic Logical Partitioning on RPA platforms).
+ *
+ * Copyright (C) 2005 Nathan Lynch
+ * Copyright (C) 2005 IBM Corporation
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/uaccess.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/mmu.h>
+
+
+
+/*
+ * Routines for "runtime" addition and removal of device tree nodes.
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+/*
+ * Add a node to /proc/device-tree.
+ */
+static void add_node_proc_entries(struct device_node *np)
+{
+ struct proc_dir_entry *ent;
+
+ ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde);
+ if (ent)
+ proc_device_tree_add_node(np, ent);
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+ struct property *pp = np->properties;
+ struct device_node *parent = np->parent;
+
+ while (pp) {
+ remove_proc_entry(pp->name, np->pde);
+ pp = pp->next;
+ }
+ if (np->pde)
+ remove_proc_entry(np->pde->name, parent->pde);
+}
+#else /* !CONFIG_PROC_DEVICETREE */
+static void add_node_proc_entries(struct device_node *np)
+{
+ return;
+}
+
+static void remove_node_proc_entries(struct device_node *np)
+{
+ return;
+}
+#endif /* CONFIG_PROC_DEVICETREE */
+
+/**
+ * derive_parent - basically like dirname(1)
+ * @path: the full_name of a node to be added to the tree
+ *
+ * Returns the node which should be the parent of the node
+ * described by path. E.g., for path = "/foo/bar", returns
+ * the node with full_name = "/foo".
+ */
+static struct device_node *derive_parent(const char *path)
+{
+ struct device_node *parent = NULL;
+ char *parent_path = "/";
+ size_t parent_path_len = strrchr(path, '/') - path + 1;
+
+ /* reject if path is "/" */
+ if (!strcmp(path, "/"))
+ return ERR_PTR(-EINVAL);
+
+ if (strrchr(path, '/') != path) {
+ parent_path = kmalloc(parent_path_len, GFP_KERNEL);
+ if (!parent_path)
+ return ERR_PTR(-ENOMEM);
+ strlcpy(parent_path, path, parent_path_len);
+ }
+ parent = of_find_node_by_path(parent_path);
+ if (!parent)
+ return ERR_PTR(-EINVAL);
+ if (strcmp(parent_path, "/"))
+ kfree(parent_path);
+ return parent;
+}
+
+static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
+
+int pSeries_reconfig_notifier_register(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb);
+}
+
+void pSeries_reconfig_notifier_unregister(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb);
+}
+
+int pSeries_reconfig_notify(unsigned long action, void *p)
+{
+ int err = blocking_notifier_call_chain(&pSeries_reconfig_chain,
+ action, p);
+
+ return notifier_to_errno(err);
+}
+
+static int pSeries_reconfig_add_node(const char *path, struct property *proplist)
+{
+ struct device_node *np;
+ int err = -ENOMEM;
+
+ np = kzalloc(sizeof(*np), GFP_KERNEL);
+ if (!np)
+ goto out_err;
+
+ np->full_name = kstrdup(path, GFP_KERNEL);
+ if (!np->full_name)
+ goto out_err;
+
+ np->properties = proplist;
+ of_node_set_flag(np, OF_DYNAMIC);
+ kref_init(&np->kref);
+
+ np->parent = derive_parent(path);
+ if (IS_ERR(np->parent)) {
+ err = PTR_ERR(np->parent);
+ goto out_err;
+ }
+
+ err = pSeries_reconfig_notify(PSERIES_RECONFIG_ADD, np);
+ if (err) {
+ printk(KERN_ERR "Failed to add device node %s\n", path);
+ goto out_err;
+ }
+
+ of_attach_node(np);
+
+ add_node_proc_entries(np);
+
+ of_node_put(np->parent);
+
+ return 0;
+
+out_err:
+ if (np) {
+ of_node_put(np->parent);
+ kfree(np->full_name);
+ kfree(np);
+ }
+ return err;
+}
+
+static int pSeries_reconfig_remove_node(struct device_node *np)
+{
+ struct device_node *parent, *child;
+
+ parent = of_get_parent(np);
+ if (!parent)
+ return -EINVAL;
+
+ if ((child = of_get_next_child(np, NULL))) {
+ of_node_put(child);
+ of_node_put(parent);
+ return -EBUSY;
+ }
+
+ remove_node_proc_entries(np);
+
+ pSeries_reconfig_notify(PSERIES_RECONFIG_REMOVE, np);
+ of_detach_node(np);
+
+ of_node_put(parent);
+ of_node_put(np); /* Must decrement the refcount */
+ return 0;
+}
+
+/*
+ * /proc/powerpc/ofdt - yucky binary interface for adding and removing
+ * OF device nodes. Should be deprecated as soon as we get an
+ * in-kernel wrapper for the RTAS ibm,configure-connector call.
+ */
+
+static void release_prop_list(const struct property *prop)
+{
+ struct property *next;
+ for (; prop; prop = next) {
+ next = prop->next;
+ kfree(prop->name);
+ kfree(prop->value);
+ kfree(prop);
+ }
+
+}
+
+/**
+ * parse_next_property - process the next property from raw input buffer
+ * @buf: input buffer, must be nul-terminated
+ * @end: end of the input buffer + 1, for validation
+ * @name: return value; set to property name in buf
+ * @length: return value; set to length of value
+ * @value: return value; set to the property value in buf
+ *
+ * Note that the caller must make copies of the name and value returned,
+ * this function does no allocation or copying of the data. Return value
+ * is set to the next name in buf, or NULL on error.
+ */
+static char * parse_next_property(char *buf, char *end, char **name, int *length,
+ unsigned char **value)
+{
+ char *tmp;
+
+ *name = buf;
+
+ tmp = strchr(buf, ' ');
+ if (!tmp) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ *tmp = '\0';
+
+ if (++tmp >= end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+
+ /* now we're on the length */
+ *length = -1;
+ *length = simple_strtoul(tmp, &tmp, 10);
+ if (*length == -1) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ if (*tmp != ' ' || ++tmp >= end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+
+ /* now we're on the value */
+ *value = tmp;
+ tmp += *length;
+ if (tmp > end) {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ else if (tmp < end && *tmp != ' ' && *tmp != '\0') {
+ printk(KERN_ERR "property parse failed in %s at line %d\n",
+ __func__, __LINE__);
+ return NULL;
+ }
+ tmp++;
+
+ /* and now we should be on the next name, or the end */
+ return tmp;
+}
+
+static struct property *new_property(const char *name, const int length,
+ const unsigned char *value, struct property *last)
+{
+ struct property *new = kzalloc(sizeof(*new), GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL)))
+ goto cleanup;
+ if (!(new->value = kmalloc(length + 1, GFP_KERNEL)))
+ goto cleanup;
+
+ strcpy(new->name, name);
+ memcpy(new->value, value, length);
+ *(((char *)new->value) + length) = 0;
+ new->length = length;
+ new->next = last;
+ return new;
+
+cleanup:
+ kfree(new->name);
+ kfree(new->value);
+ kfree(new);
+ return NULL;
+}
+
+static int do_add_node(char *buf, size_t bufsize)
+{
+ char *path, *end, *name;
+ struct device_node *np;
+ struct property *prop = NULL;
+ unsigned char* value;
+ int length, rv = 0;
+
+ end = buf + bufsize;
+ path = buf;
+ buf = strchr(buf, ' ');
+ if (!buf)
+ return -EINVAL;
+ *buf = '\0';
+ buf++;
+
+ if ((np = of_find_node_by_path(path))) {
+ of_node_put(np);
+ return -EINVAL;
+ }
+
+ /* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */
+ while (buf < end &&
+ (buf = parse_next_property(buf, end, &name, &length, &value))) {
+ struct property *last = prop;
+
+ prop = new_property(name, length, value, last);
+ if (!prop) {
+ rv = -ENOMEM;
+ prop = last;
+ goto out;
+ }
+ }
+ if (!buf) {
+ rv = -EINVAL;
+ goto out;
+ }
+
+ rv = pSeries_reconfig_add_node(path, prop);
+
+out:
+ if (rv)
+ release_prop_list(prop);
+ return rv;
+}
+
+static int do_remove_node(char *buf)
+{
+ struct device_node *node;
+ int rv = -ENODEV;
+
+ if ((node = of_find_node_by_path(buf)))
+ rv = pSeries_reconfig_remove_node(node);
+
+ of_node_put(node);
+ return rv;
+}
+
+static char *parse_node(char *buf, size_t bufsize, struct device_node **npp)
+{
+ char *handle_str;
+ phandle handle;
+ *npp = NULL;
+
+ handle_str = buf;
+
+ buf = strchr(buf, ' ');
+ if (!buf)
+ return NULL;
+ *buf = '\0';
+ buf++;
+
+ handle = simple_strtoul(handle_str, NULL, 0);
+
+ *npp = of_find_node_by_phandle(handle);
+ return buf;
+}
+
+static int do_add_property(char *buf, size_t bufsize)
+{
+ struct property *prop = NULL;
+ struct device_node *np;
+ unsigned char *value;
+ char *name, *end;
+ int length;
+ end = buf + bufsize;
+ buf = parse_node(buf, bufsize, &np);
+
+ if (!np)
+ return -ENODEV;
+
+ if (parse_next_property(buf, end, &name, &length, &value) == NULL)
+ return -EINVAL;
+
+ prop = new_property(name, length, value, NULL);
+ if (!prop)
+ return -ENOMEM;
+
+ prom_add_property(np, prop);
+
+ return 0;
+}
+
+static int do_remove_property(char *buf, size_t bufsize)
+{
+ struct device_node *np;
+ char *tmp;
+ struct property *prop;
+ buf = parse_node(buf, bufsize, &np);
+
+ if (!np)
+ return -ENODEV;
+
+ tmp = strchr(buf,' ');
+ if (tmp)
+ *tmp = '\0';
+
+ if (strlen(buf) == 0)
+ return -EINVAL;
+
+ prop = of_find_property(np, buf, NULL);
+
+ return prom_remove_property(np, prop);
+}
+
+static int do_update_property(char *buf, size_t bufsize)
+{
+ struct device_node *np;
+ unsigned char *value;
+ char *name, *end, *next_prop;
+ int rc, length;
+ struct property *newprop, *oldprop;
+ buf = parse_node(buf, bufsize, &np);
+ end = buf + bufsize;
+
+ if (!np)
+ return -ENODEV;
+
+ next_prop = parse_next_property(buf, end, &name, &length, &value);
+ if (!next_prop)
+ return -EINVAL;
+
+ newprop = new_property(name, length, value, NULL);
+ if (!newprop)
+ return -ENOMEM;
+
+ if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size"))
+ slb_set_size(*(int *)value);
+
+ oldprop = of_find_property(np, name,NULL);
+ if (!oldprop) {
+ if (strlen(name))
+ return prom_add_property(np, newprop);
+ return -ENODEV;
+ }
+
+ rc = prom_update_property(np, newprop, oldprop);
+ if (rc)
+ return rc;
+
+ /* For memory under the ibm,dynamic-reconfiguration-memory node
+ * of the device tree, adding and removing memory is just an update
+ * to the ibm,dynamic-memory property instead of adding/removing a
+ * memory node in the device tree. For these cases we still need to
+ * involve the notifier chain.
+ */
+ if (!strcmp(name, "ibm,dynamic-memory")) {
+ int action;
+
+ next_prop = parse_next_property(next_prop, end, &name,
+ &length, &value);
+ if (!next_prop)
+ return -EINVAL;
+
+ if (!strcmp(name, "add"))
+ action = PSERIES_DRCONF_MEM_ADD;
+ else
+ action = PSERIES_DRCONF_MEM_REMOVE;
+
+ rc = pSeries_reconfig_notify(action, value);
+ if (rc) {
+ prom_update_property(np, oldprop, newprop);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * ofdt_write - perform operations on the Open Firmware device tree
+ *
+ * @file: not used
+ * @buf: command and arguments
+ * @count: size of the command buffer
+ * @off: not used
+ *
+ * Operations supported at this time are addition and removal of
+ * whole nodes along with their properties. Operations on individual
+ * properties are not implemented (yet).
+ */
+static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count,
+ loff_t *off)
+{
+ int rv = 0;
+ char *kbuf;
+ char *tmp;
+
+ if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) {
+ rv = -ENOMEM;
+ goto out;
+ }
+ if (copy_from_user(kbuf, buf, count)) {
+ rv = -EFAULT;
+ goto out;
+ }
+
+ kbuf[count] = '\0';
+
+ tmp = strchr(kbuf, ' ');
+ if (!tmp) {
+ rv = -EINVAL;
+ goto out;
+ }
+ *tmp = '\0';
+ tmp++;
+
+ if (!strcmp(kbuf, "add_node"))
+ rv = do_add_node(tmp, count - (tmp - kbuf));
+ else if (!strcmp(kbuf, "remove_node"))
+ rv = do_remove_node(tmp);
+ else if (!strcmp(kbuf, "add_property"))
+ rv = do_add_property(tmp, count - (tmp - kbuf));
+ else if (!strcmp(kbuf, "remove_property"))
+ rv = do_remove_property(tmp, count - (tmp - kbuf));
+ else if (!strcmp(kbuf, "update_property"))
+ rv = do_update_property(tmp, count - (tmp - kbuf));
+ else
+ rv = -EINVAL;
+out:
+ kfree(kbuf);
+ return rv ? rv : count;
+}
+
+static const struct file_operations ofdt_fops = {
+ .write = ofdt_write,
+ .llseek = noop_llseek,
+};
+
+/* create /proc/powerpc/ofdt write-only by root */
+static int proc_ppc64_create_ofdt(void)
+{
+ struct proc_dir_entry *ent;
+
+ if (!machine_is(pseries))
+ return 0;
+
+ ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
+ if (ent)
+ ent->size = 0;
+
+ return 0;
+}
+__initcall(proc_ppc64_create_ofdt);
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
new file mode 100644
index 00000000000..554457294a2
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -0,0 +1,214 @@
+/*
+ * c 2001 PPC 64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com>
+ *
+ * When ppc64 hardware fails the service processor dumps internal state
+ * of the system. After a reboot the operating system can access a dump
+ * of this data using this driver. A dump exists if the device-tree
+ * /chosen/ibm,scan-log-data property exists.
+ *
+ * This driver exports /proc/powerpc/scan-log-dump which can be read.
+ * The driver supports only sequential reads.
+ *
+ * The driver looks at a write to the driver for the single word "reset".
+ * If given, the driver will reset the scanlog so the platform can free it.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "scanlog"
+
+/* Status returns from ibm,scan-log-dump */
+#define SCANLOG_COMPLETE 0
+#define SCANLOG_HWERROR -1
+#define SCANLOG_CONTINUE 1
+
+
+static unsigned int ibm_scan_log_dump; /* RTAS token */
+static struct proc_dir_entry *proc_ppc64_scan_log_dump; /* The proc file */
+
+static ssize_t scanlog_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode * inode = file->f_path.dentry->d_inode;
+ struct proc_dir_entry *dp;
+ unsigned int *data;
+ int status;
+ unsigned long len, off;
+ unsigned int wait_time;
+
+ dp = PDE(inode);
+ data = (unsigned int *)dp->data;
+
+ if (count > RTAS_DATA_BUF_SIZE)
+ count = RTAS_DATA_BUF_SIZE;
+
+ if (count < 1024) {
+ /* This is the min supported by this RTAS call. Rather
+ * than do all the buffering we insist the user code handle
+ * larger reads. As long as cp works... :)
+ */
+ printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count);
+ return -EINVAL;
+ }
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ for (;;) {
+ wait_time = 500; /* default wait if no data */
+ spin_lock(&rtas_data_buf_lock);
+ memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE);
+ status = rtas_call(ibm_scan_log_dump, 2, 1, NULL,
+ (u32) __pa(rtas_data_buf), (u32) count);
+ memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+ spin_unlock(&rtas_data_buf_lock);
+
+ pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \
+ "data[2]=%x\n", status, data[0], data[1], data[2]);
+ switch (status) {
+ case SCANLOG_COMPLETE:
+ pr_debug("scanlog: hit eof\n");
+ return 0;
+ case SCANLOG_HWERROR:
+ pr_debug("scanlog: hardware error reading data\n");
+ return -EIO;
+ case SCANLOG_CONTINUE:
+ /* We may or may not have data yet */
+ len = data[1];
+ off = data[2];
+ if (len > 0) {
+ if (copy_to_user(buf, ((char *)data)+off, len))
+ return -EFAULT;
+ return len;
+ }
+ /* Break to sleep default time */
+ break;
+ default:
+ /* Assume extended busy */
+ wait_time = rtas_busy_delay_time(status);
+ if (!wait_time) {
+ printk(KERN_ERR "scanlog: unknown error " \
+ "from rtas: %d\n", status);
+ return -EIO;
+ }
+ }
+ /* Apparently no data yet. Wait and try again. */
+ msleep_interruptible(wait_time);
+ }
+ /*NOTREACHED*/
+}
+
+static ssize_t scanlog_write(struct file * file, const char __user * buf,
+ size_t count, loff_t *ppos)
+{
+ char stkbuf[20];
+ int status;
+
+ if (count > 19) count = 19;
+ if (copy_from_user (stkbuf, buf, count)) {
+ return -EFAULT;
+ }
+ stkbuf[count] = 0;
+
+ if (buf) {
+ if (strncmp(stkbuf, "reset", 5) == 0) {
+ pr_debug("scanlog: reset scanlog\n");
+ status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0);
+ pr_debug("scanlog: rtas returns %d\n", status);
+ }
+ }
+ return count;
+}
+
+static int scanlog_open(struct inode * inode, struct file * file)
+{
+ struct proc_dir_entry *dp = PDE(inode);
+ unsigned int *data = (unsigned int *)dp->data;
+
+ if (data[0] != 0) {
+ /* This imperfect test stops a second copy of the
+ * data (or a reset while data is being copied)
+ */
+ return -EBUSY;
+ }
+
+ data[0] = 0; /* re-init so we restart the scan */
+
+ return 0;
+}
+
+static int scanlog_release(struct inode * inode, struct file * file)
+{
+ struct proc_dir_entry *dp = PDE(inode);
+ unsigned int *data = (unsigned int *)dp->data;
+
+ data[0] = 0;
+
+ return 0;
+}
+
+const struct file_operations scanlog_fops = {
+ .owner = THIS_MODULE,
+ .read = scanlog_read,
+ .write = scanlog_write,
+ .open = scanlog_open,
+ .release = scanlog_release,
+ .llseek = noop_llseek,
+};
+
+static int __init scanlog_init(void)
+{
+ struct proc_dir_entry *ent;
+ void *data;
+ int err = -ENOMEM;
+
+ ibm_scan_log_dump = rtas_token("ibm,scan-log-dump");
+ if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE)
+ return -ENODEV;
+
+ /* Ideally we could allocate a buffer < 4G */
+ data = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+ if (!data)
+ goto err;
+
+ ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL,
+ &scanlog_fops, data);
+ if (!ent)
+ goto err;
+
+ proc_ppc64_scan_log_dump = ent;
+
+ return 0;
+err:
+ kfree(data);
+ return err;
+}
+
+static void __exit scanlog_cleanup(void)
+{
+ if (proc_ppc64_scan_log_dump) {
+ kfree(proc_ppc64_scan_log_dump->data);
+ remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent);
+ }
+}
+
+module_init(scanlog_init);
+module_exit(scanlog_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
new file mode 100644
index 00000000000..f79f1278dfc
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -0,0 +1,653 @@
+/*
+ * 64-bit pSeries and RS/6000 setup code.
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ * Adapted from 'alpha' version by Gary Thomas
+ * Modified by Cort Dougan (cort@cs.nmt.edu)
+ * Modified by PPC64 Team, IBM Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * bootup setup stuff..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/user.h>
+#include <linux/tty.h>
+#include <linux/major.h>
+#include <linux/interrupt.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/console.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/adb.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/irq.h>
+#include <linux/seq_file.h>
+#include <linux/root_dev.h>
+#include <linux/cpuidle.h>
+
+#include <asm/mmu.h>
+#include <asm/processor.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+#include <asm/pci-bridge.h>
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+#include <asm/nvram.h>
+#include <asm/pmc.h>
+#include <asm/mpic.h>
+#include <asm/xics.h>
+#include <asm/ppc-pci.h>
+#include <asm/i8259.h>
+#include <asm/udbg.h>
+#include <asm/smp.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+#include <asm/pSeries_reconfig.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+int CMO_PrPSP = -1;
+int CMO_SecPSP = -1;
+unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT);
+EXPORT_SYMBOL(CMO_PageSize);
+
+int fwnmi_active; /* TRUE if an FWNMI handler is present */
+
+static struct device_node *pSeries_mpic_node;
+
+static void pSeries_show_cpuinfo(struct seq_file *m)
+{
+ struct device_node *root;
+ const char *model = "";
+
+ root = of_find_node_by_path("/");
+ if (root)
+ model = of_get_property(root, "model", NULL);
+ seq_printf(m, "machine\t\t: CHRP %s\n", model);
+ of_node_put(root);
+}
+
+/* Initialize firmware assisted non-maskable interrupts if
+ * the firmware supports this feature.
+ */
+static void __init fwnmi_init(void)
+{
+ unsigned long system_reset_addr, machine_check_addr;
+
+ int ibm_nmi_register = rtas_token("ibm,nmi-register");
+ if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE)
+ return;
+
+ /* If the kernel's not linked at zero we point the firmware at low
+ * addresses anyway, and use a trampoline to get to the real code. */
+ system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START;
+ machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START;
+
+ if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr,
+ machine_check_addr))
+ fwnmi_active = 1;
+}
+
+static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ unsigned int cascade_irq = i8259_irq();
+
+ if (cascade_irq != NO_IRQ)
+ generic_handle_irq(cascade_irq);
+
+ chip->irq_eoi(&desc->irq_data);
+}
+
+static void __init pseries_setup_i8259_cascade(void)
+{
+ struct device_node *np, *old, *found = NULL;
+ unsigned int cascade;
+ const u32 *addrp;
+ unsigned long intack = 0;
+ int naddr;
+
+ for_each_node_by_type(np, "interrupt-controller") {
+ if (of_device_is_compatible(np, "chrp,iic")) {
+ found = np;
+ break;
+ }
+ }
+
+ if (found == NULL) {
+ printk(KERN_DEBUG "pic: no ISA interrupt controller\n");
+ return;
+ }
+
+ cascade = irq_of_parse_and_map(found, 0);
+ if (cascade == NO_IRQ) {
+ printk(KERN_ERR "pic: failed to map cascade interrupt");
+ return;
+ }
+ pr_debug("pic: cascade mapped to irq %d\n", cascade);
+
+ for (old = of_node_get(found); old != NULL ; old = np) {
+ np = of_get_parent(old);
+ of_node_put(old);
+ if (np == NULL)
+ break;
+ if (strcmp(np->name, "pci") != 0)
+ continue;
+ addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL);
+ if (addrp == NULL)
+ continue;
+ naddr = of_n_addr_cells(np);
+ intack = addrp[naddr-1];
+ if (naddr > 1)
+ intack |= ((unsigned long)addrp[naddr-2]) << 32;
+ }
+ if (intack)
+ printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack);
+ i8259_init(found, intack);
+ of_node_put(found);
+ irq_set_chained_handler(cascade, pseries_8259_cascade);
+}
+
+static void __init pseries_mpic_init_IRQ(void)
+{
+ struct device_node *np;
+ const unsigned int *opprop;
+ unsigned long openpic_addr = 0;
+ int naddr, n, i, opplen;
+ struct mpic *mpic;
+
+ np = of_find_node_by_path("/");
+ naddr = of_n_addr_cells(np);
+ opprop = of_get_property(np, "platform-open-pic", &opplen);
+ if (opprop != 0) {
+ openpic_addr = of_read_number(opprop, naddr);
+ printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr);
+ }
+ of_node_put(np);
+
+ BUG_ON(openpic_addr == 0);
+
+ /* Setup the openpic driver */
+ mpic = mpic_alloc(pSeries_mpic_node, openpic_addr, 0,
+ 16, 250, /* isu size, irq count */
+ " MPIC ");
+ BUG_ON(mpic == NULL);
+
+ /* Add ISUs */
+ opplen /= sizeof(u32);
+ for (n = 0, i = naddr; i < opplen; i += naddr, n++) {
+ unsigned long isuaddr = of_read_number(opprop + i, naddr);
+ mpic_assign_isu(mpic, n, isuaddr);
+ }
+
+ /* Setup top-level get_irq */
+ ppc_md.get_irq = mpic_get_irq;
+
+ /* All ISUs are setup, complete initialization */
+ mpic_init(mpic);
+
+ /* Look for cascade */
+ pseries_setup_i8259_cascade();
+}
+
+static void __init pseries_xics_init_IRQ(void)
+{
+ xics_init();
+ pseries_setup_i8259_cascade();
+}
+
+static void pseries_lpar_enable_pmcs(void)
+{
+ unsigned long set, reset;
+
+ set = 1UL << 63;
+ reset = 0;
+ plpar_hcall_norets(H_PERFMON, set, reset);
+}
+
+static void __init pseries_discover_pic(void)
+{
+ struct device_node *np;
+ const char *typep;
+
+ for (np = NULL; (np = of_find_node_by_name(np,
+ "interrupt-controller"));) {
+ typep = of_get_property(np, "compatible", NULL);
+ if (strstr(typep, "open-pic")) {
+ pSeries_mpic_node = of_node_get(np);
+ ppc_md.init_IRQ = pseries_mpic_init_IRQ;
+ setup_kexec_cpu_down_mpic();
+ smp_init_pseries_mpic();
+ return;
+ } else if (strstr(typep, "ppc-xicp")) {
+ ppc_md.init_IRQ = pseries_xics_init_IRQ;
+ setup_kexec_cpu_down_xics();
+ smp_init_pseries_xics();
+ return;
+ }
+ }
+ printk(KERN_ERR "pSeries_discover_pic: failed to recognize"
+ " interrupt-controller\n");
+}
+
+static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node)
+{
+ struct device_node *np = node;
+ struct pci_dn *pci = NULL;
+ int err = NOTIFY_OK;
+
+ switch (action) {
+ case PSERIES_RECONFIG_ADD:
+ pci = np->parent->data;
+ if (pci)
+ update_dn_pci_info(np, pci->phb);
+ break;
+ default:
+ err = NOTIFY_DONE;
+ break;
+ }
+ return err;
+}
+
+static struct notifier_block pci_dn_reconfig_nb = {
+ .notifier_call = pci_dn_reconfig_notifier,
+};
+
+struct kmem_cache *dtl_cache;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+/*
+ * Allocate space for the dispatch trace log for all possible cpus
+ * and register the buffers with the hypervisor. This is used for
+ * computing time stolen by the hypervisor.
+ */
+static int alloc_dispatch_logs(void)
+{
+ int cpu, ret;
+ struct paca_struct *pp;
+ struct dtl_entry *dtl;
+
+ if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+ return 0;
+
+ if (!dtl_cache)
+ return 0;
+
+ for_each_possible_cpu(cpu) {
+ pp = &paca[cpu];
+ dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL);
+ if (!dtl) {
+ pr_warn("Failed to allocate dispatch trace log for cpu %d\n",
+ cpu);
+ pr_warn("Stolen time statistics will be unreliable\n");
+ break;
+ }
+
+ pp->dtl_ridx = 0;
+ pp->dispatch_log = dtl;
+ pp->dispatch_log_end = dtl + N_DISPATCH_LOG;
+ pp->dtl_curr = dtl;
+ }
+
+ /* Register the DTL for the current (boot) cpu */
+ dtl = get_paca()->dispatch_log;
+ get_paca()->dtl_ridx = 0;
+ get_paca()->dtl_curr = dtl;
+ get_paca()->lppaca_ptr->dtl_idx = 0;
+
+ /* hypervisor reads buffer length from this field */
+ dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES;
+ ret = register_dtl(hard_smp_processor_id(), __pa(dtl));
+ if (ret)
+ pr_err("WARNING: DTL registration of cpu %d (hw %d) failed "
+ "with %d\n", smp_processor_id(),
+ hard_smp_processor_id(), ret);
+ get_paca()->lppaca_ptr->dtl_enable_mask = 2;
+
+ return 0;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+static inline int alloc_dispatch_logs(void)
+{
+ return 0;
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+static int alloc_dispatch_log_kmem_cache(void)
+{
+ dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES,
+ DISPATCH_LOG_BYTES, 0, NULL);
+ if (!dtl_cache) {
+ pr_warn("Failed to create dispatch trace log buffer cache\n");
+ pr_warn("Stolen time statistics will be unreliable\n");
+ return 0;
+ }
+
+ return alloc_dispatch_logs();
+}
+early_initcall(alloc_dispatch_log_kmem_cache);
+
+static void pSeries_idle(void)
+{
+ /* This would call on the cpuidle framework, and the back-end pseries
+ * driver to go to idle states
+ */
+ if (cpuidle_idle_call()) {
+ /* On error, execute default handler
+ * to go into low thread priority and possibly
+ * low power mode.
+ */
+ HMT_low();
+ HMT_very_low();
+ }
+}
+
+static void __init pSeries_setup_arch(void)
+{
+ panic_timeout = 10;
+
+ /* Discover PIC type and setup ppc_md accordingly */
+ pseries_discover_pic();
+
+ /* openpic global configuration register (64-bit format). */
+ /* openpic Interrupt Source Unit pointer (64-bit format). */
+ /* python0 facility area (mmio) (64-bit format) REAL address. */
+
+ /* init to some ~sane value until calibrate_delay() runs */
+ loops_per_jiffy = 50000000;
+
+ fwnmi_init();
+
+ /* Find and initialize PCI host bridges */
+ init_pci_config_tokens();
+ find_and_init_phbs();
+ pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb);
+ eeh_init();
+
+ pSeries_nvram_init();
+
+ if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+ vpa_init(boot_cpuid);
+ ppc_md.power_save = pSeries_idle;
+ }
+
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
+ else
+ ppc_md.enable_pmcs = power4_enable_pmcs;
+}
+
+static int __init pSeries_init_panel(void)
+{
+ /* Manually leave the kernel version on the panel. */
+ ppc_md.progress("Linux ppc64\n", 0);
+ ppc_md.progress(init_utsname()->version, 0);
+
+ return 0;
+}
+machine_arch_initcall(pseries, pSeries_init_panel);
+
+static int pseries_set_dabr(unsigned long dabr)
+{
+ return plpar_hcall_norets(H_SET_DABR, dabr);
+}
+
+static int pseries_set_xdabr(unsigned long dabr)
+{
+ /* We want to catch accesses from kernel and userspace */
+ return plpar_hcall_norets(H_SET_XDABR, dabr,
+ H_DABRX_KERNEL | H_DABRX_USER);
+}
+
+#define CMO_CHARACTERISTICS_TOKEN 44
+#define CMO_MAXLENGTH 1026
+
+void pSeries_coalesce_init(void)
+{
+ struct hvcall_mpp_x_data mpp_x_data;
+
+ if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data))
+ powerpc_firmware_features |= FW_FEATURE_XCMO;
+ else
+ powerpc_firmware_features &= ~FW_FEATURE_XCMO;
+}
+
+/**
+ * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions,
+ * handle that here. (Stolen from parse_system_parameter_string)
+ */
+void pSeries_cmo_feature_init(void)
+{
+ char *ptr, *key, *value, *end;
+ int call_status;
+ int page_order = IOMMU_PAGE_SHIFT;
+
+ pr_debug(" -> fw_cmo_feature_init()\n");
+ spin_lock(&rtas_data_buf_lock);
+ memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
+ call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL,
+ CMO_CHARACTERISTICS_TOKEN,
+ __pa(rtas_data_buf),
+ RTAS_DATA_BUF_SIZE);
+
+ if (call_status != 0) {
+ spin_unlock(&rtas_data_buf_lock);
+ pr_debug("CMO not available\n");
+ pr_debug(" <- fw_cmo_feature_init()\n");
+ return;
+ }
+
+ end = rtas_data_buf + CMO_MAXLENGTH - 2;
+ ptr = rtas_data_buf + 2; /* step over strlen value */
+ key = value = ptr;
+
+ while (*ptr && (ptr <= end)) {
+ /* Separate the key and value by replacing '=' with '\0' and
+ * point the value at the string after the '='
+ */
+ if (ptr[0] == '=') {
+ ptr[0] = '\0';
+ value = ptr + 1;
+ } else if (ptr[0] == '\0' || ptr[0] == ',') {
+ /* Terminate the string containing the key/value pair */
+ ptr[0] = '\0';
+
+ if (key == value) {
+ pr_debug("Malformed key/value pair\n");
+ /* Never found a '=', end processing */
+ break;
+ }
+
+ if (0 == strcmp(key, "CMOPageSize"))
+ page_order = simple_strtol(value, NULL, 10);
+ else if (0 == strcmp(key, "PrPSP"))
+ CMO_PrPSP = simple_strtol(value, NULL, 10);
+ else if (0 == strcmp(key, "SecPSP"))
+ CMO_SecPSP = simple_strtol(value, NULL, 10);
+ value = key = ptr + 1;
+ }
+ ptr++;
+ }
+
+ /* Page size is returned as the power of 2 of the page size,
+ * convert to the page size in bytes before returning
+ */
+ CMO_PageSize = 1 << page_order;
+ pr_debug("CMO_PageSize = %lu\n", CMO_PageSize);
+
+ if (CMO_PrPSP != -1 || CMO_SecPSP != -1) {
+ pr_info("CMO enabled\n");
+ pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+ CMO_SecPSP);
+ powerpc_firmware_features |= FW_FEATURE_CMO;
+ pSeries_coalesce_init();
+ } else
+ pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
+ CMO_SecPSP);
+ spin_unlock(&rtas_data_buf_lock);
+ pr_debug(" <- fw_cmo_feature_init()\n");
+}
+
+/*
+ * Early initialization. Relocation is on but do not reference unbolted pages
+ */
+static void __init pSeries_init_early(void)
+{
+ pr_debug(" -> pSeries_init_early()\n");
+
+#ifdef CONFIG_HVC_CONSOLE
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ hvc_vio_init_early();
+#endif
+ if (firmware_has_feature(FW_FEATURE_DABR))
+ ppc_md.set_dabr = pseries_set_dabr;
+ else if (firmware_has_feature(FW_FEATURE_XDABR))
+ ppc_md.set_dabr = pseries_set_xdabr;
+
+ pSeries_cmo_feature_init();
+ iommu_init_early_pSeries();
+
+ pr_debug(" <- pSeries_init_early()\n");
+}
+
+/*
+ * Called very early, MMU is off, device-tree isn't unflattened
+ */
+
+static int __init pSeries_probe_hypertas(unsigned long node,
+ const char *uname, int depth,
+ void *data)
+{
+ const char *hypertas;
+ unsigned long len;
+
+ if (depth != 1 ||
+ (strcmp(uname, "rtas") != 0 && strcmp(uname, "rtas@0") != 0))
+ return 0;
+
+ hypertas = of_get_flat_dt_prop(node, "ibm,hypertas-functions", &len);
+ if (!hypertas)
+ return 1;
+
+ powerpc_firmware_features |= FW_FEATURE_LPAR;
+ fw_feature_init(hypertas, len);
+
+ return 1;
+}
+
+static int __init pSeries_probe(void)
+{
+ unsigned long root = of_get_flat_dt_root();
+ char *dtype = of_get_flat_dt_prop(root, "device_type", NULL);
+
+ if (dtype == NULL)
+ return 0;
+ if (strcmp(dtype, "chrp"))
+ return 0;
+
+ /* Cell blades firmware claims to be chrp while it's not. Until this
+ * is fixed, we need to avoid those here.
+ */
+ if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0") ||
+ of_flat_dt_is_compatible(root, "IBM,CBEA"))
+ return 0;
+
+ pr_debug("pSeries detected, looking for LPAR capability...\n");
+
+ /* Now try to figure out if we are running on LPAR */
+ of_scan_flat_dt(pSeries_probe_hypertas, NULL);
+
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ hpte_init_lpar();
+ else
+ hpte_init_native();
+
+ pr_debug("Machine is%s LPAR !\n",
+ (powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not");
+
+ return 1;
+}
+
+static int pSeries_pci_probe_mode(struct pci_bus *bus)
+{
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ return PCI_PROBE_DEVTREE;
+ return PCI_PROBE_NORMAL;
+}
+
+/**
+ * pSeries_power_off - tell firmware about how to power off the system.
+ *
+ * This function calls either the power-off rtas token in normal cases
+ * or the ibm,power-off-ups token (if present & requested) in case of
+ * a power failure. If power-off token is used, power on will only be
+ * possible with power button press. If ibm,power-off-ups token is used
+ * it will allow auto poweron after power is restored.
+ */
+static void pSeries_power_off(void)
+{
+ int rc;
+ int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
+
+ if (rtas_flash_term_hook)
+ rtas_flash_term_hook(SYS_POWER_OFF);
+
+ if (rtas_poweron_auto == 0 ||
+ rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
+ rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
+ printk(KERN_INFO "RTAS power-off returned %d\n", rc);
+ } else {
+ rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
+ printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc);
+ }
+ for (;;);
+}
+
+#ifndef CONFIG_PCI
+void pSeries_final_fixup(void) { }
+#endif
+
+define_machine(pseries) {
+ .name = "pSeries",
+ .probe = pSeries_probe,
+ .setup_arch = pSeries_setup_arch,
+ .init_early = pSeries_init_early,
+ .show_cpuinfo = pSeries_show_cpuinfo,
+ .log_error = pSeries_log_error,
+ .pcibios_fixup = pSeries_final_fixup,
+ .pci_probe_mode = pSeries_pci_probe_mode,
+ .restart = rtas_restart,
+ .power_off = pSeries_power_off,
+ .halt = rtas_halt,
+ .panic = rtas_os_term,
+ .get_boot_time = rtas_get_boot_time,
+ .get_rtc_time = rtas_get_rtc_time,
+ .set_rtc_time = rtas_set_rtc_time,
+ .calibrate_decr = generic_calibrate_decr,
+ .progress = rtas_progress,
+ .system_reset_exception = pSeries_system_reset_exception,
+ .machine_check_exception = pSeries_machine_check_exception,
+};
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
new file mode 100644
index 00000000000..eadba9521a3
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -0,0 +1,259 @@
+/*
+ * SMP support for pSeries machines.
+ *
+ * Dave Engebretsen, Peter Bergner, and
+ * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com
+ *
+ * Plus various changes from other IBM teams...
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/ptrace.h>
+#include <linux/atomic.h>
+#include <asm/irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+#include <asm/prom.h>
+#include <asm/smp.h>
+#include <asm/paca.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/firmware.h>
+#include <asm/system.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+#include <asm/mpic.h>
+#include <asm/vdso_datapage.h>
+#include <asm/cputhreads.h>
+#include <asm/xics.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+#include "offline_states.h"
+
+
+/*
+ * The Primary thread of each non-boot processor was started from the OF client
+ * interface by prom_hold_cpus and is spinning on secondary_hold_spinloop.
+ */
+static cpumask_var_t of_spin_mask;
+
+/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
+int smp_query_cpu_stopped(unsigned int pcpu)
+{
+ int cpu_status, status;
+ int qcss_tok = rtas_token("query-cpu-stopped-state");
+
+ if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
+ printk_once(KERN_INFO
+ "Firmware doesn't support query-cpu-stopped-state\n");
+ return QCSS_HARDWARE_ERROR;
+ }
+
+ status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+ if (status != 0) {
+ printk(KERN_ERR
+ "RTAS query-cpu-stopped-state failed: %i\n", status);
+ return status;
+ }
+
+ return cpu_status;
+}
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do for primary threads which were
+ * started from Open Firmware. For anything else, call RTAS with the
+ * appropriate start location.
+ *
+ * Returns:
+ * 0 - failure
+ * 1 - success
+ */
+static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+{
+ int status;
+ unsigned long start_here = __pa((u32)*((unsigned long *)
+ generic_secondary_smp_init));
+ unsigned int pcpu;
+ int start_cpu;
+
+ if (cpumask_test_cpu(lcpu, of_spin_mask))
+ /* Already started by OF and sitting in spin loop */
+ return 1;
+
+ pcpu = get_hard_smp_processor_id(lcpu);
+
+ /* Check to see if the CPU out of FW already for kexec */
+ if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){
+ cpumask_set_cpu(lcpu, of_spin_mask);
+ return 1;
+ }
+
+ /* Fixup atomic count: it exited inside IRQ handler. */
+ task_thread_info(paca[lcpu].__current)->preempt_count = 0;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
+ goto out;
+#endif
+ /*
+ * If the RTAS start-cpu token does not exist then presume the
+ * cpu is already spinning.
+ */
+ start_cpu = rtas_token("start-cpu");
+ if (start_cpu == RTAS_UNKNOWN_SERVICE)
+ return 1;
+
+ status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, pcpu);
+ if (status != 0) {
+ printk(KERN_ERR "start-cpu failed: %i\n", status);
+ return 0;
+ }
+
+#ifdef CONFIG_HOTPLUG_CPU
+out:
+#endif
+ return 1;
+}
+
+static void __devinit smp_xics_setup_cpu(int cpu)
+{
+ if (cpu != boot_cpuid)
+ xics_setup_cpu();
+
+ if (firmware_has_feature(FW_FEATURE_SPLPAR))
+ vpa_init(cpu);
+
+ cpumask_clear_cpu(cpu, of_spin_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+ set_cpu_current_state(cpu, CPU_STATE_ONLINE);
+ set_default_offline_state(cpu);
+#endif
+ pseries_notify_cpuidle_add_cpu(cpu);
+}
+
+static int __devinit smp_pSeries_kick_cpu(int nr)
+{
+ BUG_ON(nr < 0 || nr >= NR_CPUS);
+
+ if (!smp_startup_cpu(nr))
+ return -ENOENT;
+
+ /*
+ * The processor is currently spinning, waiting for the
+ * cpu_start field to become non-zero After we set cpu_start,
+ * the processor will continue on to secondary_start
+ */
+ paca[nr].cpu_start = 1;
+#ifdef CONFIG_HOTPLUG_CPU
+ set_preferred_offline_state(nr, CPU_STATE_ONLINE);
+
+ if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
+ long rc;
+ unsigned long hcpuid;
+
+ hcpuid = get_hard_smp_processor_id(nr);
+ rc = plpar_hcall_norets(H_PROD, hcpuid);
+ if (rc != H_SUCCESS)
+ printk(KERN_ERR "Error: Prod to wake up processor %d "
+ "Ret= %ld\n", nr, rc);
+ }
+#endif
+
+ return 0;
+}
+
+static int smp_pSeries_cpu_bootable(unsigned int nr)
+{
+ /* Special case - we inhibit secondary thread startup
+ * during boot if the user requests it.
+ */
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
+ return 0;
+ if (smt_enabled_at_boot
+ && cpu_thread_in_core(nr) >= smt_enabled_at_boot)
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct smp_ops_t pSeries_mpic_smp_ops = {
+ .message_pass = smp_mpic_message_pass,
+ .probe = smp_mpic_probe,
+ .kick_cpu = smp_pSeries_kick_cpu,
+ .setup_cpu = smp_mpic_setup_cpu,
+};
+
+static struct smp_ops_t pSeries_xics_smp_ops = {
+ .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
+ .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
+ .probe = xics_smp_probe,
+ .kick_cpu = smp_pSeries_kick_cpu,
+ .setup_cpu = smp_xics_setup_cpu,
+ .cpu_bootable = smp_pSeries_cpu_bootable,
+};
+
+/* This is called very early */
+static void __init smp_init_pseries(void)
+{
+ int i;
+
+ pr_debug(" -> smp_init_pSeries()\n");
+
+ alloc_bootmem_cpumask_var(&of_spin_mask);
+
+ /* Mark threads which are still spinning in hold loops. */
+ if (cpu_has_feature(CPU_FTR_SMT)) {
+ for_each_present_cpu(i) {
+ if (cpu_thread_in_core(i) == 0)
+ cpumask_set_cpu(i, of_spin_mask);
+ }
+ } else {
+ cpumask_copy(of_spin_mask, cpu_present_mask);
+ }
+
+ cpumask_clear_cpu(boot_cpuid, of_spin_mask);
+
+ /* Non-lpar has additional take/give timebase */
+ if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+ smp_ops->give_timebase = rtas_give_timebase;
+ smp_ops->take_timebase = rtas_take_timebase;
+ }
+
+ pr_debug(" <- smp_init_pSeries()\n");
+}
+
+void __init smp_init_pseries_mpic(void)
+{
+ smp_ops = &pSeries_mpic_smp_ops;
+
+ smp_init_pseries();
+}
+
+void __init smp_init_pseries_xics(void)
+{
+ smp_ops = &pSeries_xics_smp_ops;
+
+ smp_init_pseries();
+}
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
new file mode 100644
index 00000000000..47226e04126
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2010 Brian King IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/suspend.h>
+#include <linux/stat.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <asm/machdep.h>
+#include <asm/mmu.h>
+#include <asm/rtas.h>
+#include <asm/topology.h>
+
+static u64 stream_id;
+static struct device suspend_dev;
+static DECLARE_COMPLETION(suspend_work);
+static struct rtas_suspend_me_data suspend_data;
+static atomic_t suspending;
+
+/**
+ * pseries_suspend_begin - First phase of hibernation
+ *
+ * Check to ensure we are in a valid state to hibernate
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_suspend_begin(suspend_state_t state)
+{
+ long vasi_state, rc;
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ /* Make sure the state is valid */
+ rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id);
+
+ vasi_state = retbuf[0];
+
+ if (rc) {
+ pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc);
+ return rc;
+ } else if (vasi_state == H_VASI_ENABLED) {
+ return -EAGAIN;
+ } else if (vasi_state != H_VASI_SUSPENDING) {
+ pr_err("pseries_suspend_begin: vasi_state returned state %ld\n",
+ vasi_state);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
+ * pseries_suspend_cpu - Suspend a single CPU
+ *
+ * Makes the H_JOIN call to suspend the CPU
+ *
+ **/
+static int pseries_suspend_cpu(void)
+{
+ if (atomic_read(&suspending))
+ return rtas_suspend_cpu(&suspend_data);
+ return 0;
+}
+
+/**
+ * pseries_suspend_enter - Final phase of hibernation
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_suspend_enter(suspend_state_t state)
+{
+ int rc = rtas_suspend_last_cpu(&suspend_data);
+
+ atomic_set(&suspending, 0);
+ atomic_set(&suspend_data.done, 1);
+ return rc;
+}
+
+/**
+ * pseries_prepare_late - Prepare to suspend all other CPUs
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_prepare_late(void)
+{
+ atomic_set(&suspending, 1);
+ atomic_set(&suspend_data.working, 0);
+ atomic_set(&suspend_data.done, 0);
+ atomic_set(&suspend_data.error, 0);
+ suspend_data.complete = &suspend_work;
+ INIT_COMPLETION(suspend_work);
+ return 0;
+}
+
+/**
+ * store_hibernate - Initiate partition hibernation
+ * @dev: subsys root device
+ * @attr: device attribute struct
+ * @buf: buffer
+ * @count: buffer size
+ *
+ * Write the stream ID received from the HMC to this file
+ * to trigger hibernating the partition
+ *
+ * Return value:
+ * number of bytes printed to buffer / other on failure
+ **/
+static ssize_t store_hibernate(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ int rc;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ stream_id = simple_strtoul(buf, NULL, 16);
+
+ do {
+ rc = pseries_suspend_begin(PM_SUSPEND_MEM);
+ if (rc == -EAGAIN)
+ ssleep(1);
+ } while (rc == -EAGAIN);
+
+ if (!rc) {
+ stop_topology_update();
+ rc = pm_suspend(PM_SUSPEND_MEM);
+ start_topology_update();
+ }
+
+ stream_id = 0;
+
+ if (!rc)
+ rc = count;
+ return rc;
+}
+
+static DEVICE_ATTR(hibernate, S_IWUSR, NULL, store_hibernate);
+
+static struct bus_type suspend_subsys = {
+ .name = "power",
+ .dev_name = "power",
+};
+
+static const struct platform_suspend_ops pseries_suspend_ops = {
+ .valid = suspend_valid_only_mem,
+ .begin = pseries_suspend_begin,
+ .prepare_late = pseries_prepare_late,
+ .enter = pseries_suspend_enter,
+};
+
+/**
+ * pseries_suspend_sysfs_register - Register with sysfs
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int pseries_suspend_sysfs_register(struct device *dev)
+{
+ int rc;
+
+ if ((rc = subsys_system_register(&suspend_subsys, NULL)))
+ return rc;
+
+ dev->id = 0;
+ dev->bus = &suspend_subsys;
+
+ if ((rc = device_create_file(suspend_subsys.dev_root, &dev_attr_hibernate)))
+ goto subsys_unregister;
+
+ return 0;
+
+subsys_unregister:
+ bus_unregister(&suspend_subsys);
+ return rc;
+}
+
+/**
+ * pseries_suspend_init - initcall for pSeries suspend
+ *
+ * Return value:
+ * 0 on success / other on failure
+ **/
+static int __init pseries_suspend_init(void)
+{
+ int rc;
+
+ if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR))
+ return 0;
+
+ suspend_data.token = rtas_token("ibm,suspend-me");
+ if (suspend_data.token == RTAS_UNKNOWN_SERVICE)
+ return 0;
+
+ if ((rc = pseries_suspend_sysfs_register(&suspend_dev)))
+ return rc;
+
+ ppc_md.suspend_disable_cpu = pseries_suspend_cpu;
+ suspend_set_ops(&pseries_suspend_ops);
+ return 0;
+}
+
+__initcall(pseries_suspend_init);