From 0e83815be719d3391bf5ea24b7fe696c07dbd417 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 27 Jul 2009 19:43:52 +0200
Subject: x86: fix section mismatch for i386 init code

Startup code for i386 in arch/x86/kernel/head_32.S is using the
reference variable initial_code that is located in the .cpuinit.data
section. If CONFIG_HOTPLUG_CPU is enabled, startup code is not in an
init section and can be called later too. In this case the reference
initial_code must be kept too. This patch fixes this. See below for
the section mismatch warning.

 WARNING: vmlinux.o(.cpuinit.data+0x0): Section mismatch in reference
 from the variable initial_code to the function
 .init.text:i386_start_kernel()
 The variable __cpuinitdata initial_code references
 a function __init i386_start_kernel().
 If i386_start_kernel is only used by initial_code then
 annotate i386_start_kernel with a matching annotation.

Signed-off-by: Robert Richter <robert.richter@amd.com>
LKML-Reference: <1248716632-26844-1-git-send-email-robert.richter@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/head_32.S | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8663afb5653..0d98a01cbdb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -602,7 +602,11 @@ ignore_int:
 #endif
 	iret
 
-.section .cpuinit.data,"wa"
+#ifndef CONFIG_HOTPLUG_CPU
+	__CPUINITDATA
+#else
+	__REFDATA
+#endif
 .align 4
 ENTRY(initial_code)
 	.long i386_start_kernel
-- 
cgit 


From 6c6c51e4cc11a5456fb1172008f7c69d955af9f6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 3 Aug 2009 22:47:32 +1000
Subject: x86: Add quirk to make Apple MacBook5,2 use reboot=pci

The latest Apple MacBook (MacBook5,2) doesn't reboot successfully
under Linux; neither the EFI reboot method nor the default method
using the keyboard controller works (the system just hangs and doesn't
reset).  However, the method using the "PCI reset register" at 0xcf9
does work.

This adds a quirk to detect this machine via DMI and force the
reboot_type to BOOT_CF9.  With this it reboots successfully without
requiring a command-line option.  Note that the EFI code forces
reboot_type to BOOT_EFI when the machine is booted via EFI, but this
overrides that since the core_initcall runs after the EFI
initialization code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
LKML-Reference: <19062.56420.501516.316181@cargo.ozlabs.ibm.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/reboot.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 508e982dd07..834c9da8bf9 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -3,6 +3,7 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/efi.h>
+#include <linux/dmi.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
 #include <asm/apic.h>
@@ -17,7 +18,6 @@
 #include <asm/cpu.h>
 
 #ifdef CONFIG_X86_32
-# include <linux/dmi.h>
 # include <linux/ctype.h>
 # include <linux/mc146818rtc.h>
 #else
@@ -404,6 +404,38 @@ EXPORT_SYMBOL(machine_real_restart);
 
 #endif /* CONFIG_X86_32 */
 
+/*
+ * Apple MacBook5,2 (2009 MacBook) needs reboot=p
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+	if (reboot_type != BOOT_CF9) {
+		reboot_type = BOOT_CF9;
+		printk(KERN_INFO "%s series board detected. "
+		       "Selecting PCI-method for reboots.\n", d->ident);
+	}
+	return 0;
+}
+
+static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
+	{	/* Handle problems with rebooting on Apple MacBook5,2 */
+		.callback = set_pci_reboot,
+		.ident = "Apple MacBook",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"),
+		},
+	},
+	{ }
+};
+
+static int __init pci_reboot_init(void)
+{
+	dmi_check_system(pci_reboot_dmi_table);
+	return 0;
+}
+core_initcall(pci_reboot_init);
+
 static inline void kb_wait(void)
 {
 	int i;
-- 
cgit 


From 6a7bbd57ed50bb62c9a81ae5f2e202ca689e5964 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 3 Aug 2009 22:38:10 +1000
Subject: x86: Make 64-bit efi_ioremap use ioremap on MMIO regions

Booting current 64-bit x86 kernels on the latest Apple MacBook
(MacBook5,2) via EFI gives the following warning:

[    0.182209] ------------[ cut here ]------------
[    0.182222] WARNING: at arch/x86/mm/pageattr.c:581 __cpa_process_fault+0x44/0xa0()
[    0.182227] Hardware name: MacBook5,2
[    0.182231] CPA: called for zero pte. vaddr = ffff8800ffe00000 cpa->vaddr = ffff8800ffe00000
[    0.182236] Modules linked in:
[    0.182242] Pid: 0, comm: swapper Not tainted 2.6.31-rc4 #6
[    0.182246] Call Trace:
[    0.182254]  [<ffffffff8102c754>] ? __cpa_process_fault+0x44/0xa0
[    0.182261]  [<ffffffff81048668>] warn_slowpath_common+0x78/0xd0
[    0.182266]  [<ffffffff81048744>] warn_slowpath_fmt+0x64/0x70
[    0.182272]  [<ffffffff8102c7ec>] ? update_page_count+0x3c/0x50
[    0.182280]  [<ffffffff818d25c5>] ? phys_pmd_init+0x140/0x22e
[    0.182286]  [<ffffffff8102c754>] __cpa_process_fault+0x44/0xa0
[    0.182292]  [<ffffffff8102ce60>] __change_page_attr_set_clr+0x5f0/0xb40
[    0.182301]  [<ffffffff810d1035>] ? vm_unmap_aliases+0x175/0x190
[    0.182307]  [<ffffffff8102d4ae>] change_page_attr_set_clr+0xfe/0x3d0
[    0.182314]  [<ffffffff8102dcca>] _set_memory_uc+0x2a/0x30
[    0.182319]  [<ffffffff8102dd4b>] set_memory_uc+0x7b/0xb0
[    0.182327]  [<ffffffff818afe31>] efi_enter_virtual_mode+0x2ad/0x2c9
[    0.182334]  [<ffffffff818a1c66>] start_kernel+0x2db/0x3f4
[    0.182340]  [<ffffffff818a1289>] x86_64_start_reservations+0x99/0xb9
[    0.182345]  [<ffffffff818a1389>] x86_64_start_kernel+0xe0/0xf2
[    0.182357] ---[ end trace 4eaa2a86a8e2da22 ]---
[    0.182982] init_memory_mapping: 00000000ffffc000-0000000100000000
[    0.182993]  00ffffc000 - 0100000000 page 4k

This happens because the 64-bit version of efi_ioremap calls
init_memory_mapping for all addresses, regardless of whether they are
RAM or MMIO.  The EFI tables on this machine ask for runtime access to
some MMIO regions:

[    0.000000] EFI: mem195: type=11, attr=0x8000000000000000, range=[0x0000000093400000-0x0000000093401000) (0MB)
[    0.000000] EFI: mem196: type=11, attr=0x8000000000000000, range=[0x00000000ffc00000-0x00000000ffc40000) (0MB)
[    0.000000] EFI: mem197: type=11, attr=0x8000000000000000, range=[0x00000000ffc40000-0x00000000ffc80000) (0MB)
[    0.000000] EFI: mem198: type=11, attr=0x8000000000000000, range=[0x00000000ffc80000-0x00000000ffca4000) (0MB)
[    0.000000] EFI: mem199: type=11, attr=0x8000000000000000, range=[0x00000000ffca4000-0x00000000ffcb4000) (0MB)
[    0.000000] EFI: mem200: type=11, attr=0x8000000000000000, range=[0x00000000ffcb4000-0x00000000ffffc000) (3MB)
[    0.000000] EFI: mem201: type=11, attr=0x8000000000000000, range=[0x00000000ffffc000-0x0000000100000000) (0MB)

This arranges to pass the EFI memory type through to efi_ioremap, and
makes efi_ioremap use ioremap rather than init_memory_mapping if the
type is EFI_MEMORY_MAPPED_IO.  With this, the above warning goes away.

Signed-off-by: Paul Mackerras <paulus@samba.org>
LKML-Reference: <19062.55858.533494.471153@cargo.ozlabs.ibm.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/efi.c    | 2 +-
 arch/x86/kernel/efi_64.c | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 96f7ac0bbf0..19ccf6d0dcc 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -512,7 +512,7 @@ void __init efi_enter_virtual_mode(void)
 			&& end_pfn <= max_pfn_mapped))
 			va = __va(md->phys_addr);
 		else
-			va = efi_ioremap(md->phys_addr, size);
+			va = efi_ioremap(md->phys_addr, size, md->type);
 
 		md->virt_addr = (u64) (unsigned long) va;
 
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 22c3b7828c5..ac0621a7ac3 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -98,10 +98,14 @@ void __init efi_call_phys_epilog(void)
 	early_runtime_code_mapping_set_exec(0);
 }
 
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
+				 u32 type)
 {
 	unsigned long last_map_pfn;
 
+	if (type == EFI_MEMORY_MAPPED_IO)
+		return ioremap(phys_addr, size);
+
 	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
 	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
 		return NULL;
-- 
cgit 


From d2ba8b211bb8abc29aa627dbd4dce08cfbc8082b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 3 Aug 2009 14:44:54 -0700
Subject: x86: Fix assert syntax in vmlinux.lds.S

Older versions of binutils did not accept the naked "ASSERT" syntax;
it is considered an expression whose value needs to be assigned to
something.

Reported-tested-and-fixed-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vmlinux.lds.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 59f31d2dd43..78d185d797d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -393,8 +393,8 @@ SECTIONS
 
 
 #ifdef CONFIG_X86_32
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-        "kernel image bigger than KERNEL_IMAGE_SIZE")
+. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+	   "kernel image bigger than KERNEL_IMAGE_SIZE");
 #else
 /*
  * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -407,12 +407,12 @@ INIT_PER_CPU(irq_stack_union);
 /*
  * Build-time check on the image size:
  */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-	"kernel image bigger than KERNEL_IMAGE_SIZE")
+. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+	   "kernel image bigger than KERNEL_IMAGE_SIZE");
 
 #ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
-        "irq_stack_union is not at start of per-cpu area");
+. = ASSERT((per_cpu__irq_stack_union == 0),
+           "irq_stack_union is not at start of per-cpu area");
 #endif
 
 #endif /* CONFIG_X86_32 */
@@ -420,7 +420,7 @@ ASSERT((per_cpu__irq_stack_union == 0),
 #ifdef CONFIG_KEXEC
 #include <asm/kexec.h>
 
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
+. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+           "kexec control code size is too big");
 #endif
 
-- 
cgit 


From 6c7184b77464261b7d55583a48accbd1350923a3 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 27 Jul 2009 09:35:07 -0500
Subject: x86, UV: Handle missing blade-local memory correctly

UV blades may not have any blade-local memory. Add a field
(nid) to the UV blade structure to indicates whether the node
has local memory. This is needed by the GRU driver (pushed
separately).

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
LKML-Reference: <20090727143507.GA7006@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 096d19aea2f..ad3f0a984ca 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -591,6 +591,8 @@ void __init uv_system_init(void)
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
 	uv_blade_info = kmalloc(bytes, GFP_KERNEL);
 	BUG_ON(!uv_blade_info);
+	for (blade = 0; blade < uv_num_possible_blades(); blade++)
+		uv_blade_info[blade].memory_nid = -1;
 
 	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
 
@@ -629,6 +631,9 @@ void __init uv_system_init(void)
 		lcpu = uv_blade_info[blade].nr_possible_cpus;
 		uv_blade_info[blade].nr_possible_cpus++;
 
+		/* Any node on the blade, else will contain -1. */
+		uv_blade_info[blade].memory_nid = nid;
+
 		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
 		uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
 		uv_cpu_hub_info(cpu)->m_val = m_val;
-- 
cgit 


From cc5e4fa1bd4d2f56da07f9092281afdcd2374ab9 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 27 Jul 2009 09:36:56 -0500
Subject: x86, UV: Delete mapping of MMR rangs mapped by BIOS

The UV BIOS has added additional MMR ranges that are mapped via
EFI virtual mode mappings. These ranges should be deleted from
ranges mapped by uv_system_init().

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
LKML-Reference: <20090727143656.GA7698@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 31 +------------------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ad3f0a984ca..445c210daf8 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -362,12 +362,6 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 	BUG();
 }
 
-static __init void map_low_mmrs(void)
-{
-	init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
-	init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
-}
-
 enum map_type {map_wb, map_uc};
 
 static __init void map_high(char *id, unsigned long base, int shift,
@@ -395,26 +389,6 @@ static __init void map_gru_high(int max_pnode)
 		map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
 }
 
-static __init void map_config_high(int max_pnode)
-{
-	union uvh_rh_gam_cfg_overlay_config_mmr_u cfg;
-	int shift = UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT;
-
-	cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
-	if (cfg.s.enable)
-		map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
-}
-
-static __init void map_mmr_high(int max_pnode)
-{
-	union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
-	int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
-
-	mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
-	if (mmr.s.enable)
-		map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
-}
-
 static __init void map_mmioh_high(int max_pnode)
 {
 	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -566,8 +540,6 @@ void __init uv_system_init(void)
 	unsigned long mmr_base, present, paddr;
 	unsigned short pnode_mask;
 
-	map_low_mmrs();
-
 	m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
 	m_val = m_n_config.s.m_skt;
 	n_val = m_n_config.s.n_skt;
@@ -667,11 +639,10 @@ void __init uv_system_init(void)
 		pnode = (paddr >> m_val) & pnode_mask;
 		blade = boot_pnode_to_blade(pnode);
 		uv_node_to_blade[nid] = blade;
+		max_pnode = max(pnode, max_pnode);
 	}
 
 	map_gru_high(max_pnode);
-	map_mmr_high(max_pnode);
-	map_config_high(max_pnode);
 	map_mmioh_high(max_pnode);
 
 	uv_cpu_init();
-- 
cgit 


From c5997fa8d7aca3c9876a6ff71bacf27c41095ce9 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 27 Jul 2009 09:38:56 -0500
Subject: x86, UV: Fix UV apic mode

Change SGI UV default apicid mode to "physical". This is
required to match settings in the UV hub chip.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090727143856.GA8905@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 445c210daf8..832e908adcb 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -261,7 +261,7 @@ struct apic apic_x2apic_uv_x = {
 	.apic_id_registered		= uv_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 1, /* logical */
+	.irq_dest_mode			= 0, /* physical */
 
 	.target_cpus			= uv_target_cpus,
 	.disable_esr			= 0,
-- 
cgit 


From d8c7eb34c2db6268909ae8c3958be63bde254292 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 25 Jul 2009 03:23:09 -0700
Subject: x86: Don't use current_cpu_data in x2apic phys_pkg_id

One system has socket 1 come up as BSP.

kexeced kernel reports BSP as:

[    1.524550] Initializing cgroup subsys cpuacct
[    1.536064] initial_apicid:20
[    1.537135] ht_mask_width:1
[    1.538128] core_select_mask:f
[    1.539126] core_plus_mask_width:5
[    1.558479] CPU: Physical Processor ID: 0
[    1.559501] CPU: Processor Core ID: 0
[    1.560539] CPU: L1 I cache: 32K, L1 D cache: 32K
[    1.579098] CPU: L2 cache: 256K
[    1.580085] CPU: L3 cache: 24576K
[    1.581108] CPU 0/0x20 -> Node 0
[    1.596193] CPU 0 microcode level: 0xffff0008

It doesn't have correct physical processor id and will get an
error:

[   38.840859] CPU0 attaching sched-domain:
[   38.848287]  domain 0: span 0,8,72 level SIBLING
[   38.851151]   groups: 0 8 72
[   38.858137]   domain 1: span 0,8-15,72-79 level MC
[   38.868944]    groups: 0,8,72 9,73 10,74 11,75 12,76 13,77 14,78 15,79
[   38.881383] ERROR: parent span is not a superset of domain->span
[   38.890724]    domain 2: span 0-7,64-71 level CPU
[   38.899237] ERROR: domain->groups does not contain CPU0
[   38.909229]     groups: 8-15,72-79
[   38.912547] ERROR: groups don't span domain->span
[   38.919665]     domain 3: span 0-127 level NODE
[   38.930739]      groups: 0-7,64-71 8-15,72-79 16-23,80-87 24-31,88-95 32-39,96-103 40-47,104-111 48-55,112-119 56-63,120-127

it turns out: we can not use current_cpu_data in phys_pgd_id
for x2apic.

identify_boot_cpu() is called by check_bugs() before
smp_prepare_cpus() and till smp_prepare_cpus() current_cpu_data
for bsp is assigned with boot_cpu_data.

Just make phys_pkg_id for x2apic is aligned to xapic.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A6ADD0D.10002@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 arch/x86/kernel/apic/x2apic_phys.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 8e4cbb255c3..2ed4e2bb3b3 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -170,7 +170,7 @@ static unsigned long set_apic_id(unsigned int id)
 
 static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
 {
-	return current_cpu_data.initial_apicid >> index_msb;
+	return initial_apicid >> index_msb;
 }
 
 static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a284359627e..0b631c6a2e0 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -162,7 +162,7 @@ static unsigned long set_apic_id(unsigned int id)
 
 static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
 {
-	return current_cpu_data.initial_apicid >> index_msb;
+	return initial_apicid >> index_msb;
 }
 
 static void x2apic_send_IPI_self(int vector)
-- 
cgit 


From 2a5ef41661b56cf4eee042a6967c4e14b63e8eac Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 20 Jul 2009 09:28:41 -0500
Subject: x86, UV: Complete IRQ interrupt migration in arch_enable_uv_irq()

In uv_setup_irq(), the call to create_irq() initially assigns
IRQ vectors to cpu 0. The subsequent call to
assign_irq_vector() in arch_enable_uv_irq() migrates the IRQ to
another cpu and frees the cpu 0 vector - at least it will be
freed as soon as the "IRQ move" completes.

arch_enable_uv_irq() needs to send a cleanup IPI to complete
the IRQ move. Otherwise, assignment of GRU interrupts on large
systems (>200 cpus) will exhaust the cpu 0 interrupt vectors
and initialization of the GRU driver will fail.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090720142840.GA8885@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2284a4812b6..d2ed6c5ddc8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3793,6 +3793,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
 
+	if (cfg->move_in_progress)
+		send_cleanup_vector(cfg);
+
 	return irq;
 }
 
-- 
cgit 


From dc731fbbadf5d65c98fcd6c86472aa286c16458a Mon Sep 17 00:00:00 2001
From: Subrata Modak <subrata@linux.vnet.ibm.com>
Date: Tue, 21 Jul 2009 08:02:27 +0530
Subject: x86: Work around compilation warning in arch/x86/kernel/apm_32.c

The following fix was initially inspired by David Howells fix
few days back:

  http://lkml.org/lkml/2009/7/9/109

However, Ingo disapproves such fixes as it's dangerous (it can
hide future, relevant warnings) - in something as
performance-uncritical.

So, initialize 'err' to '0' to work around a GCC false positive
warning:

  http://lkml.org/lkml/2009/7/18/89

Signed-off-by: Subrata Modak<subrata@linux.vnet.ibm.com>
Cc: Sachin P Sant <sachinp@linux.vnet.ibm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
LKML-Reference: <20090721023226.31855.67236.sendpatchset@subratamodak.linux.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apm_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 79302e9a33a..442b5508893 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -811,7 +811,7 @@ static int apm_do_idle(void)
 	u8 ret = 0;
 	int idled = 0;
 	int polling;
-	int err;
+	int err = 0;
 
 	polling = !!(current_thread_info()->status & TS_POLLING);
 	if (polling) {
-- 
cgit 


From 7d5b005652bc5ae3e1e0efc53fd0e25a643ec506 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Tue, 4 Aug 2009 15:34:22 -0700
Subject: x86: Fix VMI && stack protector

With CONFIG_STACK_PROTECTOR turned on, VMI doesn't boot with
more than one processor. The problem is with the gs value not
being initialized correctly when registering the secondary
processor for VMI's case.

The patch below initializes the gs value for the AP to
__KERNEL_STACK_CANARY. Without this the secondary processor
keeps on taking a GP on every gs access.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: <stable@kernel.org> # for v2.6.30.x
LKML-Reference: <1249425262.18955.40.camel@ank32.eng.vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmi_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b263423fbe2..95a7289e4b0 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -441,7 +441,7 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 	ap.ds = __USER_DS;
 	ap.es = __USER_DS;
 	ap.fs = __KERNEL_PERCPU;
-	ap.gs = 0;
+	ap.gs = __KERNEL_STACK_CANARY;
 
 	ap.eflags = 0;
 
-- 
cgit 


From 087d7e56deffb611a098e7e257388a41edbeef1f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 4 Aug 2009 08:59:59 -0700
Subject: x86: Fix MSI-X initialization by using online_mask for x2apic
 target_cpus

found a system where x2apic reports an MSI-X irq initialization
failure:

[  302.859446] igbvf 0000:81:10.4: enabling device (0000 -> 0002)
[  302.874369] igbvf 0000:81:10.4: using 64bit DMA mask
[  302.879023] igbvf 0000:81:10.4: using 64bit consistent DMA mask
[  302.894386] igbvf 0000:81:10.4: enabling bus mastering
[  302.898171] igbvf 0000:81:10.4: setting latency timer to 64
[  302.914050] reserve_memtype added 0xefb08000-0xefb0c000, track uncached-minus, req uncached-minus, ret uncached-minus
[  302.933839] reserve_memtype added 0xefb28000-0xefb29000, track uncached-minus, req uncached-minus, ret uncached-minus
[  302.940367]   alloc irq_desc for 265 on node 4
[  302.956874]   alloc kstat_irqs on node 4
[  302.959452] alloc irq_2_iommu on node 0
[  302.974328] igbvf 0000:81:10.4: irq 265 for MSI/MSI-X
[  302.977778]   alloc irq_desc for 266 on node 4
[  302.980347]   alloc kstat_irqs on node 4
[  302.995312] free_memtype request 0xefb28000-0xefb29000
[  302.998816] igbvf 0000:81:10.4: Failed to initialize MSI-X interrupts.

... it turns out that when trying to enable MSI-X,
__assign_irq_vector(new, cfg_new, apic->target_cpus()) can not
get vector because for x2apic target-cpus returns cpumask_of(0)

Update that to online_mask like xapic.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4A785AFF.3050902@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 8 +++++---
 arch/x86/kernel/apic/x2apic_phys.c    | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 2ed4e2bb3b3..a5371ec3677 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -17,11 +17,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	return x2apic_enabled();
 }
 
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
 static const struct cpumask *x2apic_target_cpus(void)
 {
-	return cpumask_of(0);
+	return cpu_online_mask;
 }
 
 /*
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 0b631c6a2e0..a8989aadc99 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -27,11 +27,13 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		return 0;
 }
 
-/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
-
+/*
+ * need to use more than cpu 0, because we need more vectors when
+ * MSI-X are used.
+ */
 static const struct cpumask *x2apic_target_cpus(void)
 {
-	return cpumask_of(0);
+	return cpu_online_mask;
 }
 
 static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-- 
cgit 


From 498cdbfbcf98e0d2c90a26e6a02a82f043876e48 Mon Sep 17 00:00:00 2001
From: Ozan Çağlayan <ozan@pardus.org.tr>
Date: Tue, 4 Aug 2009 19:39:31 +0300
Subject: x86: Add quirk to make Apple MacBookPro5,1 use reboot=pci
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MacBookPro5,1 is not able to reboot unless reboot=pci is set.
This patch forces it through a DMI quirk specific to this
device.

Signed-off-by: Ozan Çağlayan <ozan@pardus.org.tr>
LKML-Reference: <1249403971-6543-1-git-send-email-ozan@pardus.org.tr>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 834c9da8bf9..9eb89760370 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -405,7 +405,7 @@ EXPORT_SYMBOL(machine_real_restart);
 #endif /* CONFIG_X86_32 */
 
 /*
- * Apple MacBook5,2 (2009 MacBook) needs reboot=p
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
  */
 static int __init set_pci_reboot(const struct dmi_system_id *d)
 {
@@ -426,6 +426,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"),
 		},
 	},
+	{	/* Handle problems with rebooting on Apple MacBookPro5,1 */
+		.callback = set_pci_reboot,
+		.ident = "Apple MacBookPro5,1",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"),
+		},
+	},
 	{ }
 };
 
-- 
cgit 


From fdb8a42742ac95606668f73481dfb2f760658fdd Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 6 Aug 2009 15:58:13 -0700
Subject: x86: fix buffer overflow in efi_init()

If the vendor name (from c16) can be longer than 100 bytes (or missing a
terminating null), then the null is written past the end of vendor[].

Found with Parfait, http://research.sun.com/projects/parfait/

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Ying <ying.huang@intel.com>
---
 arch/x86/kernel/efi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 19ccf6d0dcc..fe26ba3e345 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -354,7 +354,7 @@ void __init efi_init(void)
 	 */
 	c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
 	if (c16) {
-		for (i = 0; i < sizeof(vendor) && *c16; ++i)
+		for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
 			vendor[i] = *c16++;
 		vendor[i] = '\0';
 	} else
-- 
cgit 


From b6e61eef4f9f94714ac3ee4a5c96862d9bcd1836 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 31 Jul 2009 12:45:41 -0700
Subject: x86: Fix serialization in pit_expect_msb()

Wei Chong Tan reported a fast-PIT-calibration corner-case:

| pit_expect_msb() is vulnerable to SMI disturbance corner case
| in some platforms which causes /proc/cpuinfo to show wrong
| CPU MHz value when quick_pit_calibrate() jumps to success
| section.

I think that the real issue isn't even an SMI - but the fact
that in the very last iteration of the loop, there's no
serializing instruction _after_ the last 'rdtsc'. So even in
the absense of SMI's, we do have a situation where the cycle
counter was read without proper serialization.

The last check should be done outside the outer loop, since
_inside_ the outer loop, we'll be testing that the PIT has
the right MSB value has the right value in the next iteration.

So only the _last_ iteration is special, because that's the one
that will not check the PIT MSB value any more, and because the
final 'get_cycles()' isn't serialized.

In other words:

 - I'd like to move the PIT MSB check to after the last
   iteration, rather than in every iteration

 - I think we should comment on the fact that it's also a
   serializing instruction and so 'fences in' the TSC read.

Here's a suggested replacement.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: "Tan, Wei Chong" <wei.chong.tan@intel.com>
Tested-by: "Tan, Wei Chong" <wei.chong.tan@intel.com>
LKML-Reference: <B28277FD4E0F9247A3D55704C440A140D5D683F3@pgsmsx504.gar.corp.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6e1a368d21d..71f4368b357 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -275,15 +275,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
  * use the TSC value at the transitions to calculate a pretty
  * good value for the TSC frequencty.
  */
+static inline int pit_verify_msb(unsigned char val)
+{
+	/* Ignore LSB */
+	inb(0x42);
+	return inb(0x42) == val;
+}
+
 static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
 {
 	int count;
 	u64 tsc = 0;
 
 	for (count = 0; count < 50000; count++) {
-		/* Ignore LSB */
-		inb(0x42);
-		if (inb(0x42) != val)
+		if (!pit_verify_msb(val))
 			break;
 		tsc = get_cycles();
 	}
@@ -336,8 +341,7 @@ static unsigned long quick_pit_calibrate(void)
 	 * to do that is to just read back the 16-bit counter
 	 * once from the PIT.
 	 */
-	inb(0x42);
-	inb(0x42);
+	pit_verify_msb(0);
 
 	if (pit_expect_msb(0xff, &tsc, &d1)) {
 		for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) {
@@ -348,8 +352,19 @@ static unsigned long quick_pit_calibrate(void)
 			 * Iterate until the error is less than 500 ppm
 			 */
 			delta -= tsc;
-			if (d1+d2 < delta >> 11)
-				goto success;
+			if (d1+d2 >= delta >> 11)
+				continue;
+
+			/*
+			 * Check the PIT one more time to verify that
+			 * all TSC reads were stable wrt the PIT.
+			 *
+			 * This also guarantees serialization of the
+			 * last cycle read ('d2') in pit_expect_msb.
+			 */
+			if (!pit_verify_msb(0xfe - i))
+				break;
+			goto success;
 		}
 	}
 	printk("Fast TSC calibration failed\n");
-- 
cgit 


From 3e03bbeac541856aaaf1ce1ab0250b6a490e4099 Mon Sep 17 00:00:00 2001
From: Shunichi Fuji <palglowr@gmail.com>
Date: Tue, 11 Aug 2009 03:34:40 +0900
Subject: x86: Add reboot quirk for every 5 series MacBook/Pro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reboot does not work on my MacBook Pro 13 inch (MacBookPro5,5)
too. It seems all unibody MacBook and MacBookPro require
PCI reboot handling, i guess.

Following model/machine ID list shows unibody MacBook/Pro have
the 5 series of model number:

   http://www.everymac.com/systems/by_capability/macs-by-machine-model-machine-id.html

Signed-off-by: Shunichi Fuji <palglowr@gmail.com>
Cc: Ozan Çağlayan <ozan@pardus.org.tr>
LKML-Reference: <30046e3b0908101134p6487ddbftd8776e4ddef204be@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 9eb89760370..a06e8d10184 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -418,20 +418,20 @@ static int __init set_pci_reboot(const struct dmi_system_id *d)
 }
 
 static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
-	{	/* Handle problems with rebooting on Apple MacBook5,2 */
+	{	/* Handle problems with rebooting on Apple MacBook5 */
 		.callback = set_pci_reboot,
-		.ident = "Apple MacBook",
+		.ident = "Apple MacBook5",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5,2"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
 		},
 	},
-	{	/* Handle problems with rebooting on Apple MacBookPro5,1 */
+	{	/* Handle problems with rebooting on Apple MacBookPro5 */
 		.callback = set_pci_reboot,
-		.ident = "Apple MacBookPro5,1",
+		.ident = "Apple MacBookPro5",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
-			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5,1"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
 		},
 	},
 	{ }
-- 
cgit 


From 0d01f31439c1e4d602bf9fdc924ab66f407f5e38 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 9 Aug 2009 21:44:49 -0700
Subject: x86, mce: therm_throt - change when we print messages

My Latitude d630 seems to be handling thermal events in SMI by
lowering the max frequency of the CPU till it cools down but
still leaks the "everything is normal" events.

This spams the console and with high priority printks.

Adjust therm_throt driver to only print messages about the fact
that temperatire returned back to normal when leaving the
throttling state.

Also lower the severity of "back to normal" message from
KERN_CRIT to KERN_INFO.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Acked-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <20090810051513.0558F526EC9@mailhub.coreip.homeip.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index bff8dd191dd..8bc64cfbe93 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -36,6 +36,7 @@
 
 static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
+static DEFINE_PER_CPU(bool, thermal_throttle_active);
 
 static atomic_t therm_throt_en		= ATOMIC_INIT(0);
 
@@ -96,24 +97,27 @@ static int therm_throt_process(int curr)
 {
 	unsigned int cpu = smp_processor_id();
 	__u64 tmp_jiffs = get_jiffies_64();
+	bool was_throttled = __get_cpu_var(thermal_throttle_active);
+	bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr;
 
-	if (curr)
+	if (is_throttled)
 		__get_cpu_var(thermal_throttle_count)++;
 
-	if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+	if (!(was_throttled ^ is_throttled) &&
+	    time_before64(tmp_jiffs, __get_cpu_var(next_check)))
 		return 0;
 
 	__get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
 
 	/* if we just entered the thermal event */
-	if (curr) {
+	if (is_throttled) {
 		printk(KERN_CRIT "CPU%d: Temperature above threshold, "
-		       "cpu clock throttled (total events = %lu)\n", cpu,
-		       __get_cpu_var(thermal_throttle_count));
+		       "cpu clock throttled (total events = %lu)\n",
+		       cpu, __get_cpu_var(thermal_throttle_count));
 
 		add_taint(TAINT_MACHINE_CHECK);
-	} else {
-		printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+	} else if (was_throttled) {
+		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
 	}
 
 	return 1;
-- 
cgit 


From 3c581a7f94542341bf0da496a226b44ac63521a8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Aug 2009 10:47:36 +0200
Subject: perf_counter, x86: Fix lapic printk message

Instead of this garbled bootup on UP Pentium-M systems:

[    0.015048] Performance Counters:
[    0.016004] no Local APIC, try rebooting with lapicno PMU driver, software counters only.

Print:

[    0.015050] Performance Counters:
[    0.016004] no APIC, boot with the "lapic" boot parameter to force-enable it.
[    0.017003] no PMU driver, software counters only.

Cf: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a7aa8f90095..40e233a24d9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1590,7 +1590,7 @@ static int p6_pmu_init(void)
 	}
 
 	if (!cpu_has_apic) {
-		pr_info("no Local APIC, try rebooting with lapic");
+		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
 		return -ENODEV;
 	}
 
-- 
cgit 


From f64ccccb8afa43abdd63fcbd230f818d6ea0883f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Aug 2009 10:26:33 +0200
Subject: perf_counter, x86: Fix generic cache events on P6-mobile CPUs

Johannes Stezenbach reported that 'perf stat' does not count
cache-miss and cache-references events on his Pentium-M based
laptop.

This is because we left them blank in p6_perfmon_event_map[],
fill them in.

Reported-by: Johannes Stezenbach <js@sig21.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 40e233a24d9..fffc126dbdf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -72,8 +72,8 @@ static const u64 p6_perfmon_event_map[] =
 {
   [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
   [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0000,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0000,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,
   [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
   [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
   [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
-- 
cgit 


From fbd8b1819e80ac5a176d085fdddc3a34d1499318 Mon Sep 17 00:00:00 2001
From: Kevin Winchester <kjwinchester@gmail.com>
Date: Mon, 10 Aug 2009 19:56:45 -0300
Subject: x86: Clear incorrectly forced X86_FEATURE_LAHF_LM flag

Due to an erratum with certain AMD Athlon 64 processors, the
BIOS may need to force enable the LAHF_LM capability.
Unfortunately, in at least one case, the BIOS does this even
for processors that do not support the functionality.

Add a specific check that will clear the feature bit for
processors known not to support the LAHF/SAHF instructions.

Signed-off-by: Kevin Winchester <kjwinchester@gmail.com>
Acked-by: Borislav Petkov <petkovbb@googlemail.com>
LKML-Reference: <4A80A5AD.2000209@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e2485b03f1c..63fddcd082c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -400,6 +400,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		level = cpuid_eax(1);
 		if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
 			set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+		/*
+		 * Some BIOSes incorrectly force this feature, but only K8
+		 * revision D (model = 0x14) and later actually support it.
+		 */
+		if (c->x86_model < 0x14)
+			clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
 	}
 	if (c->x86 == 0x10 || c->x86 == 0x11)
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-- 
cgit 


From e8055139d996e85722984968472868d6dccb1490 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Tue, 11 Aug 2009 20:00:11 +0200
Subject: x86: Fix oops in identify_cpu() on CPUs without CPUID

Kernel is broken for x86 CPUs without CPUID since 2.6.28. It
crashes with NULL pointer dereference in identify_cpu():

766        generic_identify(c);
767
768-->     if (this_cpu->c_identify)
769               this_cpu->c_identify(c);

this_cpu is NULL. This is because it's only initialized in
get_cpu_vendor() function, which is not called if the CPU has
no CPUID instruction.

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
LKML-Reference: <200908112000.15993.linux@rainbow-software.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/common.c | 48 ++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f1961c07af9..5ce60a88027 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -59,7 +59,30 @@ void __init setup_cpu_local_masks(void)
 	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
 }
 
-static const struct cpu_dev *this_cpu __cpuinitdata;
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+	display_cacheinfo(c);
+#else
+	/* Not much we can do here... */
+	/* Check if at least it has cpuid */
+	if (c->cpuid_level == -1) {
+		/* No cpuid. It must be an ancient CPU */
+		if (c->x86 == 4)
+			strcpy(c->x86_model_id, "486");
+		else if (c->x86 == 3)
+			strcpy(c->x86_model_id, "386");
+	}
+#endif
+}
+
+static const struct cpu_dev __cpuinitconst default_cpu = {
+	.c_init		= default_init,
+	.c_vendor	= "Unknown",
+	.c_x86_vendor	= X86_VENDOR_UNKNOWN,
+};
+
+static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
 
 DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
@@ -332,29 +355,6 @@ void switch_to_new_gdt(int cpu)
 
 static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
 
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_64
-	display_cacheinfo(c);
-#else
-	/* Not much we can do here... */
-	/* Check if at least it has cpuid */
-	if (c->cpuid_level == -1) {
-		/* No cpuid. It must be an ancient CPU */
-		if (c->x86 == 4)
-			strcpy(c->x86_model_id, "486");
-		else if (c->x86 == 3)
-			strcpy(c->x86_model_id, "386");
-	}
-#endif
-}
-
-static const struct cpu_dev __cpuinitconst default_cpu = {
-	.c_init	= default_init,
-	.c_vendor = "Unknown",
-	.c_x86_vendor = X86_VENDOR_UNKNOWN,
-};
-
 static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
-- 
cgit 


From 04da8a43da804723a550f00dd158fd5b5e25ae35 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 Aug 2009 10:40:08 +0200
Subject: perf_counter, x86: Fix/improve apic fallback

Johannes Stezenbach reported that his Pentium-M based
laptop does not have the local APIC enabled by default,
and hence perfcounters do not get initialized.

Add a fallback for this case: allow non-sampled counters
and return with an error on sampled counters. This allows
'perf stat' to work out of box - and allows 'perf top'
and 'perf record' to fall back on a hrtimer based sampling
method.

( Passing 'lapic' on the boot line will allow hardware
  sampling to occur - but if the APIC is disabled
  permanently by the hardware then this fallback still
  allows more systems to use perfcounters. )

Also decouple perfcounter support from X86_LOCAL_APIC.

-v2: fix typo breaking counters on all other systems ...

Reported-by: Johannes Stezenbach <js@sig21.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fffc126dbdf..900332b800f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -55,6 +55,7 @@ struct x86_pmu {
 	int		num_counters_fixed;
 	int		counter_bits;
 	u64		counter_mask;
+	int		apic;
 	u64		max_period;
 	u64		intel_ctrl;
 };
@@ -613,6 +614,7 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
 
 static bool reserve_pmc_hardware(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -627,9 +629,11 @@ static bool reserve_pmc_hardware(void)
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
+#endif
 
 	return true;
 
+#ifdef CONFIG_X86_LOCAL_APIC
 eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -644,10 +648,12 @@ perfctr_fail:
 		enable_lapic_nmi_watchdog();
 
 	return false;
+#endif
 }
 
 static void release_pmc_hardware(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	for (i = 0; i < x86_pmu.num_counters; i++) {
@@ -657,6 +663,7 @@ static void release_pmc_hardware(void)
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
+#endif
 }
 
 static void hw_perf_counter_destroy(struct perf_counter *counter)
@@ -748,6 +755,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
 		atomic64_set(&hwc->period_left, hwc->sample_period);
+	} else {
+		/*
+		 * If we have a PMU initialized but no APIC
+		 * interrupts, we cannot sample hardware
+		 * counters (user-space has to fall back and
+		 * sample via a hrtimer based software counter):
+		 */
+		if (!x86_pmu.apic)
+			return -EOPNOTSUPP;
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -1449,18 +1465,22 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
 
 void set_perf_counter_pending(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+#endif
 }
 
 void perf_counters_lapic_init(void)
 {
-	if (!x86_pmu_initialized())
+#ifdef CONFIG_X86_LOCAL_APIC
+	if (!x86_pmu.apic || !x86_pmu_initialized())
 		return;
 
 	/*
 	 * Always use NMI for PMU
 	 */
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
 }
 
 static int __kprobes
@@ -1484,7 +1504,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 
 	regs = args->regs;
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
+#endif
 	/*
 	 * Can't rely on the handled return value to say it was our NMI, two
 	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1515,6 +1537,7 @@ static struct x86_pmu p6_pmu = {
 	.event_map		= p6_pmu_event_map,
 	.raw_event		= p6_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
+	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
 	.version		= 0,
 	.num_counters		= 2,
@@ -1541,6 +1564,7 @@ static struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -1564,6 +1588,7 @@ static struct x86_pmu amd_pmu = {
 	.num_counters		= 4,
 	.counter_bits		= 48,
 	.counter_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
 };
@@ -1589,13 +1614,14 @@ static int p6_pmu_init(void)
 		return -ENODEV;
 	}
 
+	x86_pmu = p6_pmu;
+
 	if (!cpu_has_apic) {
 		pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-		return -ENODEV;
+		pr_info("no hardware sampling interrupt available.\n");
+		x86_pmu.apic = 0;
 	}
 
-	x86_pmu				= p6_pmu;
-
 	return 0;
 }
 
-- 
cgit 


From 74d46d6b2d23d44d72c37df4c6a5d2e782f7b088 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Jul 2009 17:11:50 +0900
Subject: percpu, sparc64: fix sparse possible cpu map handling

percpu code has been assuming num_possible_cpus() == nr_cpu_ids which
is incorrect if cpu_possible_map contains holes.  This causes percpu
code to access beyond allocated memories and vmalloc areas.  On a
sparc64 machine with cpus 0 and 2 (u60), this triggers the following
warning or fails boot.

 WARNING: at /devel/tj/os/work/mm/vmalloc.c:106 vmap_page_range_noflush+0x1f0/0x240()
 Modules linked in:
 Call Trace:
  [00000000004b17d0] vmap_page_range_noflush+0x1f0/0x240
  [00000000004b1840] map_vm_area+0x20/0x60
  [00000000004b1950] __vmalloc_area_node+0xd0/0x160
  [0000000000593434] deflate_init+0x14/0xe0
  [0000000000583b94] __crypto_alloc_tfm+0xd4/0x1e0
  [00000000005844f0] crypto_alloc_base+0x50/0xa0
  [000000000058b898] alg_test_comp+0x18/0x80
  [000000000058dad4] alg_test+0x54/0x180
  [000000000058af00] cryptomgr_test+0x40/0x60
  [0000000000473098] kthread+0x58/0x80
  [000000000042b590] kernel_thread+0x30/0x60
  [0000000000472fd0] kthreadd+0xf0/0x160
 ---[ end trace 429b268a213317ba ]---

This patch fixes generic percpu functions and sparc64
setup_per_cpu_areas() so that they handle sparse cpu_possible_map
properly.

Please note that on x86, cpu_possible_map() doesn't contain holes and
thus num_possible_cpus() == nr_cpu_ids and this patch doesn't cause
any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 29a3eef7cf4..07d81916f21 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -165,7 +165,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 
 	if (!chosen) {
 		size_t vm_size = VMALLOC_END - VMALLOC_START;
-		size_t tot_size = num_possible_cpus() * PMD_SIZE;
+		size_t tot_size = nr_cpu_ids * PMD_SIZE;
 
 		/* on non-NUMA, embedding is better */
 		if (!pcpu_need_numa())
@@ -199,7 +199,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 	dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 
 	/* allocate pointer array and alloc large pages */
-	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
+	map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
 	pcpul_map = alloc_bootmem(map_size);
 
 	for_each_possible_cpu(cpu) {
@@ -228,7 +228,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 
 	/* allocate address and map */
 	pcpul_vm.flags = VM_ALLOC;
-	pcpul_vm.size = num_possible_cpus() * PMD_SIZE;
+	pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
 	vm_area_register_early(&pcpul_vm, PMD_SIZE);
 
 	for_each_possible_cpu(cpu) {
@@ -250,8 +250,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen)
 				     PMD_SIZE, pcpul_vm.addr, NULL);
 
 	/* sort pcpul_map array for pcpu_lpage_remapped() */
-	for (i = 0; i < num_possible_cpus() - 1; i++)
-		for (j = i + 1; j < num_possible_cpus(); j++)
+	for (i = 0; i < nr_cpu_ids - 1; i++)
+		for (j = i + 1; j < nr_cpu_ids; j++)
 			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
 				struct pcpul_ent tmp = pcpul_map[i];
 				pcpul_map[i] = pcpul_map[j];
@@ -288,7 +288,7 @@ void *pcpu_lpage_remapped(void *kaddr)
 {
 	void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
 	unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
-	int left = 0, right = num_possible_cpus() - 1;
+	int left = 0, right = nr_cpu_ids - 1;
 	int pos;
 
 	/* pcpul in use at all? */
@@ -377,7 +377,7 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pcpu4k_nr_static_pages = PFN_UP(static_size);
 
 	/* unaligned allocations can't be freed, round up to page size */
-	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
 			       * sizeof(pcpu4k_pages[0]));
 	pcpu4k_pages = alloc_bootmem(pages_size);
 
-- 
cgit 


From 3ef12c3c97603bad405d30c989718cc9405e2759 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Fri, 14 Aug 2009 13:56:37 -0500
Subject: x86: Fix UV BAU destination subnode id

The SGI UV Broadcast Assist Unit is used to send TLB shootdown
messages to remote nodes of the system.  The header of the
message must contain the subnode id of the block in the
receiving hub that handles such messages.  It should always be
0x10, the id of the "LB" block.

It had previously been documented as a "must be zero" field.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Acked-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <E1Mc1x7-0005Ce-6t@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tlb_uv.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8ccabb8a2f6..77b9689f8ed 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -744,6 +744,7 @@ uv_activation_descriptor_init(int node, int pnode)
 		 * note that base_dest_nodeid is actually a nasid.
 		 */
 		ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+		ad2->header.dest_subnodeid = 0x10; /* the LB */
 		ad2->header.command = UV_NET_ENDPOINT_INTD;
 		ad2->header.int_both = 1;
 		/*
-- 
cgit 


From 4e5c25d405e18a2f279ca2bfc855508ec3a0186b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Sun, 16 Aug 2009 15:54:37 +0100
Subject: x86, mce: therm_throt: Don't log redundant normality

0d01f31439c1e4d602bf9fdc924ab66f407f5e38 "x86, mce: therm_throt
- change when we print messages" removed redundant
announcements of "Temperature/speed normal".

They're not worth logging and remove their accompanying
"Machine check events logged" messages as well from the
console.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dmitry Torokhov <dtor@mail.ru>
LKML-Reference: <Pine.LNX.4.64.0908161544100.7929@sister.anvils>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 8bc64cfbe93..5957a93e517 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -116,11 +116,14 @@ static int therm_throt_process(int curr)
 		       cpu, __get_cpu_var(thermal_throttle_count));
 
 		add_taint(TAINT_MACHINE_CHECK);
-	} else if (was_throttled) {
+		return 1;
+	}
+	if (was_throttled) {
 		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+		return 1;
 	}
 
-	return 1;
+	return 0;
 }
 
 #ifdef CONFIG_SYSFS
-- 
cgit 


From 52459ab91363343af8ae252766e9da762344a2e7 Mon Sep 17 00:00:00 2001
From: Leonardo Potenza <lpotenza@inwind.it>
Date: Sun, 16 Aug 2009 18:55:48 +0200
Subject: x86: Annotate section mismatch warnings in kernel/apic/x2apic_uv_x.c

The function uv_acpi_madt_oem_check() has been marked __init,
the struct apic_x2apic_uv_x has been marked __refdata.

The aim is to address the following section mismatch messages:

WARNING: arch/x86/kernel/apic/built-in.o(.data+0x1368): Section mismatch in reference from the variable apic_x2apic_uv_x to the function .cpuinit.text:uv_wakeup_secondary()
The variable apic_x2apic_uv_x references
the function __cpuinit uv_wakeup_secondary()
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*driver, *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console,

WARNING: arch/x86/kernel/built-in.o(.data+0x68e8): Section mismatch in reference from the variable apic_x2apic_uv_x to the function .cpuinit.text:uv_wakeup_secondary()
The variable apic_x2apic_uv_x references
the function __cpuinit uv_wakeup_secondary()
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*driver, *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console,

WARNING: arch/x86/built-in.o(.text+0x7b36f): Section mismatch in reference from the function uv_acpi_madt_oem_check() to the function .init.text:early_ioremap()
The function uv_acpi_madt_oem_check() references
the function __init early_ioremap().
This is often because uv_acpi_madt_oem_check lacks a __init
annotation or the annotation of early_ioremap is wrong.

WARNING: arch/x86/built-in.o(.text+0x7b38d): Section mismatch in reference from the function uv_acpi_madt_oem_check() to the function .init.text:early_iounmap()
The function uv_acpi_madt_oem_check() references
the function __init early_iounmap().
This is often because uv_acpi_madt_oem_check lacks a __init
annotation or the annotation of early_iounmap is wrong.

WARNING: arch/x86/built-in.o(.data+0x8668): Section mismatch in reference from the variable apic_x2apic_uv_x to the function .cpuinit.text:uv_wakeup_secondary()
The variable apic_x2apic_uv_x references
the function __cpuinit uv_wakeup_secondary()
If the reference is valid then annotate the
variable with __init* or __refdata (see linux/init.h) or name the variable:
*driver, *_template, *_timer, *_sht, *_ops, *_probe, *_probe_one, *_console,

Signed-off-by: Leonardo Potenza <lpotenza@inwind.it>
LKML-Reference: <200908161855.48302.lpotenza@inwind.it>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 832e908adcb..601159374e8 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -46,7 +46,7 @@ static int early_get_nodeid(void)
 	return node_id.s.node_id;
 }
 
-static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	if (!strcmp(oem_id, "SGI")) {
 		if (!strcmp(oem_table_id, "UVL"))
@@ -253,7 +253,7 @@ static void uv_send_IPI_self(int vector)
 	apic_write(APIC_SELF_IPI, vector);
 }
 
-struct apic apic_x2apic_uv_x = {
+struct apic __refdata apic_x2apic_uv_x = {
 
 	.name				= "UV large system",
 	.probe				= NULL,
-- 
cgit 


From c7f6fa44115d401e89db730f357629d39f8e4ba6 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Jul 2009 23:52:54 +0200
Subject: x86, mce: don't log boot MCEs on Pentium M (model == 13) CPUs

On my legacy Pentium M laptop (Acer Extensa 2900) I get bogus MCE on a cold
boot with CONFIG_X86_NEW_MCE enabled, i.e. (after decoding it with mcelog):

MCE 0
HARDWARE ERROR. This is *NOT* a software problem!
Please contact your hardware vendor
CPU 0 BANK 1 MCG status:
MCi status:
Error overflow
Uncorrected error
Error enabled
Processor context corrupt
MCA: Data CACHE Level-1 UNKNOWN Error
STATUS f200000000000195 MCGSTATUS 0

[ The other STATUS values observed: f2000000000001b5 (... UNKNOWN error)
  and f200000000000115 (... READ Error).

  To verify that this is not a CONFIG_X86_NEW_MCE bug I also modified
  the CONFIG_X86_OLD_MCE code (which doesn't log any MCEs) to dump
  content of STATUS MSR before it is cleared during initialization. ]

Since the bogus MCE results in a kernel taint (which in turn disables
lockdep support) don't log boot MCEs on Pentium M (model == 13) CPUs
by default ("mce=bootlog" boot parameter can be be used to get the old
behavior).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Reviewed-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1cfb623ce11..a0c2910d96a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1273,6 +1273,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
 			monarch_timeout < 0)
 			monarch_timeout = USEC_PER_SEC;
+
+		/* There are also broken BIOSes on some Pentium M systems. */
+		if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0)
+			mce_bootlog = 0;
 	}
 	if (monarch_timeout < 0)
 		monarch_timeout = 0;
-- 
cgit 


From e412cd257e0d51e0ecbb89f50953835b5a0681b2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 17 Aug 2009 10:19:00 +0200
Subject: x86, mce: Don't initialize MCEs on unknown CPUs

An older test-box started hanging at the following point during
bootup:

 [    0.022996] Mount-cache hash table entries: 512
 [    0.024996] Initializing cgroup subsys debug
 [    0.025996] Initializing cgroup subsys cpuacct
 [    0.026995] Initializing cgroup subsys devices
 [    0.027995] Initializing cgroup subsys freezer
 [    0.028995] mce: CPU supports 5 MCE banks

I've bisected it down to commit 4efc0670 ("x86, mce: use 64bit
machine check code on 32bit"), which utilizes the MCE code on
32-bit systems too.

The problem is caused by this detail in my config:

  # CONFIG_CPU_SUP_INTEL is not set

This disables the quirks in mce_cpu_quirks() but still enables
MCE support - which then hangs due to the missing quirk
workaround needed on this CPU:

	if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
		mce_banks[0].init = 0;

The safe solution is to not initialize MCEs if we dont know on
what CPU we are running (or if that CPU's support code got
disabled in the config).

Also be a bit more defensive on 32-bit systems: dont do a
boot-time dump of pending MCEs not just on the specific system
that we found a problem with (Pentium-M), but earlier ones as
well.

Now this problem is probably not common and disabling CPU
support is rare - but still being more defensive in something
we turned on for a wide range of CPUs is prudent.

Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
LKML-Reference: Message-ID: <4A88E3E4.40506@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a0c2910d96a..01213048f62 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1226,8 +1226,13 @@ static void mce_init(void)
 }
 
 /* Add per CPU specific workarounds here */
-static void mce_cpu_quirks(struct cpuinfo_x86 *c)
+static int mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
+	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+		pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+		return -EOPNOTSUPP;
+	}
+
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
 		if (c->x86 == 15 && banks > 4) {
@@ -1274,14 +1279,19 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 			monarch_timeout < 0)
 			monarch_timeout = USEC_PER_SEC;
 
-		/* There are also broken BIOSes on some Pentium M systems. */
-		if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0)
+		/*
+		 * There are also broken BIOSes on some Pentium M and
+		 * earlier systems:
+		 */
+		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
 			mce_bootlog = 0;
 	}
 	if (monarch_timeout < 0)
 		monarch_timeout = 0;
 	if (mce_bootlog != 0)
 		mce_panic_timeout = 30;
+
+	return 0;
 }
 
 static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@ -1342,11 +1352,10 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 	if (!mce_available(c))
 		return;
 
-	if (mce_cap_init() < 0) {
+	if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
 		mce_disabled = 1;
 		return;
 	}
-	mce_cpu_quirks(c);
 
 	machine_check_vector = do_machine_check;
 
-- 
cgit 


From 78b89ecd731798f2fec8cc26ca90739253cec33c Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 18 Aug 2009 16:41:33 +0100
Subject: i386: Fix section mismatches for init code with !HOTPLUG_CPU

Commit 0e83815be719d3391bf5ea24b7fe696c07dbd417 changed the
section the initial_code variable gets allocated in, in an
attempt to address a section conflict warning. This, however
created a new section conflict when building without
HOTPLUG_CPU. The apparently only (reasonable) way to address
this is to always use __REFDATA.

Once at it, also fix a second section mismatch when not using
HOTPLUG_CPU.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <4A8AE7CD020000780001054B@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_32.S | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0d98a01cbdb..cc827ac9e8d 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -261,9 +261,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
  * which will be freed later
  */
 
-#ifndef CONFIG_HOTPLUG_CPU
-.section .init.text,"ax",@progbits
-#endif
+__CPUINIT
 
 #ifdef CONFIG_SMP
 ENTRY(startup_32_smp)
@@ -602,11 +600,7 @@ ignore_int:
 #endif
 	iret
 
-#ifndef CONFIG_HOTPLUG_CPU
-	__CPUINITDATA
-#else
 	__REFDATA
-#endif
 .align 4
 ENTRY(initial_code)
 	.long i386_start_kernel
-- 
cgit 


From f833bab87fca5c3ce13778421b1365845843b976 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 17 Aug 2009 14:34:59 -0700
Subject: clockevent: Prevent dead lock on clockevents_lock

Currently clockevents_notify() is called with interrupts enabled at
some places and interrupts disabled at some other places.

This results in a deadlock in this scenario.

cpu A holds clockevents_lock in clockevents_notify() with irqs enabled
cpu B waits for clockevents_lock in clockevents_notify() with irqs disabled
cpu C doing set_mtrr() which will try to rendezvous of all the cpus.

This will result in C and A come to the rendezvous point and waiting
for B. B is stuck forever waiting for the spinlock and thus not
reaching the rendezvous point.

Fix the clockevents code so that clockevents_lock is taken with
interrupts disabled and thus avoid the above deadlock.

Also call lapic_timer_propagate_broadcast() on the destination cpu so
that we avoid calling smp_call_function() in the clockevents notifier
chain.

This issue left us wondering if we need to change the MTRR rendezvous
logic to use stop machine logic (instead of smp_call_function) or add
a check in spinlock debug code to see if there are other spinlocks
which gets taken under both interrupts enabled/disabled conditions.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: "Pallipadi Venkatesh" <venkatesh.pallipadi@intel.com>
Cc: "Brown Len" <len.brown@intel.com>
LKML-Reference: <1250544899.2709.210.camel@sbs-t61.sc.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 994dd6a4a2a..071166a4ba8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -519,16 +519,12 @@ static void c1e_idle(void)
 		if (!cpumask_test_cpu(cpu, c1e_mask)) {
 			cpumask_set_cpu(cpu, c1e_mask);
 			/*
-			 * Force broadcast so ACPI can not interfere. Needs
-			 * to run with interrupts enabled as it uses
-			 * smp_function_call.
+			 * Force broadcast so ACPI can not interfere.
 			 */
-			local_irq_enable();
 			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
 					   &cpu);
 			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
 			       cpu);
-			local_irq_disable();
 		}
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
 
-- 
cgit 


From 5416c2663517ebd0be0664c4d4ce3df0b116c059 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 17 Aug 2009 12:25:41 -0700
Subject: x86: make sure load_percpu_segment has no stackprotector

load_percpu_segment() is used to set up the per-cpu segment registers,
which are also used for -fstack-protector.  Make sure that the
load_percpu_segment() function doesn't have stackprotector enabled.

[ Impact: allow percpu setup before calling stack-protected functions ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/cpu/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e..8b5b9b625ed 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -7,6 +7,10 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_common.o = -pg
 endif
 
+# Make sure load_percpu_segment has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_common.o		:= $(nostackp)
+
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
 obj-y			+= vmware.o hypervisor.o
-- 
cgit 


From 83d349f35e1ae72268c5104dbf9ab2ae635425d4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 21 Aug 2009 09:23:57 -0700
Subject: x86: don't send an IPI to the empty set of CPU's
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The default_send_IPI_mask_logical() function uses the "flat" APIC mode
to send an IPI to a set of CPU's at once, but if that set happens to be
empty, some older local APIC's will apparently be rather unhappy.  So
just warn if a caller gives us an empty mask, and ignore it.

This fixes a regression in 2.6.30.x, due to commit 4595f9620 ("x86:
change flush_tlb_others to take a const struct cpumask"), documented
here:

	http://bugzilla.kernel.org/show_bug.cgi?id=13933

which causes a silent lock-up.  It only seems to happen on PPro, P2, P3
and Athlon XP cores.  Most developers sadly (or not so sadly, if you're
a developer..) have more modern CPU's.  Also, on x86-64 we don't use the
flat APIC mode, so it would never trigger there even if the APIC didn't
like sending an empty IPI mask.

Reported-by: Pavel Vilim <wylda@volny.cz>
Reported-and-tested-by: Thomas Björnell <thomas.bjornell@gmail.com>
Reported-and-tested-by: Martin Rogge <marogge@onlinehome.de>
Cc: Mike Travis <travis@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/apic/ipi.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index dbf5445727a..6ef00ba4c88 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -106,6 +106,9 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
 	unsigned long mask = cpumask_bits(cpumask)[0];
 	unsigned long flags;
 
+	if (WARN_ONCE(!mask, "empty IPI mask"))
+		return;
+
 	local_irq_save(flags);
 	WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
 	__default_send_IPI_dest_field(mask, vector, apic->dest_logical);
-- 
cgit 


From c62e43202e7cf50ca24bce58b255df7bf5de69d0 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Tue, 25 Aug 2009 14:50:53 +0100
Subject: x86: Fix build with older binutils and consolidate linker script

binutils prior to 2.17 can't deal with the currently possible
situation of a new segment following the per-CPU segment, but
that new segment being empty - objcopy misplaces the .bss (and
perhaps also the .brk) sections outside of any segment.

However, the current ordering of sections really just appears
to be the effect of cumulative unrelated changes; re-ordering
things allows to easily guarantee that the segment following
the per-CPU one is non-empty, and at once eliminates the need
for the bogus data.init2 segment.

Once touching this code, also use the various data section
helper macros from include/asm-generic/vmlinux.lds.h.

-v2: fix !SMP builds.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Cc: <sam@ravnborg.org>
LKML-Reference: <4A94085D02000078000119A5@vpn.id2.novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S | 126 ++++++++++++++++--------------------------
 1 file changed, 47 insertions(+), 79 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 78d185d797d..9fc178255c0 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -46,11 +46,10 @@ PHDRS {
 	data PT_LOAD FLAGS(7);          /* RWE */
 #ifdef CONFIG_X86_64
 	user PT_LOAD FLAGS(7);          /* RWE */
-	data.init PT_LOAD FLAGS(7);     /* RWE */
 #ifdef CONFIG_SMP
 	percpu PT_LOAD FLAGS(7);        /* RWE */
 #endif
-	data.init2 PT_LOAD FLAGS(7);    /* RWE */
+	init PT_LOAD FLAGS(7);          /* RWE */
 #endif
 	note PT_NOTE FLAGS(0);          /* ___ */
 }
@@ -103,65 +102,43 @@ SECTIONS
 		__stop___ex_table = .;
 	} :text = 0x9090
 
-	RODATA
+	RO_DATA(PAGE_SIZE)
 
 	/* Data */
-	. = ALIGN(PAGE_SIZE);
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {
 		/* Start of data section */
 		_sdata = .;
-		DATA_DATA
-		CONSTRUCTORS
-	} :data
+
+		/* init_task */
+		INIT_TASK_DATA(THREAD_SIZE)
 
 #ifdef CONFIG_X86_32
-	/* 32 bit has nosave before _edata */
-	. = ALIGN(PAGE_SIZE);
-	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-	}
+		/* 32 bit has nosave before _edata */
+		NOSAVE_DATA
 #endif
 
-	. = ALIGN(PAGE_SIZE);
-	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-		*(.data.page_aligned)
+		PAGE_ALIGNED_DATA(PAGE_SIZE)
 		*(.data.idt)
-	}
 
-#ifdef CONFIG_X86_32
-	. = ALIGN(32);
-#else
-	. = ALIGN(PAGE_SIZE);
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-#endif
-	.data.cacheline_aligned :
-		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-		*(.data.cacheline_aligned)
-	}
+		CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
 
-	/* rarely changed data like cpu maps */
-#ifdef CONFIG_X86_32
-	. = ALIGN(32);
-#else
-	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-#endif
-	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-		*(.data.read_mostly)
+		DATA_DATA
+		CONSTRUCTORS
+
+		/* rarely changed data like cpu maps */
+		READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
 
 		/* End of data section */
 		_edata = .;
-	}
+	} :data
 
 #ifdef CONFIG_X86_64
 
 #define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
+                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
+                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
 
 #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
 #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
@@ -227,35 +204,29 @@ SECTIONS
 
 #endif /* CONFIG_X86_64 */
 
-	/* init_task */
-	. = ALIGN(THREAD_SIZE);
-	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-		*(.data.init_task)
+	/* Init code and data - will be freed after init */
+	. = ALIGN(PAGE_SIZE);
+	.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+		__init_begin = .; /* paired with __init_end */
 	}
-#ifdef CONFIG_X86_64
-	 :data.init
-#endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
 	/*
-	 * smp_locks might be freed after init
-	 * start/end must be page aligned
+	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+	 * output PHDR, so the next output section - .init.text - should
+	 * start another segment - init.
 	 */
-	. = ALIGN(PAGE_SIZE);
-	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-		__smp_locks = .;
-		*(.smp_locks)
-		__smp_locks_end = .;
-		. = ALIGN(PAGE_SIZE);
-	}
+	PERCPU_VADDR(0, :percpu)
+#endif
 
-	/* Init code and data - will be freed after init */
-	. = ALIGN(PAGE_SIZE);
 	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		__init_begin = .; /* paired with __init_end */
 		_sinittext = .;
 		INIT_TEXT
 		_einittext = .;
 	}
+#ifdef CONFIG_X86_64
+	:init
+#endif
 
 	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
 		INIT_DATA
@@ -326,17 +297,7 @@ SECTIONS
 	}
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
-	/*
-	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-	 * output PHDR, so the next output section - __data_nosave - should
-	 * start another section data.init2.  Also, pda should be at the head of
-	 * percpu area.  Preallocate it and define the percpu offset symbol
-	 * so that it can be accessed as a percpu variable.
-	 */
-	. = ALIGN(PAGE_SIZE);
-	PERCPU_VADDR(0, :percpu)
-#else
+#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
 	PERCPU(PAGE_SIZE)
 #endif
 
@@ -347,15 +308,22 @@ SECTIONS
 		__init_end = .;
 	}
 
+	/*
+	 * smp_locks might be freed after init
+	 * start/end must be page aligned
+	 */
+	. = ALIGN(PAGE_SIZE);
+	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+		__smp_locks = .;
+		*(.smp_locks)
+		__smp_locks_end = .;
+		. = ALIGN(PAGE_SIZE);
+	}
+
 #ifdef CONFIG_X86_64
 	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-	} :data.init2
-	/* use another section data.init2, see PERCPU_VADDR() above */
+		NOSAVE_DATA
+	}
 #endif
 
 	/* BSS */
-- 
cgit 


From 295594e9cf6ae2efd73371777aa8feba0f87f42f Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Tue, 25 Aug 2009 13:44:44 -0700
Subject: x86: Fix vSMP boot crash

2.6.31-rc7 does not boot on vSMP systems:

[    8.501108] CPU31: Thermal monitoring enabled (TM1)
[    8.501127] CPU 31 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8
[    8.650254] CPU31: Intel(R) Xeon(R) CPU           E5540  @ 2.53GHz stepping 04
[    8.710324] Brought up 32 CPUs
[    8.713916] Total of 32 processors activated (162314.96 BogoMIPS).
[    8.721489] ERROR: parent span is not a superset of domain->span
[    8.727686] ERROR: domain->groups does not contain CPU0
[    8.733091] ERROR: groups don't span domain->span
[    8.737975] ERROR: domain->cpu_power not set
[    8.742416]

Ravikiran Thirumalai bisected it to:

| commit 2759c3287de27266e06f1f4e82cbd2d65f6a044c
| x86: don't call read_apic_id if !cpu_has_apic

The problem is that on vSMP systems the CPUID derived
initial-APICIDs are overlapping - so we need to fall
back on hard_smp_processor_id() which reads the local
APIC.

Both come from the hardware (influenced by firmware
though) so it's a tough call which one to trust.

Doing the quirk expresses the vSMP property properly
and also does not affect other systems, so we go for
this solution instead of a revert.

Reported-and-Tested-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Shai Fultheim <shai@scalex86.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <4A944D3C.5030100@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/probe_64.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index bc3e880f9b8..fcec2f1d34a 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -44,6 +44,11 @@ static struct apic *apic_probe[] __initdata = {
 	NULL,
 };
 
+static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+	return hard_smp_processor_id() >> index_msb;
+}
+
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
@@ -69,6 +74,11 @@ void __init default_setup_apic_routing(void)
 		printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 	}
 
+	if (is_vsmp_box()) {
+		/* need to update phys_pkg_id */
+		apic->phys_pkg_id = apicid_phys_pkg_id;
+	}
+
 	/*
 	 * Now that apic routing model is selected, configure the
 	 * fault handling for intr remapping.
-- 
cgit 


From e3e59876e82a5e1a07f365d5474e7b6943524725 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 14:02:10 +0200
Subject: x86/amd-iommu: Dump fault entry on DTE error

This patch adds code to dump the content of the device table
entry which caused an ILLEGAL_DEV_TABLE_ENTRY error from the
IOMMU hardware.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f503780..364c6de2637 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -138,6 +138,15 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
  *
  ****************************************************************************/
 
+static void dump_dte_entry(u16 devid)
+{
+	int i;
+
+	for (i = 0; i < 8; ++i)
+		pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
+			amd_iommu_dev_table[devid].data[i]);
+}
+
 static void iommu_print_event(void *__evt)
 {
 	u32 *event = __evt;
@@ -155,6 +164,7 @@ static void iommu_print_event(void *__evt)
 		       "address=0x%016llx flags=0x%04x]\n",
 		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
 		       address, flags);
+		dump_dte_entry(devid);
 		break;
 	case EVENT_TYPE_IO_FAULT:
 		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
-- 
cgit 


From 945b4ac44e5700acd3d974c176c8ace34b4d2e8e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 14:25:02 +0200
Subject: x86/amd-iommu: Dump illegal command on ILLEGAL_COMMAND_ERROR

This patch adds code to dump the command which caused an
ILLEGAL_COMMAND_ERROR raised by the IOMMU hardware.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 364c6de2637..e62b35f5df1 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -147,6 +147,15 @@ static void dump_dte_entry(u16 devid)
 			amd_iommu_dev_table[devid].data[i]);
 }
 
+static void dump_command(unsigned long phys_addr)
+{
+	struct iommu_cmd *cmd = phys_to_virt(phys_addr);
+	int i;
+
+	for (i = 0; i < 4; ++i)
+		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
+}
+
 static void iommu_print_event(void *__evt)
 {
 	u32 *event = __evt;
@@ -186,6 +195,7 @@ static void iommu_print_event(void *__evt)
 		break;
 	case EVENT_TYPE_ILL_CMD:
 		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+		dump_command(address);
 		break;
 	case EVENT_TYPE_CMD_HARD_ERR:
 		printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
-- 
cgit 


From e394d72aa8b319211b8f947d151d9d50b0fde842 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 15:28:33 +0200
Subject: x86/amd-iommu: Introduce function for iommu-local domain flush

This patch introduces a function to flush all domain tlbs
for on one given IOMMU. This is required later to reset the
command buffer on one IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 49 ++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index e62b35f5df1..64cc582feb9 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -465,38 +465,55 @@ static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid)
 }
 
 /*
- * This function is used to flush the IO/TLB for a given protection domain
- * on every IOMMU in the system
+ * This function flushes one domain on one IOMMU
  */
-static void iommu_flush_domain(u16 domid)
+static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid)
 {
-	unsigned long flags;
-	struct amd_iommu *iommu;
 	struct iommu_cmd cmd;
-
-	INC_STATS_COUNTER(domain_flush_all);
+	unsigned long flags;
 
 	__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
 				      domid, 1, 1);
 
-	for_each_iommu(iommu) {
-		spin_lock_irqsave(&iommu->lock, flags);
-		__iommu_queue_command(iommu, &cmd);
-		__iommu_completion_wait(iommu);
-		__iommu_wait_for_completion(iommu);
-		spin_unlock_irqrestore(&iommu->lock, flags);
-	}
+	spin_lock_irqsave(&iommu->lock, flags);
+	__iommu_queue_command(iommu, &cmd);
+	__iommu_completion_wait(iommu);
+	__iommu_wait_for_completion(iommu);
+	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-void amd_iommu_flush_all_domains(void)
+static void flush_all_domains_on_iommu(struct amd_iommu *iommu)
 {
 	int i;
 
 	for (i = 1; i < MAX_DOMAIN_ID; ++i) {
 		if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
 			continue;
-		iommu_flush_domain(i);
+		flush_domain_on_iommu(iommu, i);
 	}
+
+}
+
+/*
+ * This function is used to flush the IO/TLB for a given protection domain
+ * on every IOMMU in the system
+ */
+static void iommu_flush_domain(u16 domid)
+{
+	struct amd_iommu *iommu;
+
+	INC_STATS_COUNTER(domain_flush_all);
+
+	for_each_iommu(iommu)
+		flush_domain_on_iommu(iommu, domid);
+}
+
+void amd_iommu_flush_all_domains(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		flush_all_domains_on_iommu(iommu);
 }
 
 void amd_iommu_flush_all_devices(void)
-- 
cgit 


From f2430bd104bec2706315e9e983a9d9f828ff9565 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 26 Aug 2009 12:10:19 +0200
Subject: x86/amd-iommu: Remove some merge helper code

This patch removes some left-overs which where put into the code to
simplify merging code which also depends on changes in other trees.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f503780..70fdef54e06 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -41,9 +41,7 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 static LIST_HEAD(iommu_pd_list);
 static DEFINE_SPINLOCK(iommu_pd_list_lock);
 
-#ifdef CONFIG_IOMMU_API
 static struct iommu_ops amd_iommu_ops;
-#endif
 
 /*
  * general struct to manage commands send to an IOMMU
@@ -62,10 +60,6 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages);
 
-#ifndef BUS_NOTIFY_UNBOUND_DRIVER
-#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
-#endif
-
 #ifdef CONFIG_AMD_IOMMU_STATS
 
 /*
-- 
cgit 


From 4c6f40d4e0f0bba77a5f27eec4e1c6d1c457d324 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 1 Sep 2009 16:43:58 +0200
Subject: x86/amd-iommu: replace "AMD IOMMU" by "AMD-Vi"

This patch replaces the "AMD IOMMU" printk strings with the
official name for the hardware: "AMD-Vi".

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c      |  2 +-
 arch/x86/kernel/amd_iommu_init.c | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 70fdef54e06..3e62d783652 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -141,7 +141,7 @@ static void iommu_print_event(void *__evt)
 	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
 	u64 address = (u64)(((u64)event[3]) << 32) | event[2];
 
-	printk(KERN_ERR "AMD IOMMU: Event logged [");
+	printk(KERN_ERR "AMD-Vi: Event logged [");
 
 	switch (type) {
 	case EVENT_TYPE_ILL_DEV:
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252..169958ad624 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,7 +252,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 /* Function to enable the hardware */
 static void iommu_enable(struct amd_iommu *iommu)
 {
-	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
 	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
 
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
@@ -902,7 +902,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 
 	r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
 			IRQF_SAMPLE_RANDOM,
-			"AMD IOMMU",
+			"AMD-Vi",
 			NULL);
 
 	if (r) {
@@ -1150,7 +1150,7 @@ int __init amd_iommu_init(void)
 
 
 	if (no_iommu) {
-		printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+		printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
 		return 0;
 	}
 
@@ -1248,16 +1248,16 @@ int __init amd_iommu_init(void)
 
 	enable_iommus();
 
-	printk(KERN_INFO "AMD IOMMU: device isolation ");
+	printk(KERN_INFO "AMD-Vi: device isolation ");
 	if (amd_iommu_isolate)
 		printk("enabled\n");
 	else
 		printk("disabled\n");
 
 	if (amd_iommu_unmap_flush)
-		printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
+		printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
 	else
-		printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
+		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
 
 out:
 	return ret;
-- 
cgit 


From ae908c22aa2b9f7d4b41bd02d14e473f79c22dd3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 1 Sep 2009 16:52:16 +0200
Subject: x86/amd-iommu: Remove redundant 'IOMMU' string

The 'IOMMU: ' prefix is not necessary because the
DUMP_printk macro already prints its own prefix.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 169958ad624..264b3ef9dd6 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -858,7 +858,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 		switch (*p) {
 		case ACPI_IVHD_TYPE:
 
-			DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+			DUMP_printk("device: %02x:%02x.%01x cap: %04x "
 				    "seg: %d flags: %01x info %04x\n",
 				    PCI_BUS(h->devid), PCI_SLOT(h->devid),
 				    PCI_FUNC(h->devid), h->cap_ptr,
-- 
cgit 


From e0faf54ee82bf9c07f0307b4391caad4020bd659 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 15:45:51 +0200
Subject: x86/amd-iommu: fix broken check in amd_iommu_flush_all_devices

The amd_iommu_pd_table is indexed by protection domain
number and not by device id. So this check is broken and
must be removed.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 3e62d783652..009d722af00 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -479,8 +479,6 @@ void amd_iommu_flush_all_devices(void)
 	int i;
 
 	for (i = 0; i <= amd_iommu_last_bdf; ++i) {
-		if (amd_iommu_pd_table[i] == NULL)
-			continue;
 
 		iommu = amd_iommu_rlookup_table[i];
 		if (!iommu)
-- 
cgit 


From d586d7852ccd0cecb502bf4809f827e60c486af0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 15:39:23 +0200
Subject: x86/amd-iommu: Add function to flush all DTEs on one IOMMU

This function flushes all DTE entries on one IOMMU for all
devices behind this IOMMU. This is required for command
buffer resetting later.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 64cc582feb9..2dc093370d2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -516,6 +516,19 @@ void amd_iommu_flush_all_domains(void)
 		flush_all_domains_on_iommu(iommu);
 }
 
+static void flush_all_devices_for_iommu(struct amd_iommu *iommu)
+{
+	int i;
+
+	for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+		if (iommu != amd_iommu_rlookup_table[i])
+			continue;
+
+		iommu_queue_inv_dev_entry(iommu, i);
+		iommu_completion_wait(iommu);
+	}
+}
+
 void amd_iommu_flush_all_devices(void)
 {
 	struct amd_iommu *iommu;
-- 
cgit 


From 93f1cc67cf3196174412adca87321b25c1c986b0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 14:50:20 +0200
Subject: x86/amd-iommu: Add reset function for command buffers

This patch factors parts of the command buffer
initialization code into a seperate function which can be
used to reset the command buffer later.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252..1752afef948 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -434,6 +434,20 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 	return cmd_buf;
 }
 
+/*
+ * This function resets the command buffer if the IOMMU stopped fetching
+ * commands from it.
+ */
+void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
+{
+	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
+
+	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+}
+
 /*
  * This function writes the command buffer address to the hardware and
  * enables it.
@@ -450,11 +464,7 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
 	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
 		    &entry, sizeof(entry));
 
-	/* set head and tail to zero manually */
-	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
-	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+	amd_iommu_reset_cmd_buffer(iommu);
 }
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
-- 
cgit 


From a345b23b79f1900e7d87c3165182504419180de4 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 15:01:43 +0200
Subject: x86/amd-iommu: Reset command buffer on ILLEGAL_COMMAND_ERROR

On an ILLEGAL_COMMAND_ERROR the IOMMU stops executing
further commands. This patch changes the code to handle this
case better by resetting the command buffer in the IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 2dc093370d2..2333d615f5e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -61,6 +61,7 @@ static u64* alloc_pte(struct protection_domain *dom,
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages);
+static void reset_iommu_command_buffer(struct amd_iommu *iommu);
 
 #ifndef BUS_NOTIFY_UNBOUND_DRIVER
 #define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
@@ -156,7 +157,7 @@ static void dump_command(unsigned long phys_addr)
 		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
 }
 
-static void iommu_print_event(void *__evt)
+static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 {
 	u32 *event = __evt;
 	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
@@ -195,6 +196,7 @@ static void iommu_print_event(void *__evt)
 		break;
 	case EVENT_TYPE_ILL_CMD:
 		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+		reset_iommu_command_buffer(iommu);
 		dump_command(address);
 		break;
 	case EVENT_TYPE_CMD_HARD_ERR:
@@ -229,7 +231,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
 	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
 
 	while (head != tail) {
-		iommu_print_event(iommu->evt_buf + head);
+		iommu_print_event(iommu, iommu->evt_buf + head);
 		head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
 	}
 
@@ -547,6 +549,15 @@ void amd_iommu_flush_all_devices(void)
 	}
 }
 
+static void reset_iommu_command_buffer(struct amd_iommu *iommu)
+{
+	pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
+
+	amd_iommu_reset_cmd_buffer(iommu);
+	flush_all_devices_for_iommu(iommu);
+	flush_all_domains_on_iommu(iommu);
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
-- 
cgit 


From b26e81b871bd18184968f0bb3f12945906eadfce Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 15:08:09 +0200
Subject: x86/amd-iommu: Panic if IOMMU command buffer reset fails

To prevent the driver from doing recursive command buffer
resets, just panic when that recursion happens.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 2333d615f5e..b62a2f64dfc 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -553,9 +553,16 @@ static void reset_iommu_command_buffer(struct amd_iommu *iommu)
 {
 	pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
 
+	if (iommu->reset_in_progress)
+		panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+
+	iommu->reset_in_progress = true;
+
 	amd_iommu_reset_cmd_buffer(iommu);
 	flush_all_devices_for_iommu(iommu);
 	flush_all_domains_on_iommu(iommu);
+
+	iommu->reset_in_progress = false;
 }
 
 /****************************************************************************
-- 
cgit 


From 6a1eddd2f951656a6abbd42e2cddc2267c4a639d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 15:15:10 +0200
Subject: x86/amd-iommu: Reset command buffer if wait loop fails

Instead of a panic on an comletion wait loop failure, try to
recover from that event from resetting the command buffer.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index b62a2f64dfc..cfca80bfe75 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -318,8 +318,11 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
 	status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
 	writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
 
-	if (unlikely(i == EXIT_LOOP_COUNT))
-		panic("AMD IOMMU: Completion wait loop failed\n");
+	if (unlikely(i == EXIT_LOOP_COUNT)) {
+		spin_unlock(&iommu->lock);
+		reset_iommu_command_buffer(iommu);
+		spin_lock(&iommu->lock);
+	}
 }
 
 /*
-- 
cgit 


From 9355a08186e52b7c120adea91c984923b54efa10 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 14:24:08 +0200
Subject: x86/amd-iommu: Make fetch_pte aware of dynamic mapping levels

This patch changes the fetch_pte function in the AMD IOMMU
driver to support dynamic mapping levels.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f503780..29bcd358b6c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -61,6 +61,8 @@ static u64* alloc_pte(struct protection_domain *dom,
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages);
+static u64 *fetch_pte(struct protection_domain *domain,
+		      unsigned long address);
 
 #ifndef BUS_NOTIFY_UNBOUND_DRIVER
 #define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
@@ -670,24 +672,24 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  * This function checks if there is a PTE for a given dma address. If
  * there is one, it returns the pointer to it.
  */
-static u64* fetch_pte(struct protection_domain *domain,
+static u64 *fetch_pte(struct protection_domain *domain,
 		      unsigned long address)
 {
+	int level;
 	u64 *pte;
 
-	pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+	level =  domain->mode - 1;
+	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-	if (!IOMMU_PTE_PRESENT(*pte))
-		return NULL;
+	while (level > 0) {
+		if (!IOMMU_PTE_PRESENT(*pte))
+			return NULL;
 
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+		level -= 1;
 
-	if (!IOMMU_PTE_PRESENT(*pte))
-		return NULL;
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+		pte = IOMMU_PTE_PAGE(*pte);
+		pte = &pte[PM_LEVEL_INDEX(level, address)];
+	}
 
 	return pte;
 }
-- 
cgit 


From 38a76eeeafb251bf67d143a34b37a8105cff302e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 17:02:47 +0200
Subject: x86/amd-iommu: Use fetch_pte in iommu_unmap_page

Instead of reimplementing existing logic use fetch_pte to
walk the page table in iommu_unmap_page.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 29bcd358b6c..5e527989999 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -546,23 +546,10 @@ static int iommu_map_page(struct protection_domain *dom,
 static void iommu_unmap_page(struct protection_domain *dom,
 			     unsigned long bus_addr)
 {
-	u64 *pte;
-
-	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
-	if (!IOMMU_PTE_PRESENT(*pte))
-		return;
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
-	if (!IOMMU_PTE_PRESENT(*pte))
-		return;
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+	u64 *pte = fetch_pte(dom, bus_addr);
 
-	*pte = 0;
+	if (pte)
+		*pte = 0;
 }
 
 /*
-- 
cgit 


From a6d41a4027b758a9473197a78fab45afb31003aa Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 17:08:55 +0200
Subject: x86/amd-iommu: Use fetch_pte in amd_iommu_iova_to_phys

Don't reimplement the page table walker in this function.
Use the generic one.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5e527989999..4a54366d422 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2140,21 +2140,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	phys_addr_t paddr;
 	u64 *pte;
 
-	pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
+	pte = fetch_pte(domain, iova);
 
-	if (!IOMMU_PTE_PRESENT(*pte))
-		return 0;
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
-
-	if (!IOMMU_PTE_PRESENT(*pte))
-		return 0;
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
-
-	if (!IOMMU_PTE_PRESENT(*pte))
+	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
 
 	paddr  = *pte & IOMMU_PAGE_MASK;
-- 
cgit 


From 6a0dbcbe4e612fbc9d73cd4dde8ebef19295058a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 15:41:59 +0200
Subject: x86/amd-iommu: Add a gneric version of amd_iommu_flush_all_devices

This patch adds a generic variant of
amd_iommu_flush_all_devices function which flushes only the
DTEs for a given protection domain.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 4a54366d422..5265dd17eb5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -481,13 +481,14 @@ void amd_iommu_flush_all_domains(void)
 	}
 }
 
-void amd_iommu_flush_all_devices(void)
+static void flush_devices_by_domain(struct protection_domain *domain)
 {
 	struct amd_iommu *iommu;
 	int i;
 
 	for (i = 0; i <= amd_iommu_last_bdf; ++i) {
-		if (amd_iommu_pd_table[i] == NULL)
+		if ((domain == NULL && amd_iommu_pd_table[i] == NULL) ||
+		    (amd_iommu_pd_table[i] != domain))
 			continue;
 
 		iommu = amd_iommu_rlookup_table[i];
@@ -499,6 +500,11 @@ void amd_iommu_flush_all_devices(void)
 	}
 }
 
+void amd_iommu_flush_all_devices(void)
+{
+	flush_devices_by_domain(NULL);
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
-- 
cgit 


From 407d733e30a97daf5ea6f9eb5f9ebbd42a0a9ef2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 16:07:00 +0200
Subject: x86/amd-iommu: Introduce set_dte_entry function

This function factors out some logic of attach_device to a
seperate function. This new function will be used to update
device table entries when necessary.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5265dd17eb5..0fab1f1d135 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1058,18 +1058,10 @@ static struct protection_domain *domain_for_device(u16 devid)
 	return dom;
 }
 
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static void attach_device(struct amd_iommu *iommu,
-			  struct protection_domain *domain,
-			  u16 devid)
+static void set_dte_entry(u16 devid, struct protection_domain *domain)
 {
-	unsigned long flags;
 	u64 pte_root = virt_to_phys(domain->pt_root);
-
-	domain->dev_cnt += 1;
+	unsigned long flags;
 
 	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
 		    << DEV_ENTRY_MODE_SHIFT;
@@ -1082,6 +1074,21 @@ static void attach_device(struct amd_iommu *iommu,
 
 	amd_iommu_pd_table[devid] = domain;
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
+static void attach_device(struct amd_iommu *iommu,
+			  struct protection_domain *domain,
+			  u16 devid)
+{
+	/* set the DTE entry */
+	set_dte_entry(devid, domain);
+
+	/* increase reference counter */
+	domain->dev_cnt += 1;
 
        /*
         * We might boot into a crash-kernel here. The crashed kernel
-- 
cgit 


From 04bfdd8406099fca2e6b8844748c4d6c5eba8c8d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 16:00:23 +0200
Subject: x86/amd-iommu: Flush domains if address space size was increased

Thist patch introduces the update_domain function which
propagates the larger address space of a protection domain
to the device table and flushes all relevant DTEs and the
domain TLB.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0fab1f1d135..5eab6a84b9c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -63,6 +63,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned int pages);
 static u64 *fetch_pte(struct protection_domain *domain,
 		      unsigned long address);
+static void update_domain(struct protection_domain *domain);
 
 #ifndef BUS_NOTIFY_UNBOUND_DRIVER
 #define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
@@ -546,6 +547,8 @@ static int iommu_map_page(struct protection_domain *dom,
 
 	*pte = __pte;
 
+	update_domain(dom);
+
 	return 0;
 }
 
@@ -762,9 +765,13 @@ static int alloc_new_range(struct amd_iommu *iommu,
 		dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
 	}
 
+	update_domain(&dma_dom->domain);
+
 	return 0;
 
 out_free:
+	update_domain(&dma_dom->domain);
+
 	free_page((unsigned long)dma_dom->aperture[index]->bitmap);
 
 	kfree(dma_dom->aperture[index]);
@@ -1294,6 +1301,29 @@ static int get_device_resources(struct device *dev,
 	return 1;
 }
 
+static void update_device_table(struct protection_domain *domain)
+{
+	int i;
+
+	for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+		if (amd_iommu_pd_table[i] != domain)
+			continue;
+		set_dte_entry(i, domain);
+	}
+}
+
+static void update_domain(struct protection_domain *domain)
+{
+	if (!domain->updated)
+		return;
+
+	update_device_table(domain);
+	flush_devices_by_domain(domain);
+	iommu_flush_domain(domain->id);
+
+	domain->updated = false;
+}
+
 /*
  * If the pte_page is not yet allocated this function is called
  */
@@ -1351,6 +1381,8 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 	} else
 		pte += IOMMU_PTE_L0_INDEX(address);
 
+	update_domain(&dom->domain);
+
 	return pte;
 }
 
-- 
cgit 


From 50020fb6324465e478d6c8cdbf3c695f0a60358d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 15:38:40 +0200
Subject: x86/amd-iommu: Introduce increase_address_space function

This function will be used to increase the address space
size of a protection domain.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 5eab6a84b9c..fc97b51f028 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1324,6 +1324,33 @@ static void update_domain(struct protection_domain *domain)
 	domain->updated = false;
 }
 
+/*
+ * This function is used to add another level to an IO page table. Adding
+ * another level increases the size of the address space by 9 bits to a size up
+ * to 64 bits.
+ */
+static bool increase_address_space(struct protection_domain *domain,
+				   gfp_t gfp)
+{
+	u64 *pte;
+
+	if (domain->mode == PAGE_MODE_6_LEVEL)
+		/* address space already 64 bit large */
+		return false;
+
+	pte = (void *)get_zeroed_page(gfp);
+	if (!pte)
+		return false;
+
+	*pte             = PM_LEVEL_PDE(domain->mode,
+					virt_to_phys(domain->pt_root));
+	domain->pt_root  = pte;
+	domain->mode    += 1;
+	domain->updated  = true;
+
+	return true;
+}
+
 /*
  * If the pte_page is not yet allocated this function is called
  */
-- 
cgit 


From 8bc3e127421bf3b735edbde05135892c12c5f615 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 16:48:40 +0200
Subject: x86/amd-iommu: Change alloc_pte to support 64 bit address space

This patch changes the alloc_pte function to be able to map
pages into the whole 64 bit address space supported by AMD
IOMMU hardware from the old limit of 2**39 bytes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index fc97b51f028..3be2b61fc31 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,7 @@ struct iommu_cmd {
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
 static struct dma_ops_domain *find_protection_domain(u16 devid);
-static u64* alloc_pte(struct protection_domain *dom,
+static u64 *alloc_pte(struct protection_domain *domain,
 		      unsigned long address, u64
 		      **pte_page, gfp_t gfp);
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
@@ -1351,39 +1351,35 @@ static bool increase_address_space(struct protection_domain *domain,
 	return true;
 }
 
-/*
- * If the pte_page is not yet allocated this function is called
- */
-static u64* alloc_pte(struct protection_domain *dom,
+static u64 *alloc_pte(struct protection_domain *domain,
 		      unsigned long address, u64 **pte_page, gfp_t gfp)
 {
 	u64 *pte, *page;
+	int level;
 
-	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+	while (address > PM_LEVEL_SIZE(domain->mode))
+		increase_address_space(domain, gfp);
 
-	if (!IOMMU_PTE_PRESENT(*pte)) {
-		page = (u64 *)get_zeroed_page(gfp);
-		if (!page)
-			return NULL;
-		*pte = IOMMU_L2_PDE(virt_to_phys(page));
-	}
+	level =  domain->mode - 1;
+	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+	while (level > 0) {
+		if (!IOMMU_PTE_PRESENT(*pte)) {
+			page = (u64 *)get_zeroed_page(gfp);
+			if (!page)
+				return NULL;
+			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
+		}
 
-	if (!IOMMU_PTE_PRESENT(*pte)) {
-		page = (u64 *)get_zeroed_page(gfp);
-		if (!page)
-			return NULL;
-		*pte = IOMMU_L1_PDE(virt_to_phys(page));
-	}
+		level -= 1;
 
-	pte = IOMMU_PTE_PAGE(*pte);
+		pte = IOMMU_PTE_PAGE(*pte);
 
-	if (pte_page)
-		*pte_page = pte;
+		if (pte_page && level == 0)
+			*pte_page = pte;
 
-	pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+		pte = &pte[PM_LEVEL_INDEX(level, address)];
+	}
 
 	return pte;
 }
-- 
cgit 


From 8c8c143cdc95ebe50fd962917556e25e8912997b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 17:30:00 +0200
Subject: x86/amd-iommu: Remove last usages of IOMMU_PTE_L0_INDEX

This change allows to remove these old macros later.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 3be2b61fc31..ebc1c844392 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1402,7 +1402,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 		pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
 		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
 	} else
-		pte += IOMMU_PTE_L0_INDEX(address);
+		pte += PM_LEVEL_INDEX(0, address);
 
 	update_domain(&dom->domain);
 
@@ -1466,7 +1466,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 	if (!pte)
 		return;
 
-	pte += IOMMU_PTE_L0_INDEX(address);
+	pte += PM_LEVEL_INDEX(0, address);
 
 	WARN_ON(!*pte);
 
-- 
cgit 


From bad1cac28a707c69301a4d0612da9ccbecd51953 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 16:52:23 +0200
Subject: x86/amd-iommu: Remove bus_addr check in iommu_map_page

The driver now supports full 64 bit device address spaces.
So this check is not longer required.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index ebc1c844392..6ffb3e63765 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -530,8 +530,7 @@ static int iommu_map_page(struct protection_domain *dom,
 	bus_addr  = PAGE_ALIGN(bus_addr);
 	phys_addr = PAGE_ALIGN(phys_addr);
 
-	/* only support 512GB address spaces for now */
-	if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+	if (!(prot & IOMMU_PROT_MASK))
 		return -EINVAL;
 
 	pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
-- 
cgit 


From 8f7a017ce05ed4522809448e169daa44fe6edeb1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 2 Sep 2009 16:55:24 +0200
Subject: x86/amd-iommu: Use 2-level page tables for dma_ops domains

The driver now supports a dynamic number of levels for IO
page tables. This allows to reduce the number of levels for
dma_ops domains by one because a dma_ops domain has usually
an address space size between 128MB and 4G.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6ffb3e63765..addf6588c36 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1010,7 +1010,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
 	dma_dom->domain.id = domain_id_alloc();
 	if (dma_dom->domain.id == 0)
 		goto free_dma_dom;
-	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+	dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
 	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
 	dma_dom->domain.flags = PD_DMA_OPS_MASK;
 	dma_dom->domain.priv = dma_dom;
-- 
cgit 


From a6b256b41357c33ccb2d105a4457e22bdc83e021 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 12:21:31 +0200
Subject: x86/amd-iommu: Support higher level PTEs in iommu_page_unmap

This patch changes fetch_pte and iommu_page_unmap to support
different page sizes too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index addf6588c36..002cf9cab9e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -62,7 +62,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages);
 static u64 *fetch_pte(struct protection_domain *domain,
-		      unsigned long address);
+		      unsigned long address, int map_size);
 static void update_domain(struct protection_domain *domain);
 
 #ifndef BUS_NOTIFY_UNBOUND_DRIVER
@@ -552,9 +552,9 @@ static int iommu_map_page(struct protection_domain *dom,
 }
 
 static void iommu_unmap_page(struct protection_domain *dom,
-			     unsigned long bus_addr)
+			     unsigned long bus_addr, int map_size)
 {
-	u64 *pte = fetch_pte(dom, bus_addr);
+	u64 *pte = fetch_pte(dom, bus_addr, map_size);
 
 	if (pte)
 		*pte = 0;
@@ -668,7 +668,7 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  * there is one, it returns the pointer to it.
  */
 static u64 *fetch_pte(struct protection_domain *domain,
-		      unsigned long address)
+		      unsigned long address, int map_size)
 {
 	int level;
 	u64 *pte;
@@ -676,7 +676,7 @@ static u64 *fetch_pte(struct protection_domain *domain,
 	level =  domain->mode - 1;
 	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-	while (level > 0) {
+	while (level > map_size) {
 		if (!IOMMU_PTE_PRESENT(*pte))
 			return NULL;
 
@@ -684,6 +684,11 @@ static u64 *fetch_pte(struct protection_domain *domain,
 
 		pte = IOMMU_PTE_PAGE(*pte);
 		pte = &pte[PM_LEVEL_INDEX(level, address)];
+
+		if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
+			pte = NULL;
+			break;
+		}
 	}
 
 	return pte;
@@ -757,7 +762,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
 	for (i = dma_dom->aperture[index]->offset;
 	     i < dma_dom->aperture_size;
 	     i += PAGE_SIZE) {
-		u64 *pte = fetch_pte(&dma_dom->domain, i);
+		u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
 		if (!pte || !IOMMU_PTE_PRESENT(*pte))
 			continue;
 
@@ -2192,7 +2197,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
 	iova  &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		iommu_unmap_page(domain, iova);
+		iommu_unmap_page(domain, iova, PM_MAP_4k);
 		iova  += PAGE_SIZE;
 	}
 
@@ -2207,7 +2212,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	phys_addr_t paddr;
 	u64 *pte;
 
-	pte = fetch_pte(domain, iova);
+	pte = fetch_pte(domain, iova, PM_MAP_4k);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
-- 
cgit 


From abdc5eb3d69279039ba6cb89719913d08013ab14 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 3 Sep 2009 11:33:51 +0200
Subject: x86/amd-iommu: Change iommu_map_page to support multiple page sizes

This patch adds a map_size parameter to the iommu_map_page
function which makes it generic enough to handle multiple
page sizes. This also requires a change to alloc_pte which
is also done in this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 002cf9cab9e..45be9499c97 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -56,8 +56,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
 static struct dma_ops_domain *find_protection_domain(u16 devid);
 static u64 *alloc_pte(struct protection_domain *domain,
-		      unsigned long address, u64
-		      **pte_page, gfp_t gfp);
+		      unsigned long address, int end_lvl,
+		      u64 **pte_page, gfp_t gfp);
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages);
@@ -523,17 +523,21 @@ void amd_iommu_flush_all_devices(void)
 static int iommu_map_page(struct protection_domain *dom,
 			  unsigned long bus_addr,
 			  unsigned long phys_addr,
-			  int prot)
+			  int prot,
+			  int map_size)
 {
 	u64 __pte, *pte;
 
 	bus_addr  = PAGE_ALIGN(bus_addr);
 	phys_addr = PAGE_ALIGN(phys_addr);
 
+	BUG_ON(!PM_ALIGNED(map_size, bus_addr));
+	BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+
 	if (!(prot & IOMMU_PROT_MASK))
 		return -EINVAL;
 
-	pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
+	pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
 
 	if (IOMMU_PTE_PRESENT(*pte))
 		return -EBUSY;
@@ -612,7 +616,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 
 	for (addr = e->address_start; addr < e->address_end;
 	     addr += PAGE_SIZE) {
-		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
+		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
+				     PM_MAP_4k);
 		if (ret)
 			return ret;
 		/*
@@ -729,7 +734,7 @@ static int alloc_new_range(struct amd_iommu *iommu,
 		u64 *pte, *pte_page;
 
 		for (i = 0; i < num_ptes; ++i) {
-			pte = alloc_pte(&dma_dom->domain, address,
+			pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
 					&pte_page, gfp);
 			if (!pte)
 				goto out_free;
@@ -1356,7 +1361,10 @@ static bool increase_address_space(struct protection_domain *domain,
 }
 
 static u64 *alloc_pte(struct protection_domain *domain,
-		      unsigned long address, u64 **pte_page, gfp_t gfp)
+		      unsigned long address,
+		      int end_lvl,
+		      u64 **pte_page,
+		      gfp_t gfp)
 {
 	u64 *pte, *page;
 	int level;
@@ -1367,7 +1375,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
 	level =  domain->mode - 1;
 	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-	while (level > 0) {
+	while (level > end_lvl) {
 		if (!IOMMU_PTE_PRESENT(*pte)) {
 			page = (u64 *)get_zeroed_page(gfp);
 			if (!page)
@@ -1379,7 +1387,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
 
 		pte = IOMMU_PTE_PAGE(*pte);
 
-		if (pte_page && level == 0)
+		if (pte_page && level == end_lvl)
 			*pte_page = pte;
 
 		pte = &pte[PM_LEVEL_INDEX(level, address)];
@@ -1403,7 +1411,8 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 
 	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
 	if (!pte) {
-		pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+		pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+				GFP_ATOMIC);
 		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
 	} else
 		pte += PM_LEVEL_INDEX(0, address);
@@ -2176,7 +2185,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
 	paddr &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		ret = iommu_map_page(domain, iova, paddr, prot);
+		ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
 		if (ret)
 			return ret;
 
-- 
cgit 


From ac0101d396fee24994632f71b55b9f7f9ee16eff Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 1 Sep 2009 16:00:35 +0200
Subject: x86/dma: Mark iommu_pass_through as __read_mostly

This variable is read most of the time. This patch marks it
as such. It also documents the meaning the this variable
while at it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/pci-dma.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 1a041bcf506..873aa079d16 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -32,7 +32,14 @@ int no_iommu __read_mostly;
 /* Set this to 1 if there is a HW IOMMU in the system */
 int iommu_detected __read_mostly = 0;
 
-int iommu_pass_through;
+/*
+ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
+ * If this variable is 1, IOMMU implementations do no DMA ranslation for
+ * devices and allow every device to access to whole physical memory. This is
+ * useful if a user want to use an IOMMU only for KVM device assignment to
+ * guests and not for driver dma translation.
+ */
+int iommu_pass_through __read_mostly;
 
 dma_addr_t bad_dma_address __read_mostly = 0;
 EXPORT_SYMBOL(bad_dma_address);
-- 
cgit 


From 2650815fb03fe2bf1e6701584087ba669dcf92cd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 26 Aug 2009 16:52:40 +0200
Subject: x86/amd-iommu: Add core functions for pd allocation/freeing

This patch factors some code of protection domain allocation
into seperate functions. This way the logic can be used to
allocate the passthrough domain later. As a side effect this
patch fixes an unlikely domain id leakage bug.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 6c99f503780..0934348abfa 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1988,19 +1988,47 @@ static void cleanup_domain(struct protection_domain *domain)
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 }
 
-static int amd_iommu_domain_init(struct iommu_domain *dom)
+static void protection_domain_free(struct protection_domain *domain)
+{
+	if (!domain)
+		return;
+
+	if (domain->id)
+		domain_id_free(domain->id);
+
+	kfree(domain);
+}
+
+static struct protection_domain *protection_domain_alloc(void)
 {
 	struct protection_domain *domain;
 
 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
 	if (!domain)
-		return -ENOMEM;
+		return NULL;
 
 	spin_lock_init(&domain->lock);
-	domain->mode = PAGE_MODE_3_LEVEL;
 	domain->id = domain_id_alloc();
 	if (!domain->id)
+		goto out_err;
+
+	return domain;
+
+out_err:
+	kfree(domain);
+
+	return NULL;
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+	struct protection_domain *domain;
+
+	domain = protection_domain_alloc();
+	if (!domain)
 		goto out_free;
+
+	domain->mode    = PAGE_MODE_3_LEVEL;
 	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!domain->pt_root)
 		goto out_free;
@@ -2010,7 +2038,7 @@ static int amd_iommu_domain_init(struct iommu_domain *dom)
 	return 0;
 
 out_free:
-	kfree(domain);
+	protection_domain_free(domain);
 
 	return -ENOMEM;
 }
-- 
cgit 


From 0feae533ddebe02cda6ccce5cac7349b446776a8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 26 Aug 2009 15:26:30 +0200
Subject: x86/amd-iommu: Add passthrough mode initialization functions

When iommu=pt is passed on kernel command line the devices
should run untranslated. This requires the allocation of a
special domain for that purpose. This patch implements the
allocation and initialization path for iommu=pt.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 72 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 64 insertions(+), 8 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0934348abfa..7987f20499a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -41,6 +41,12 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 static LIST_HEAD(iommu_pd_list);
 static DEFINE_SPINLOCK(iommu_pd_list_lock);
 
+/*
+ * Domain for untranslated devices - only allocated
+ * if iommu=pt passed on kernel cmd line.
+ */
+static struct protection_domain *pt_domain;
+
 #ifdef CONFIG_IOMMU_API
 static struct iommu_ops amd_iommu_ops;
 #endif
@@ -1067,9 +1073,9 @@ static struct protection_domain *domain_for_device(u16 devid)
  * If a device is not yet associated with a domain, this function does
  * assigns it visible for the hardware
  */
-static void attach_device(struct amd_iommu *iommu,
-			  struct protection_domain *domain,
-			  u16 devid)
+static void __attach_device(struct amd_iommu *iommu,
+			    struct protection_domain *domain,
+			    u16 devid)
 {
 	unsigned long flags;
 	u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1087,12 +1093,19 @@ static void attach_device(struct amd_iommu *iommu,
 
 	amd_iommu_pd_table[devid] = domain;
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
 
-       /*
-        * We might boot into a crash-kernel here. The crashed kernel
-        * left the caches in the IOMMU dirty. So we have to flush
-        * here to evict all dirty stuff.
-        */
+static void attach_device(struct amd_iommu *iommu,
+			  struct protection_domain *domain,
+			  u16 devid)
+{
+	__attach_device(iommu, domain, devid);
+
+	/*
+	 * We might boot into a crash-kernel here. The crashed kernel
+	 * left the caches in the IOMMU dirty. So we have to flush
+	 * here to evict all dirty stuff.
+	 */
 	iommu_queue_inv_dev_entry(iommu, devid);
 	iommu_flush_tlb_pde(iommu, domain->id);
 }
@@ -2219,3 +2232,46 @@ static struct iommu_ops amd_iommu_ops = {
 	.domain_has_cap = amd_iommu_domain_has_cap,
 };
 
+/*****************************************************************************
+ *
+ * The next functions do a basic initialization of IOMMU for pass through
+ * mode
+ *
+ * In passthrough mode the IOMMU is initialized and enabled but not used for
+ * DMA-API translation.
+ *
+ *****************************************************************************/
+
+int __init amd_iommu_init_passthrough(void)
+{
+	struct pci_dev *dev = NULL;
+	u16 devid, devid2;
+
+	/* allocate passthroug domain */
+	pt_domain = protection_domain_alloc();
+	if (!pt_domain)
+		return -ENOMEM;
+
+	pt_domain->mode |= PAGE_MODE_NONE;
+
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		struct amd_iommu *iommu;
+
+		devid = calc_devid(dev->bus->number, dev->devfn);
+		if (devid > amd_iommu_last_bdf)
+			continue;
+
+		devid2 = amd_iommu_alias_table[devid];
+
+		iommu = amd_iommu_rlookup_table[devid2];
+		if (!iommu)
+			continue;
+
+		__attach_device(iommu, pt_domain, devid);
+		__attach_device(iommu, pt_domain, devid2);
+	}
+
+	pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
+
+	return 0;
+}
-- 
cgit 


From aa879fff5d12318259816aa35023e941a1e4d3d9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 31 Aug 2009 16:01:48 +0200
Subject: x86/amd-iommu: Fix device table write order

The V bit of the device table entry has to be set after the
rest of the entry is written to not confuse the hardware.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 7987f20499a..2b1e77c714f 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1087,9 +1087,9 @@ static void __attach_device(struct amd_iommu *iommu,
 	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
 	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
 	amd_iommu_dev_table[devid].data[2] = domain->id;
+	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
+	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
 
 	amd_iommu_pd_table[devid] = domain;
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-- 
cgit 


From eba6ac60ba66c6bf6858938442204feaa67dea31 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 1 Sep 2009 12:07:08 +0200
Subject: x86/amd-iommu: Align locking between attach_device and detach_device

This patch makes the locking behavior between the functions
attach_device and __attach_device consistent with the
locking behavior between detach_device and __detach_device.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 2b1e77c714f..9aa135d4453 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1077,29 +1077,38 @@ static void __attach_device(struct amd_iommu *iommu,
 			    struct protection_domain *domain,
 			    u16 devid)
 {
-	unsigned long flags;
-	u64 pte_root = virt_to_phys(domain->pt_root);
+	u64 pte_root;
 
-	domain->dev_cnt += 1;
+	/* lock domain */
+	spin_lock(&domain->lock);
+
+	pte_root = virt_to_phys(domain->pt_root);
 
 	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
 		    << DEV_ENTRY_MODE_SHIFT;
 	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
 	amd_iommu_dev_table[devid].data[2] = domain->id;
 	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
 	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
 
 	amd_iommu_pd_table[devid] = domain;
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	domain->dev_cnt += 1;
+
+	/* ready */
+	spin_unlock(&domain->lock);
 }
 
 static void attach_device(struct amd_iommu *iommu,
 			  struct protection_domain *domain,
 			  u16 devid)
 {
+	unsigned long flags;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
 	__attach_device(iommu, domain, devid);
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
 
 	/*
 	 * We might boot into a crash-kernel here. The crashed kernel
-- 
cgit 


From 21129f786f231f7a9dce5b504617b893f50a435f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 1 Sep 2009 11:59:42 +0200
Subject: x86/amd-iommu: Make sure a device is assigned in passthrough mode

When the IOMMU driver runs in passthrough mode it has to
make sure that every device not assigned to an IOMMU-API
domain must be put into the passthrough domain instead of
keeping it unassigned.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 9aa135d4453..a8e74c34dd2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1141,6 +1141,15 @@ static void __detach_device(struct protection_domain *domain, u16 devid)
 
 	/* ready */
 	spin_unlock(&domain->lock);
+
+	/*
+	 * If we run in passthrough mode the device must be assigned to the
+	 * passthrough domain if it is detached from any other domain
+	 */
+	if (iommu_pass_through) {
+		struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+		__attach_device(iommu, pt_domain, devid);
+	}
 }
 
 /*
-- 
cgit 


From a1ca331c8aa75cd58fdf685e2e8745e1d3ec5c8f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 1 Sep 2009 12:22:22 +0200
Subject: x86/amd-iommu: Don't detach device from pt domain on driver unbind

This patch makes sure a device is not detached from the
passthrough domain when the device driver is unloaded or
does otherwise release the device.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a8e74c34dd2..12a541deae5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1195,6 +1195,8 @@ static int device_change_notifier(struct notifier_block *nb,
 	case BUS_NOTIFY_UNBOUND_DRIVER:
 		if (!domain)
 			goto out;
+		if (iommu_pass_through)
+			break;
 		detach_device(domain, devid);
 		break;
 	case BUS_NOTIFY_ADD_DEVICE:
-- 
cgit 


From 4751a95134e05f1172131d2001c6991d671fa58c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 1 Sep 2009 15:53:54 +0200
Subject: x86/amd-iommu: Initialize passthrough mode when requested

This patch enables the passthrough mode for AMD IOMMU by
running the initialization function when iommu=pt is passed
on the kernel command line.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kernel')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c1b17e97252..f00f489ab15 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1242,12 +1242,18 @@ int __init amd_iommu_init(void)
 	if (ret)
 		goto free;
 
-	ret = amd_iommu_init_dma_ops();
+	if (iommu_pass_through)
+		ret = amd_iommu_init_passthrough();
+	else
+		ret = amd_iommu_init_dma_ops();
 	if (ret)
 		goto free;
 
 	enable_iommus();
 
+	if (iommu_pass_through)
+		goto out;
+
 	printk(KERN_INFO "AMD IOMMU: device isolation ");
 	if (amd_iommu_isolate)
 		printk("enabled\n");
-- 
cgit