summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorAnton Arapov <anton@redhat.com>2012-04-16 10:05:28 +0200
committerAnton Arapov <anton@redhat.com>2012-04-16 10:05:28 +0200
commitb4b6116a13633898cf868f2f103c96a90c4c20f8 (patch)
tree93d1b7e2cfcdf473d8d4ff3ad141fa864f8491f6 /arch/x86/kernel
parentedd4be777c953e5faafc80d091d3084b4343f5d3 (diff)
downloadkernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.gz
kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.xz
kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.zip
fedora kernel: d9aad82f3319f3cfd1aebc01234254ef0c37ad84v3.3.2-1
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile114
-rw-r--r--arch/x86/kernel/acpi/Makefile14
-rw-r--r--arch/x86/kernel/acpi/boot.c1696
-rw-r--r--arch/x86/kernel/acpi/cstate.c203
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile59
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/copy.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-bios.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-mode.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vesa.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vga.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakemain.c81
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S170
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h48
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S62
-rw-r--r--arch/x86/kernel/acpi/sleep.c133
-rw-r--r--arch/x86/kernel/acpi/sleep.h15
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S100
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S124
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c742
-rw-r--r--arch/x86/kernel/amd_gart_64.c898
-rw-r--r--arch/x86/kernel/amd_nb.c282
-rw-r--r--arch/x86/kernel/apb_timer.c431
-rw-r--r--arch/x86/kernel/aperture_64.c523
-rw-r--r--arch/x86/kernel/apic/Makefile27
-rw-r--r--arch/x86/kernel/apic/apic.c2464
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c396
-rw-r--r--arch/x86/kernel/apic/apic_noop.c191
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c294
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c278
-rw-r--r--arch/x86/kernel/apic/es7000_32.c755
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c90
-rw-r--r--arch/x86/kernel/apic/io_apic.c4043
-rw-r--r--arch/x86/kernel/apic/ipi.c167
-rw-r--r--arch/x86/kernel/apic/numaq_32.c541
-rw-r--r--arch/x86/kernel/apic/probe_32.c270
-rw-r--r--arch/x86/kernel/apic/probe_64.c79
-rw-r--r--arch/x86/kernel/apic/summit_32.c556
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c268
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c174
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c889
-rw-r--r--arch/x86/kernel/apm_32.c2453
-rw-r--r--arch/x86/kernel/asm-offsets.c72
-rw-r--r--arch/x86/kernel/asm-offsets_32.c87
-rw-r--r--arch/x86/kernel/asm-offsets_64.c83
-rw-r--r--arch/x86/kernel/audit_64.c81
-rw-r--r--arch/x86/kernel/bootflag.c101
-rw-r--r--arch/x86/kernel/check.c159
-rw-r--r--arch/x86/kernel/cpu/Makefile49
-rw-r--r--arch/x86/kernel/cpu/amd.c780
-rw-r--r--arch/x86/kernel/cpu/bugs.c174
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c33
-rw-r--r--arch/x86/kernel/cpu/centaur.c500
-rw-r--r--arch/x86/kernel/cpu/common.c1343
-rw-r--r--arch/x86/kernel/cpu/cpu.h37
-rw-r--r--arch/x86/kernel/cpu/cyrix.c461
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c78
-rw-r--r--arch/x86/kernel/cpu/intel.c555
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c1216
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c149
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c248
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h53
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c261
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2269
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c742
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c229
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c68
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c509
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c29
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c40
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c79
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile3
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c124
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c126
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c980
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c282
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c847
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c449
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c764
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h78
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1716
-rw-r--r--arch/x86/kernel/cpu/perf_event.h554
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c656
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c301
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c1777
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c737
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c220
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c1332
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c143
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c156
-rw-r--r--arch/x86/kernel/cpu/powerflags.c21
-rw-r--r--arch/x86/kernel/cpu/proc.c167
-rw-r--r--arch/x86/kernel/cpu/rdrand.c73
-rw-r--r--arch/x86/kernel/cpu/scattered.c70
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/cpu/topology.c99
-rw-r--r--arch/x86/kernel/cpu/transmeta.c109
-rw-r--r--arch/x86/kernel/cpu/umc.c26
-rw-r--r--arch/x86/kernel/cpu/vmware.c134
-rw-r--r--arch/x86/kernel/cpuid.c241
-rw-r--r--arch/x86/kernel/crash.c106
-rw-r--r--arch/x86/kernel/crash_dump_32.c95
-rw-r--r--arch/x86/kernel/crash_dump_64.c49
-rw-r--r--arch/x86/kernel/devicetree.c429
-rw-r--r--arch/x86/kernel/doublefault_32.c69
-rw-r--r--arch/x86/kernel/dumpstack.c326
-rw-r--r--arch/x86/kernel/dumpstack_32.c141
-rw-r--r--arch/x86/kernel/dumpstack_64.c307
-rw-r--r--arch/x86/kernel/e820.c1111
-rw-r--r--arch/x86/kernel/early-quirks.c292
-rw-r--r--arch/x86/kernel/early_printk.c259
-rw-r--r--arch/x86/kernel/entry_32.S1417
-rw-r--r--arch/x86/kernel/entry_64.S1708
-rw-r--r--arch/x86/kernel/ftrace.c455
-rw-r--r--arch/x86/kernel/head.c56
-rw-r--r--arch/x86/kernel/head32.c68
-rw-r--r--arch/x86/kernel/head64.c124
-rw-r--r--arch/x86/kernel/head_32.S731
-rw-r--r--arch/x86/kernel/head_64.S427
-rw-r--r--arch/x86/kernel/hpet.c1209
-rw-r--r--arch/x86/kernel/hw_breakpoint.c524
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c38
-rw-r--r--arch/x86/kernel/i387.c748
-rw-r--r--arch/x86/kernel/i8237.c55
-rw-r--r--arch/x86/kernel/i8253.c43
-rw-r--r--arch/x86/kernel/i8259.c402
-rw-r--r--arch/x86/kernel/init_task.c42
-rw-r--r--arch/x86/kernel/io_delay.c131
-rw-r--r--arch/x86/kernel/ioport.c113
-rw-r--r--arch/x86/kernel/irq.c328
-rw-r--r--arch/x86/kernel/irq_32.c206
-rw-r--r--arch/x86/kernel/irq_64.c110
-rw-r--r--arch/x86/kernel/irq_work.c30
-rw-r--r--arch/x86/kernel/irqinit.c328
-rw-r--r--arch/x86/kernel/jump_label.c59
-rw-r--r--arch/x86/kernel/kdebugfs.c219
-rw-r--r--arch/x86/kernel/kgdb.c812
-rw-r--r--arch/x86/kernel/kprobes.c1575
-rw-r--r--arch/x86/kernel/kvm.c448
-rw-r--r--arch/x86/kernel/kvmclock.c211
-rw-r--r--arch/x86/kernel/ldt.c268
-rw-r--r--arch/x86/kernel/machine_kexec_32.c273
-rw-r--r--arch/x86/kernel/machine_kexec_64.c356
-rw-r--r--arch/x86/kernel/mca_32.c477
-rw-r--r--arch/x86/kernel/microcode_amd.c393
-rw-r--r--arch/x86/kernel/microcode_core.c592
-rw-r--r--arch/x86/kernel/microcode_intel.c468
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c237
-rw-r--r--arch/x86/kernel/module.c206
-rw-r--r--arch/x86/kernel/mpparse.c921
-rw-r--r--arch/x86/kernel/msr.c297
-rw-r--r--arch/x86/kernel/nmi.c537
-rw-r--r--arch/x86/kernel/nmi_selftest.c180
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c28
-rw-r--r--arch/x86/kernel/paravirt.c491
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c61
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c75
-rw-r--r--arch/x86/kernel/pci-calgary_64.c1600
-rw-r--r--arch/x86/kernel/pci-dma.c271
-rw-r--r--arch/x86/kernel/pci-iommu_table.c79
-rw-r--r--arch/x86/kernel/pci-nommu.c106
-rw-r--r--arch/x86/kernel/pci-swiotlb.c100
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/probe_roms.c267
-rw-r--r--arch/x86/kernel/process.c678
-rw-r--r--arch/x86/kernel/process_32.c443
-rw-r--r--arch/x86/kernel/process_64.c647
-rw-r--r--arch/x86/kernel/ptrace.c1430
-rw-r--r--arch/x86/kernel/pvclock.c158
-rw-r--r--arch/x86/kernel/quirks.c569
-rw-r--r--arch/x86/kernel/reboot.c843
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c102
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S277
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S268
-rw-r--r--arch/x86/kernel/resource.c48
-rw-r--r--arch/x86/kernel/rtc.c256
-rw-r--r--arch/x86/kernel/setup.c1064
-rw-r--r--arch/x86/kernel/setup_percpu.c275
-rw-r--r--arch/x86/kernel/signal.c852
-rw-r--r--arch/x86/kernel/smp.c312
-rw-r--r--arch/x86/kernel/smpboot.c1445
-rw-r--r--arch/x86/kernel/stacktrace.c146
-rw-r--r--arch/x86/kernel/step.c216
-rw-r--r--arch/x86/kernel/sys_i386_32.c40
-rw-r--r--arch/x86/kernel/sys_x86_64.c283
-rw-r--r--arch/x86/kernel/syscall_32.c25
-rw-r--r--arch/x86/kernel/syscall_64.c25
-rw-r--r--arch/x86/kernel/tboot.c451
-rw-r--r--arch/x86/kernel/tce_64.c189
-rw-r--r--arch/x86/kernel/test_nx.c175
-rw-r--r--arch/x86/kernel/test_rodata.c86
-rw-r--r--arch/x86/kernel/time.c104
-rw-r--r--arch/x86/kernel/tls.c218
-rw-r--r--arch/x86/kernel/tls.h21
-rw-r--r--arch/x86/kernel/topology.c83
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S83
-rw-r--r--arch/x86/kernel/trampoline_64.S171
-rw-r--r--arch/x86/kernel/traps.c839
-rw-r--r--arch/x86/kernel/tsc.c1026
-rw-r--r--arch/x86/kernel/tsc_sync.c198
-rw-r--r--arch/x86/kernel/verify_cpu.S139
-rw-r--r--arch/x86/kernel/vm86_32.c849
-rw-r--r--arch/x86/kernel/vmlinux.lds.S376
-rw-r--r--arch/x86/kernel/vsmp_64.c160
-rw-r--r--arch/x86/kernel/vsyscall_64.c358
-rw-r--r--arch/x86/kernel/vsyscall_emu_64.S37
-rw-r--r--arch/x86/kernel/vsyscall_trace.h29
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c60
-rw-r--r--arch/x86/kernel/x86_init.c127
-rw-r--r--arch/x86/kernel/xsave.c472
216 files changed, 83344 insertions, 0 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
new file mode 100644
index 00000000000..5369059c07a
--- /dev/null
+++ b/arch/x86/kernel/Makefile
@@ -0,0 +1,114 @@
+#
+# Makefile for the linux kernel.
+#
+
+extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+
+CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
+
+ifdef CONFIG_FUNCTION_TRACER
+# Do not profile debug and lowlevel utilities
+CFLAGS_REMOVE_tsc.o = -pg
+CFLAGS_REMOVE_rtc.o = -pg
+CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
+CFLAGS_REMOVE_pvclock.o = -pg
+CFLAGS_REMOVE_kvmclock.o = -pg
+CFLAGS_REMOVE_ftrace.o = -pg
+CFLAGS_REMOVE_early_printk.o = -pg
+endif
+
+obj-y := process_$(BITS).o signal.o entry_$(BITS).o
+obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
+obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
+obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-y += probe_roms.o
+obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
+obj-y += syscall_$(BITS).o
+obj-$(CONFIG_X86_64) += vsyscall_64.o
+obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-y += bootflag.o e820.o
+obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
+obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
+obj-y += tsc.o io_delay.o rtc.o
+obj-y += pci-iommu_table.o
+obj-y += resource.o
+
+obj-y += trampoline.o trampoline_$(BITS).o
+obj-y += process.o
+obj-y += i387.o xsave.o
+obj-y += ptrace.o
+obj-$(CONFIG_X86_32) += tls.o
+obj-$(CONFIG_IA32_EMULATION) += tls.o
+obj-y += step.o
+obj-$(CONFIG_INTEL_TXT) += tboot.o
+obj-$(CONFIG_ISA_DMA_API) += i8237.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += cpu/
+obj-y += acpi/
+obj-y += reboot.o
+obj-$(CONFIG_X86_32) += reboot_32.o
+obj-$(CONFIG_MCA) += mca_32.o
+obj-$(CONFIG_X86_MSR) += msr.o
+obj-$(CONFIG_X86_CPUID) += cpuid.o
+obj-$(CONFIG_PCI) += early-quirks.o
+apm-y := apm_32.o
+obj-$(CONFIG_APM) += apm.o
+obj-$(CONFIG_SMP) += smp.o
+obj-$(CONFIG_SMP) += smpboot.o
+obj-$(CONFIG_SMP) += tsc_sync.o
+obj-$(CONFIG_SMP) += setup_percpu.o
+obj-$(CONFIG_X86_MPPARSE) += mpparse.o
+obj-y += apic/
+obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
+obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
+obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
+obj-$(CONFIG_KGDB) += kgdb.o
+obj-$(CONFIG_VM86) += vm86_32.o
+obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+
+obj-$(CONFIG_HPET_TIMER) += hpet.o
+obj-$(CONFIG_APB_TIMER) += apb_timer.o
+
+obj-$(CONFIG_AMD_NB) += amd_nb.o
+obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
+obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
+
+obj-$(CONFIG_KVM_GUEST) += kvm.o
+obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
+obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+
+obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
+
+microcode-y := microcode_core.o
+microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
+microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+
+obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+obj-$(CONFIG_OF) += devicetree.o
+
+###
+# 64 bit specific files
+ifeq ($(CONFIG_X86_64),y)
+ obj-$(CONFIG_AUDIT) += audit_64.o
+
+ obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
+ obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
+
+ obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
+ obj-y += vsmp_64.o
+endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
new file mode 100644
index 00000000000..6f35260bb3e
--- /dev/null
+++ b/arch/x86/kernel/acpi/Makefile
@@ -0,0 +1,14 @@
+subdir- := realmode
+
+obj-$(CONFIG_ACPI) += boot.o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
+
+ifneq ($(CONFIG_ACPI_PROCESSOR),)
+obj-y += cstate.o
+endif
+
+$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
+
+$(obj)/realmode/wakeup.bin: FORCE
+ $(Q)$(MAKE) $(build)=$(obj)/realmode
+
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
new file mode 100644
index 00000000000..ce664f33ea8
--- /dev/null
+++ b/arch/x86/kernel/acpi/boot.c
@@ -0,0 +1,1696 @@
+/*
+ * boot.c - Architecture-Specific Low-Level ACPI Boot Support
+ *
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/efi.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/dmi.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+
+#include <asm/pci_x86.h>
+#include <asm/pgtable.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/io.h>
+#include <asm/mpspec.h>
+#include <asm/smp.h>
+
+static int __initdata acpi_force = 0;
+u32 acpi_rsdt_forced;
+int acpi_disabled;
+EXPORT_SYMBOL(acpi_disabled);
+
+#ifdef CONFIG_X86_64
+# include <asm/proto.h>
+# include <asm/numa_64.h>
+#endif /* X86 */
+
+#define BAD_MADT_ENTRY(entry, end) ( \
+ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
+ ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
+
+#define PREFIX "ACPI: "
+
+int acpi_noirq; /* skip ACPI IRQ initialization */
+int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
+EXPORT_SYMBOL(acpi_pci_disabled);
+
+int acpi_lapic;
+int acpi_ioapic;
+int acpi_strict;
+
+u8 acpi_sci_flags __initdata;
+int acpi_sci_override_gsi __initdata;
+int acpi_skip_timer_override __initdata;
+int acpi_use_timer_override __initdata;
+int acpi_fix_pin2_polarity __initdata;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+#endif
+
+#ifndef __HAVE_ARCH_CMPXCHG
+#warning ACPI uses CMPXCHG, i486 and later hardware
+#endif
+
+/* --------------------------------------------------------------------------
+ Boot-time Configuration
+ -------------------------------------------------------------------------- */
+
+/*
+ * The default interrupt routing model is PIC (8259). This gets
+ * overridden if IOAPICs are enumerated (below).
+ */
+enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+
+
+/*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static unsigned int gsi_to_irq(unsigned int gsi)
+{
+ unsigned int irq = gsi + NR_IRQS_LEGACY;
+ unsigned int i;
+
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ if (isa_irq_to_gsi[i] == gsi) {
+ return i;
+ }
+ }
+
+ /* Provide an identity mapping of gsi == irq
+ * except on truly weird platforms that have
+ * non isa irqs in the first 16 gsis.
+ */
+ if (gsi >= NR_IRQS_LEGACY)
+ irq = gsi;
+ else
+ irq = gsi_top + gsi;
+
+ return irq;
+}
+
+static u32 irq_to_gsi(int irq)
+{
+ unsigned int gsi;
+
+ if (irq < NR_IRQS_LEGACY)
+ gsi = isa_irq_to_gsi[irq];
+ else if (irq < gsi_top)
+ gsi = irq;
+ else if (irq < (gsi_top + NR_IRQS_LEGACY))
+ gsi = irq - gsi_top;
+ else
+ gsi = 0xffffffff;
+
+ return gsi;
+}
+
+/*
+ * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
+ * to map the target physical address. The problem is that set_fixmap()
+ * provides a single page, and it is possible that the page is not
+ * sufficient.
+ * By using this area, we can map up to MAX_IO_APICS pages temporarily,
+ * i.e. until the next __va_range() call.
+ *
+ * Important Safety Note: The fixed I/O APIC page numbers are *subtracted*
+ * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and
+ * count idx down while incrementing the phys address.
+ */
+char *__init __acpi_map_table(unsigned long phys, unsigned long size)
+{
+
+ if (!phys || !size)
+ return NULL;
+
+ return early_ioremap(phys, size);
+}
+void __init __acpi_unmap_table(char *map, unsigned long size)
+{
+ if (!map || !size)
+ return;
+
+ early_iounmap(map, size);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int __init acpi_parse_madt(struct acpi_table_header *table)
+{
+ struct acpi_table_madt *madt = NULL;
+
+ if (!cpu_has_apic)
+ return -EINVAL;
+
+ madt = (struct acpi_table_madt *)table;
+ if (!madt) {
+ printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ return -ENODEV;
+ }
+
+ if (madt->address) {
+ acpi_lapic_addr = (u64) madt->address;
+
+ printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
+ madt->address);
+ }
+
+ default_acpi_madt_oem_check(madt->header.oem_id,
+ madt->header.oem_table_id);
+
+ return 0;
+}
+
+static void __cpuinit acpi_register_lapic(int id, u8 enabled)
+{
+ unsigned int ver = 0;
+
+ if (id >= (MAX_LOCAL_APIC-1)) {
+ printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+ return;
+ }
+
+ if (!enabled) {
+ ++disabled_cpus;
+ return;
+ }
+
+ if (boot_cpu_physical_apicid != -1U)
+ ver = apic_version[boot_cpu_physical_apicid];
+
+ generic_processor_info(id, ver);
+}
+
+static int __init
+acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
+{
+ struct acpi_madt_local_x2apic *processor = NULL;
+ int apic_id;
+ u8 enabled;
+
+ processor = (struct acpi_madt_local_x2apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ apic_id = processor->local_apic_id;
+ enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
+#ifdef CONFIG_X86_X2APIC
+ /*
+ * We need to register disabled CPU as well to permit
+ * counting disabled CPUs. This allows us to size
+ * cpus_possible_map more accurately, to permit
+ * to not preallocating memory for all NR_CPUS
+ * when we use CPU hotplug.
+ */
+ if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
+ printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+ else
+ acpi_register_lapic(apic_id, enabled);
+#else
+ printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+#endif
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic *processor = NULL;
+
+ processor = (struct acpi_madt_local_apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /*
+ * We need to register disabled CPU as well to permit
+ * counting disabled CPUs. This allows us to size
+ * cpus_possible_map more accurately, to permit
+ * to not preallocating memory for all NR_CPUS
+ * when we use CPU hotplug.
+ */
+ acpi_register_lapic(processor->id, /* APIC ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+{
+ struct acpi_madt_local_sapic *processor = NULL;
+
+ processor = (struct acpi_madt_local_sapic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
+ const unsigned long end)
+{
+ struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
+
+ lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
+
+ if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
+ return -EINVAL;
+
+ acpi_lapic_addr = lapic_addr_ovr->address;
+
+ return 0;
+}
+
+static int __init
+acpi_parse_x2apic_nmi(struct acpi_subtable_header *header,
+ const unsigned long end)
+{
+ struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL;
+
+ x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header;
+
+ if (BAD_MADT_ENTRY(x2apic_nmi, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (x2apic_nmi->lint != 1)
+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+ return 0;
+}
+
+static int __init
+acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_local_apic_nmi *lapic_nmi = NULL;
+
+ lapic_nmi = (struct acpi_madt_local_apic_nmi *)header;
+
+ if (BAD_MADT_ENTRY(lapic_nmi, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (lapic_nmi->lint != 1)
+ printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+ return 0;
+}
+
+#endif /*CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init
+acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_io_apic *ioapic = NULL;
+
+ ioapic = (struct acpi_madt_io_apic *)header;
+
+ if (BAD_MADT_ENTRY(ioapic, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ mp_register_ioapic(ioapic->id,
+ ioapic->address, ioapic->global_irq_base);
+
+ return 0;
+}
+
+/*
+ * Parse Interrupt Source Override for the ACPI SCI
+ */
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
+{
+ if (trigger == 0) /* compatible SCI trigger is level */
+ trigger = 3;
+
+ if (polarity == 0) /* compatible SCI polarity is low */
+ polarity = 3;
+
+ /* Command-line over-ride via acpi_sci= */
+ if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)
+ trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2;
+
+ if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
+ polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
+
+ /*
+ * mp_config_acpi_legacy_irqs() already setup IRQs < 16
+ * If GSI is < 16, this will update its flags,
+ * else it will create a new mp_irqs[] entry.
+ */
+ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+
+ /*
+ * stash over-ride to indicate we've been here
+ * and for later update of acpi_gbl_FADT
+ */
+ acpi_sci_override_gsi = gsi;
+ return;
+}
+
+static int __init
+acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
+ const unsigned long end)
+{
+ struct acpi_madt_interrupt_override *intsrc = NULL;
+
+ intsrc = (struct acpi_madt_interrupt_override *)header;
+
+ if (BAD_MADT_ENTRY(intsrc, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
+ acpi_sci_ioapic_setup(intsrc->source_irq,
+ intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
+ return 0;
+ }
+
+ if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+ if (acpi_skip_timer_override) {
+ printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+ return 0;
+ }
+ if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+ intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
+ printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+ }
+ }
+
+ mp_override_legacy_irq(intsrc->source_irq,
+ intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end)
+{
+ struct acpi_madt_nmi_source *nmi_src = NULL;
+
+ nmi_src = (struct acpi_madt_nmi_source *)header;
+
+ if (BAD_MADT_ENTRY(nmi_src, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ /* TBD: Support nimsrc entries? */
+
+ return 0;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+
+/*
+ * acpi_pic_sci_set_trigger()
+ *
+ * use ELCR to set PIC-mode trigger type for SCI
+ *
+ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
+ * it may require Edge Trigger -- use "acpi_sci=edge"
+ *
+ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge.
+ * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0)
+ * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0)
+ */
+
+void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
+{
+ unsigned int mask = 1 << irq;
+ unsigned int old, new;
+
+ /* Real old ELCR mask */
+ old = inb(0x4d0) | (inb(0x4d1) << 8);
+
+ /*
+ * If we use ACPI to set PCI IRQs, then we should clear ELCR
+ * since we will set it correctly as we enable the PCI irq
+ * routing.
+ */
+ new = acpi_noirq ? old : 0;
+
+ /*
+ * Update SCI information in the ELCR, it isn't in the PCI
+ * routing tables..
+ */
+ switch (trigger) {
+ case 1: /* Edge - clear */
+ new &= ~mask;
+ break;
+ case 3: /* Level - set */
+ new |= mask;
+ break;
+ }
+
+ if (old == new)
+ return;
+
+ printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
+ outb(new, 0x4d0);
+ outb(new >> 8, 0x4d1);
+}
+
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+{
+ *irq = gsi_to_irq(gsi);
+
+#ifdef CONFIG_X86_IO_APIC
+ if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
+ setup_IO_APIC_irq_extra(gsi);
+#endif
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
+
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+ if (isa_irq >= 16)
+ return -1;
+ *gsi = irq_to_gsi(isa_irq);
+ return 0;
+}
+
+static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+#ifdef CONFIG_PCI
+ /*
+ * Make sure all (legacy) PCI IRQs are set as level-triggered.
+ */
+ if (trigger == ACPI_LEVEL_SENSITIVE)
+ eisa_set_level_irq(gsi);
+#endif
+
+ return gsi;
+}
+
+static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
+ int trigger, int polarity)
+{
+#ifdef CONFIG_X86_IO_APIC
+ gsi = mp_register_gsi(dev, gsi, trigger, polarity);
+#endif
+
+ return gsi;
+}
+
+int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
+ int trigger, int polarity) = acpi_register_gsi_pic;
+
+/*
+ * success: return IRQ number (>=0)
+ * failure: return < 0
+ */
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ unsigned int irq;
+ unsigned int plat_gsi = gsi;
+
+ plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
+ irq = gsi_to_irq(plat_gsi);
+
+ return irq;
+}
+
+void __init acpi_set_irq_model_pic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+ __acpi_register_gsi = acpi_register_gsi_pic;
+ acpi_ioapic = 0;
+}
+
+void __init acpi_set_irq_model_ioapic(void)
+{
+ acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+ __acpi_register_gsi = acpi_register_gsi_ioapic;
+ acpi_ioapic = 1;
+}
+
+/*
+ * ACPI based hotplug support for CPU
+ */
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+#include <acpi/processor.h>
+
+static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+{
+#ifdef CONFIG_ACPI_NUMA
+ int nid;
+
+ nid = acpi_get_node(handle);
+ if (nid == -1 || !node_online(nid))
+ return;
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+#endif
+}
+
+static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
+{
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ union acpi_object *obj;
+ struct acpi_madt_local_apic *lapic;
+ cpumask_var_t tmp_map, new_map;
+ u8 physid;
+ int cpu;
+ int retval = -ENOMEM;
+
+ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+ return -EINVAL;
+
+ if (!buffer.length || !buffer.pointer)
+ return -EINVAL;
+
+ obj = buffer.pointer;
+ if (obj->type != ACPI_TYPE_BUFFER ||
+ obj->buffer.length < sizeof(*lapic)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
+
+ if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
+ !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ physid = lapic->id;
+
+ kfree(buffer.pointer);
+ buffer.length = ACPI_ALLOCATE_BUFFER;
+ buffer.pointer = NULL;
+
+ if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
+ goto out;
+
+ if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
+ goto free_tmp_map;
+
+ cpumask_copy(tmp_map, cpu_present_mask);
+ acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
+
+ /*
+ * If mp_register_lapic successfully generates a new logical cpu
+ * number, then the following will get us exactly what was mapped
+ */
+ cpumask_andnot(new_map, cpu_present_mask, tmp_map);
+ if (cpumask_empty(new_map)) {
+ printk ("Unable to map lapic to logical cpu number\n");
+ retval = -EINVAL;
+ goto free_new_map;
+ }
+
+ acpi_processor_set_pdc(handle);
+
+ cpu = cpumask_first(new_map);
+ acpi_map_cpu2node(handle, cpu, physid);
+
+ *pcpu = cpu;
+ retval = 0;
+
+free_new_map:
+ free_cpumask_var(new_map);
+free_tmp_map:
+ free_cpumask_var(tmp_map);
+out:
+ return retval;
+}
+
+/* wrapper to silence section mismatch warning */
+int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
+{
+ return _acpi_map_lsapic(handle, pcpu);
+}
+EXPORT_SYMBOL(acpi_map_lsapic);
+
+int acpi_unmap_lsapic(int cpu)
+{
+ per_cpu(x86_cpu_to_apicid, cpu) = -1;
+ set_cpu_present(cpu, false);
+ num_processors--;
+
+ return (0);
+}
+
+EXPORT_SYMBOL(acpi_unmap_lsapic);
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+
+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
+{
+ /* TBD */
+ return -EINVAL;
+}
+
+EXPORT_SYMBOL(acpi_register_ioapic);
+
+int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
+{
+ /* TBD */
+ return -EINVAL;
+}
+
+EXPORT_SYMBOL(acpi_unregister_ioapic);
+
+static int __init acpi_parse_sbf(struct acpi_table_header *table)
+{
+ struct acpi_table_boot *sb;
+
+ sb = (struct acpi_table_boot *)table;
+ if (!sb) {
+ printk(KERN_WARNING PREFIX "Unable to map SBF\n");
+ return -ENODEV;
+ }
+
+ sbf_port = sb->cmos_index; /* Save CMOS port */
+
+ return 0;
+}
+
+#ifdef CONFIG_HPET_TIMER
+#include <asm/hpet.h>
+
+static struct __initdata resource *hpet_res;
+
+static int __init acpi_parse_hpet(struct acpi_table_header *table)
+{
+ struct acpi_table_hpet *hpet_tbl;
+
+ hpet_tbl = (struct acpi_table_hpet *)table;
+ if (!hpet_tbl) {
+ printk(KERN_WARNING PREFIX "Unable to map HPET\n");
+ return -ENODEV;
+ }
+
+ if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
+ printk(KERN_WARNING PREFIX "HPET timers must be located in "
+ "memory.\n");
+ return -1;
+ }
+
+ hpet_address = hpet_tbl->address.address;
+ hpet_blockid = hpet_tbl->sequence;
+
+ /*
+ * Some broken BIOSes advertise HPET at 0x0. We really do not
+ * want to allocate a resource there.
+ */
+ if (!hpet_address) {
+ printk(KERN_WARNING PREFIX
+ "HPET id: %#x base: %#lx is invalid\n",
+ hpet_tbl->id, hpet_address);
+ return 0;
+ }
+#ifdef CONFIG_X86_64
+ /*
+ * Some even more broken BIOSes advertise HPET at
+ * 0xfed0000000000000 instead of 0xfed00000. Fix it up and add
+ * some noise:
+ */
+ if (hpet_address == 0xfed0000000000000UL) {
+ if (!hpet_force_user) {
+ printk(KERN_WARNING PREFIX "HPET id: %#x "
+ "base: 0xfed0000000000000 is bogus\n "
+ "try hpet=force on the kernel command line to "
+ "fix it up to 0xfed00000.\n", hpet_tbl->id);
+ hpet_address = 0;
+ return 0;
+ }
+ printk(KERN_WARNING PREFIX
+ "HPET id: %#x base: 0xfed0000000000000 fixed up "
+ "to 0xfed00000.\n", hpet_tbl->id);
+ hpet_address >>= 32;
+ }
+#endif
+ printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+ hpet_tbl->id, hpet_address);
+
+ /*
+ * Allocate and initialize the HPET firmware resource for adding into
+ * the resource tree during the lateinit timeframe.
+ */
+#define HPET_RESOURCE_NAME_SIZE 9
+ hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+
+ hpet_res->name = (void *)&hpet_res[1];
+ hpet_res->flags = IORESOURCE_MEM;
+ snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
+ hpet_tbl->sequence);
+
+ hpet_res->start = hpet_address;
+ hpet_res->end = hpet_address + (1 * 1024) - 1;
+
+ return 0;
+}
+
+/*
+ * hpet_insert_resource inserts the HPET resources used into the resource
+ * tree.
+ */
+static __init int hpet_insert_resource(void)
+{
+ if (!hpet_res)
+ return 1;
+
+ return insert_resource(&iomem_resource, hpet_res);
+}
+
+late_initcall(hpet_insert_resource);
+
+#else
+#define acpi_parse_hpet NULL
+#endif
+
+static int __init acpi_parse_fadt(struct acpi_table_header *table)
+{
+
+#ifdef CONFIG_X86_PM_TIMER
+ /* detect the location of the ACPI PM Timer */
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) {
+ /* FADT rev. 2 */
+ if (acpi_gbl_FADT.xpm_timer_block.space_id !=
+ ACPI_ADR_SPACE_SYSTEM_IO)
+ return 0;
+
+ pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address;
+ /*
+ * "X" fields are optional extensions to the original V1.0
+ * fields, so we must selectively expand V1.0 fields if the
+ * corresponding X field is zero.
+ */
+ if (!pmtmr_ioport)
+ pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+ } else {
+ /* FADT rev. 1 */
+ pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
+ }
+ if (pmtmr_ioport)
+ printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
+ pmtmr_ioport);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Parse LAPIC entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+
+static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
+{
+ int count;
+
+ if (!cpu_has_apic)
+ return -ENODEV;
+
+ /*
+ * Note that the LAPIC address is obtained from the MADT (32-bit value)
+ * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ */
+
+ count =
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC address override entry\n");
+ return count;
+ }
+
+ register_lapic_address(acpi_lapic_addr);
+
+ return count;
+}
+
+static int __init acpi_parse_madt_lapic_entries(void)
+{
+ int count;
+ int x2count = 0;
+
+ if (!cpu_has_apic)
+ return -ENODEV;
+
+ /*
+ * Note that the LAPIC address is obtained from the MADT (32-bit value)
+ * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ */
+
+ count =
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing LAPIC address override entry\n");
+ return count;
+ }
+
+ register_lapic_address(acpi_lapic_addr);
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
+ acpi_parse_sapic, MAX_LOCAL_APIC);
+
+ if (!count) {
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
+ acpi_parse_x2apic, MAX_LOCAL_APIC);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
+ acpi_parse_lapic, MAX_LOCAL_APIC);
+ }
+ if (!count && !x2count) {
+ printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return -ENODEV;
+ } else if (count < 0 || x2count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ x2count =
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+ acpi_parse_x2apic_nmi, 0);
+ count =
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+ if (count < 0 || x2count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+ return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+#ifdef CONFIG_X86_ES7000
+extern int es7000_plat;
+#endif
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+ int ioapic;
+ int pin;
+ struct mpc_intsrc mp_irq;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq; /* IRQ */
+ mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
+ mp_irq.dstirq = pin; /* INTIN# */
+
+ mp_save_irq(&mp_irq);
+
+ isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+ int i;
+ struct mpc_intsrc mp_irq;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+ /*
+ * Fabricate the legacy ISA bus (bus #31).
+ */
+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
+ pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
+
+#ifdef CONFIG_X86_ES7000
+ /*
+ * Older generations of ES7000 have no legacy identity mappings
+ */
+ if (es7000_plat == 1)
+ return;
+#endif
+
+ /*
+ * Use the default configuration for the IRQs 0-15. Unless
+ * overridden by (MADT) interrupt source override entries.
+ */
+ for (i = 0; i < 16; i++) {
+ int ioapic, pin;
+ unsigned int dstapic;
+ int idx;
+ u32 gsi;
+
+ /* Locate the gsi that irq i maps to. */
+ if (acpi_isa_irq_to_gsi(i, &gsi))
+ continue;
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQ.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ continue;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ dstapic = mpc_ioapic_id(ioapic);
+
+ for (idx = 0; idx < mp_irq_entries; idx++) {
+ struct mpc_intsrc *irq = mp_irqs + idx;
+
+ /* Do we already have a mapping for this ISA IRQ? */
+ if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i)
+ break;
+
+ /* Do we already have a mapping for this IOAPIC pin */
+ if (irq->dstapic == dstapic && irq->dstirq == pin)
+ break;
+ }
+
+ if (idx != mp_irq_entries) {
+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ continue; /* IRQ already used */
+ }
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqflag = 0; /* Conforming */
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.dstapic = dstapic;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.srcbusirq = i; /* Identity mapped */
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+ }
+}
+
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
+ int ioapic;
+ u8 pin;
+
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev)
+ return 0;
+ if (dev->bus != &pci_bus_type)
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_save_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+ struct io_apic_irq_attr irq_attr;
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return gsi;
+
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return gsi;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+ return gsi;
+ }
+
+ ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
+ "%d-%d\n", mpc_ioapic_id(ioapic),
+ ioapic_pin);
+ return gsi;
+ }
+
+ if (enable_update_mptable)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+
+ set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+ trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
+
+ return gsi;
+}
+
+/*
+ * Parse IOAPIC related entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+static int __init acpi_parse_madt_ioapic_entries(void)
+{
+ int count;
+
+ /*
+ * ACPI interpreter is required to complete interrupt setup,
+ * so if it is off, don't enumerate the io-apics with ACPI.
+ * If MPS is present, it will handle them,
+ * otherwise the system will stay in PIC mode
+ */
+ if (acpi_disabled || acpi_noirq)
+ return -ENODEV;
+
+ if (!cpu_has_apic)
+ return -ENODEV;
+
+ /*
+ * if "noapic" boot option, don't look for IO-APICs
+ */
+ if (skip_ioapic_setup) {
+ printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
+ "due to 'noapic' option.\n");
+ return -ENODEV;
+ }
+
+ count =
+ acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+ MAX_IO_APICS);
+ if (!count) {
+ printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+ return -ENODEV;
+ } else if (count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+ return count;
+ }
+
+ count =
+ acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
+ nr_irqs);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX
+ "Error parsing interrupt source overrides entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ /*
+ * If BIOS did not supply an INT_SRC_OVR for the SCI
+ * pretend we got one so we can set the SCI flags.
+ */
+ if (!acpi_sci_override_gsi)
+ acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+ acpi_gbl_FADT.sci_interrupt);
+
+ /* Fill in identity legacy mappings where no override */
+ mp_config_acpi_legacy_irqs();
+
+ count =
+ acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
+ nr_irqs);
+ if (count < 0) {
+ printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+ /* TBD: Cleanup to allow fallback to MPS */
+ return count;
+ }
+
+ return 0;
+}
+#else
+static inline int acpi_parse_madt_ioapic_entries(void)
+{
+ return -1;
+}
+#endif /* !CONFIG_X86_IO_APIC */
+
+static void __init early_acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int error;
+
+ if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+
+ /*
+ * Parse MADT LAPIC entries
+ */
+ error = early_acpi_parse_madt_lapic_addr_ovr();
+ if (!error) {
+ acpi_lapic = 1;
+ smp_found_config = 1;
+ }
+ if (error == -EINVAL) {
+ /*
+ * Dell Precision Workstation 410, 610 come here.
+ */
+ printk(KERN_ERR PREFIX
+ "Invalid BIOS MADT, disabling ACPI\n");
+ disable_acpi();
+ }
+ }
+#endif
+}
+
+static void __init acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+ int error;
+
+ if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) {
+
+ /*
+ * Parse MADT LAPIC entries
+ */
+ error = acpi_parse_madt_lapic_entries();
+ if (!error) {
+ acpi_lapic = 1;
+
+ /*
+ * Parse MADT IO-APIC entries
+ */
+ error = acpi_parse_madt_ioapic_entries();
+ if (!error) {
+ acpi_set_irq_model_ioapic();
+
+ smp_found_config = 1;
+ }
+ }
+ if (error == -EINVAL) {
+ /*
+ * Dell Precision Workstation 410, 610 come here.
+ */
+ printk(KERN_ERR PREFIX
+ "Invalid BIOS MADT, disabling ACPI\n");
+ disable_acpi();
+ }
+ } else {
+ /*
+ * ACPI found no MADT, and so ACPI wants UP PIC mode.
+ * In the event an MPS table was found, forget it.
+ * Boot with "acpi=off" to use MPS on such a system.
+ */
+ if (smp_found_config) {
+ printk(KERN_WARNING PREFIX
+ "No APIC-table, disabling MPS\n");
+ smp_found_config = 0;
+ }
+ }
+
+ /*
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * processors, where MPS only supports physical.
+ */
+ if (acpi_lapic && acpi_ioapic)
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
+ else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
+#endif
+ return;
+}
+
+static int __init disable_acpi_irq(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
+ d->ident);
+ acpi_noirq_set();
+ }
+ return 0;
+}
+
+static int __init disable_acpi_pci(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
+ d->ident);
+ acpi_disable_pci();
+ }
+ return 0;
+}
+
+static int __init dmi_disable_acpi(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
+ disable_acpi();
+ } else {
+ printk(KERN_NOTICE
+ "Warning: DMI blacklist says broken, but acpi forced\n");
+ }
+ return 0;
+}
+
+/*
+ * Force ignoring BIOS IRQ0 pin2 override
+ */
+static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
+{
+ /*
+ * The ati_ixp4x0_rev() early PCI quirk should have set
+ * the acpi_skip_timer_override flag already:
+ */
+ if (!acpi_skip_timer_override) {
+ WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
+ pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ d->ident);
+ acpi_skip_timer_override = 1;
+ }
+ return 0;
+}
+
+/*
+ * If your system is blacklisted here, but you find that acpi=force
+ * works for you, please contact linux-acpi@vger.kernel.org
+ */
+static struct dmi_system_id __initdata acpi_dmi_table[] = {
+ /*
+ * Boxes that need ACPI disabled
+ */
+ {
+ .callback = dmi_disable_acpi,
+ .ident = "IBM Thinkpad",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2629H1G"),
+ },
+ },
+
+ /*
+ * Boxes that need ACPI PCI IRQ routing disabled
+ */
+ {
+ .callback = disable_acpi_irq,
+ .ident = "ASUS A7V",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"),
+ DMI_MATCH(DMI_BOARD_NAME, "<A7V>"),
+ /* newer BIOS, Revision 1011, does work */
+ DMI_MATCH(DMI_BIOS_VERSION,
+ "ASUS A7V ACPI BIOS Revision 1007"),
+ },
+ },
+ {
+ /*
+ * Latest BIOS for IBM 600E (1.16) has bad pcinum
+ * for LPC bridge, which is needed for the PCI
+ * interrupt links to work. DSDT fix is in bug 5966.
+ * 2645, 2646 model numbers are shared with 600/600E/600X
+ */
+ .callback = disable_acpi_irq,
+ .ident = "IBM Thinkpad 600 Series 2645",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2645"),
+ },
+ },
+ {
+ .callback = disable_acpi_irq,
+ .ident = "IBM Thinkpad 600 Series 2646",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2646"),
+ },
+ },
+ /*
+ * Boxes that need ACPI PCI IRQ routing and PCI scan disabled
+ */
+ { /* _BBN 0 bug */
+ .callback = disable_acpi_pci,
+ .ident = "ASUS PR-DLS",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"),
+ DMI_MATCH(DMI_BIOS_VERSION,
+ "ASUS PR-DLS ACPI BIOS Revision 1010"),
+ DMI_MATCH(DMI_BIOS_DATE, "03/21/2003")
+ },
+ },
+ {
+ .callback = disable_acpi_pci,
+ .ident = "Acer TravelMate 36x Laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+ },
+ },
+ {}
+};
+
+/* second table for DMI checks that should run after early-quirks */
+static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
+ /*
+ * HP laptops which use a DSDT reporting as HP/SB400/10000,
+ * which includes some code which overrides all temperature
+ * trip points to 16C if the INTIN2 input of the I/O APIC
+ * is enabled. This input is incorrectly designated the
+ * ISA IRQ 0 via an interrupt source override even though
+ * it is wired to the output of the master 8259A and INTIN0
+ * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * override in that cases.
+ */
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP nx6115 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6125 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP NX6325 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+ },
+ },
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "HP 6715b laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
+ },
+ },
+ {}
+};
+
+/*
+ * acpi_boot_table_init() and acpi_boot_init()
+ * called from setup_arch(), always.
+ * 1. checksums all tables
+ * 2. enumerates lapics
+ * 3. enumerates io-apics
+ *
+ * acpi_table_init() is separate to allow reading SRAT without
+ * other side effects.
+ *
+ * side effects of acpi_boot_init:
+ * acpi_lapic = 1 if LAPIC found
+ * acpi_ioapic = 1 if IOAPIC found
+ * if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
+ * if acpi_blacklisted() acpi_disabled = 1;
+ * acpi_irq_model=...
+ * ...
+ */
+
+void __init acpi_boot_table_init(void)
+{
+ dmi_check_system(acpi_dmi_table);
+
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return;
+
+ /*
+ * Initialize the ACPI boot-time table parser.
+ */
+ if (acpi_table_init()) {
+ disable_acpi();
+ return;
+ }
+
+ acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+
+ /*
+ * blacklist may disable ACPI entirely
+ */
+ if (acpi_blacklisted()) {
+ if (acpi_force) {
+ printk(KERN_WARNING PREFIX "acpi=force override\n");
+ } else {
+ printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+ disable_acpi();
+ return;
+ }
+ }
+}
+
+int __init early_acpi_boot_init(void)
+{
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return 1;
+
+ /*
+ * Process the Multiple APIC Description Table (MADT), if present
+ */
+ early_acpi_process_madt();
+
+ return 0;
+}
+
+int __init acpi_boot_init(void)
+{
+ /* those are executed after early-quirks are executed */
+ dmi_check_system(acpi_dmi_table_late);
+
+ /*
+ * If acpi_disabled, bail out
+ */
+ if (acpi_disabled)
+ return 1;
+
+ acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
+
+ /*
+ * set sci_int and PM timer address
+ */
+ acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt);
+
+ /*
+ * Process the Multiple APIC Description Table (MADT), if present
+ */
+ acpi_process_madt();
+
+ acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+
+ if (!acpi_noirq)
+ x86_init.pci.init = pci_acpi_init;
+
+ return 0;
+}
+
+static int __init parse_acpi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /* "acpi=off" disables both ACPI table parsing and interpreter */
+ if (strcmp(arg, "off") == 0) {
+ disable_acpi();
+ }
+ /* acpi=force to over-ride black-list */
+ else if (strcmp(arg, "force") == 0) {
+ acpi_force = 1;
+ acpi_disabled = 0;
+ }
+ /* acpi=strict disables out-of-spec workarounds */
+ else if (strcmp(arg, "strict") == 0) {
+ acpi_strict = 1;
+ }
+ /* acpi=rsdt use RSDT instead of XSDT */
+ else if (strcmp(arg, "rsdt") == 0) {
+ acpi_rsdt_forced = 1;
+ }
+ /* "acpi=noirq" disables ACPI interrupt routing */
+ else if (strcmp(arg, "noirq") == 0) {
+ acpi_noirq_set();
+ }
+ /* "acpi=copy_dsdt" copys DSDT */
+ else if (strcmp(arg, "copy_dsdt") == 0) {
+ acpi_gbl_copy_dsdt_locally = 1;
+ } else {
+ /* Core will printk when we return error. */
+ return -EINVAL;
+ }
+ return 0;
+}
+early_param("acpi", parse_acpi);
+
+/* FIXME: Using pci= for an ACPI parameter is a travesty. */
+static int __init parse_pci(char *arg)
+{
+ if (arg && strcmp(arg, "noacpi") == 0)
+ acpi_disable_pci();
+ return 0;
+}
+early_param("pci", parse_pci);
+
+int __init acpi_mps_check(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
+/* mptable code is not built-in*/
+ if (acpi_disabled || acpi_noirq) {
+ printk(KERN_WARNING "MPS support code is not built-in.\n"
+ "Using acpi=off or acpi=noirq or pci=noacpi "
+ "may have problem\n");
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static int __init parse_acpi_skip_timer_override(char *arg)
+{
+ acpi_skip_timer_override = 1;
+ return 0;
+}
+early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+
+static int __init parse_acpi_use_timer_override(char *arg)
+{
+ acpi_use_timer_override = 1;
+ return 0;
+}
+early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init setup_acpi_sci(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "edge"))
+ acpi_sci_flags = ACPI_MADT_TRIGGER_EDGE |
+ (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+ else if (!strcmp(s, "level"))
+ acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL |
+ (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK);
+ else if (!strcmp(s, "high"))
+ acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH |
+ (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+ else if (!strcmp(s, "low"))
+ acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW |
+ (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK);
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("acpi_sci", setup_acpi_sci);
+
+int __acpi_acquire_global_lock(unsigned int *lock)
+{
+ unsigned int old, new, val;
+ do {
+ old = *lock;
+ new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
+ val = cmpxchg(lock, old, new);
+ } while (unlikely (val != old));
+ return (new < 3) ? -1 : 0;
+}
+
+int __acpi_release_global_lock(unsigned int *lock)
+{
+ unsigned int old, new, val;
+ do {
+ old = *lock;
+ new = old & ~0x3;
+ val = cmpxchg(lock, old, new);
+ } while (unlikely (val != old));
+ return old & 0x1;
+}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
new file mode 100644
index 00000000000..f50e7fb2a20
--- /dev/null
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2005 Intel Corporation
+ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * - Added _PDC for SMP C-states on Intel CPUs
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+
+#include <acpi/processor.h>
+#include <asm/acpi.h>
+#include <asm/mwait.h>
+
+/*
+ * Initialize bm_flags based on the CPU cache properties
+ * On SMP it depends on cache configuration
+ * - When cache is not shared among all CPUs, we flush cache
+ * before entering C3.
+ * - When cache is shared among all CPUs, we use bm_check
+ * mechanism as in UP case
+ *
+ * This routine is called only after all the CPUs are online
+ */
+void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
+ unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ flags->bm_check = 0;
+ if (num_online_cpus() == 1)
+ flags->bm_check = 1;
+ else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * Today all MP CPUs that support C3 share cache.
+ * And caches should not be flushed by software while
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ }
+
+ /*
+ * On all recent Intel platforms, ARB_DISABLE is a nop.
+ * So, set bm_control to zero to indicate that ARB_DISABLE
+ * is not required while entering C3 type state on
+ * P4, Core and beyond CPUs
+ */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
+ flags->bm_control = 0;
+}
+EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
+
+/* The code below handles cstate entry with monitor-mwait pair on Intel*/
+
+struct cstate_entry {
+ struct {
+ unsigned int eax;
+ unsigned int ecx;
+ } states[ACPI_PROCESSOR_MAX_POWER];
+};
+static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
+
+static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
+
+#define NATIVE_CSTATE_BEYOND_HALT (2)
+
+static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
+{
+ struct acpi_processor_cx *cx = _cx;
+ long retval;
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int edx_part;
+ unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+ unsigned int num_cstate_subtype;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+ /* Check whether this particular cx_type (in CST) is supported or not */
+ cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1;
+ edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+ num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+ retval = 0;
+ if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) {
+ retval = -1;
+ goto out;
+ }
+
+ /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) {
+ retval = -1;
+ goto out;
+ }
+
+ if (!mwait_supported[cstate_type]) {
+ mwait_supported[cstate_type] = 1;
+ printk(KERN_DEBUG
+ "Monitor-Mwait will be used to enter C-%d "
+ "state\n", cx->type);
+ }
+ snprintf(cx->desc,
+ ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
+ cx->address);
+out:
+ return retval;
+}
+
+int acpi_processor_ffh_cstate_probe(unsigned int cpu,
+ struct acpi_processor_cx *cx, struct acpi_power_register *reg)
+{
+ struct cstate_entry *percpu_entry;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ long retval;
+
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ return -1;
+
+ if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
+ return -1;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ percpu_entry->states[cx->index].eax = 0;
+ percpu_entry->states[cx->index].ecx = 0;
+
+ /* Make sure we are running on right CPU */
+
+ retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx);
+ if (retval == 0) {
+ /* Use the hint in CST */
+ percpu_entry->states[cx->index].eax = cx->address;
+ percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
+ }
+
+ /*
+ * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
+ * then we should skip checking BM_STS for this C-state.
+ * ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
+ */
+ if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
+ cx->bm_sts_skip = 1;
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+ if (!need_resched()) {
+ if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+ clflush((void *)&current_thread_info()->flags);
+
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __mwait(ax, cx);
+ }
+}
+
+void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cstate_entry *percpu_entry;
+
+ percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
+ mwait_idle_with_hints(percpu_entry->states[cx->index].eax,
+ percpu_entry->states[cx->index].ecx);
+}
+EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
+
+static int __init ffh_cstate_init(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ if (c->x86_vendor != X86_VENDOR_INTEL)
+ return -1;
+
+ cpu_cstate_entry = alloc_percpu(struct cstate_entry);
+ return 0;
+}
+
+static void __exit ffh_cstate_exit(void)
+{
+ free_percpu(cpu_cstate_entry);
+ cpu_cstate_entry = NULL;
+}
+
+arch_initcall(ffh_cstate_init);
+__exitcall(ffh_cstate_exit);
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
new file mode 100644
index 00000000000..6a564ac67ef
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -0,0 +1,59 @@
+#
+# arch/x86/kernel/acpi/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+
+always := wakeup.bin
+targets := wakeup.elf wakeup.lds
+
+wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
+
+# The link order of the video-*.o modules can matter. In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+wakeup-y += video-vga.o
+wakeup-y += video-vesa.o
+wakeup-y += video-bios.o
+
+targets += $(wakeup-y)
+
+bootsrc := $(src)/../../../boot
+
+# ---------------------------------------------------------------------------
+
+# How to compile the 16-bit code. Note we always compile for -march=i386,
+# that way we can complain to the user if the CPU is insufficient.
+# Compile with _SETUP since this is similar to the boot-time setup code.
+KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
+ -I$(srctree)/$(bootsrc) \
+ $(cflags-y) \
+ -Wall -Wstrict-prototypes \
+ -march=i386 -mregparm=3 \
+ -include $(srctree)/$(bootsrc)/code16gcc.h \
+ -fno-strict-aliasing -fomit-frame-pointer \
+ $(call cc-option, -ffreestanding) \
+ $(call cc-option, -fno-toplevel-reorder,\
+ $(call cc-option, -fno-unit-at-a-time)) \
+ $(call cc-option, -fno-stack-protector) \
+ $(call cc-option, -mpreferred-stack-boundary=2)
+KBUILD_CFLAGS += $(call cc-option, -m32)
+KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+GCOV_PROFILE := n
+
+WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
+
+LDFLAGS_wakeup.elf := -T
+
+CPPFLAGS_wakeup.lds += -P -C
+
+$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
+ $(call if_changed,ld)
+
+OBJCOPYFLAGS_wakeup.bin := -O binary
+
+$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
+ $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 00000000000..f51eb0bb56c
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
new file mode 100644
index 00000000000..dc59ebee69d
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/copy.S
@@ -0,0 +1 @@
+#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 00000000000..6206033ba20
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
new file mode 100644
index 00000000000..7deabc144a2
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-bios.c
@@ -0,0 +1 @@
+#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
new file mode 100644
index 00000000000..328ad209f11
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-mode.c
@@ -0,0 +1 @@
+#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
new file mode 100644
index 00000000000..9dbb9672226
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-vesa.c
@@ -0,0 +1 @@
+#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
new file mode 100644
index 00000000000..bcc81255f37
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-vga.c
@@ -0,0 +1 @@
+#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c
new file mode 100644
index 00000000000..883962d9eef
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakemain.c
@@ -0,0 +1,81 @@
+#include "wakeup.h"
+#include "boot.h"
+
+static void udelay(int loops)
+{
+ while (loops--)
+ io_delay(); /* Approximately 1 us */
+}
+
+static void beep(unsigned int hz)
+{
+ u8 enable;
+
+ if (!hz) {
+ enable = 0x00; /* Turn off speaker */
+ } else {
+ u16 div = 1193181/hz;
+
+ outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */
+ io_delay();
+ outb(div, 0x42); /* LSB of counter */
+ io_delay();
+ outb(div >> 8, 0x42); /* MSB of counter */
+ io_delay();
+
+ enable = 0x03; /* Turn on speaker */
+ }
+ inb(0x61); /* Dummy read of System Control Port B */
+ io_delay();
+ outb(enable, 0x61); /* Enable timer 2 output to speaker */
+ io_delay();
+}
+
+#define DOT_HZ 880
+#define DASH_HZ 587
+#define US_PER_DOT 125000
+
+/* Okay, this is totally silly, but it's kind of fun. */
+static void send_morse(const char *pattern)
+{
+ char s;
+
+ while ((s = *pattern++)) {
+ switch (s) {
+ case '.':
+ beep(DOT_HZ);
+ udelay(US_PER_DOT);
+ beep(0);
+ udelay(US_PER_DOT);
+ break;
+ case '-':
+ beep(DASH_HZ);
+ udelay(US_PER_DOT * 3);
+ beep(0);
+ udelay(US_PER_DOT);
+ break;
+ default: /* Assume it's a space */
+ udelay(US_PER_DOT * 3);
+ break;
+ }
+ }
+}
+
+void main(void)
+{
+ /* Kill machine if structures are wrong */
+ if (wakeup_header.real_magic != 0x12345678)
+ while (1);
+
+ if (wakeup_header.realmode_flags & 4)
+ send_morse("...-");
+
+ if (wakeup_header.realmode_flags & 1)
+ asm volatile("lcallw $0xc000,$3");
+
+ if (wakeup_header.realmode_flags & 2) {
+ /* Need to call BIOS */
+ probe_cards(0);
+ set_mode(wakeup_header.video_mode);
+ }
+}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
new file mode 100644
index 00000000000..b4fd836e405
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -0,0 +1,170 @@
+/*
+ * ACPI wakeup real mode startup stub
+ */
+#include <asm/segment.h>
+#include <asm/msr-index.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/processor-flags.h>
+#include "wakeup.h"
+
+ .code16
+ .section ".jump", "ax"
+ .globl _start
+_start:
+ cli
+ jmp wakeup_code
+
+/* This should match the structure in wakeup.h */
+ .section ".header", "a"
+ .globl wakeup_header
+wakeup_header:
+video_mode: .short 0 /* Video mode number */
+pmode_return: .byte 0x66, 0xea /* ljmpl */
+ .long 0 /* offset goes here */
+ .short __KERNEL_CS
+pmode_cr0: .long 0 /* Saved %cr0 */
+pmode_cr3: .long 0 /* Saved %cr3 */
+pmode_cr4: .long 0 /* Saved %cr4 */
+pmode_efer: .quad 0 /* Saved EFER */
+pmode_gdt: .quad 0
+pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
+pmode_behavior: .long 0 /* Wakeup behavior flags */
+realmode_flags: .long 0
+real_magic: .long 0
+trampoline_segment: .word 0
+_pad1: .byte 0
+wakeup_jmp: .byte 0xea /* ljmpw */
+wakeup_jmp_off: .word 3f
+wakeup_jmp_seg: .word 0
+wakeup_gdt: .quad 0, 0, 0
+signature: .long WAKEUP_HEADER_SIGNATURE
+
+ .text
+ .code16
+wakeup_code:
+ cld
+
+ /* Apparently some dimwit BIOS programmers don't know how to
+ program a PM to RM transition, and we might end up here with
+ junk in the data segment descriptor registers. The only way
+ to repair that is to go into PM and fix it ourselves... */
+ movw $16, %cx
+ lgdtl %cs:wakeup_gdt
+ movl %cr0, %eax
+ orb $X86_CR0_PE, %al
+ movl %eax, %cr0
+ jmp 1f
+1: ljmpw $8, $2f
+2:
+ movw %cx, %ds
+ movw %cx, %es
+ movw %cx, %ss
+ movw %cx, %fs
+ movw %cx, %gs
+
+ andb $~X86_CR0_PE, %al
+ movl %eax, %cr0
+ jmp wakeup_jmp
+3:
+ /* Set up segments */
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+ lidtl wakeup_idt
+
+ movl $wakeup_stack_end, %esp
+
+ /* Clear the EFLAGS */
+ pushl $0
+ popfl
+
+ /* Check header signature... */
+ movl signature, %eax
+ cmpl $WAKEUP_HEADER_SIGNATURE, %eax
+ jne bogus_real_magic
+
+ /* Check we really have everything... */
+ movl end_signature, %eax
+ cmpl $WAKEUP_END_SIGNATURE, %eax
+ jne bogus_real_magic
+
+ /* Call the C code */
+ calll main
+
+ /* Restore MISC_ENABLE before entering protected mode, in case
+ BIOS decided to clear XD_DISABLE during S3. */
+ movl pmode_behavior, %eax
+ btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+ jnc 1f
+
+ movl pmode_misc_en, %eax
+ movl pmode_misc_en + 4, %edx
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ wrmsr
+1:
+
+ /* Do any other stuff... */
+
+#ifndef CONFIG_64BIT
+ /* This could also be done in C code... */
+ movl pmode_cr3, %eax
+ movl %eax, %cr3
+
+ movl pmode_cr4, %ecx
+ jecxz 1f
+ movl %ecx, %cr4
+1:
+ movl pmode_efer, %eax
+ movl pmode_efer + 4, %edx
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jz 1f
+ movl $MSR_EFER, %ecx
+ wrmsr
+1:
+
+ lgdtl pmode_gdt
+
+ /* This really couldn't... */
+ movl pmode_cr0, %eax
+ movl %eax, %cr0
+ jmp pmode_return
+#else
+ pushw $0
+ pushw trampoline_segment
+ pushw $0
+ lret
+#endif
+
+bogus_real_magic:
+1:
+ hlt
+ jmp 1b
+
+ .data
+ .balign 8
+
+ /* This is the standard real-mode IDT */
+wakeup_idt:
+ .word 0xffff /* limit */
+ .long 0 /* address */
+ .word 0
+
+ .globl HEAP, heap_end
+HEAP:
+ .long wakeup_heap
+heap_end:
+ .long wakeup_stack
+
+ .bss
+wakeup_heap:
+ .space 2048
+wakeup_stack:
+ .space 2048
+wakeup_stack_end:
+
+ .section ".signature","a"
+end_signature:
+ .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
new file mode 100644
index 00000000000..97a29e1430e
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -0,0 +1,48 @@
+/*
+ * Definitions for the wakeup data structure at the head of the
+ * wakeup code.
+ */
+
+#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/* This must match data at wakeup.S */
+struct wakeup_header {
+ u16 video_mode; /* Video mode number */
+ u16 _jmp1; /* ljmpl opcode, 32-bit only */
+ u32 pmode_entry; /* Protected mode resume point, 32-bit only */
+ u16 _jmp2; /* CS value, 32-bit only */
+ u32 pmode_cr0; /* Protected mode cr0 */
+ u32 pmode_cr3; /* Protected mode cr3 */
+ u32 pmode_cr4; /* Protected mode cr4 */
+ u32 pmode_efer_low; /* Protected mode EFER */
+ u32 pmode_efer_high;
+ u64 pmode_gdt;
+ u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
+ u32 pmode_misc_en_high;
+ u32 pmode_behavior; /* Wakeup routine behavior flags */
+ u32 realmode_flags;
+ u32 real_magic;
+ u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
+ u8 _pad1;
+ u8 wakeup_jmp;
+ u16 wakeup_jmp_off;
+ u16 wakeup_jmp_seg;
+ u64 wakeup_gdt[3];
+ u32 signature; /* To check we have correct structure */
+} __attribute__((__packed__));
+
+extern struct wakeup_header wakeup_header;
+#endif
+
+#define WAKEUP_HEADER_OFFSET 8
+#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
+#define WAKEUP_END_SIGNATURE 0x65a22c82
+
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
+
+#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
new file mode 100644
index 00000000000..d4f8010a5b1
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -0,0 +1,62 @@
+/*
+ * wakeup.ld
+ *
+ * Linker script for the real-mode wakeup code
+ */
+#undef i386
+#include "wakeup.h"
+
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(_start)
+
+SECTIONS
+{
+ . = 0;
+ .jump : {
+ *(.jump)
+ } = 0x90909090
+
+ . = WAKEUP_HEADER_OFFSET;
+ .header : {
+ *(.header)
+ }
+
+ . = ALIGN(16);
+ .text : {
+ *(.text*)
+ } = 0x90909090
+
+ . = ALIGN(16);
+ .rodata : {
+ *(.rodata*)
+ }
+
+ .videocards : {
+ video_cards = .;
+ *(.videocards)
+ video_cards_end = .;
+ }
+
+ . = ALIGN(16);
+ .data : {
+ *(.data*)
+ }
+
+ . = ALIGN(16);
+ .bss : {
+ __bss_start = .;
+ *(.bss)
+ __bss_end = .;
+ }
+
+ .signature : {
+ *(.signature)
+ }
+
+ _end = .;
+
+ /DISCARD/ : {
+ *(.note*)
+ }
+}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
new file mode 100644
index 00000000000..103b6ab368d
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,133 @@
+/*
+ * sleep.c - x86-specific ACPI sleep support.
+ *
+ * Copyright (C) 2001-2003 Patrick Mochel
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
+ */
+
+#include <linux/acpi.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/dmi.h>
+#include <linux/cpumask.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+
+#include "realmode/wakeup.h"
+#include "sleep.h"
+
+unsigned long acpi_realmode_flags;
+
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+static char temp_stack[4096];
+#endif
+
+/**
+ * acpi_suspend_lowlevel - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_suspend_lowlevel(void)
+{
+ struct wakeup_header *header;
+ /* address in low memory of the wakeup routine. */
+ char *acpi_realmode;
+
+ acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
+
+ header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
+ if (header->signature != WAKEUP_HEADER_SIGNATURE) {
+ printk(KERN_ERR "wakeup header does not match\n");
+ return -EINVAL;
+ }
+
+ header->video_mode = saved_video_mode;
+
+ header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+
+ /*
+ * Set up the wakeup GDT. We set these up as Big Real Mode,
+ * that is, with limits set to 4 GB. At least the Lenovo
+ * Thinkpad X61 is known to need this for the video BIOS
+ * initialization quirk to work; this is likely to also
+ * be the case for other laptops or integrated video devices.
+ */
+
+ /* GDT[0]: GDT self-pointer */
+ header->wakeup_gdt[0] =
+ (u64)(sizeof(header->wakeup_gdt) - 1) +
+ ((u64)__pa(&header->wakeup_gdt) << 16);
+ /* GDT[1]: big real mode-like code segment */
+ header->wakeup_gdt[1] =
+ GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
+ /* GDT[2]: big real mode-like data segment */
+ header->wakeup_gdt[2] =
+ GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
+
+#ifndef CONFIG_64BIT
+ store_gdt((struct desc_ptr *)&header->pmode_gdt);
+
+ if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
+ &header->pmode_efer_high))
+ header->pmode_efer_low = header->pmode_efer_high = 0;
+#endif /* !CONFIG_64BIT */
+
+ header->pmode_cr0 = read_cr0();
+ header->pmode_cr4 = read_cr4_safe();
+ header->pmode_behavior = 0;
+ if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+ &header->pmode_misc_en_low,
+ &header->pmode_misc_en_high))
+ header->pmode_behavior |=
+ (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
+ header->realmode_flags = acpi_realmode_flags;
+ header->real_magic = 0x12345678;
+
+#ifndef CONFIG_64BIT
+ header->pmode_entry = (u32)&wakeup_pmode_return;
+ header->pmode_cr3 = (u32)__pa(&initial_page_table);
+ saved_magic = 0x12345678;
+#else /* CONFIG_64BIT */
+ header->trampoline_segment = trampoline_address() >> 4;
+#ifdef CONFIG_SMP
+ stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
+ early_gdt_descr.address =
+ (unsigned long)get_cpu_gdt_table(smp_processor_id());
+ initial_gs = per_cpu_offset(smp_processor_id());
+#endif
+ initial_code = (unsigned long)wakeup_long64;
+ saved_magic = 0x123456789abcdef0L;
+#endif /* CONFIG_64BIT */
+
+ do_suspend_lowlevel();
+ return 0;
+}
+
+static int __init acpi_sleep_setup(char *str)
+{
+ while ((str != NULL) && (*str != '\0')) {
+ if (strncmp(str, "s3_bios", 7) == 0)
+ acpi_realmode_flags |= 1;
+ if (strncmp(str, "s3_mode", 7) == 0)
+ acpi_realmode_flags |= 2;
+ if (strncmp(str, "s3_beep", 7) == 0)
+ acpi_realmode_flags |= 4;
+#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_nohwsig", 10) == 0)
+ acpi_no_s4_hw_signature();
+#endif
+ if (strncmp(str, "nonvs", 5) == 0)
+ acpi_nvs_nosave();
+ if (strncmp(str, "old_ordering", 12) == 0)
+ acpi_old_suspend_ordering();
+ str = strchr(str, ',');
+ if (str != NULL)
+ str += strspn(str, ", \t");
+ }
+ return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
new file mode 100644
index 00000000000..416d4be13fe
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -0,0 +1,15 @@
+/*
+ * Variables and functions used by the code in sleep.c
+ */
+
+#include <asm/trampoline.h>
+
+extern unsigned long saved_video_mode;
+extern long saved_magic;
+
+extern int wakeup_pmode_return;
+
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+extern void wakeup_long64(void);
+
+extern void do_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
new file mode 100644
index 00000000000..13ab720573e
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -0,0 +1,100 @@
+ .section .text..page_aligned
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+
+# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+
+ .code32
+ ALIGN
+
+ENTRY(wakeup_pmode_return)
+wakeup_pmode_return:
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+
+ # reload the gdt, as we need the full 32 bit address
+ lgdt saved_gdt
+ lidt saved_idt
+ lldt saved_ldt
+ ljmp $(__KERNEL_CS), $1f
+1:
+ movl %cr3, %eax
+ movl %eax, %cr3
+ wbinvd
+
+ # and restore the stack ... but you need gdt for this to work
+ movl saved_context_esp, %esp
+
+ movl %cs:saved_magic, %eax
+ cmpl $0x12345678, %eax
+ jne bogus_magic
+
+ # jump to place where we left off
+ movl saved_eip, %eax
+ jmp *%eax
+
+bogus_magic:
+ jmp bogus_magic
+
+
+
+save_registers:
+ sgdt saved_gdt
+ sidt saved_idt
+ sldt saved_ldt
+ str saved_tss
+
+ leal 4(%esp), %eax
+ movl %eax, saved_context_esp
+ movl %ebx, saved_context_ebx
+ movl %ebp, saved_context_ebp
+ movl %esi, saved_context_esi
+ movl %edi, saved_context_edi
+ pushfl
+ popl saved_context_eflags
+
+ movl $ret_point, saved_eip
+ ret
+
+
+restore_registers:
+ movl saved_context_ebp, %ebp
+ movl saved_context_ebx, %ebx
+ movl saved_context_esi, %esi
+ movl saved_context_edi, %edi
+ pushl saved_context_eflags
+ popfl
+ ret
+
+ENTRY(do_suspend_lowlevel)
+ call save_processor_state
+ call save_registers
+ pushl $3
+ call acpi_enter_sleep_state
+ addl $4, %esp
+
+# In case of S3 failure, we'll emerge here. Jump
+# to ret_point to recover
+ jmp ret_point
+ .p2align 4,,7
+ret_point:
+ call restore_registers
+ call restore_processor_state
+ ret
+
+.data
+ALIGN
+ENTRY(saved_magic) .long 0
+ENTRY(saved_eip) .long 0
+
+# saved registers
+saved_gdt: .long 0,0
+saved_idt: .long 0,0
+saved_ldt: .long 0
+saved_tss: .long 0
+
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
new file mode 100644
index 00000000000..8ea5164cbd0
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -0,0 +1,124 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/pgtable_types.h>
+#include <asm/page_types.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+
+# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+
+.code64
+ /*
+ * Hooray, we are in Long 64-bit mode (but still running in low memory)
+ */
+ENTRY(wakeup_long64)
+ movq saved_magic, %rax
+ movq $0x123456789abcdef0, %rdx
+ cmpq %rdx, %rax
+ jne bogus_64_magic
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movq saved_rsp, %rsp
+
+ movq saved_rbx, %rbx
+ movq saved_rdi, %rdi
+ movq saved_rsi, %rsi
+ movq saved_rbp, %rbp
+
+ movq saved_rip, %rax
+ jmp *%rax
+ENDPROC(wakeup_long64)
+
+bogus_64_magic:
+ jmp bogus_64_magic
+
+ENTRY(do_suspend_lowlevel)
+ subq $8, %rsp
+ xorl %eax, %eax
+ call save_processor_state
+
+ movq $saved_context, %rax
+ movq %rsp, pt_regs_sp(%rax)
+ movq %rbp, pt_regs_bp(%rax)
+ movq %rsi, pt_regs_si(%rax)
+ movq %rdi, pt_regs_di(%rax)
+ movq %rbx, pt_regs_bx(%rax)
+ movq %rcx, pt_regs_cx(%rax)
+ movq %rdx, pt_regs_dx(%rax)
+ movq %r8, pt_regs_r8(%rax)
+ movq %r9, pt_regs_r9(%rax)
+ movq %r10, pt_regs_r10(%rax)
+ movq %r11, pt_regs_r11(%rax)
+ movq %r12, pt_regs_r12(%rax)
+ movq %r13, pt_regs_r13(%rax)
+ movq %r14, pt_regs_r14(%rax)
+ movq %r15, pt_regs_r15(%rax)
+ pushfq
+ popq pt_regs_flags(%rax)
+
+ movq $resume_point, saved_rip(%rip)
+
+ movq %rsp, saved_rsp
+ movq %rbp, saved_rbp
+ movq %rbx, saved_rbx
+ movq %rdi, saved_rdi
+ movq %rsi, saved_rsi
+
+ addq $8, %rsp
+ movl $3, %edi
+ xorl %eax, %eax
+ call acpi_enter_sleep_state
+ /* in case something went wrong, restore the machine status and go on */
+ jmp resume_point
+
+ .align 4
+resume_point:
+ /* We don't restore %rax, it must be 0 anyway */
+ movq $saved_context, %rax
+ movq saved_context_cr4(%rax), %rbx
+ movq %rbx, %cr4
+ movq saved_context_cr3(%rax), %rbx
+ movq %rbx, %cr3
+ movq saved_context_cr2(%rax), %rbx
+ movq %rbx, %cr2
+ movq saved_context_cr0(%rax), %rbx
+ movq %rbx, %cr0
+ pushq pt_regs_flags(%rax)
+ popfq
+ movq pt_regs_sp(%rax), %rsp
+ movq pt_regs_bp(%rax), %rbp
+ movq pt_regs_si(%rax), %rsi
+ movq pt_regs_di(%rax), %rdi
+ movq pt_regs_bx(%rax), %rbx
+ movq pt_regs_cx(%rax), %rcx
+ movq pt_regs_dx(%rax), %rdx
+ movq pt_regs_r8(%rax), %r8
+ movq pt_regs_r9(%rax), %r9
+ movq pt_regs_r10(%rax), %r10
+ movq pt_regs_r11(%rax), %r11
+ movq pt_regs_r12(%rax), %r12
+ movq pt_regs_r13(%rax), %r13
+ movq pt_regs_r14(%rax), %r14
+ movq pt_regs_r15(%rax), %r15
+
+ xorl %eax, %eax
+ addq $8, %rsp
+ jmp restore_processor_state
+ENDPROC(do_suspend_lowlevel)
+
+.data
+ENTRY(saved_rbp) .quad 0
+ENTRY(saved_rsi) .quad 0
+ENTRY(saved_rdi) .quad 0
+ENTRY(saved_rbx) .quad 0
+
+ENTRY(saved_rip) .quad 0
+ENTRY(saved_rsp) .quad 0
+
+ENTRY(saved_magic) .quad 0
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
new file mode 100644
index 00000000000..63b8ab524f2
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -0,0 +1,12 @@
+/*
+ * Wrapper script for the realmode binary as a transport object
+ * before copying to low memory.
+ */
+#include <asm/page_types.h>
+
+ .section ".x86_trampoline","a"
+ .balign PAGE_SIZE
+ .globl acpi_wakeup_code
+acpi_wakeup_code:
+ .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
+ .size acpi_wakeup_code, .-acpi_wakeup_code
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
new file mode 100644
index 00000000000..1f84794f075
--- /dev/null
+++ b/arch/x86/kernel/alternative.c
@@ -0,0 +1,742 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/stringify.h>
+#include <linux/kprobes.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/memory.h>
+#include <linux/stop_machine.h>
+#include <linux/slab.h>
+#include <asm/alternative.h>
+#include <asm/sections.h>
+#include <asm/pgtable.h>
+#include <asm/mce.h>
+#include <asm/nmi.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+
+#define MAX_PATCH_LEN (255-1)
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int smp_alt_once;
+
+static int __init bootonly(char *str)
+{
+ smp_alt_once = 1;
+ return 1;
+}
+__setup("smp-alt-boot", bootonly);
+#else
+#define smp_alt_once 1
+#endif
+
+static int __initdata_or_module debug_alternative;
+
+static int __init debug_alt(char *str)
+{
+ debug_alternative = 1;
+ return 1;
+}
+__setup("debug-alternative", debug_alt);
+
+static int noreplace_smp;
+
+static int __init setup_noreplace_smp(char *str)
+{
+ noreplace_smp = 1;
+ return 1;
+}
+__setup("noreplace-smp", setup_noreplace_smp);
+
+#ifdef CONFIG_PARAVIRT
+static int __initdata_or_module noreplace_paravirt = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+ noreplace_paravirt = 1;
+ return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+#endif
+
+#define DPRINTK(fmt, args...) if (debug_alternative) \
+ printk(KERN_DEBUG fmt, args)
+
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
+#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
+static const unsigned char intelnops[] =
+{
+ GENERIC_NOP1,
+ GENERIC_NOP2,
+ GENERIC_NOP3,
+ GENERIC_NOP4,
+ GENERIC_NOP5,
+ GENERIC_NOP6,
+ GENERIC_NOP7,
+ GENERIC_NOP8,
+ GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ intelnops,
+ intelnops + 1,
+ intelnops + 1 + 2,
+ intelnops + 1 + 2 + 3,
+ intelnops + 1 + 2 + 3 + 4,
+ intelnops + 1 + 2 + 3 + 4 + 5,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#ifdef K8_NOP1
+static const unsigned char k8nops[] =
+{
+ K8_NOP1,
+ K8_NOP2,
+ K8_NOP3,
+ K8_NOP4,
+ K8_NOP5,
+ K8_NOP6,
+ K8_NOP7,
+ K8_NOP8,
+ K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ k8nops,
+ k8nops + 1,
+ k8nops + 1 + 2,
+ k8nops + 1 + 2 + 3,
+ k8nops + 1 + 2 + 3 + 4,
+ k8nops + 1 + 2 + 3 + 4 + 5,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
+static const unsigned char k7nops[] =
+{
+ K7_NOP1,
+ K7_NOP2,
+ K7_NOP3,
+ K7_NOP4,
+ K7_NOP5,
+ K7_NOP6,
+ K7_NOP7,
+ K7_NOP8,
+ K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ k7nops,
+ k7nops + 1,
+ k7nops + 1 + 2,
+ k7nops + 1 + 2 + 3,
+ k7nops + 1 + 2 + 3 + 4,
+ k7nops + 1 + 2 + 3 + 4 + 5,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+#ifdef P6_NOP1
+static const unsigned char __initconst_or_module p6nops[] =
+{
+ P6_NOP1,
+ P6_NOP2,
+ P6_NOP3,
+ P6_NOP4,
+ P6_NOP5,
+ P6_NOP6,
+ P6_NOP7,
+ P6_NOP8,
+ P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
+ NULL,
+ p6nops,
+ p6nops + 1,
+ p6nops + 1 + 2,
+ p6nops + 1 + 2 + 3,
+ p6nops + 1 + 2 + 3 + 4,
+ p6nops + 1 + 2 + 3 + 4 + 5,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+};
+#endif
+
+/* Initialize these to a safe default */
+#ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
+
+void __init arch_init_ideal_nops(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ /*
+ * Due to a decoder implementation quirk, some
+ * specific Intel CPUs actually perform better with
+ * the "k8_nops" than with the SDM-recommended NOPs.
+ */
+ if (boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model >= 0x0f &&
+ boot_cpu_data.x86_model != 0x1c &&
+ boot_cpu_data.x86_model != 0x26 &&
+ boot_cpu_data.x86_model != 0x27 &&
+ boot_cpu_data.x86_model < 0x30) {
+ ideal_nops = k8_nops;
+ } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
+ ideal_nops = p6_nops;
+ } else {
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ ideal_nops = intel_nops;
+#endif
+ }
+
+ default:
+#ifdef CONFIG_X86_64
+ ideal_nops = k8_nops;
+#else
+ if (boot_cpu_has(X86_FEATURE_K8))
+ ideal_nops = k8_nops;
+ else if (boot_cpu_has(X86_FEATURE_K7))
+ ideal_nops = k7_nops;
+ else
+ ideal_nops = intel_nops;
+#endif
+ }
+}
+
+/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+static void __init_or_module add_nops(void *insns, unsigned int len)
+{
+ while (len > 0) {
+ unsigned int noplen = len;
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+ memcpy(insns, ideal_nops[noplen], noplen);
+ insns += noplen;
+ len -= noplen;
+ }
+}
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
+void *text_poke_early(void *addr, const void *opcode, size_t len);
+
+/* Replace instructions with better alternatives for this CPU type.
+ This runs before SMP is initialized to avoid SMP problems with
+ self modifying code. This implies that asymmetric systems where
+ APs have less capabilities than the boot processor are not handled.
+ Tough. Make sure you disable such features by hand. */
+
+void __init_or_module apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
+{
+ struct alt_instr *a;
+ u8 *instr, *replacement;
+ u8 insnbuf[MAX_PATCH_LEN];
+
+ DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ /*
+ * The scan order should be from start to end. A later scanned
+ * alternative code can overwrite a previous scanned alternative code.
+ * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+ * patch code.
+ *
+ * So be careful if you want to change the scan order to any other
+ * order.
+ */
+ for (a = start; a < end; a++) {
+ instr = (u8 *)&a->instr_offset + a->instr_offset;
+ replacement = (u8 *)&a->repl_offset + a->repl_offset;
+ BUG_ON(a->replacementlen > a->instrlen);
+ BUG_ON(a->instrlen > sizeof(insnbuf));
+ BUG_ON(a->cpuid >= NCAPINTS*32);
+ if (!boot_cpu_has(a->cpuid))
+ continue;
+
+ memcpy(insnbuf, replacement, a->replacementlen);
+
+ /* 0xe8 is a relative jump; fix the offset. */
+ if (*insnbuf == 0xe8 && a->replacementlen == 5)
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+
+ text_poke_early(instr, insnbuf, a->instrlen);
+ }
+}
+
+#ifdef CONFIG_SMP
+
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ mutex_lock(&text_mutex);
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
+ continue;
+ /* turn DS segment override prefix into lock prefix */
+ if (*ptr == 0x3e)
+ text_poke(ptr, ((unsigned char []){0xf0}), 1);
+ };
+ mutex_unlock(&text_mutex);
+}
+
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
+{
+ const s32 *poff;
+
+ if (noreplace_smp)
+ return;
+
+ mutex_lock(&text_mutex);
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
+ continue;
+ /* turn lock prefix into DS segment override prefix */
+ if (*ptr == 0xf0)
+ text_poke(ptr, ((unsigned char []){0x3E}), 1);
+ };
+ mutex_unlock(&text_mutex);
+}
+
+struct smp_alt_module {
+ /* what is this ??? */
+ struct module *mod;
+ char *name;
+
+ /* ptrs to lock prefixes */
+ const s32 *locks;
+ const s32 *locks_end;
+
+ /* .text segment, needed to avoid patching init code ;) */
+ u8 *text;
+ u8 *text_end;
+
+ struct list_head next;
+};
+static LIST_HEAD(smp_alt_modules);
+static DEFINE_MUTEX(smp_alt);
+static int smp_mode = 1; /* protected by smp_alt */
+
+void __init_or_module alternatives_smp_module_add(struct module *mod,
+ char *name,
+ void *locks, void *locks_end,
+ void *text, void *text_end)
+{
+ struct smp_alt_module *smp;
+
+ if (noreplace_smp)
+ return;
+
+ if (smp_alt_once) {
+ if (boot_cpu_has(X86_FEATURE_UP))
+ alternatives_smp_unlock(locks, locks_end,
+ text, text_end);
+ return;
+ }
+
+ smp = kzalloc(sizeof(*smp), GFP_KERNEL);
+ if (NULL == smp)
+ return; /* we'll run the (safe but slow) SMP code then ... */
+
+ smp->mod = mod;
+ smp->name = name;
+ smp->locks = locks;
+ smp->locks_end = locks_end;
+ smp->text = text;
+ smp->text_end = text_end;
+ DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
+ __func__, smp->locks, smp->locks_end,
+ smp->text, smp->text_end, smp->name);
+
+ mutex_lock(&smp_alt);
+ list_add_tail(&smp->next, &smp_alt_modules);
+ if (boot_cpu_has(X86_FEATURE_UP))
+ alternatives_smp_unlock(smp->locks, smp->locks_end,
+ smp->text, smp->text_end);
+ mutex_unlock(&smp_alt);
+}
+
+void __init_or_module alternatives_smp_module_del(struct module *mod)
+{
+ struct smp_alt_module *item;
+
+ if (smp_alt_once || noreplace_smp)
+ return;
+
+ mutex_lock(&smp_alt);
+ list_for_each_entry(item, &smp_alt_modules, next) {
+ if (mod != item->mod)
+ continue;
+ list_del(&item->next);
+ mutex_unlock(&smp_alt);
+ DPRINTK("%s: %s\n", __func__, item->name);
+ kfree(item);
+ return;
+ }
+ mutex_unlock(&smp_alt);
+}
+
+bool skip_smp_alternatives;
+void alternatives_smp_switch(int smp)
+{
+ struct smp_alt_module *mod;
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * Older binutils section handling bug prevented
+ * alternatives-replacement from working reliably.
+ *
+ * If this still occurs then you should see a hang
+ * or crash shortly after this line:
+ */
+ printk("lockdep: fixing up alternatives.\n");
+#endif
+
+ if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
+ return;
+ BUG_ON(!smp && (num_online_cpus() > 1));
+
+ mutex_lock(&smp_alt);
+
+ /*
+ * Avoid unnecessary switches because it forces JIT based VMs to
+ * throw away all cached translations, which can be quite costly.
+ */
+ if (smp == smp_mode) {
+ /* nothing */
+ } else if (smp) {
+ printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+ clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
+ list_for_each_entry(mod, &smp_alt_modules, next)
+ alternatives_smp_lock(mod->locks, mod->locks_end,
+ mod->text, mod->text_end);
+ } else {
+ printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+ set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
+ list_for_each_entry(mod, &smp_alt_modules, next)
+ alternatives_smp_unlock(mod->locks, mod->locks_end,
+ mod->text, mod->text_end);
+ }
+ smp_mode = smp;
+ mutex_unlock(&smp_alt);
+}
+
+/* Return 1 if the address range is reserved for smp-alternatives */
+int alternatives_text_reserved(void *start, void *end)
+{
+ struct smp_alt_module *mod;
+ const s32 *poff;
+ u8 *text_start = start;
+ u8 *text_end = end;
+
+ list_for_each_entry(mod, &smp_alt_modules, next) {
+ if (mod->text > text_end || mod->text_end < text_start)
+ continue;
+ for (poff = mod->locks; poff < mod->locks_end; poff++) {
+ const u8 *ptr = (const u8 *)poff + *poff;
+
+ if (text_start <= ptr && text_end > ptr)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT
+void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
+{
+ struct paravirt_patch_site *p;
+ char insnbuf[MAX_PATCH_LEN];
+
+ if (noreplace_paravirt)
+ return;
+
+ for (p = start; p < end; p++) {
+ unsigned int used;
+
+ BUG_ON(p->len > MAX_PATCH_LEN);
+ /* prep the buffer with the original instructions */
+ memcpy(insnbuf, p->instr, p->len);
+ used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
+ (unsigned long)p->instr, p->len);
+
+ BUG_ON(used > p->len);
+
+ /* Pad the rest with nops */
+ add_nops(insnbuf + used, p->len - used);
+ text_poke_early(p->instr, insnbuf, p->len);
+ }
+}
+extern struct paravirt_patch_site __start_parainstructions[],
+ __stop_parainstructions[];
+#endif /* CONFIG_PARAVIRT */
+
+void __init alternative_instructions(void)
+{
+ /* The patching is not fully atomic, so try to avoid local interruptions
+ that might execute the to be patched code.
+ Other CPUs are not running. */
+ stop_nmi();
+
+ /*
+ * Don't stop machine check exceptions while patching.
+ * MCEs only happen when something got corrupted and in this
+ * case we must do something about the corruption.
+ * Ignoring it is worse than a unlikely patching race.
+ * Also machine checks tend to be broadcast and if one CPU
+ * goes into machine check the others follow quickly, so we don't
+ * expect a machine check to cause undue problems during to code
+ * patching.
+ */
+
+ apply_alternatives(__alt_instructions, __alt_instructions_end);
+
+ /* switch to patch-once-at-boottime-only mode and free the
+ * tables in case we know the number of CPUs will never ever
+ * change */
+#ifdef CONFIG_HOTPLUG_CPU
+ if (num_possible_cpus() < 2)
+ smp_alt_once = 1;
+#endif
+
+#ifdef CONFIG_SMP
+ if (smp_alt_once) {
+ if (1 == num_possible_cpus()) {
+ printk(KERN_INFO "SMP alternatives: switching to UP code\n");
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
+ set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
+
+ alternatives_smp_unlock(__smp_locks, __smp_locks_end,
+ _text, _etext);
+ }
+ } else {
+ alternatives_smp_module_add(NULL, "core kernel",
+ __smp_locks, __smp_locks_end,
+ _text, _etext);
+
+ /* Only switch to UP mode if we don't immediately boot others */
+ if (num_present_cpus() == 1 || setup_max_cpus <= 1)
+ alternatives_smp_switch(0);
+ }
+#endif
+ apply_paravirt(__parainstructions, __parainstructions_end);
+
+ if (smp_alt_once)
+ free_init_pages("SMP alternatives",
+ (unsigned long)__smp_locks,
+ (unsigned long)__smp_locks_end);
+
+ restart_nmi();
+}
+
+/**
+ * text_poke_early - Update instructions on a live kernel at boot time
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * When you use this code to patch more than one byte of an instruction
+ * you need to make sure that other CPUs cannot execute this code in parallel.
+ * Also no thread must be currently preempted in the middle of these
+ * instructions. And on the local CPU you need to be protected again NMI or MCE
+ * handlers seeing an inconsistent instruction while you patch.
+ */
+void *__init_or_module text_poke_early(void *addr, const void *opcode,
+ size_t len)
+{
+ unsigned long flags;
+ local_irq_save(flags);
+ memcpy(addr, opcode, len);
+ sync_core();
+ local_irq_restore(flags);
+ /* Could also do a CLFLUSH here to speed up CPU recovery; but
+ that causes hangs on some VIA CPUs. */
+ return addr;
+}
+
+/**
+ * text_poke - Update instructions on a live kernel
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ *
+ * Note: Must be called under text_mutex.
+ */
+void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+{
+ unsigned long flags;
+ char *vaddr;
+ struct page *pages[2];
+ int i;
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ BUG_ON(!pages[0]);
+ local_irq_save(flags);
+ set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
+ if (pages[1])
+ set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
+ vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
+ memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
+ clear_fixmap(FIX_TEXT_POKE0);
+ if (pages[1])
+ clear_fixmap(FIX_TEXT_POKE1);
+ local_flush_tlb();
+ sync_core();
+ /* Could also do a CLFLUSH here to speed up CPU recovery; but
+ that causes hangs on some VIA CPUs. */
+ for (i = 0; i < len; i++)
+ BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
+ local_irq_restore(flags);
+ return addr;
+}
+
+/*
+ * Cross-modifying kernel text with stop_machine().
+ * This code originally comes from immediate value.
+ */
+static atomic_t stop_machine_first;
+static int wrote_text;
+
+struct text_poke_params {
+ struct text_poke_param *params;
+ int nparams;
+};
+
+static int __kprobes stop_machine_text_poke(void *data)
+{
+ struct text_poke_params *tpp = data;
+ struct text_poke_param *p;
+ int i;
+
+ if (atomic_dec_and_test(&stop_machine_first)) {
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ text_poke(p->addr, p->opcode, p->len);
+ }
+ smp_wmb(); /* Make sure other cpus see that this has run */
+ wrote_text = 1;
+ } else {
+ while (!wrote_text)
+ cpu_relax();
+ smp_mb(); /* Load wrote_text before following execution */
+ }
+
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ flush_icache_range((unsigned long)p->addr,
+ (unsigned long)p->addr + p->len);
+ }
+ /*
+ * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
+ * that a core serializing instruction such as "cpuid" should be
+ * executed on _each_ core before the new instruction is made visible.
+ */
+ sync_core();
+ return 0;
+}
+
+/**
+ * text_poke_smp - Update instructions on a live kernel on SMP
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. This allows
+ * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
+ * should be allowed, since stop_machine() does _not_ protect code against
+ * NMI and MCE.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
+{
+ struct text_poke_params tpp;
+ struct text_poke_param p;
+
+ p.addr = addr;
+ p.opcode = opcode;
+ p.len = len;
+ tpp.params = &p;
+ tpp.nparams = 1;
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ /* Use __stop_machine() because the caller already got online_cpus. */
+ __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
+ return addr;
+}
+
+/**
+ * text_poke_smp_batch - Update instructions on a live kernel on SMP
+ * @params: an array of text_poke parameters
+ * @n: the number of elements in params.
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. Since the
+ * stop_machine() is heavy task, it is better to aggregate text_poke requests
+ * and do it once if possible.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+{
+ struct text_poke_params tpp = {.params = params, .nparams = n};
+
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
+}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
new file mode 100644
index 00000000000..b1e7c7f7a0a
--- /dev/null
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -0,0 +1,898 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ *
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB.
+ *
+ * See Documentation/DMA-API-HOWTO.txt for the interface specification.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU General Public License v2 only.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitmap.h>
+#include <linux/kdebug.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <linux/syscore_ops.h>
+#include <linux/io.h>
+#include <linux/gfp.h>
+#include <linux/atomic.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/cacheflush.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static unsigned long iommu_bus_base; /* GART remapping area (physical) */
+static unsigned long iommu_size; /* size of remapping area bytes */
+static unsigned long iommu_pages; /* .. and in pages */
+
+static u32 *iommu_gatt_base; /* Remapping table */
+
+static dma_addr_t bad_dma_addr;
+
+/*
+ * If this is disabled the IOMMU will use an optimized flushing strategy
+ * of only flushing when an mapping is reused. With it true the GART is
+ * flushed for every mapping. Problem is that doing the lazy flush seems
+ * to trigger bugs with some popular PCI cards, in particular 3ware (but
+ * has been also also seen with Qlogic at least).
+ */
+static int iommu_fullflush = 1;
+
+/* Allocation bitmap for the remapping area: */
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+/* Guarded by iommu_bitmap_lock: */
+static unsigned long *iommu_gart_bitmap;
+
+static u32 gart_unmapped_entry;
+
+#define GPTE_VALID 1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+ (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR (1ULL << 40)
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit; /* protected by iommu_bitmap_lock */
+static bool need_flush; /* global flush state. set for each gart wrap */
+
+static unsigned long alloc_iommu(struct device *dev, int size,
+ unsigned long align_mask)
+{
+ unsigned long offset, flags;
+ unsigned long boundary_size;
+ unsigned long base_index;
+
+ base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
+ PAGE_SIZE) >> PAGE_SHIFT;
+ boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
+ size, base_index, boundary_size, align_mask);
+ if (offset == -1) {
+ need_flush = true;
+ offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
+ size, base_index, boundary_size,
+ align_mask);
+ }
+ if (offset != -1) {
+ next_bit = offset+size;
+ if (next_bit >= iommu_pages) {
+ next_bit = 0;
+ need_flush = true;
+ }
+ }
+ if (iommu_fullflush)
+ need_flush = true;
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+
+ return offset;
+}
+
+static void free_iommu(unsigned long offset, int size)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ bitmap_clear(iommu_gart_bitmap, offset, size);
+ if (offset >= next_bit)
+ next_bit = offset + size;
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+/*
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ if (need_flush) {
+ amd_flush_garts();
+ need_flush = false;
+ }
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+#ifdef CONFIG_IOMMU_LEAK
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static int leak_trace;
+static int iommu_leak_pages = 20;
+
+static void dump_leak(void)
+{
+ static int dump;
+
+ if (dump)
+ return;
+ dump = 1;
+
+ show_stack(NULL, NULL);
+ debug_dma_dump_mappings(NULL);
+}
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+ /*
+ * Ran out of IOMMU space for this operation. This is very bad.
+ * Unfortunately the drivers cannot handle this operation properly.
+ * Return some non mapped prereserved space in the aperture and
+ * let the Northbridge deal with it. This will result in garbage
+ * in the IO operation. When the size exceeds the prereserved space
+ * memory corruption will occur or random memory will be DMAed
+ * out. Hopefully no network devices use single mappings that big.
+ */
+
+ dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
+
+ if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+ if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic("PCI-DMA: Memory would be corrupted\n");
+ if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+ panic(KERN_ERR
+ "PCI-DMA: Random memory would be DMAed\n");
+ }
+#ifdef CONFIG_IOMMU_LEAK
+ dump_leak();
+#endif
+}
+
+static inline int
+need_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+ return force_iommu || !dma_capable(dev, addr, size);
+}
+
+static inline int
+nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+ return !dma_capable(dev, addr, size);
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+ size_t size, int dir, unsigned long align_mask)
+{
+ unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
+ unsigned long iommu_page;
+ int i;
+
+ if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+ return bad_dma_addr;
+
+ iommu_page = alloc_iommu(dev, npages, align_mask);
+ if (iommu_page == -1) {
+ if (!nonforced_iommu(dev, phys_mem, size))
+ return phys_mem;
+ if (panic_on_overflow)
+ panic("dma_map_area overflow %lu bytes\n", size);
+ iommu_full(dev, size, dir);
+ return bad_dma_addr;
+ }
+
+ for (i = 0; i < npages; i++) {
+ iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+ phys_mem += PAGE_SIZE;
+ }
+ return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+/* Map a single area into the IOMMU */
+static dma_addr_t gart_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ unsigned long bus;
+ phys_addr_t paddr = page_to_phys(page) + offset;
+
+ if (!dev)
+ dev = &x86_dma_fallback_dev;
+
+ if (!need_iommu(dev, paddr, size))
+ return paddr;
+
+ bus = dma_map_area(dev, paddr, size, dir, 0);
+ flush_gart();
+
+ return bus;
+}
+
+/*
+ * Free a DMA mapping.
+ */
+static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ unsigned long iommu_page;
+ int npages;
+ int i;
+
+ if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+ dma_addr >= iommu_bus_base + iommu_size)
+ return;
+
+ iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+ npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+ for (i = 0; i < npages; i++) {
+ iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+ }
+ free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sg, s, nents, i) {
+ if (!s->dma_length || !s->length)
+ break;
+ gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
+ }
+}
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+ int nents, int dir)
+{
+ struct scatterlist *s;
+ int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+ pr_debug("dma_map_sg overflow\n");
+#endif
+
+ for_each_sg(sg, s, nents, i) {
+ unsigned long addr = sg_phys(s);
+
+ if (nonforced_iommu(dev, addr, s->length)) {
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
+ if (addr == bad_dma_addr) {
+ if (i > 0)
+ gart_unmap_sg(dev, sg, i, dir, NULL);
+ nents = 0;
+ sg[0].dma_length = 0;
+ break;
+ }
+ }
+ s->dma_address = addr;
+ s->dma_length = s->length;
+ }
+ flush_gart();
+
+ return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct device *dev, struct scatterlist *start,
+ int nelems, struct scatterlist *sout,
+ unsigned long pages)
+{
+ unsigned long iommu_start = alloc_iommu(dev, pages, 0);
+ unsigned long iommu_page = iommu_start;
+ struct scatterlist *s;
+ int i;
+
+ if (iommu_start == -1)
+ return -1;
+
+ for_each_sg(start, s, nelems, i) {
+ unsigned long pages, addr;
+ unsigned long phys_addr = s->dma_address;
+
+ BUG_ON(s != start && s->offset);
+ if (s == start) {
+ sout->dma_address = iommu_bus_base;
+ sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+ sout->dma_length = s->length;
+ } else {
+ sout->dma_length += s->length;
+ }
+
+ addr = phys_addr;
+ pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+ while (pages--) {
+ iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
+ addr += PAGE_SIZE;
+ iommu_page++;
+ }
+ }
+ BUG_ON(iommu_page - iommu_start != pages);
+
+ return 0;
+}
+
+static inline int
+dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
+ struct scatterlist *sout, unsigned long pages, int need)
+{
+ if (!need) {
+ BUG_ON(nelems != 1);
+ sout->dma_address = start->dma_address;
+ sout->dma_length = start->length;
+ return 0;
+ }
+ return __dma_map_cont(dev, start, nelems, sout, pages);
+}
+
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping.
+ */
+static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ struct scatterlist *s, *ps, *start_sg, *sgmap;
+ int need = 0, nextneed, i, out, start;
+ unsigned long pages = 0;
+ unsigned int seg_size;
+ unsigned int max_seg_size;
+
+ if (nents == 0)
+ return 0;
+
+ if (!dev)
+ dev = &x86_dma_fallback_dev;
+
+ out = 0;
+ start = 0;
+ start_sg = sg;
+ sgmap = sg;
+ seg_size = 0;
+ max_seg_size = dma_get_max_seg_size(dev);
+ ps = NULL; /* shut up gcc */
+
+ for_each_sg(sg, s, nents, i) {
+ dma_addr_t addr = sg_phys(s);
+
+ s->dma_address = addr;
+ BUG_ON(s->length == 0);
+
+ nextneed = need_iommu(dev, addr, s->length);
+
+ /* Handle the previous not yet processed entries */
+ if (i > start) {
+ /*
+ * Can only merge when the last chunk ends on a
+ * page boundary and the new one doesn't have an
+ * offset.
+ */
+ if (!iommu_merge || !nextneed || !need || s->offset ||
+ (s->length + seg_size > max_seg_size) ||
+ (ps->offset + ps->length) % PAGE_SIZE) {
+ if (dma_map_cont(dev, start_sg, i - start,
+ sgmap, pages, need) < 0)
+ goto error;
+ out++;
+
+ seg_size = 0;
+ sgmap = sg_next(sgmap);
+ pages = 0;
+ start = i;
+ start_sg = s;
+ }
+ }
+
+ seg_size += s->length;
+ need = nextneed;
+ pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+ ps = s;
+ }
+ if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+ goto error;
+ out++;
+ flush_gart();
+ if (out < nents) {
+ sgmap = sg_next(sgmap);
+ sgmap->dma_length = 0;
+ }
+ return out;
+
+error:
+ flush_gart();
+ gart_unmap_sg(dev, sg, out, dir, NULL);
+
+ /* When it was forced or merged try again in a dumb way */
+ if (force_iommu || iommu_merge) {
+ out = dma_map_sg_nonforce(dev, sg, nents, dir);
+ if (out > 0)
+ return out;
+ }
+ if (panic_on_overflow)
+ panic("dma_map_sg: overflow on %lu pages\n", pages);
+
+ iommu_full(dev, pages << PAGE_SHIFT, dir);
+ for_each_sg(sg, s, nents, i)
+ s->dma_address = bad_dma_addr;
+ return 0;
+}
+
+/* allocate and map a coherent mapping */
+static void *
+gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+ gfp_t flag)
+{
+ dma_addr_t paddr;
+ unsigned long align_mask;
+ struct page *page;
+
+ if (force_iommu && !(flag & GFP_DMA)) {
+ flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ page = alloc_pages(flag | __GFP_ZERO, get_order(size));
+ if (!page)
+ return NULL;
+
+ align_mask = (1UL << get_order(size)) - 1;
+ paddr = dma_map_area(dev, page_to_phys(page), size,
+ DMA_BIDIRECTIONAL, align_mask);
+
+ flush_gart();
+ if (paddr != bad_dma_addr) {
+ *dma_addr = paddr;
+ return page_address(page);
+ }
+ __free_pages(page, get_order(size));
+ } else
+ return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+
+ return NULL;
+}
+
+/* free a coherent mapping */
+static void
+gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr)
+{
+ gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return (dma_addr == bad_dma_addr);
+}
+
+static int no_agp;
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{
+ unsigned long a;
+
+ if (!iommu_size) {
+ iommu_size = aper_size;
+ if (!no_agp)
+ iommu_size /= 2;
+ }
+
+ a = aper + iommu_size;
+ iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+
+ if (iommu_size < 64*1024*1024) {
+ pr_warning(
+ "PCI-DMA: Warning: Small IOMMU %luMB."
+ " Consider increasing the AGP aperture in BIOS\n",
+ iommu_size >> 20);
+ }
+
+ return iommu_size;
+}
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
+{
+ unsigned aper_size = 0, aper_base_32, aper_order;
+ u64 aper_base;
+
+ pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
+ aper_order = (aper_order >> 1) & 7;
+
+ aper_base = aper_base_32 & 0x7fff;
+ aper_base <<= 25;
+
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ if (aper_base + aper_size > 0x100000000UL || !aper_size)
+ aper_base = 0;
+
+ *size = aper_size;
+ return aper_base;
+}
+
+static void enable_gart_translations(void)
+{
+ int i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+ enable_gart_translation(dev, __pa(agp_gatt_table));
+ }
+
+ /* Flush the GART-TLB to remove stale entries */
+ amd_flush_garts();
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+ fix_up_north_bridges = true;
+ aperture_order = aper_order;
+ aperture_alloc = aper_alloc;
+}
+
+static void gart_fixup_northbridges(void)
+{
+ int i;
+
+ if (!fix_up_north_bridges)
+ return;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ pr_info("PCI-DMA: Restoring GART aperture settings\n");
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ gart_set_size_and_enable(dev, aperture_order);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
+ }
+}
+
+static void gart_resume(void)
+{
+ pr_info("PCI-DMA: Resuming GART IOMMU\n");
+
+ gart_fixup_northbridges();
+
+ enable_gart_translations();
+}
+
+static struct syscore_ops gart_syscore_ops = {
+ .resume = gart_resume,
+
+};
+
+/*
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.
+ */
+static __init int init_amd_gatt(struct agp_kern_info *info)
+{
+ unsigned aper_size, gatt_size, new_aper_size;
+ unsigned aper_base, new_aper_base;
+ struct pci_dev *dev;
+ void *gatt;
+ int i;
+
+ pr_info("PCI-DMA: Disabling AGP.\n");
+
+ aper_size = aper_base = info->aper_size = 0;
+ dev = NULL;
+ for (i = 0; i < amd_nb_num(); i++) {
+ dev = node_to_amd_nb(i)->misc;
+ new_aper_base = read_aperture(dev, &new_aper_size);
+ if (!new_aper_base)
+ goto nommu;
+
+ if (!aper_base) {
+ aper_size = new_aper_size;
+ aper_base = new_aper_base;
+ }
+ if (aper_size != new_aper_size || aper_base != new_aper_base)
+ goto nommu;
+ }
+ if (!aper_base)
+ goto nommu;
+
+ info->aper_base = aper_base;
+ info->aper_size = aper_size >> 20;
+
+ gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
+ gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(gatt_size));
+ if (!gatt)
+ panic("Cannot allocate GATT table");
+ if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
+ panic("Could not set GART PTEs to uncacheable pages");
+
+ agp_gatt_table = gatt;
+
+ register_syscore_ops(&gart_syscore_ops);
+
+ flush_gart();
+
+ pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
+ aper_base, aper_size>>10);
+
+ return 0;
+
+ nommu:
+ /* Should not happen anymore */
+ pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+ "falling back to iommu=soft.\n");
+ return -1;
+}
+
+static struct dma_map_ops gart_dma_ops = {
+ .map_sg = gart_map_sg,
+ .unmap_sg = gart_unmap_sg,
+ .map_page = gart_map_page,
+ .unmap_page = gart_unmap_page,
+ .alloc_coherent = gart_alloc_coherent,
+ .free_coherent = gart_free_coherent,
+ .mapping_error = gart_mapping_error,
+};
+
+static void gart_iommu_shutdown(void)
+{
+ struct pci_dev *dev;
+ int i;
+
+ /* don't shutdown it if there is AGP installed */
+ if (!no_agp)
+ return;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ for (i = 0; i < amd_nb_num(); i++) {
+ u32 ctl;
+
+ dev = node_to_amd_nb(i)->misc;
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+
+ ctl &= ~GARTEN;
+
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+ }
+}
+
+int __init gart_iommu_init(void)
+{
+ struct agp_kern_info info;
+ unsigned long iommu_start;
+ unsigned long aper_base, aper_size;
+ unsigned long start_pfn, end_pfn;
+ unsigned long scratch;
+ long i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return 0;
+
+#ifndef CONFIG_AGP_AMD64
+ no_agp = 1;
+#else
+ /* Makefile puts PCI initialization via subsys_initcall first. */
+ /* Add other AMD AGP bridge drivers here */
+ no_agp = no_agp ||
+ (agp_amd64_init() < 0) ||
+ (agp_copy_info(agp_bridge, &info) < 0);
+#endif
+
+ if (no_iommu ||
+ (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
+ !gart_iommu_aperture ||
+ (no_agp && init_amd_gatt(&info) < 0)) {
+ if (max_pfn > MAX_DMA32_PFN) {
+ pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
+ pr_warning("falling back to iommu=soft.\n");
+ }
+ return 0;
+ }
+
+ /* need to map that range */
+ aper_size = info.aper_size << 20;
+ aper_base = info.aper_base;
+ end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+
+ if (end_pfn > max_low_pfn_mapped) {
+ start_pfn = (aper_base>>PAGE_SHIFT);
+ init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+ }
+
+ pr_info("PCI-DMA: using GART IOMMU.\n");
+ iommu_size = check_iommu_size(info.aper_base, aper_size);
+ iommu_pages = iommu_size >> PAGE_SHIFT;
+
+ iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ get_order(iommu_pages/8));
+ if (!iommu_gart_bitmap)
+ panic("Cannot allocate iommu bitmap\n");
+
+#ifdef CONFIG_IOMMU_LEAK
+ if (leak_trace) {
+ int ret;
+
+ ret = dma_debug_resize_entries(iommu_pages);
+ if (ret)
+ pr_debug("PCI-DMA: Cannot trace all the entries\n");
+ }
+#endif
+
+ /*
+ * Out of IOMMU space handling.
+ * Reserve some invalid pages at the beginning of the GART.
+ */
+ bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+
+ pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+ iommu_size >> 20);
+
+ agp_memory_reserved = iommu_size;
+ iommu_start = aper_size - iommu_size;
+ iommu_bus_base = info.aper_base + iommu_start;
+ bad_dma_addr = iommu_bus_base;
+ iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+ /*
+ * Unmap the IOMMU part of the GART. The alias of the page is
+ * always mapped with cache enabled and there is no full cache
+ * coherency across the GART remapping. The unmapping avoids
+ * automatic prefetches from the CPU allocating cache lines in
+ * there. All CPU accesses are done via the direct mapping to
+ * the backing memory. The GART address is only used by PCI
+ * devices.
+ */
+ set_memory_np((unsigned long)__va(iommu_bus_base),
+ iommu_size >> PAGE_SHIFT);
+ /*
+ * Tricky. The GART table remaps the physical memory range,
+ * so the CPU wont notice potential aliases and if the memory
+ * is remapped to UC later on, we might surprise the PCI devices
+ * with a stray writeout of a cacheline. So play it sure and
+ * do an explicit, full-scale wbinvd() _after_ having marked all
+ * the pages as Not-Present:
+ */
+ wbinvd();
+
+ /*
+ * Now all caches are flushed and we can safely enable
+ * GART hardware. Doing it early leaves the possibility
+ * of stale cache entries that can lead to GART PTE
+ * errors.
+ */
+ enable_gart_translations();
+
+ /*
+ * Try to workaround a bug (thanks to BenH):
+ * Set unmapped entries to a scratch page instead of 0.
+ * Any prefetches that hit unmapped entries won't get an bus abort
+ * then. (P2P bridge may be prefetching on DMA reads).
+ */
+ scratch = get_zeroed_page(GFP_KERNEL);
+ if (!scratch)
+ panic("Cannot allocate iommu scratch page");
+ gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+ for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
+ iommu_gatt_base[i] = gart_unmapped_entry;
+
+ flush_gart();
+ dma_ops = &gart_dma_ops;
+ x86_platform.iommu_shutdown = gart_iommu_shutdown;
+ swiotlb = 0;
+
+ return 0;
+}
+
+void __init gart_parse_options(char *p)
+{
+ int arg;
+
+#ifdef CONFIG_IOMMU_LEAK
+ if (!strncmp(p, "leak", 4)) {
+ leak_trace = 1;
+ p += 4;
+ if (*p == '=')
+ ++p;
+ if (isdigit(*p) && get_option(&p, &arg))
+ iommu_leak_pages = arg;
+ }
+#endif
+ if (isdigit(*p) && get_option(&p, &arg))
+ iommu_size = arg;
+ if (!strncmp(p, "fullflush", 9))
+ iommu_fullflush = 1;
+ if (!strncmp(p, "nofullflush", 11))
+ iommu_fullflush = 0;
+ if (!strncmp(p, "noagp", 5))
+ no_agp = 1;
+ if (!strncmp(p, "noaperture", 10))
+ fix_aperture = 0;
+ /* duplicated from pci-dma.c */
+ if (!strncmp(p, "force", 5))
+ gart_iommu_aperture_allowed = 1;
+ if (!strncmp(p, "allowed", 7))
+ gart_iommu_aperture_allowed = 1;
+ if (!strncmp(p, "memaper", 7)) {
+ fallback_aper_force = 1;
+ p += 7;
+ if (*p == '=') {
+ ++p;
+ if (get_option(&p, &arg))
+ fallback_aper_order = arg;
+ }
+ }
+}
+IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 00000000000..be16854591c
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,282 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/amd_nb.h>
+
+static u32 *flush_words;
+
+const struct pci_device_id amd_nb_misc_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
+ {}
+};
+EXPORT_SYMBOL(amd_nb_misc_ids);
+
+static struct pci_device_id amd_nb_link_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+ {}
+};
+
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
+struct amd_northbridge_info amd_northbridges;
+EXPORT_SYMBOL(amd_northbridges);
+
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+ const struct pci_device_id *ids)
+{
+ do {
+ dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ } while (!pci_match_id(ids, dev));
+ return dev;
+}
+
+int amd_cache_northbridges(void)
+{
+ u16 i = 0;
+ struct amd_northbridge *nb;
+ struct pci_dev *misc, *link;
+
+ if (amd_nb_num())
+ return 0;
+
+ misc = NULL;
+ while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+ i++;
+
+ if (i == 0)
+ return 0;
+
+ nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+ if (!nb)
+ return -ENOMEM;
+
+ amd_northbridges.nb = nb;
+ amd_northbridges.num = i;
+
+ link = misc = NULL;
+ for (i = 0; i != amd_nb_num(); i++) {
+ node_to_amd_nb(i)->misc = misc =
+ next_northbridge(misc, amd_nb_misc_ids);
+ node_to_amd_nb(i)->link = link =
+ next_northbridge(link, amd_nb_link_ids);
+ }
+
+ /* some CPU families (e.g. family 0x11) do not support GART */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_GART;
+
+ /*
+ * Some CPU families support L3 Cache Index Disable. There are some
+ * limitations because of E382 and E388 on family 0x10.
+ */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model >= 0x8 &&
+ (boot_cpu_data.x86_model > 0x9 ||
+ boot_cpu_data.x86_mask >= 0x1))
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
+
+ /* L3 cache partitioning is supported on family 0x15 */
+ if (boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
+
+/*
+ * Ignores subdevice/subvendor but as far as I can figure out
+ * they're useless anyways
+ */
+bool __init early_is_amd_nb(u32 device)
+{
+ const struct pci_device_id *id;
+ u32 vendor = device & 0xffff;
+
+ device >>= 16;
+ for (id = amd_nb_misc_ids; id->vendor; id++)
+ if (vendor == id->vendor && device == id->device)
+ return true;
+ return false;
+}
+
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+ u32 address;
+ u64 base, msr;
+ unsigned segn_busn_bits;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return NULL;
+
+ /* assume all cpus from fam10h have mmconfig */
+ if (boot_cpu_data.x86 < 0x10)
+ return NULL;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ rdmsrl(address, msr);
+
+ /* mmconfig is not enabled */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ res->flags = IORESOURCE_MEM;
+ res->start = base;
+ res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+ return res;
+}
+
+int amd_get_subcaches(int cpu)
+{
+ struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
+ unsigned int mask;
+ int cuid;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return 0;
+
+ pci_read_config_dword(link, 0x1d4, &mask);
+
+ cuid = cpu_data(cpu).compute_unit_id;
+ return (mask >> (4 * cuid)) & 0xf;
+}
+
+int amd_set_subcaches(int cpu, int mask)
+{
+ static unsigned int reset, ban;
+ struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ unsigned int reg;
+ int cuid;
+
+ if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
+ return -EINVAL;
+
+ /* if necessary, collect reset state of L3 partitioning and BAN mode */
+ if (reset == 0) {
+ pci_read_config_dword(nb->link, 0x1d4, &reset);
+ pci_read_config_dword(nb->misc, 0x1b8, &ban);
+ ban &= 0x180000;
+ }
+
+ /* deactivate BAN mode if any subcaches are to be disabled */
+ if (mask != 0xf) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
+ }
+
+ cuid = cpu_data(cpu).compute_unit_id;
+ mask <<= 4 * cuid;
+ mask |= (0xf ^ (1 << cuid)) << 26;
+
+ pci_write_config_dword(nb->link, 0x1d4, mask);
+
+ /* reset BAN mode if L3 partitioning returned to reset state */
+ pci_read_config_dword(nb->link, 0x1d4, &reg);
+ if (reg == reset) {
+ pci_read_config_dword(nb->misc, 0x1b8, &reg);
+ reg &= ~0x180000;
+ pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
+ }
+
+ return 0;
+}
+
+static int amd_cache_gart(void)
+{
+ u16 i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return 0;
+
+ flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i != amd_nb_num(); i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ &flush_words[i]);
+
+ return 0;
+}
+
+void amd_flush_garts(void)
+{
+ int flushed, i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(gart_lock);
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return;
+
+ /* Avoid races between AGP and IOMMU. In theory it's not needed
+ but I'm not sure if the hardware won't lose flush requests
+ when another is pending. This whole thing is so expensive anyways
+ that it doesn't matter to serialize more. -AK */
+ spin_lock_irqsave(&gart_lock, flags);
+ flushed = 0;
+ for (i = 0; i < amd_nb_num(); i++) {
+ pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ flush_words[i] | 1);
+ flushed++;
+ }
+ for (i = 0; i < amd_nb_num(); i++) {
+ u32 w;
+ /* Make sure the hardware actually executed the flush*/
+ for (;;) {
+ pci_read_config_dword(node_to_amd_nb(i)->misc,
+ 0x9c, &w);
+ if (!(w & 1))
+ break;
+ cpu_relax();
+ }
+ }
+ spin_unlock_irqrestore(&gart_lock, flags);
+ if (!flushed)
+ printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(amd_flush_garts);
+
+static __init int init_amd_nbs(void)
+{
+ int err = 0;
+
+ err = amd_cache_northbridges();
+
+ if (err < 0)
+ printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+
+ if (amd_cache_gart() < 0)
+ printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+ "GART support disabled.\n");
+
+ return err;
+}
+
+/* This has to go after the PCI subsystem */
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 00000000000..afdc3f756de
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,431 @@
+/*
+ * apb_timer.c: Driver for Langwell APB timers
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * Langwell is the south complex of Intel Moorestown MID platform. There are
+ * eight external timers in total that can be used by the operating system.
+ * The timer information, such as frequency and addresses, is provided to the
+ * OS via SFI tables.
+ * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
+ * individual redirection table entries (RTE).
+ * Unlike HPET, there is no master counter, therefore one of the timers are
+ * used as clocksource. The overall allocation looks like:
+ * - timer 0 - NR_CPUs for per cpu timer
+ * - one timer for clocksource
+ * - one timer for watchdog driver.
+ * It is also worth notice that APB timer does not support true one-shot mode,
+ * free-running mode will be used here to emulate one-shot mode.
+ * APB timer can also be used as broadcast timer along with per cpu local APIC
+ * timer, but by default APB timer has higher rating than local APIC timers.
+ */
+
+#include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/pm.h>
+#include <linux/sfi.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#include <asm/fixmap.h>
+#include <asm/apb_timer.h>
+#include <asm/mrst.h>
+#include <asm/time.h>
+
+#define APBT_CLOCKEVENT_RATING 110
+#define APBT_CLOCKSOURCE_RATING 250
+
+#define APBT_CLOCKEVENT0_NUM (0)
+#define APBT_CLOCKSOURCE_NUM (2)
+
+static phys_addr_t apbt_address;
+static int apb_timer_block_enabled;
+static void __iomem *apbt_virt_address;
+
+/*
+ * Common DW APB timer info
+ */
+static unsigned long apbt_freq;
+
+struct apbt_dev {
+ struct dw_apb_clock_event_device *timer;
+ unsigned int num;
+ int cpu;
+ unsigned int irq;
+ char name[10];
+};
+
+static struct dw_apb_clocksource *clocksource_apbt;
+
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
+{
+ return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
+}
+
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
+
+static inline void apbt_set_mapping(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ int phy_cs_timer_id = 0;
+
+ if (apbt_virt_address) {
+ pr_debug("APBT base already mapped\n");
+ return;
+ }
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return;
+ }
+ apbt_address = (phys_addr_t)mtmr->phys_addr;
+ if (!apbt_address) {
+ printk(KERN_WARNING "No timer base from SFI, use default\n");
+ apbt_address = APBT_DEFAULT_BASE;
+ }
+ apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
+ if (!apbt_virt_address) {
+ pr_debug("Failed mapping APBT phy address at %lu\n",\
+ (unsigned long)apbt_address);
+ goto panic_noapbt;
+ }
+ apbt_freq = mtmr->freq_hz;
+ sfi_free_mtmr(mtmr);
+
+ /* Now figure out the physical timer id for clocksource device */
+ mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
+ if (mtmr == NULL)
+ goto panic_noapbt;
+
+ /* Now figure out the physical timer id */
+ pr_debug("Use timer %d for clocksource\n",
+ (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+ phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+ APBTMRS_REG_SIZE;
+
+ clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+ "apbt0", apbt_virt_address + phy_cs_timer_id *
+ APBTMRS_REG_SIZE, apbt_freq);
+ return;
+
+panic_noapbt:
+ panic("Failed to setup APB system timer\n");
+
+}
+
+static inline void apbt_clear_mapping(void)
+{
+ iounmap(apbt_virt_address);
+ apbt_virt_address = NULL;
+}
+
+/*
+ * APBT timer interrupt enable / disable
+ */
+static inline int is_apbt_capable(void)
+{
+ return apbt_virt_address ? 1 : 0;
+}
+
+static int __init apbt_clockevent_register(void)
+{
+ struct sfi_timer_table_entry *mtmr;
+ struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
+
+ mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
+ if (mtmr == NULL) {
+ printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
+ APBT_CLOCKEVENT0_NUM);
+ return -ENODEV;
+ }
+
+ adev->num = smp_processor_id();
+ adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+ mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+ APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+ adev_virt_addr(adev), 0, apbt_freq);
+ /* Firmware does EOI handling for us. */
+ adev->timer->eoi = NULL;
+
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+ global_clock_event = &adev->timer->ced;
+ printk(KERN_DEBUG "%s clockevent registered as global\n",
+ global_clock_event->name);
+ }
+
+ dw_apb_clockevent_register(adev->timer);
+
+ sfi_free_mtmr(mtmr);
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+
+static void apbt_setup_irq(struct apbt_dev *adev)
+{
+ /* timer0 irq has been setup early */
+ if (adev->irq == 0)
+ return;
+
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+ /* APB timer irqs are set up as mp_irqs, timer is edge type */
+ __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
+}
+
+/* Should be called with per cpu */
+void apbt_setup_secondary_clock(void)
+{
+ struct apbt_dev *adev;
+ int cpu;
+
+ /* Don't register boot CPU clockevent */
+ cpu = smp_processor_id();
+ if (!cpu)
+ return;
+
+ adev = &__get_cpu_var(cpu_apbt_dev);
+ if (!adev->timer) {
+ adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+ APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+ adev->irq, apbt_freq);
+ adev->timer->eoi = NULL;
+ } else {
+ dw_apb_clockevent_resume(adev->timer);
+ }
+
+ printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+ cpu, adev->name, adev->cpu);
+
+ apbt_setup_irq(adev);
+ dw_apb_clockevent_register(adev->timer);
+
+ return;
+}
+
+/*
+ * this notify handler process CPU hotplug events. in case of S0i3, nonboot
+ * cpus are disabled/enabled frequently, for performance reasons, we keep the
+ * per cpu timer irq registered so that we do need to do free_irq/request_irq.
+ *
+ * TODO: it might be more reliable to directly disable percpu clockevent device
+ * without the notifier chain. currently, cpu 0 may get interrupts from other
+ * cpu timers during the offline process due to the ordering of notification.
+ * the extra interrupt is harmless.
+ */
+static int apbt_cpuhp_notify(struct notifier_block *n,
+ unsigned long action, void *hcpu)
+{
+ unsigned long cpu = (unsigned long)hcpu;
+ struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
+
+ switch (action & 0xf) {
+ case CPU_DEAD:
+ dw_apb_clockevent_pause(adev->timer);
+ if (system_state == SYSTEM_RUNNING) {
+ pr_debug("skipping APBT CPU %lu offline\n", cpu);
+ } else if (adev) {
+ pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
+ dw_apb_clockevent_stop(adev->timer);
+ }
+ break;
+ default:
+ pr_debug("APBT notified %lu, no action\n", action);
+ }
+ return NOTIFY_OK;
+}
+
+static __init int apbt_late_init(void)
+{
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+ !apb_timer_block_enabled)
+ return 0;
+ /* This notifier should be called after workqueue is ready */
+ hotcpu_notifier(apbt_cpuhp_notify, -20);
+ return 0;
+}
+fs_initcall(apbt_late_init);
+#else
+
+void apbt_setup_secondary_clock(void) {}
+
+#endif /* CONFIG_SMP */
+
+static int apbt_clocksource_register(void)
+{
+ u64 start, now;
+ cycle_t t1;
+
+ /* Start the counter, use timer 2 as source, timer 0/1 for event */
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ /* Verify whether apbt counter works */
+ t1 = dw_apb_clocksource_read(clocksource_apbt);
+ rdtscll(start);
+
+ /*
+ * We don't know the TSC frequency yet, but waiting for
+ * 200000 TSC cycles is safe:
+ * 4 GHz == 50us
+ * 1 GHz == 200us
+ */
+ do {
+ rep_nop();
+ rdtscll(now);
+ } while ((now - start) < 200000UL);
+
+ /* APBT is the only always on clocksource, it has to work! */
+ if (t1 == dw_apb_clocksource_read(clocksource_apbt))
+ panic("APBT counter not counting. APBT disabled\n");
+
+ dw_apb_clocksource_register(clocksource_apbt);
+
+ return 0;
+}
+
+/*
+ * Early setup the APBT timer, only use timer 0 for booting then switch to
+ * per CPU timer if possible.
+ * returns 1 if per cpu apbt is setup
+ * returns 0 if no per cpu apbt is chosen
+ * panic if set up failed, this is the only platform timer on Moorestown.
+ */
+void __init apbt_time_init(void)
+{
+#ifdef CONFIG_SMP
+ int i;
+ struct sfi_timer_table_entry *p_mtmr;
+ unsigned int percpu_timer;
+ struct apbt_dev *adev;
+#endif
+
+ if (apb_timer_block_enabled)
+ return;
+ apbt_set_mapping();
+ if (!apbt_virt_address)
+ goto out_noapbt;
+ /*
+ * Read the frequency and check for a sane value, for ESL model
+ * we extend the possible clock range to allow time scaling.
+ */
+
+ if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
+ pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
+ goto out_noapbt;
+ }
+ if (apbt_clocksource_register()) {
+ pr_debug("APBT has failed to register clocksource\n");
+ goto out_noapbt;
+ }
+ if (!apbt_clockevent_register())
+ apb_timer_block_enabled = 1;
+ else {
+ pr_debug("APBT has failed to register clockevent\n");
+ goto out_noapbt;
+ }
+#ifdef CONFIG_SMP
+ /* kernel cmdline disable apb timer, so we will use lapic timers */
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+ printk(KERN_INFO "apbt: disabled per cpu timer\n");
+ return;
+ }
+ pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
+ if (num_possible_cpus() <= sfi_mtimer_num) {
+ percpu_timer = 1;
+ apbt_num_timers_used = num_possible_cpus();
+ } else {
+ percpu_timer = 0;
+ apbt_num_timers_used = 1;
+ }
+ pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
+
+ /* here we set up per CPU timer data structure */
+ for (i = 0; i < apbt_num_timers_used; i++) {
+ adev = &per_cpu(cpu_apbt_dev, i);
+ adev->num = i;
+ adev->cpu = i;
+ p_mtmr = sfi_get_mtmr(i);
+ if (p_mtmr)
+ adev->irq = p_mtmr->irq;
+ else
+ printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
+ snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
+ }
+#endif
+
+ return;
+
+out_noapbt:
+ apbt_clear_mapping();
+ apb_timer_block_enabled = 0;
+ panic("failed to enable APB timer\n");
+}
+
+/* called before apb_timer_enable, use early map */
+unsigned long apbt_quick_calibrate(void)
+{
+ int i, scale;
+ u64 old, new;
+ cycle_t t1, t2;
+ unsigned long khz = 0;
+ u32 loop, shift;
+
+ apbt_set_mapping();
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ /* check if the timer can count down, otherwise return */
+ old = dw_apb_clocksource_read(clocksource_apbt);
+ i = 10000;
+ while (--i) {
+ if (old != dw_apb_clocksource_read(clocksource_apbt))
+ break;
+ }
+ if (!i)
+ goto failed;
+
+ /* count 16 ms */
+ loop = (apbt_freq / 1000) << 4;
+
+ /* restart the timer to ensure it won't get to 0 in the calibration */
+ dw_apb_clocksource_start(clocksource_apbt);
+
+ old = dw_apb_clocksource_read(clocksource_apbt);
+ old += loop;
+
+ t1 = __native_read_tsc();
+
+ do {
+ new = dw_apb_clocksource_read(clocksource_apbt);
+ } while (new < old);
+
+ t2 = __native_read_tsc();
+
+ shift = 5;
+ if (unlikely(loop >> shift == 0)) {
+ printk(KERN_INFO
+ "APBT TSC calibration failed, not enough resolution\n");
+ return 0;
+ }
+ scale = (int)div_u64((t2 - t1), loop >> shift);
+ khz = (scale * (apbt_freq / 1000)) >> shift;
+ printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
+ return khz;
+failed:
+ return 0;
+}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
new file mode 100644
index 00000000000..6e76c191a83
--- /dev/null
+++ b/arch/x86/kernel/aperture_64.c
@@ -0,0 +1,523 @@
+/*
+ * Firmware replacement code.
+ *
+ * Work around broken BIOSes that don't set an aperture, only set the
+ * aperture in the AGP bridge, or set too small aperture.
+ *
+ * If all fails map the aperture over some low memory. This is cheaper than
+ * doing bounce buffering. The memory is lost. This is done at early boot
+ * because only the bootmem allocator can allocate 32+MB.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <linux/ioport.h>
+#include <linux/suspend.h>
+#include <linux/kmemleak.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/pci-direct.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+#include <asm/x86_init.h>
+
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR (512ULL << 20)
+#define GART_MAX_ADDR (1ULL << 32)
+
+int gart_iommu_aperture;
+int gart_iommu_aperture_disabled __initdata;
+int gart_iommu_aperture_allowed __initdata;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata;
+
+int fix_aperture __initdata = 1;
+
+static struct resource gart_resource = {
+ .name = "GART",
+ .flags = IORESOURCE_MEM,
+};
+
+static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
+{
+ gart_resource.start = aper_base;
+ gart_resource.end = aper_base + aper_size - 1;
+ insert_resource(&iomem_resource, &gart_resource);
+}
+
+/* This code runs before the PCI subsystem is initialized, so just
+ access the northbridge directly. */
+
+static u32 __init allocate_aperture(void)
+{
+ u32 aper_size;
+ unsigned long addr;
+
+ /* aper_size should <= 1G */
+ if (fallback_aper_order > 5)
+ fallback_aper_order = 5;
+ aper_size = (32 * 1024 * 1024) << fallback_aper_order;
+
+ /*
+ * Aperture has to be naturally aligned. This means a 2GB aperture
+ * won't have much chance of finding a place in the lower 4GB of
+ * memory. Unfortunately we cannot move it up because that would
+ * make the IOMMU useless.
+ */
+ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
+ aper_size, aper_size);
+ if (!addr || addr + aper_size > GART_MAX_ADDR) {
+ printk(KERN_ERR
+ "Cannot allocate aperture memory hole (%lx,%uK)\n",
+ addr, aper_size>>10);
+ return 0;
+ }
+ memblock_reserve(addr, aper_size);
+ /*
+ * Kmemleak should not scan this block as it may not be mapped via the
+ * kernel direct mapping.
+ */
+ kmemleak_ignore(phys_to_virt(addr));
+ printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
+ aper_size >> 10, addr);
+ insert_aperture_resource((u32)addr, aper_size);
+ register_nosave_region(addr >> PAGE_SHIFT,
+ (addr+aper_size) >> PAGE_SHIFT);
+
+ return (u32)addr;
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(int bus, int slot, int func, int cap)
+{
+ int bytes;
+ u8 pos;
+
+ if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
+ PCI_STATUS_CAP_LIST))
+ return 0;
+
+ pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
+ for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+ u8 id;
+
+ pos &= ~3;
+ id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+ pos = read_pci_config_byte(bus, slot, func,
+ pos+PCI_CAP_LIST_NEXT);
+ }
+ return 0;
+}
+
+/* Read a standard AGPv3 bridge header */
+static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
+{
+ u32 apsize;
+ u32 apsizereg;
+ int nbits;
+ u32 aper_low, aper_hi;
+ u64 aper;
+ u32 old_order;
+
+ printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
+ if (apsizereg == 0xffffffff) {
+ printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
+ return 0;
+ }
+
+ /* old_order could be the value from NB gart setting */
+ old_order = *order;
+
+ apsize = apsizereg & 0xfff;
+ /* Some BIOS use weird encodings not in the AGPv3 table. */
+ if (apsize & 0xff)
+ apsize |= 0xf00;
+ nbits = hweight16(apsize);
+ *order = 7 - nbits;
+ if ((int)*order < 0) /* < 32MB */
+ *order = 0;
+
+ aper_low = read_pci_config(bus, slot, func, 0x10);
+ aper_hi = read_pci_config(bus, slot, func, 0x14);
+ aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+ /*
+ * On some sick chips, APSIZE is 0. It means it wants 4G
+ * so let double check that order, and lets trust AMD NB settings:
+ */
+ printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
+ aper, 32 << old_order);
+ if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
+ printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
+ 32 << *order, apsizereg);
+ *order = old_order;
+ }
+
+ printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
+ aper, 32 << *order, apsizereg);
+
+ if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
+ return 0;
+ return (u32)aper;
+}
+
+/*
+ * Look for an AGP bridge. Windows only expects the aperture in the
+ * AGP bridge and some BIOS forget to initialize the Northbridge too.
+ * Work around this here.
+ *
+ * Do an PCI bus scan by hand because we're running before the PCI
+ * subsystem.
+ *
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
+ * generically. It's probably overkill to always scan all slots because
+ * the AGP bridges should be always an own bus on the HT hierarchy,
+ * but do it here for future safety.
+ */
+static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+ int bus, slot, func;
+
+ /* Poor man's PCI discovery */
+ for (bus = 0; bus < 256; bus++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class, cap;
+ u8 type;
+ class = read_pci_config(bus, slot, func,
+ PCI_CLASS_REVISION);
+ if (class == 0xffffffff)
+ break;
+
+ switch (class >> 16) {
+ case PCI_CLASS_BRIDGE_HOST:
+ case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+ /* AGP bridge? */
+ cap = find_cap(bus, slot, func,
+ PCI_CAP_ID_AGP);
+ if (!cap)
+ break;
+ *valid_agp = 1;
+ return read_agp(bus, slot, func, cap,
+ order);
+ }
+
+ /* No multi-function device? */
+ type = read_pci_config_byte(bus, slot, func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+ printk(KERN_INFO "No AGP bridge found\n");
+
+ return 0;
+}
+
+static int gart_fix_e820 __initdata = 1;
+
+static int __init parse_gart_mem(char *p)
+{
+ if (!p)
+ return -EINVAL;
+
+ if (!strncmp(p, "off", 3))
+ gart_fix_e820 = 0;
+ else if (!strncmp(p, "on", 2))
+ gart_fix_e820 = 1;
+
+ return 0;
+}
+early_param("gart_fix_e820", parse_gart_mem);
+
+void __init early_gart_iommu_check(void)
+{
+ /*
+ * in case it is enabled before, esp for kexec/kdump,
+ * previous kernel already enable that. memset called
+ * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
+ * or second kernel have different position for GART hole. and new
+ * kernel could use hole as RAM that is still used by GART set by
+ * first kernel
+ * or BIOS forget to put that in reserved.
+ * try to update e820 to make that region as reserved.
+ */
+ u32 agp_aper_order = 0;
+ int i, fix, slot, valid_agp = 0;
+ u32 ctl;
+ u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
+ u64 aper_base = 0, last_aper_base = 0;
+ int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* This is mostly duplicate of iommu_hole_init */
+ search_agp_bridge(&agp_aper_order, &valid_agp);
+
+ fix = 0;
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ aper_enabled = ctl & GARTEN;
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ if (last_valid) {
+ if ((aper_order != last_aper_order) ||
+ (aper_base != last_aper_base) ||
+ (aper_enabled != last_aper_enabled)) {
+ fix = 1;
+ break;
+ }
+ }
+
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ last_aper_enabled = aper_enabled;
+ last_valid = 1;
+ }
+ }
+
+ if (!fix && !aper_enabled)
+ return;
+
+ if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
+ fix = 1;
+
+ if (gart_fix_e820 && !fix && aper_enabled) {
+ if (e820_any_mapped(aper_base, aper_base + aper_size,
+ E820_RAM)) {
+ /* reserve it, so we can reuse it in second kernel */
+ printk(KERN_INFO "update e820 for GART\n");
+ e820_add_region(aper_base, aper_size, E820_RESERVED);
+ update_e820();
+ }
+ }
+
+ if (valid_agp)
+ return;
+
+ /* disable them all at first */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ }
+ }
+
+}
+
+static int __initdata printed_gart_size_msg;
+
+int __init gart_iommu_hole_init(void)
+{
+ u32 agp_aper_base = 0, agp_aper_order = 0;
+ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
+ u64 aper_base, last_aper_base = 0;
+ int fix, slot, valid_agp = 0;
+ int i, node;
+
+ if (gart_iommu_aperture_disabled || !fix_aperture ||
+ !early_pci_allowed())
+ return -ENODEV;
+
+ printk(KERN_INFO "Checking aperture...\n");
+
+ if (!fallback_aper_force)
+ agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
+ fix = 0;
+ node = 0;
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus;
+ int dev_base, dev_limit;
+ u32 ctl;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ iommu_detected = 1;
+ gart_iommu_aperture = 1;
+ x86_init.iommu.iommu_init = gart_iommu_init;
+
+ ctl = read_pci_config(bus, slot, 3,
+ AMD64_GARTAPERTURECTL);
+
+ /*
+ * Before we do anything else disable the GART. It may
+ * still be enabled if we boot into a crash-kernel here.
+ * Reconfiguring the GART while it is enabled could have
+ * unknown side-effects.
+ */
+ ctl &= ~GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
+ node, aper_base, aper_size >> 20);
+ node++;
+
+ if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+ if (valid_agp && agp_aper_base &&
+ agp_aper_base == aper_base &&
+ agp_aper_order == aper_order) {
+ /* the same between two setting from NB and agp */
+ if (!no_iommu &&
+ max_pfn > MAX_DMA32_PFN &&
+ !printed_gart_size_msg) {
+ printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
+ printk(KERN_ERR "please increase GART size in your BIOS setup\n");
+ printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ printed_gart_size_msg = 1;
+ }
+ } else {
+ fix = 1;
+ goto out;
+ }
+ }
+
+ if ((last_aper_order && aper_order != last_aper_order) ||
+ (last_aper_base && aper_base != last_aper_base)) {
+ fix = 1;
+ goto out;
+ }
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ }
+ }
+
+out:
+ if (!fix && !fallback_aper_force) {
+ if (last_aper_base) {
+ unsigned long n = (32 * 1024 * 1024) << last_aper_order;
+
+ insert_aperture_resource((u32)last_aper_base, n);
+ return 1;
+ }
+ return 0;
+ }
+
+ if (!fallback_aper_force) {
+ aper_alloc = agp_aper_base;
+ aper_order = agp_aper_order;
+ }
+
+ if (aper_alloc) {
+ /* Got the aperture from the AGP bridge */
+ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
+ force_iommu ||
+ valid_agp ||
+ fallback_aper_force) {
+ printk(KERN_INFO
+ "Your BIOS doesn't leave a aperture memory hole\n");
+ printk(KERN_INFO
+ "Please enable the IOMMU option in the BIOS setup\n");
+ printk(KERN_INFO
+ "This costs you %d MB of RAM\n",
+ 32 << fallback_aper_order);
+
+ aper_order = fallback_aper_order;
+ aper_alloc = allocate_aperture();
+ if (!aper_alloc) {
+ /*
+ * Could disable AGP and IOMMU here, but it's
+ * probably not worth it. But the later users
+ * cannot deal with bad apertures and turning
+ * on the aperture over memory causes very
+ * strange problems, so it's better to panic
+ * early.
+ */
+ panic("Not enough memory for aperture");
+ }
+ } else {
+ return 0;
+ }
+
+ /* Fix up the north bridges */
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
+ int bus, dev_base, dev_limit;
+
+ /*
+ * Don't enable translation yet but enable GART IO and CPU
+ * accesses and set DISTLBWALKPRB since GART table memory is UC.
+ */
+ u32 ctl = aper_order << 1;
+
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
+ }
+ }
+
+ set_up_gart_resume(aper_order, aper_alloc);
+
+ return 1;
+}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
new file mode 100644
index 00000000000..0ae0323b1f9
--- /dev/null
+++ b/arch/x86/kernel/apic/Makefile
@@ -0,0 +1,27 @@
+#
+# Makefile for local APIC drivers and for the IO-APIC code
+#
+
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
+obj-y += hw_nmi.o
+
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_SMP) += ipi.o
+
+ifeq ($(CONFIG_X86_64),y)
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
+obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
+obj-y += apic_flat_64.o
+endif
+
+# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
+obj-$(CONFIG_X86_SUMMIT) += summit_32.o
+obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
+obj-$(CONFIG_X86_ES7000) += es7000_32.o
+
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
new file mode 100644
index 00000000000..2eec05b6d1b
--- /dev/null
+++ b/arch/x86/kernel/apic/apic.c
@@ -0,0 +1,2464 @@
+/*
+ * Local APIC handling, local APIC timers
+ *
+ * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively.
+ * Maciej W. Rozycki : Various updates and fixes.
+ * Mikael Pettersson : Power Management for UP-APIC.
+ * Pavel Machek and
+ * Mikael Pettersson : PM converted to driver model.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi_pmtmr.h>
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
+#include <linux/i8253.h>
+#include <linux/dmar.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/perf_event.h>
+#include <asm/x86_init.h>
+#include <asm/pgalloc.h>
+#include <linux/atomic.h>
+#include <asm/mpspec.h>
+#include <asm/i8259.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/desc.h>
+#include <asm/hpet.h>
+#include <asm/idle.h>
+#include <asm/mtrr.h>
+#include <asm/time.h>
+#include <asm/smp.h>
+#include <asm/mce.h>
+#include <asm/tsc.h>
+#include <asm/hypervisor.h>
+
+unsigned int num_processors;
+
+unsigned disabled_cpus __cpuinitdata;
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+
+/*
+ * The highest APIC ID seen during enumeration.
+ */
+unsigned int max_physical_apicid;
+
+/*
+ * Bitmask of physically existing CPUs:
+ */
+physid_mask_t phys_cpu_present_map;
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
+#ifdef CONFIG_X86_32
+
+/*
+ * On x86_32, the mapping between cpu and logical apicid may vary
+ * depending on apic in use. The following early percpu variable is
+ * used for the mapping. This is where the behaviors of x86_64 and 32
+ * actually diverge. Let's keep it ugly for now.
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
+
+/*
+ * Knob to control our willingness to enable the local APIC.
+ *
+ * +1=force-enable
+ */
+static int force_enable_local_apic __initdata;
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+ force_enable_local_apic = 1;
+ return 0;
+}
+early_param("lapic", parse_lapic);
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go through APIC */
+ outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go directly to BSP */
+ outb(0x00, 0x23);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+static int apic_calibrate_pmtmr __initdata;
+static __init int setup_apicpmtimer(char *s)
+{
+ apic_calibrate_pmtmr = 1;
+ notsc_setup(NULL);
+ return 0;
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+#endif
+
+int x2apic_mode;
+#ifdef CONFIG_X86_X2APIC
+/* x2apic enabled before OS handover */
+int x2apic_preenabled;
+static int x2apic_disabled;
+static int nox2apic;
+static __init int setup_nox2apic(char *str)
+{
+ if (x2apic_enabled()) {
+ int apicid = native_apic_msr_read(APIC_ID);
+
+ if (apicid >= 255) {
+ pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+
+ pr_warning("x2apic already enabled. will disable it\n");
+ } else
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+
+ nox2apic = 1;
+
+ return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+#endif
+
+unsigned long mp_lapic_addr;
+int disable_apic;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
+static int disable_apic_timer __initdata;
+/* Local APIC timer works in C2 */
+int local_apic_timer_c2_ok;
+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+
+int first_system_vector = 0xfe;
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+unsigned int apic_verbosity;
+
+int pic_mode;
+
+/* Have we found an MP table */
+int smp_found_config;
+
+static struct resource lapic_resource = {
+ .name = "Local APIC",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
+unsigned int lapic_timer_frequency = 0;
+
+static void apic_pm_activate(void);
+
+static unsigned long apic_phys;
+
+/*
+ * Get the LAPIC version
+ */
+static inline int lapic_get_version(void)
+{
+ return GET_APIC_VERSION(apic_read(APIC_LVR));
+}
+
+/*
+ * Check, if the APIC is integrated or a separate chip
+ */
+static inline int lapic_is_integrated(void)
+{
+#ifdef CONFIG_X86_64
+ return 1;
+#else
+ return APIC_INTEGRATED(lapic_get_version());
+#endif
+}
+
+/*
+ * Check, whether this is a modern or a first generation APIC
+ */
+static int modern_apic(void)
+{
+ /* AMD systems use old APIC versions, so check the CPU */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+ boot_cpu_data.x86 >= 0xf)
+ return 1;
+ return lapic_get_version() >= 0x14;
+}
+
+/*
+ * right after this call apic become NOOP driven
+ * so apic->write/read doesn't do anything
+ */
+static void __init apic_disable(void)
+{
+ pr_info("APIC: switched to apic NOOP\n");
+ apic = &apic_noop;
+}
+
+void native_apic_wait_icr_idle(void)
+{
+ while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
+ cpu_relax();
+}
+
+u32 native_safe_apic_wait_icr_idle(void)
+{
+ u32 send_status;
+ int timeout;
+
+ timeout = 0;
+ do {
+ send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+ if (!send_status)
+ break;
+ inc_irq_stat(icr_read_retry_count);
+ udelay(100);
+ } while (timeout++ < 1000);
+
+ return send_status;
+}
+
+void native_apic_icr_write(u32 low, u32 id)
+{
+ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+ apic_write(APIC_ICR, low);
+}
+
+u64 native_apic_icr_read(void)
+{
+ u32 icr1, icr2;
+
+ icr2 = apic_read(APIC_ICR2);
+ icr1 = apic_read(APIC_ICR);
+
+ return icr1 | ((u64)icr2 << 32);
+}
+
+#ifdef CONFIG_X86_32
+/**
+ * get_physical_broadcast - Get number of physical broadcast IDs
+ */
+int get_physical_broadcast(void)
+{
+ return modern_apic() ? 0xff : 0xf;
+}
+#endif
+
+/**
+ * lapic_get_maxlvt - get the maximum number of local vector table entries
+ */
+int lapic_get_maxlvt(void)
+{
+ unsigned int v;
+
+ v = apic_read(APIC_LVR);
+ /*
+ * - we always have APIC integrated on 64bit mode
+ * - 82489DXs do not report # of LVT entries
+ */
+ return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
+}
+
+/*
+ * Local APIC timer
+ */
+
+/* Clock divisor */
+#define APIC_DIVISOR 16
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
+{
+ unsigned int lvtt_value, tmp_value;
+
+ lvtt_value = LOCAL_TIMER_VECTOR;
+ if (!oneshot)
+ lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+ if (!lapic_is_integrated())
+ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+
+ if (!irqen)
+ lvtt_value |= APIC_LVT_MASKED;
+
+ apic_write(APIC_LVTT, lvtt_value);
+
+ /*
+ * Divide PICLK by 16
+ */
+ tmp_value = apic_read(APIC_TDCR);
+ apic_write(APIC_TDCR,
+ (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+ APIC_TDR_DIV_16);
+
+ if (!oneshot)
+ apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
+}
+
+/*
+ * Setup extended LVT, AMD specific
+ *
+ * Software should use the LVT offsets the BIOS provides. The offsets
+ * are determined by the subsystems using it like those for MCE
+ * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
+ * are supported. Beginning with family 10h at least 4 offsets are
+ * available.
+ *
+ * Since the offsets must be consistent for all cores, we keep track
+ * of the LVT offsets in software and reserve the offset for the same
+ * vector also to be used on other cores. An offset is freed by
+ * setting the entry to APIC_EILVT_MASKED.
+ *
+ * If the BIOS is right, there should be no conflicts. Otherwise a
+ * "[Firmware Bug]: ..." error message is generated. However, if
+ * software does not properly determines the offsets, it is not
+ * necessarily a BIOS bug.
+ */
+
+static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
+
+static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
+{
+ return (old & APIC_EILVT_MASKED)
+ || (new == APIC_EILVT_MASKED)
+ || ((new & ~APIC_EILVT_MASKED) == old);
+}
+
+static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
+{
+ unsigned int rsvd; /* 0: uninitialized */
+
+ if (offset >= APIC_EILVT_NR_MAX)
+ return ~0;
+
+ rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
+ do {
+ if (rsvd &&
+ !eilvt_entry_is_changeable(rsvd, new))
+ /* may not change if vectors are different */
+ return rsvd;
+ rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
+ } while (rsvd != new);
+
+ return new;
+}
+
+/*
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
+ */
+
+int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
+{
+ unsigned long reg = APIC_EILVTn(offset);
+ unsigned int new, old, reserved;
+
+ new = (mask << 16) | (msg_type << 8) | vector;
+ old = apic_read(reg);
+ reserved = reserve_eilvt_offset(offset, new);
+
+ if (reserved != new) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on another cpu\n",
+ smp_processor_id(), reg, offset, new, reserved);
+ return -EINVAL;
+ }
+
+ if (!eilvt_entry_is_changeable(old, new)) {
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on this cpu\n",
+ smp_processor_id(), reg, offset, new, old);
+ return -EBUSY;
+ }
+
+ apic_write(reg, new);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
+
+/*
+ * Program the next event, relative to now
+ */
+static int lapic_next_event(unsigned long delta,
+ struct clock_event_device *evt)
+{
+ apic_write(APIC_TMICT, delta);
+ return 0;
+}
+
+/*
+ * Setup the lapic timer in periodic or oneshot mode
+ */
+static void lapic_timer_setup(enum clock_event_mode mode,
+ struct clock_event_device *evt)
+{
+ unsigned long flags;
+ unsigned int v;
+
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return;
+
+ local_irq_save(flags);
+
+ switch (mode) {
+ case CLOCK_EVT_MODE_PERIODIC:
+ case CLOCK_EVT_MODE_ONESHOT:
+ __setup_APIC_LVTT(lapic_timer_frequency,
+ mode != CLOCK_EVT_MODE_PERIODIC, 1);
+ break;
+ case CLOCK_EVT_MODE_UNUSED:
+ case CLOCK_EVT_MODE_SHUTDOWN:
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
+ apic_write(APIC_TMICT, 0);
+ break;
+ case CLOCK_EVT_MODE_RESUME:
+ /* Nothing to do here */
+ break;
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Local APIC timer broadcast function
+ */
+static void lapic_timer_broadcast(const struct cpumask *mask)
+{
+#ifdef CONFIG_SMP
+ apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
+#endif
+}
+
+
+/*
+ * The local apic timer can be used for any function which is CPU local.
+ */
+static struct clock_event_device lapic_clockevent = {
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
+ | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_mode = lapic_timer_setup,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
+};
+static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
+
+/*
+ * Setup the local APIC timer for this CPU. Copy the initialized values
+ * of the boot CPU and register the clock event in the framework.
+ */
+static void __cpuinit setup_APIC_timer(void)
+{
+ struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+
+ if (this_cpu_has(X86_FEATURE_ARAT)) {
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
+ /* Make LAPIC timer preferrable over percpu HPET */
+ lapic_clockevent.rating = 150;
+ }
+
+ memcpy(levt, &lapic_clockevent, sizeof(*levt));
+ levt->cpumask = cpumask_of(smp_processor_id());
+
+ clockevents_register_device(levt);
+}
+
+/*
+ * In this functions we calibrate APIC bus clocks to the external timer.
+ *
+ * We want to do the calibration only once since we want to have local timer
+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * frequency.
+ *
+ * This was previously done by reading the PIT/HPET and waiting for a wrap
+ * around to find out, that a tick has elapsed. I have a box, where the PIT
+ * readout is broken, so it never gets out of the wait loop again. This was
+ * also reported by others.
+ *
+ * Monitoring the jiffies value is inaccurate and the clockevents
+ * infrastructure allows us to do a simple substitution of the interrupt
+ * handler.
+ *
+ * The calibration routine also uses the pm_timer when possible, as the PIT
+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes
+ * back to normal later in the boot process).
+ */
+
+#define LAPIC_CAL_LOOPS (HZ/10)
+
+static __initdata int lapic_cal_loops = -1;
+static __initdata long lapic_cal_t1, lapic_cal_t2;
+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
+
+/*
+ * Temporary interrupt handler.
+ */
+static void __init lapic_cal_handler(struct clock_event_device *dev)
+{
+ unsigned long long tsc = 0;
+ long tapic = apic_read(APIC_TMCCT);
+ unsigned long pm = acpi_pm_read_early();
+
+ if (cpu_has_tsc)
+ rdtscll(tsc);
+
+ switch (lapic_cal_loops++) {
+ case 0:
+ lapic_cal_t1 = tapic;
+ lapic_cal_tsc1 = tsc;
+ lapic_cal_pm1 = pm;
+ lapic_cal_j1 = jiffies;
+ break;
+
+ case LAPIC_CAL_LOOPS:
+ lapic_cal_t2 = tapic;
+ lapic_cal_tsc2 = tsc;
+ if (pm < lapic_cal_pm1)
+ pm += ACPI_PM_OVRRUN;
+ lapic_cal_pm2 = pm;
+ lapic_cal_j2 = jiffies;
+ break;
+ }
+}
+
+static int __init
+calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+{
+ const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
+ const long pm_thresh = pm_100ms / 100;
+ unsigned long mult;
+ u64 res;
+
+#ifndef CONFIG_X86_PM_TIMER
+ return -1;
+#endif
+
+ apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+
+ /* Check, if the PM timer is available */
+ if (!deltapm)
+ return -1;
+
+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+ if (deltapm > (pm_100ms - pm_thresh) &&
+ deltapm < (pm_100ms + pm_thresh)) {
+ apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ return 0;
+ }
+
+ res = (((u64)deltapm) * mult) >> 22;
+ do_div(res, 1000000);
+ pr_warning("APIC calibration not consistent "
+ "with PM-Timer: %ldms instead of 100ms\n",(long)res);
+
+ /* Correct the lapic counter value */
+ res = (((u64)(*delta)) * pm_100ms);
+ do_div(res, deltapm);
+ pr_info("APIC delta adjusted to PM-Timer: "
+ "%lu (%ld)\n", (unsigned long)res, *delta);
+ *delta = (long)res;
+
+ /* Correct the tsc counter value */
+ if (cpu_has_tsc) {
+ res = (((u64)(*deltatsc)) * pm_100ms);
+ do_div(res, deltapm);
+ apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
+ "PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
+ *deltatsc = (long)res;
+ }
+
+ return 0;
+}
+
+static int __init calibrate_APIC_clock(void)
+{
+ struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+ void (*real_handler)(struct clock_event_device *dev);
+ unsigned long deltaj;
+ long delta, deltatsc;
+ int pm_referenced = 0;
+
+ /**
+ * check if lapic timer has already been calibrated by platform
+ * specific routine, such as tsc calibration code. if so, we just fill
+ * in the clockevent structure and return.
+ */
+
+ if (lapic_timer_frequency) {
+ apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
+ lapic_timer_frequency);
+ lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
+ TICK_NSEC, lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+ return 0;
+ }
+
+ local_irq_disable();
+
+ /* Replace the global interrupt handler */
+ real_handler = global_clock_event->event_handler;
+ global_clock_event->event_handler = lapic_cal_handler;
+
+ /*
+ * Setup the APIC counter to maximum. There is no way the lapic
+ * can underflow in the 100ms detection time frame
+ */
+ __setup_APIC_LVTT(0xffffffff, 0, 0);
+
+ /* Let the interrupts run */
+ local_irq_enable();
+
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+ cpu_relax();
+
+ local_irq_disable();
+
+ /* Restore the real event handler */
+ global_clock_event->event_handler = real_handler;
+
+ /* Build delta t1-t2 as apic timer counts down */
+ delta = lapic_cal_t1 - lapic_cal_t2;
+ apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+
+ deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
+
+ /* we trust the PM based calibration if possible */
+ pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
+ &delta, &deltatsc);
+
+ /* Calculate the scaled math multiplication factor */
+ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
+ lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+
+ lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+
+ apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
+ apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
+ apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
+ lapic_timer_frequency);
+
+ if (cpu_has_tsc) {
+ apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
+ "%ld.%04ld MHz.\n",
+ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ }
+
+ apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
+ "%u.%04u MHz.\n",
+ lapic_timer_frequency / (1000000 / HZ),
+ lapic_timer_frequency % (1000000 / HZ));
+
+ /*
+ * Do a sanity check on the APIC calibration result
+ */
+ if (lapic_timer_frequency < (1000000 / HZ)) {
+ local_irq_enable();
+ pr_warning("APIC frequency too slow, disabling apic timer\n");
+ return -1;
+ }
+
+ levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+ /*
+ * PM timer calibration failed or not turned on
+ * so lets try APIC timer based calibration
+ */
+ if (!pm_referenced) {
+ apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+
+ /*
+ * Setup the apic timer manually
+ */
+ levt->event_handler = lapic_cal_handler;
+ lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+ lapic_cal_loops = -1;
+
+ /* Let the interrupts run */
+ local_irq_enable();
+
+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS)
+ cpu_relax();
+
+ /* Stop the lapic timer */
+ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+
+ /* Jiffies delta */
+ deltaj = lapic_cal_j2 - lapic_cal_j1;
+ apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+
+ /* Check, if the jiffies result is consistent */
+ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
+ apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ else
+ levt->features |= CLOCK_EVT_FEAT_DUMMY;
+ } else
+ local_irq_enable();
+
+ if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
+ pr_warning("APIC timer disabled due to verification failure\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+ /*
+ * The local apic timer can be disabled via the kernel
+ * commandline or from the CPU detection code. Register the lapic
+ * timer as a dummy clock event source on SMP systems, so the
+ * broadcast mechanism is used. On UP systems simply ignore it.
+ */
+ if (disable_apic_timer) {
+ pr_info("Disabling APIC timer\n");
+ /* No broadcast on UP ! */
+ if (num_possible_cpus() > 1) {
+ lapic_clockevent.mult = 1;
+ setup_APIC_timer();
+ }
+ return;
+ }
+
+ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+ "calibrating APIC timer ...\n");
+
+ if (calibrate_APIC_clock()) {
+ /* No broadcast on UP ! */
+ if (num_possible_cpus() > 1)
+ setup_APIC_timer();
+ return;
+ }
+
+ /*
+ * If nmi_watchdog is set to IO_APIC, we need the
+ * PIT/HPET going. Otherwise register lapic as a dummy
+ * device.
+ */
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+ /* Setup the lapic or request the broadcast */
+ setup_APIC_timer();
+}
+
+void __cpuinit setup_secondary_APIC_clock(void)
+{
+ setup_APIC_timer();
+}
+
+/*
+ * The guts of the apic timer interrupt
+ */
+static void local_apic_timer_interrupt(void)
+{
+ int cpu = smp_processor_id();
+ struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
+
+ /*
+ * Normally we should not be here till LAPIC has been initialized but
+ * in some cases like kdump, its possible that there is a pending LAPIC
+ * timer interrupt from previous kernel's context and is delivered in
+ * new kernel the moment interrupts are enabled.
+ *
+ * Interrupts are enabled early and LAPIC is setup much later, hence
+ * its possible that when we get here evt->event_handler is NULL.
+ * Check for event_handler being NULL and discard the interrupt as
+ * spurious.
+ */
+ if (!evt->event_handler) {
+ pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
+ /* Switch it off */
+ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+ return;
+ }
+
+ /*
+ * the NMI deadlock-detector uses this.
+ */
+ inc_irq_stat(apic_timer_irqs);
+
+ evt->event_handler(evt);
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ * interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ */
+ ack_APIC_irq();
+ /*
+ * update_process_times() expects us to have done irq_enter().
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ irq_enter();
+ exit_idle();
+ local_apic_timer_interrupt();
+ irq_exit();
+
+ set_irq_regs(old_regs);
+}
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+ return -EINVAL;
+}
+
+/*
+ * Local APIC start and shutdown
+ */
+
+/**
+ * clear_local_APIC - shutdown the local APIC
+ *
+ * This is called, when a CPU is disabled and before rebooting, so the state of
+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
+ * leftovers during boot.
+ */
+void clear_local_APIC(void)
+{
+ int maxlvt;
+ u32 v;
+
+ /* APIC hasn't been mapped yet */
+ if (!x2apic_mode && !apic_phys)
+ return;
+
+ maxlvt = lapic_get_maxlvt();
+ /*
+ * Masking an LVT entry can trigger a local APIC error
+ * if the vector is zero. Mask LVTERR first to prevent this.
+ */
+ if (maxlvt >= 3) {
+ v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+ apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+ }
+ /*
+ * Careful: we have to set masks only first to deassert
+ * any level-triggered sources.
+ */
+ v = apic_read(APIC_LVTT);
+ apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT1);
+ apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
+ if (maxlvt >= 4) {
+ v = apic_read(APIC_LVTPC);
+ apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
+ }
+
+ /* lets not touch this if we didn't frob it */
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5) {
+ v = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+ }
+#endif
+#ifdef CONFIG_X86_MCE_INTEL
+ if (maxlvt >= 6) {
+ v = apic_read(APIC_LVTCMCI);
+ if (!(v & APIC_LVT_MASKED))
+ apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+ }
+#endif
+
+ /*
+ * Clean APIC state for other OSs:
+ */
+ apic_write(APIC_LVTT, APIC_LVT_MASKED);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
+ apic_write(APIC_LVT1, APIC_LVT_MASKED);
+ if (maxlvt >= 3)
+ apic_write(APIC_LVTERR, APIC_LVT_MASKED);
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, APIC_LVT_MASKED);
+
+ /* Integrated APIC (!82489DX) ? */
+ if (lapic_is_integrated()) {
+ if (maxlvt > 3)
+ /* Clear ESR due to Pentium errata 3AP and 11AP */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+}
+
+/**
+ * disable_local_APIC - clear and disable the local APIC
+ */
+void disable_local_APIC(void)
+{
+ unsigned int value;
+
+ /* APIC hasn't been mapped yet */
+ if (!x2apic_mode && !apic_phys)
+ return;
+
+ clear_local_APIC();
+
+ /*
+ * Disable APIC (implies clearing of registers
+ * for 82489DX!).
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write(APIC_SPIV, value);
+
+#ifdef CONFIG_X86_32
+ /*
+ * When LAPIC was disabled by the BIOS and enabled by the kernel,
+ * restore the disabled state.
+ */
+ if (enabled_via_apicbase) {
+ unsigned int l, h;
+
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_ENABLE;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
+#endif
+}
+
+/*
+ * If Linux enabled the LAPIC against the BIOS default disable it down before
+ * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
+ * not power-off. Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
+ */
+void lapic_shutdown(void)
+{
+ unsigned long flags;
+
+ if (!cpu_has_apic && !apic_from_smp_config())
+ return;
+
+ local_irq_save(flags);
+
+#ifdef CONFIG_X86_32
+ if (!enabled_via_apicbase)
+ clear_local_APIC();
+ else
+#endif
+ disable_local_APIC();
+
+
+ local_irq_restore(flags);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+ unsigned int reg0, reg1;
+
+ /*
+ * The version register is read-only in a real APIC.
+ */
+ reg0 = apic_read(APIC_LVR);
+ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+ apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+ reg1 = apic_read(APIC_LVR);
+ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+ /*
+ * The two version reads above should print the same
+ * numbers. If the second one is different, then we
+ * poke at a non-APIC.
+ */
+ if (reg1 != reg0)
+ return 0;
+
+ /*
+ * Check if the version looks reasonably.
+ */
+ reg1 = GET_APIC_VERSION(reg0);
+ if (reg1 == 0x00 || reg1 == 0xff)
+ return 0;
+ reg1 = lapic_get_maxlvt();
+ if (reg1 < 0x02 || reg1 == 0xff)
+ return 0;
+
+ /*
+ * The ID register is read/write in a real APIC.
+ */
+ reg0 = apic_read(APIC_ID);
+ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+ apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
+ reg1 = apic_read(APIC_ID);
+ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+ apic_write(APIC_ID, reg0);
+ if (reg1 != (reg0 ^ apic->apic_id_mask))
+ return 0;
+
+ /*
+ * The next two are just to see if we have sane values.
+ * They're only really relevant if we're in Virtual Wire
+ * compatibility mode, but most boxes are anymore.
+ */
+ reg0 = apic_read(APIC_LVT0);
+ apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
+ reg1 = apic_read(APIC_LVT1);
+ apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+
+ return 1;
+}
+
+/**
+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs
+ */
+void __init sync_Arb_IDs(void)
+{
+ /*
+ * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
+ * needed on AMD.
+ */
+ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC |
+ APIC_INT_LEVELTRIG | APIC_DM_INIT);
+}
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+ unsigned int value;
+
+ /*
+ * Don't do the setup now if we have a SMP BIOS as the
+ * through-I/O-APIC virtual wire mode might be active.
+ */
+ if (smp_found_config || !cpu_has_apic)
+ return;
+
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+
+ /*
+ * Enable APIC.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+
+#ifdef CONFIG_X86_32
+ /* This bit is reserved on P4/Xeon and should be cleared */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 15))
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+ else
+#endif
+ value |= APIC_SPIV_FOCUS_DISABLED;
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write(APIC_SPIV, value);
+
+ /*
+ * Set up the virtual wire mode.
+ */
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ value = APIC_DM_NMI;
+ if (!lapic_is_integrated()) /* 82489DX */
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ apic_write(APIC_LVT1, value);
+}
+
+static void __cpuinit lapic_setup_esr(void)
+{
+ unsigned int oldvalue, value, maxlvt;
+
+ if (!lapic_is_integrated()) {
+ pr_info("No ESR for 82489DX.\n");
+ return;
+ }
+
+ if (apic->disable_esr) {
+ /*
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ pr_info("Leaving ESR disabled.\n");
+ return;
+ }
+
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write(APIC_LVTERR, value);
+
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+}
+
+/**
+ * setup_local_APIC - setup the local APIC
+ *
+ * Used to setup local APIC while initializing BSP or bringin up APs.
+ * Always called with preemption disabled.
+ */
+void __cpuinit setup_local_APIC(void)
+{
+ int cpu = smp_processor_id();
+ unsigned int value, queued;
+ int i, j, acked = 0;
+ unsigned long long tsc = 0, ntsc;
+ long long max_loops = cpu_khz;
+
+ if (cpu_has_tsc)
+ rdtscll(tsc);
+
+ if (disable_apic) {
+ disable_ioapic_support();
+ return;
+ }
+
+#ifdef CONFIG_X86_32
+ /* Pound the ESR really hard over the head with a big hammer - mbligh */
+ if (lapic_is_integrated() && apic->disable_esr) {
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ }
+#endif
+ perf_events_lapic_init();
+
+ /*
+ * Double-check whether this APIC is really registered.
+ * This is meaningless in clustered apic mode, so we skip it.
+ */
+ BUG_ON(!apic->apic_id_registered());
+
+ /*
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+ apic->init_apic_ldr();
+
+#ifdef CONFIG_X86_32
+ /*
+ * APIC LDR is initialized. If logical_apicid mapping was
+ * initialized during get_smp_config(), make sure it matches the
+ * actual value.
+ */
+ i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+ WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
+ /* always use the value from LDR */
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ logical_smp_processor_id();
+
+ /*
+ * Some NUMA implementations (NUMAQ) don't initialize apicid to
+ * node mapping during NUMA init. Now that logical apicid is
+ * guaranteed to be known, give it another chance. This is already
+ * a bit too late - percpu allocation has already happened without
+ * proper NUMA affinity.
+ */
+ if (apic->x86_32_numa_cpu_node)
+ set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
+ apic->x86_32_numa_cpu_node(cpu));
+#endif
+
+ /*
+ * Set Task Priority to 'accept all'. We never change this
+ * later on.
+ */
+ value = apic_read(APIC_TASKPRI);
+ value &= ~APIC_TPRI_MASK;
+ apic_write(APIC_TASKPRI, value);
+
+ /*
+ * After a crash, we no longer service the interrupts and a pending
+ * interrupt from previous kernel might still have ISR bit set.
+ *
+ * Most probably by now CPU has serviced that pending interrupt and
+ * it might not have done the ack_APIC_irq() because it thought,
+ * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
+ * does not clear the ISR bit and cpu thinks it has already serivced
+ * the interrupt. Hence a vector might get locked. It was noticed
+ * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
+ */
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for (j = 31; j >= 0; j--) {
+ if (value & (1<<j)) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
+ }
+ if (acked > 256) {
+ printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+ acked);
+ break;
+ }
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ } while (queued && max_loops > 0);
+ WARN_ON(max_loops <= 0);
+
+ /*
+ * Now that we are all set up, enable the APIC
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ /*
+ * Enable APIC
+ */
+ value |= APIC_SPIV_APIC_ENABLED;
+
+#ifdef CONFIG_X86_32
+ /*
+ * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+ * certain networking cards. If high frequency interrupts are
+ * happening on a particular IOAPIC pin, plus the IOAPIC routing
+ * entry is masked/unmasked at a high rate as well then sooner or
+ * later IOAPIC line gets 'stuck', no more interrupts are received
+ * from the device. If focus CPU is disabled then the hang goes
+ * away, oh well :-(
+ *
+ * [ This bug can be reproduced easily with a level-triggered
+ * PCI Ne2000 networking cards and PII/PIII processors, dual
+ * BX chipset. ]
+ */
+ /*
+ * Actually disabling the focus CPU check just makes the hang less
+ * frequent as it makes the interrupt distributon model be more
+ * like LRU than MRU (the short-term load is more even across CPUs).
+ * See also the comment in end_level_ioapic_irq(). --macro
+ */
+
+ /*
+ * - enable focus processor (bit==0)
+ * - 64bit mode always use processor focus
+ * so no need to set it
+ */
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+#endif
+
+ /*
+ * Set spurious IRQ vector
+ */
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write(APIC_SPIV, value);
+
+ /*
+ * Set up LVT0, LVT1:
+ *
+ * set up through-local-APIC on the BP's LINT0. This is not
+ * strictly necessary in pure symmetric-IO mode, but sometimes
+ * we delegate interrupts to the 8259A.
+ */
+ /*
+ * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+ */
+ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+ if (!cpu && (pic_mode || !value)) {
+ value = APIC_DM_EXTINT;
+ apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
+ } else {
+ value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+ apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
+ }
+ apic_write(APIC_LVT0, value);
+
+ /*
+ * only the BP should see the LINT1 NMI signal, obviously.
+ */
+ if (!cpu)
+ value = APIC_DM_NMI;
+ else
+ value = APIC_DM_NMI | APIC_LVT_MASKED;
+ if (!lapic_is_integrated()) /* 82489DX */
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ apic_write(APIC_LVT1, value);
+
+#ifdef CONFIG_X86_MCE_INTEL
+ /* Recheck CMCI information after local APIC is up on CPU #0 */
+ if (!cpu)
+ cmci_recheck();
+#endif
+}
+
+void __cpuinit end_local_APIC_setup(void)
+{
+ lapic_setup_esr();
+
+#ifdef CONFIG_X86_32
+ {
+ unsigned int value;
+ /* Disable the local apic timer */
+ value = apic_read(APIC_LVTT);
+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, value);
+ }
+#endif
+
+ apic_pm_activate();
+}
+
+void __init bsp_end_local_APIC_setup(void)
+{
+ end_local_APIC_setup();
+
+ /*
+ * Now that local APIC setup is completed for BP, configure the fault
+ * handling for interrupt remapping.
+ */
+ if (intr_remapping_enabled)
+ enable_drhd_fault_handling();
+
+}
+
+#ifdef CONFIG_X86_X2APIC
+/*
+ * Need to disable xapic and x2apic at the same time and then enable xapic mode
+ */
+static inline void __disable_x2apic(u64 msr)
+{
+ wrmsrl(MSR_IA32_APICBASE,
+ msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+}
+
+static __init void disable_x2apic(void)
+{
+ u64 msr;
+
+ if (!cpu_has_x2apic)
+ return;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE) {
+ u32 x2apic_id = read_apic_id();
+
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ pr_info("Disabling x2apic\n");
+ __disable_x2apic(msr);
+
+ if (nox2apic) {
+ clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ }
+
+ x2apic_disabled = 1;
+ x2apic_mode = 0;
+
+ register_lapic_address(mp_lapic_addr);
+ }
+}
+
+void check_x2apic(void)
+{
+ if (x2apic_enabled()) {
+ pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
+ x2apic_preenabled = x2apic_mode = 1;
+ }
+}
+
+void enable_x2apic(void)
+{
+ u64 msr;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (x2apic_disabled) {
+ __disable_x2apic(msr);
+ return;
+ }
+
+ if (!x2apic_mode)
+ return;
+
+ if (!(msr & X2APIC_ENABLE)) {
+ printk_once(KERN_INFO "Enabling x2apic\n");
+ wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ }
+}
+#endif /* CONFIG_X86_X2APIC */
+
+int __init enable_IR(void)
+{
+#ifdef CONFIG_IRQ_REMAP
+ if (!intr_remapping_supported()) {
+ pr_debug("intr-remapping not supported\n");
+ return -1;
+ }
+
+ if (!x2apic_preenabled && skip_ioapic_setup) {
+ pr_info("Skipped enabling intr-remap because of skipping "
+ "io-apic setup\n");
+ return -1;
+ }
+
+ return enable_intr_remapping();
+#endif
+ return -1;
+}
+
+void __init enable_IR_x2apic(void)
+{
+ unsigned long flags;
+ int ret, x2apic_enabled = 0;
+ int dmar_table_init_ret;
+
+ dmar_table_init_ret = dmar_table_init();
+ if (dmar_table_init_ret && !x2apic_supported())
+ return;
+
+ ret = save_ioapic_entries();
+ if (ret) {
+ pr_info("Saving IO-APIC state failed: %d\n", ret);
+ return;
+ }
+
+ local_irq_save(flags);
+ legacy_pic->mask_all();
+ mask_ioapic_entries();
+
+ if (x2apic_preenabled && nox2apic)
+ disable_x2apic();
+
+ if (dmar_table_init_ret)
+ ret = -1;
+ else
+ ret = enable_IR();
+
+ if (!x2apic_supported())
+ goto skip_x2apic;
+
+ if (ret < 0) {
+ /* IR is required if there is APIC ID > 255 even when running
+ * under KVM
+ */
+ if (max_physical_apicid > 255 ||
+ !hypervisor_x2apic_available()) {
+ if (x2apic_preenabled)
+ disable_x2apic();
+ goto skip_x2apic;
+ }
+ /*
+ * without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode
+ */
+ x2apic_force_phys();
+ }
+
+ if (ret == IRQ_REMAP_XAPIC_MODE) {
+ pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
+ goto skip_x2apic;
+ }
+
+ x2apic_enabled = 1;
+
+ if (x2apic_supported() && !x2apic_mode) {
+ x2apic_mode = 1;
+ enable_x2apic();
+ pr_info("Enabled x2apic\n");
+ }
+
+skip_x2apic:
+ if (ret < 0) /* IR enabling failed */
+ restore_ioapic_entries();
+ legacy_pic->restore_mask();
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC(void)
+{
+ if (!cpu_has_apic) {
+ pr_info("No local APIC present\n");
+ return -1;
+ }
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ return 0;
+}
+#else
+
+static int __init apic_verify(void)
+{
+ u32 features, h, l;
+
+ /*
+ * The APIC feature bit should now be enabled
+ * in `cpuid'
+ */
+ features = cpuid_edx(1);
+ if (!(features & (1 << X86_FEATURE_APIC))) {
+ pr_warning("Could not enable APIC!\n");
+ return -1;
+ }
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /* The BIOS may have set up the APIC at some other address */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+ pr_info("Found and enabled local APIC!\n");
+ return 0;
+}
+
+int __init apic_force_enable(unsigned long addr)
+{
+ u32 h, l;
+
+ if (disable_apic)
+ return -1;
+
+ /*
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
+ */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
+ return apic_verify();
+}
+
+/*
+ * Detect and initialize APIC
+ */
+static int __init detect_init_APIC(void)
+{
+ /* Disabled by kernel option? */
+ if (disable_apic)
+ return -1;
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
+ (boot_cpu_data.x86 >= 15))
+ break;
+ goto no_apic;
+ case X86_VENDOR_INTEL:
+ if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
+ (boot_cpu_data.x86 == 5 && cpu_has_apic))
+ break;
+ goto no_apic;
+ default:
+ goto no_apic;
+ }
+
+ if (!cpu_has_apic) {
+ /*
+ * Over-ride BIOS and try to enable the local APIC only if
+ * "lapic" specified.
+ */
+ if (!force_enable_local_apic) {
+ pr_info("Local APIC disabled by BIOS -- "
+ "you can enable it with \"lapic\"\n");
+ return -1;
+ }
+ if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
+ return -1;
+ } else {
+ if (apic_verify())
+ return -1;
+ }
+
+ apic_pm_activate();
+
+ return 0;
+
+no_apic:
+ pr_info("No local APIC present or hardware disabled\n");
+ return -1;
+}
+#endif
+
+/**
+ * init_apic_mappings - initialize APIC mappings
+ */
+void __init init_apic_mappings(void)
+{
+ unsigned int new_apicid;
+
+ if (x2apic_mode) {
+ boot_cpu_physical_apicid = read_apic_id();
+ return;
+ }
+
+ /* If no local APIC can be found return early */
+ if (!smp_found_config && detect_init_APIC()) {
+ /* lets NOP'ify apic operations */
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ } else {
+ apic_phys = mp_lapic_addr;
+
+ /*
+ * acpi lapic path already maps that address in
+ * acpi_register_lapic_address()
+ */
+ if (!acpi_lapic && !smp_found_config)
+ register_lapic_address(apic_phys);
+ }
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ new_apicid = read_apic_id();
+ if (boot_cpu_physical_apicid != new_apicid) {
+ boot_cpu_physical_apicid = new_apicid;
+ /*
+ * yeah -- we lie about apic_version
+ * in case if apic was disabled via boot option
+ * but it's not a problem for SMP compiled kernel
+ * since smp_sanity_check is prepared for such a case
+ * and disable smp mode
+ */
+ apic_version[new_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
+void __init register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ if (!x2apic_mode) {
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, mp_lapic_addr);
+ }
+ if (boot_cpu_physical_apicid == -1U) {
+ boot_cpu_physical_apicid = read_apic_id();
+ apic_version[boot_cpu_physical_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int apic_version[MAX_LOCAL_APIC];
+
+int __init APIC_init_uniprocessor(void)
+{
+ if (disable_apic) {
+ pr_info("Apic disabled\n");
+ return -1;
+ }
+#ifdef CONFIG_X86_64
+ if (!cpu_has_apic) {
+ disable_apic = 1;
+ pr_info("Apic disabled by BIOS\n");
+ return -1;
+ }
+#else
+ if (!smp_found_config && !cpu_has_apic)
+ return -1;
+
+ /*
+ * Complain if the BIOS pretends there is one.
+ */
+ if (!cpu_has_apic &&
+ APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
+ boot_cpu_physical_apicid);
+ return -1;
+ }
+#endif
+
+ default_setup_apic_routing();
+
+ verify_local_APIC();
+ connect_bsp_APIC();
+
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+#else
+ /*
+ * Hack: In case of kdump, after a crash, kernel might be booting
+ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+ * might be zero if read from MP tables. Get it from LAPIC.
+ */
+# ifdef CONFIG_CRASH_DUMP
+ boot_cpu_physical_apicid = read_apic_id();
+# endif
+#endif
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+ setup_local_APIC();
+
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Now enable IO-APICs, actually call clear_IO_APIC
+ * We need clear_IO_APIC before enabling error vector
+ */
+ if (!skip_ioapic_setup && nr_ioapics)
+ enable_IO_APIC();
+#endif
+
+ bsp_end_local_APIC_setup();
+
+#ifdef CONFIG_X86_IO_APIC
+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+ else {
+ nr_ioapics = 0;
+ }
+#endif
+
+ x86_init.timers.setup_percpu_clockev();
+ return 0;
+}
+
+/*
+ * Local APIC interrupts
+ */
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+void smp_spurious_interrupt(struct pt_regs *regs)
+{
+ u32 v;
+
+ irq_enter();
+ exit_idle();
+ /*
+ * Check if this really is a spurious interrupt and ACK it
+ * if it is a vectored one. Just in case...
+ * Spurious interrupts should not be ACKed.
+ */
+ v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+ ack_APIC_irq();
+
+ inc_irq_stat(irq_spurious_count);
+
+ /* see sw-dev-man vol 3, chapter 7.4.13.5 */
+ pr_info("spurious APIC interrupt on CPU#%d, "
+ "should never happen.\n", smp_processor_id());
+ irq_exit();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+void smp_error_interrupt(struct pt_regs *regs)
+{
+ u32 v0, v1;
+ u32 i = 0;
+ static const char * const error_interrupt_reason[] = {
+ "Send CS error", /* APIC Error Bit 0 */
+ "Receive CS error", /* APIC Error Bit 1 */
+ "Send accept error", /* APIC Error Bit 2 */
+ "Receive accept error", /* APIC Error Bit 3 */
+ "Redirectable IPI", /* APIC Error Bit 4 */
+ "Send illegal vector", /* APIC Error Bit 5 */
+ "Received illegal vector", /* APIC Error Bit 6 */
+ "Illegal register address", /* APIC Error Bit 7 */
+ };
+
+ irq_enter();
+ exit_idle();
+ /* First tickle the hardware, only then report what went on. -- REW */
+ v0 = apic_read(APIC_ESR);
+ apic_write(APIC_ESR, 0);
+ v1 = apic_read(APIC_ESR);
+ ack_APIC_irq();
+ atomic_inc(&irq_err_count);
+
+ apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
+ smp_processor_id(), v0 , v1);
+
+ v1 = v1 & 0xff;
+ while (v1) {
+ if (v1 & 0x1)
+ apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ i++;
+ v1 >>= 1;
+ };
+
+ apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
+ irq_exit();
+}
+
+/**
+ * connect_bsp_APIC - attach the APIC to the interrupt system
+ */
+void __init connect_bsp_APIC(void)
+{
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+ /*
+ * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
+ * local APIC to INT and NMI lines.
+ */
+ apic_printk(APIC_VERBOSE, "leaving PIC mode, "
+ "enabling APIC mode.\n");
+ imcr_pic_to_apic();
+ }
+#endif
+ if (apic->enable_apic_mode)
+ apic->enable_apic_mode();
+}
+
+/**
+ * disconnect_bsp_APIC - detach the APIC from the interrupt system
+ * @virt_wire_setup: indicates, whether virtual wire mode is selected
+ *
+ * Virtual wire mode is necessary to deliver legacy interrupts even when the
+ * APIC is disabled.
+ */
+void disconnect_bsp_APIC(int virt_wire_setup)
+{
+ unsigned int value;
+
+#ifdef CONFIG_X86_32
+ if (pic_mode) {
+ /*
+ * Put the board back into PIC mode (has an effect only on
+ * certain older boards). Note that APIC interrupts, including
+ * IPIs, won't work beyond this point! The only exception are
+ * INIT IPIs.
+ */
+ apic_printk(APIC_VERBOSE, "disabling APIC mode, "
+ "entering PIC mode.\n");
+ imcr_apic_to_pic();
+ return;
+ }
+#endif
+
+ /* Go back to Virtual Wire compatibility mode */
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write(APIC_SPIV, value);
+
+ if (!virt_wire_setup) {
+ /*
+ * For LVT0 make it edge triggered, active high,
+ * external and enabled
+ */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
+ }
+
+ /*
+ * For LVT1 make it edge triggered, active high,
+ * nmi and enabled
+ */
+ value = apic_read(APIC_LVT1);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write(APIC_LVT1, value);
+}
+
+void __cpuinit generic_processor_info(int apicid, int version)
+{
+ int cpu, max = nr_cpu_ids;
+ bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
+ phys_cpu_present_map);
+
+ /*
+ * If boot cpu has not been detected yet, then only allow upto
+ * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
+ */
+ if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
+ apicid != boot_cpu_physical_apicid) {
+ int thiscpu = max + disabled_cpus - 1;
+
+ pr_warning(
+ "ACPI: NR_CPUS/possible_cpus limit of %i almost"
+ " reached. Keeping one slot for boot cpu."
+ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+ disabled_cpus++;
+ return;
+ }
+
+ if (num_processors >= nr_cpu_ids) {
+ int thiscpu = max + disabled_cpus;
+
+ pr_warning(
+ "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+ disabled_cpus++;
+ return;
+ }
+
+ num_processors++;
+ if (apicid == boot_cpu_physical_apicid) {
+ /*
+ * x86_bios_cpu_apicid is required to have processors listed
+ * in same order as logical cpu numbers. Hence the first
+ * entry is BSP, and so on.
+ * boot_cpu_init() already hold bit 0 in cpu_present_mask
+ * for BSP.
+ */
+ cpu = 0;
+ } else
+ cpu = cpumask_next_zero(-1, cpu_present_mask);
+
+ /*
+ * Validate version
+ */
+ if (version == 0x0) {
+ pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
+ cpu, apicid);
+ version = 0x10;
+ }
+ apic_version[apicid] = version;
+
+ if (version != apic_version[boot_cpu_physical_apicid]) {
+ pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
+ apic_version[boot_cpu_physical_apicid], cpu, version);
+ }
+
+ physid_set(apicid, phys_cpu_present_map);
+ if (apicid > max_physical_apicid)
+ max_physical_apicid = apicid;
+
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
+#endif
+#ifdef CONFIG_X86_32
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ apic->x86_32_early_logical_apicid(cpu);
+#endif
+ set_cpu_possible(cpu, true);
+ set_cpu_present(cpu, true);
+}
+
+int hard_smp_processor_id(void)
+{
+ return read_apic_id();
+}
+
+void default_init_apic_ldr(void)
+{
+ unsigned long val;
+
+ apic_write(APIC_DFR, APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+ apic_write(APIC_LDR, val);
+}
+
+/*
+ * Power management
+ */
+#ifdef CONFIG_PM
+
+static struct {
+ /*
+ * 'active' is true if the local APIC was enabled by us and
+ * not the BIOS; this signifies that we are also responsible
+ * for disabling it before entering apm/acpi suspend
+ */
+ int active;
+ /* r/w apic fields */
+ unsigned int apic_id;
+ unsigned int apic_taskpri;
+ unsigned int apic_ldr;
+ unsigned int apic_dfr;
+ unsigned int apic_spiv;
+ unsigned int apic_lvtt;
+ unsigned int apic_lvtpc;
+ unsigned int apic_lvt0;
+ unsigned int apic_lvt1;
+ unsigned int apic_lvterr;
+ unsigned int apic_tmict;
+ unsigned int apic_tdcr;
+ unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(void)
+{
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return 0;
+
+ maxlvt = lapic_get_maxlvt();
+
+ apic_pm_state.apic_id = apic_read(APIC_ID);
+ apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+ apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+ apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+ apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+ apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+ if (maxlvt >= 4)
+ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+ apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+ apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+ apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+ apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ if (maxlvt >= 5)
+ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+#endif
+
+ local_irq_save(flags);
+ disable_local_APIC();
+
+ if (intr_remapping_enabled)
+ disable_intr_remapping();
+
+ local_irq_restore(flags);
+ return 0;
+}
+
+static void lapic_resume(void)
+{
+ unsigned int l, h;
+ unsigned long flags;
+ int maxlvt;
+
+ if (!apic_pm_state.active)
+ return;
+
+ local_irq_save(flags);
+ if (intr_remapping_enabled) {
+ /*
+ * IO-APIC and PIC have their own resume routines.
+ * We just mask them here to make sure the interrupt
+ * subsystem is completely quiet while we enable x2apic
+ * and interrupt-remapping.
+ */
+ mask_ioapic_entries();
+ legacy_pic->mask_all();
+ }
+
+ if (x2apic_mode)
+ enable_x2apic();
+ else {
+ /*
+ * Make sure the APICBASE points to the right address
+ *
+ * FIXME! This will be wrong if we ever support suspend on
+ * SMP! We'll need to do this as part of the CPU restore!
+ */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ }
+
+ maxlvt = lapic_get_maxlvt();
+ apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+ apic_write(APIC_ID, apic_pm_state.apic_id);
+ apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+ apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+ apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+ apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+ apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+ apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
+ if (maxlvt >= 5)
+ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+#endif
+ if (maxlvt >= 4)
+ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+ apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+ apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+ apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+
+ if (intr_remapping_enabled)
+ reenable_intr_remapping(x2apic_mode);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * This device has no shutdown method - fully functioning local APICs
+ * are needed on every CPU up until machine_halt/restart/poweroff.
+ */
+
+static struct syscore_ops lapic_syscore_ops = {
+ .resume = lapic_resume,
+ .suspend = lapic_suspend,
+};
+
+static void __cpuinit apic_pm_activate(void)
+{
+ apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+ /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+ if (cpu_has_apic)
+ register_syscore_ops(&lapic_syscore_ops);
+
+ return 0;
+}
+
+/* local apic needs to resume before other devices access its registers. */
+core_initcall(init_lapic_sysfs);
+
+#else /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
+
+#ifdef CONFIG_X86_64
+
+static int __cpuinit apic_cluster_num(void)
+{
+ int i, clusters, zeros;
+ unsigned id;
+ u16 *bios_cpu_apicid;
+ DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+ bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
+ bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+
+ for (i = 0; i < nr_cpu_ids; i++) {
+ /* are we being called early in kernel startup? */
+ if (bios_cpu_apicid) {
+ id = bios_cpu_apicid[i];
+ } else if (i < nr_cpu_ids) {
+ if (cpu_present(i))
+ id = per_cpu(x86_bios_cpu_apicid, i);
+ else
+ continue;
+ } else
+ break;
+
+ if (id != BAD_APICID)
+ __set_bit(APIC_CLUSTERID(id), clustermap);
+ }
+
+ /* Problem: Partially populated chassis may not have CPUs in some of
+ * the APIC clusters they have been allocated. Only present CPUs have
+ * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
+ * Since clusters are allocated sequentially, count zeros only if
+ * they are bounded by ones.
+ */
+ clusters = 0;
+ zeros = 0;
+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+ if (test_bit(i, clustermap)) {
+ clusters += 1 + zeros;
+ zeros = 0;
+ } else
+ ++zeros;
+ }
+
+ return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+ if (multi)
+ return 0;
+ pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+ multi = 1;
+ return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+ {
+ .callback = set_multi,
+ .ident = "IBM System Summit2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+ },
+ },
+ {}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+ if (multi_checked)
+ return;
+
+ dmi_check_system(multi_dmi_table);
+ multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+ dmi_check_multi();
+ if (multi)
+ return 1;
+
+ if (!is_vsmp_box())
+ return 0;
+
+ /*
+ * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
+ */
+ if (apic_cluster_num() > 1)
+ return 1;
+
+ return 0;
+}
+#endif
+
+/*
+ * APIC command line parameters
+ */
+static int __init setup_disableapic(char *arg)
+{
+ disable_apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
+ return 0;
+}
+early_param("disableapic", setup_disableapic);
+
+/* same as disableapic, for compatibility */
+static int __init setup_nolapic(char *arg)
+{
+ return setup_disableapic(arg);
+}
+early_param("nolapic", setup_nolapic);
+
+static int __init parse_lapic_timer_c2_ok(char *arg)
+{
+ local_apic_timer_c2_ok = 1;
+ return 0;
+}
+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
+
+static int __init parse_disable_apic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
+
+static int __init apic_set_verbosity(char *arg)
+{
+ if (!arg) {
+#ifdef CONFIG_X86_64
+ skip_ioapic_setup = 0;
+ return 0;
+#endif
+ return -EINVAL;
+ }
+
+ if (strcmp("debug", arg) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", arg) == 0)
+ apic_verbosity = APIC_VERBOSE;
+ else {
+ pr_warning("APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug\n", arg);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+early_param("apic", apic_set_verbosity);
+
+static int __init lapic_insert_resource(void)
+{
+ if (!apic_phys)
+ return -1;
+
+ /* Put local APIC into the resource map. */
+ lapic_resource.start = apic_phys;
+ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+ insert_resource(&iomem_resource, &lapic_resource);
+
+ return 0;
+}
+
+/*
+ * need call insert after e820_reserve_resources()
+ * that is using request_resource
+ */
+late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
new file mode 100644
index 00000000000..8c3cdded6f2
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+
+static struct apic apic_physflat;
+static struct apic apic_flat;
+
+struct apic __read_mostly *apic = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
+
+static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 1;
+}
+
+static const struct cpumask *flat_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_clear(retmask);
+ cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+void flat_init_apic_ldr(void)
+{
+ unsigned long val;
+ unsigned long num, id;
+
+ num = smp_processor_id();
+ id = 1UL << num;
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void
+ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ int cpu = smp_processor_id();
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+
+ _flat_send_IPI_mask(mask, vector);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+ int cpu = smp_processor_id();
+#ifdef CONFIG_HOTPLUG_CPU
+ int hotplug = 1;
+#else
+ int hotplug = 0;
+#endif
+ if (hotplug || vector == NMI_VECTOR) {
+ if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+ unsigned long mask = cpumask_bits(cpu_online_mask)[0];
+
+ if (cpu < BITS_PER_LONG)
+ clear_bit(cpu, &mask);
+
+ _flat_send_IPI_mask(mask, vector);
+ }
+ } else if (num_online_cpus() > 1) {
+ __default_send_IPI_shortcut(APIC_DEST_ALLBUT,
+ vector, apic->dest_logical);
+ }
+}
+
+static void flat_send_IPI_all(int vector)
+{
+ if (vector == NMI_VECTOR) {
+ flat_send_IPI_mask(cpu_online_mask, vector);
+ } else {
+ __default_send_IPI_shortcut(APIC_DEST_ALLINC,
+ vector, apic->dest_logical);
+ }
+}
+
+static unsigned int flat_get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ id = (((x)>>24) & 0xFFu);
+
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = ((id & 0xFFu)<<24);
+ return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+ unsigned int id;
+
+ id = flat_get_apic_id(apic_read(APIC_ID));
+ return id;
+}
+
+static int flat_apic_id_registered(void)
+{
+ return physid_isset(read_xapic_id(), phys_cpu_present_map);
+}
+
+static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static int flat_probe(void)
+{
+ return 1;
+}
+
+static struct apic apic_flat = {
+ .name = "flat",
+ .probe = flat_probe,
+ .acpi_madt_oem_check = flat_acpi_madt_oem_check,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = flat_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = flat_vector_allocation_domain,
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = flat_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFu << 24,
+
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = flat_send_IPI_mask,
+ .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = flat_send_IPI_allbutself,
+ .send_IPI_all = flat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+/*
+ * Physflat mode is used when there are more than 8 CPUs on a system.
+ * We cannot use logical delivery in this case because the mask
+ * overflows, so use physical mode.
+ */
+static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+#ifdef CONFIG_ACPI
+ /*
+ * Quirk: some x86_64 machines can only use physical APIC mode
+ * regardless of how many processors are present (x86_64 ES7000
+ * is an example).
+ */
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+ printk(KERN_DEBUG "system APIC only can use physical flat");
+ return 1;
+ }
+
+ if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
+ printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+static const struct cpumask *physflat_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+ default_send_IPI_mask_sequence_phys(cpumask, vector);
+}
+
+static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+ int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpumask, vector);
+}
+
+static void physflat_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void physflat_send_IPI_all(int vector)
+{
+ physflat_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_first(cpumask);
+ if ((unsigned)cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ else
+ return BAD_APICID;
+}
+
+static unsigned int
+physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static int physflat_probe(void)
+{
+ if (apic == &apic_physflat || num_possible_cpus() > 8)
+ return 1;
+
+ return 0;
+}
+
+static struct apic apic_physflat = {
+
+ .name = "physical flat",
+ .probe = physflat_probe,
+ .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
+ .apic_id_registered = flat_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = physflat_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = physflat_vector_allocation_domain,
+ /* not needed, but shouldn't hurt: */
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = flat_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = flat_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFu << 24,
+
+ .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = physflat_send_IPI_mask,
+ .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = physflat_send_IPI_allbutself,
+ .send_IPI_all = physflat_send_IPI_all,
+ .send_IPI_self = apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+
+/*
+ * We need to check for physflat first, so this order is important.
+ */
+apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 00000000000..775b82bc655
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,191 @@
+/*
+ * NOOP APIC driver.
+ *
+ * Does almost nothing and should be substituted by a real apic driver via
+ * probe routine.
+ *
+ * Though in case if apic is disabled (for some reason) we try
+ * to not uglify the caller's code and allow to call (some) apic routines
+ * like self-ipi, etc...
+ */
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+
+static void noop_init_apic_ldr(void) { }
+static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
+static void noop_send_IPI_allbutself(int vector) { }
+static void noop_send_IPI_all(int vector) { }
+static void noop_send_IPI_self(int vector) { }
+static void noop_apic_wait_icr_idle(void) { }
+static void noop_apic_icr_write(u32 low, u32 id) { }
+
+static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+{
+ return -1;
+}
+
+static u32 noop_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static u64 noop_apic_icr_read(void)
+{
+ return 0;
+}
+
+static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return 0;
+}
+
+static unsigned int noop_get_apic_id(unsigned long x)
+{
+ return 0;
+}
+
+static int noop_probe(void)
+{
+ /*
+ * NOOP apic should not ever be
+ * enabled via probe routine
+ */
+ return 0;
+}
+
+static int noop_apic_id_registered(void)
+{
+ /*
+ * if we would be really "pedantic"
+ * we should pass read_apic_id() here
+ * but since NOOP suppose APIC ID = 0
+ * lets save a few cycles
+ */
+ return physid_isset(0, phys_cpu_present_map);
+}
+
+static const struct cpumask *noop_target_cpus(void)
+{
+ /* only BSP here */
+ return cpumask_of(0);
+}
+
+static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return physid_isset(apicid, *map);
+}
+
+static unsigned long noop_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ if (cpu != 0)
+ pr_warning("APIC: Vector allocated for non-BSP cpu\n");
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static u32 noop_apic_read(u32 reg)
+{
+ WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+ return 0;
+}
+
+static void noop_apic_write(u32 reg, u32 v)
+{
+ WARN_ON_ONCE(cpu_has_apic && !disable_apic);
+}
+
+struct apic apic_noop = {
+ .name = "noop",
+ .probe = noop_probe,
+ .acpi_madt_oem_check = NULL,
+
+ .apic_id_registered = noop_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = noop_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = noop_check_apicid_used,
+ .check_apicid_present = noop_check_apicid_present,
+
+ .vector_allocation_domain = noop_vector_allocation_domain,
+ .init_apic_ldr = noop_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+
+ .phys_pkg_id = noop_phys_pkg_id,
+
+ .mps_oem_check = NULL,
+
+ .get_apic_id = noop_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = noop_send_IPI_mask,
+ .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = noop_send_IPI_allbutself,
+ .send_IPI_all = noop_send_IPI_all,
+ .send_IPI_self = noop_send_IPI_self,
+
+ .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
+
+ /* should be safe */
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = NULL,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = noop_apic_read,
+ .write = noop_apic_write,
+ .icr_read = noop_apic_icr_read,
+ .icr_write = noop_apic_icr_write,
+ .wait_icr_idle = noop_apic_wait_icr_idle,
+ .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+ .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
+#endif
+};
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 00000000000..09d3d8c1cd9
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,294 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+
+#include <asm/numachip/numachip_csr.h>
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/apic_flat_64.h>
+
+static int numachip_system __read_mostly;
+
+static struct apic apic_numachip __read_mostly;
+
+static unsigned int get_apic_id(unsigned long x)
+{
+ unsigned long value;
+ unsigned int id;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
+
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = ((id & 0xffU) << 24);
+ return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+ return get_apic_id(apic_read(APIC_ID));
+}
+
+static int numachip_apic_id_registered(void)
+{
+ return physid_isset(read_xapic_id(), phys_cpu_present_map);
+}
+
+static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static const struct cpumask *numachip_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ union numachip_csr_g3_ext_irq_gen int_gen;
+
+ int_gen.s._destination_apic_id = phys_apicid;
+ int_gen.s._vector = 0;
+ int_gen.s._msgtype = APIC_DM_INIT >> 8;
+ int_gen.s._index = 0;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+ int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
+ int_gen.s._vector = start_rip >> 12;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+ atomic_set(&init_deasserted, 1);
+ return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+ union numachip_csr_g3_ext_irq_gen int_gen;
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
+ int_gen.s._destination_apic_id = apicid;
+ int_gen.s._vector = vector;
+ int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
+ int_gen.s._index = 0;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+ numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_first(cpumask);
+ if (likely((unsigned)cpu < nr_cpu_ids))
+ return per_cpu(x86_cpu_to_apicid, cpu);
+
+ return BAD_APICID;
+}
+
+static unsigned int
+numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static int __init numachip_probe(void)
+{
+ return apic == &apic_numachip;
+}
+
+static void __init map_csrs(void)
+{
+ printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
+ NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+
+ printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
+ NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
+ init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ c->phys_proc_id = node;
+ per_cpu(cpu_llc_id, smp_processor_id()) = node;
+}
+
+static int __init numachip_system_init(void)
+{
+ unsigned int val;
+
+ if (!numachip_system)
+ return 0;
+
+ x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+
+ map_csrs();
+
+ val = read_lcsr(CSR_G0_NODE_IDS);
+ printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
+
+ return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "NUMASC", 6)) {
+ numachip_system = 1;
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct apic apic_numachip __refconst = {
+
+ .name = "NumaConnect system",
+ .probe = numachip_probe,
+ .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = numachip_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = numachip_vector_allocation_domain,
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = numachip_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xffU << 24,
+
+ .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+apic_driver(apic_numachip);
+
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
new file mode 100644
index 00000000000..521bead0113
--- /dev/null
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -0,0 +1,278 @@
+/*
+ * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs.
+ *
+ * Drives the local APIC in "clustered mode".
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/smp.h>
+
+#include <asm/apicdef.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+static unsigned bigsmp_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static int bigsmp_apic_id_registered(void)
+{
+ return 1;
+}
+
+static const struct cpumask *bigsmp_target_cpus(void)
+{
+#ifdef CONFIG_SMP
+ return cpu_online_mask;
+#else
+ return cpumask_of(0);
+#endif
+}
+
+static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return 0;
+}
+
+static unsigned long bigsmp_check_apicid_present(int bit)
+{
+ return 1;
+}
+
+static int bigsmp_early_logical_apicid(int cpu)
+{
+ /* on bigsmp, logical apicid is the same as physical */
+ return early_per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static inline unsigned long calculate_ldr(int cpu)
+{
+ unsigned long val, id;
+
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ id = per_cpu(x86_bios_cpu_apicid, cpu);
+ val |= SET_APIC_LOGICAL_ID(id);
+
+ return val;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void bigsmp_init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void bigsmp_setup_apic_routing(void)
+{
+ printk(KERN_INFO
+ "Enabling APIC mode: Physflat. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static int bigsmp_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
+
+ return BAD_APICID;
+}
+
+static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ physids_promote(0xFFL, retmap);
+}
+
+static int bigsmp_check_phys_apicid_present(int phys_apicid)
+{
+ return 1;
+}
+
+/* As we are using single CPU as destination, pick only one CPU here */
+static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ int cpu = cpumask_first(cpumask);
+
+ if (cpu < nr_cpu_ids)
+ return cpu_physical_id(cpu);
+ return BAD_APICID;
+}
+
+static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ return cpu_physical_id(cpu);
+ }
+ return BAD_APICID;
+}
+
+static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_phys(mask, vector);
+}
+
+static void bigsmp_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void bigsmp_send_IPI_all(int vector)
+{
+ bigsmp_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int dmi_bigsmp; /* can be set by dmi scanners */
+
+static int hp_ht_bigsmp(const struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
+ dmi_bigsmp = 1;
+
+ return 0;
+}
+
+
+static const struct dmi_system_id bigsmp_dmi_table[] = {
+ { hp_ht_bigsmp, "HP ProLiant DL760 G2",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
+ }
+ },
+
+ { hp_ht_bigsmp, "HP ProLiant DL740",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
+ }
+ },
+ { } /* NULL entry stops DMI scanning */
+};
+
+static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static int probe_bigsmp(void)
+{
+ if (def_to_bigsmp)
+ dmi_bigsmp = 1;
+ else
+ dmi_check_system(bigsmp_dmi_table);
+
+ return dmi_bigsmp;
+}
+
+static struct apic apic_bigsmp = {
+
+ .name = "bigsmp",
+ .probe = probe_bigsmp,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = bigsmp_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPU: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = bigsmp_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = bigsmp_check_apicid_used,
+ .check_apicid_present = bigsmp_check_apicid_present,
+
+ .vector_allocation_domain = bigsmp_vector_allocation_domain,
+ .init_apic_ldr = bigsmp_init_apic_ldr,
+
+ .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
+ .setup_apic_routing = bigsmp_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = bigsmp_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = bigsmp_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = bigsmp_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = bigsmp_send_IPI_allbutself,
+ .send_IPI_all = bigsmp_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
+};
+
+void __init generic_bigsmp_probe(void)
+{
+ unsigned int cpu;
+
+ if (!probe_bigsmp())
+ return;
+
+ apic = &apic_bigsmp;
+
+ for_each_possible_cpu(cpu) {
+ if (early_per_cpu(x86_cpu_to_logical_apicid,
+ cpu) == BAD_APICID)
+ continue;
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
+ bigsmp_early_logical_apicid(cpu);
+ }
+
+ pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name);
+}
+
+apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
new file mode 100644
index 00000000000..5d513bc47b6
--- /dev/null
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -0,0 +1,755 @@
+/*
+ * Written by: Garry Forsgren, Unisys Corporation
+ * Natalie Protasevich, Unisys Corporation
+ *
+ * This file contains the code to configure and interface
+ * with Unisys ES7000 series hardware system manager.
+ *
+ * Copyright (c) 2003 Unisys Corporation.
+ * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
+ *
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Unisys Corporation, Township Line & Union Meeting
+ * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
+ *
+ * http://www.unisys.com
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/nmi.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#include <asm/apicdef.h>
+#include <linux/atomic.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+
+/*
+ * ES7000 chipsets
+ */
+
+#define NON_UNISYS 0
+#define ES7000_CLASSIC 1
+#define ES7000_ZORRO 2
+
+#define MIP_REG 1
+#define MIP_PSAI_REG 4
+
+#define MIP_BUSY 1
+#define MIP_SPIN 0xf0000
+#define MIP_VALID 0x0100000000000000ULL
+#define MIP_SW_APIC 0x1020b
+
+#define MIP_PORT(val) ((val >> 32) & 0xffff)
+
+#define MIP_RD_LO(val) (val & 0xffffffff)
+
+struct mip_reg {
+ unsigned long long off_0x00;
+ unsigned long long off_0x08;
+ unsigned long long off_0x10;
+ unsigned long long off_0x18;
+ unsigned long long off_0x20;
+ unsigned long long off_0x28;
+ unsigned long long off_0x30;
+ unsigned long long off_0x38;
+};
+
+struct mip_reg_info {
+ unsigned long long mip_info;
+ unsigned long long delivery_info;
+ unsigned long long host_reg;
+ unsigned long long mip_reg;
+};
+
+struct psai {
+ unsigned long long entry_type;
+ unsigned long long addr;
+ unsigned long long bep_addr;
+};
+
+#ifdef CONFIG_ACPI
+
+struct es7000_oem_table {
+ struct acpi_table_header Header;
+ u32 OEMTableAddr;
+ u32 OEMTableSize;
+};
+
+static unsigned long oem_addrX;
+static unsigned long oem_size;
+
+#endif
+
+/*
+ * ES7000 Globals
+ */
+
+static volatile unsigned long *psai;
+static struct mip_reg *mip_reg;
+static struct mip_reg *host_reg;
+static int mip_port;
+static unsigned long mip_addr;
+static unsigned long host_addr;
+
+int es7000_plat;
+
+/*
+ * GSI override for ES7000 platforms.
+ */
+
+
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+{
+ unsigned long vect = 0, psaival = 0;
+
+ if (psai == NULL)
+ return -1;
+
+ vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+ psaival = (0x1000000 | vect | cpu);
+
+ while (*psai & 0x1000000)
+ ;
+
+ *psai = psaival;
+
+ return 0;
+}
+
+static int es7000_apic_is_cluster(void)
+{
+ /* MPENTIUMIII */
+ if (boot_cpu_data.x86 == 6 &&
+ (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
+ return 1;
+
+ return 0;
+}
+
+static void setup_unisys(void)
+{
+ /*
+ * Determine the generation of the ES7000 currently running.
+ *
+ * es7000_plat = 1 if the machine is a 5xx ES7000 box
+ * es7000_plat = 2 if the machine is a x86_64 ES7000 box
+ *
+ */
+ if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
+ es7000_plat = ES7000_ZORRO;
+ else
+ es7000_plat = ES7000_CLASSIC;
+}
+
+/*
+ * Parse the OEM Table:
+ */
+static int parse_unisys_oem(char *oemptr)
+{
+ int i;
+ int success = 0;
+ unsigned char type, size;
+ unsigned long val;
+ char *tp = NULL;
+ struct psai *psaip = NULL;
+ struct mip_reg_info *mi;
+ struct mip_reg *host, *mip;
+
+ tp = oemptr;
+
+ tp += 8;
+
+ for (i = 0; i <= 6; i++) {
+ type = *tp++;
+ size = *tp++;
+ tp -= 2;
+ switch (type) {
+ case MIP_REG:
+ mi = (struct mip_reg_info *)tp;
+ val = MIP_RD_LO(mi->host_reg);
+ host_addr = val;
+ host = (struct mip_reg *)val;
+ host_reg = __va(host);
+ val = MIP_RD_LO(mi->mip_reg);
+ mip_port = MIP_PORT(mi->mip_info);
+ mip_addr = val;
+ mip = (struct mip_reg *)val;
+ mip_reg = __va(mip);
+ pr_debug("host_reg = 0x%lx\n",
+ (unsigned long)host_reg);
+ pr_debug("mip_reg = 0x%lx\n",
+ (unsigned long)mip_reg);
+ success++;
+ break;
+ case MIP_PSAI_REG:
+ psaip = (struct psai *)tp;
+ if (tp != NULL) {
+ if (psaip->addr)
+ psai = __va(psaip->addr);
+ else
+ psai = NULL;
+ success++;
+ }
+ break;
+ default:
+ break;
+ }
+ tp += size;
+ }
+
+ if (success < 2)
+ es7000_plat = NON_UNISYS;
+ else
+ setup_unisys();
+
+ return es7000_plat;
+}
+
+#ifdef CONFIG_ACPI
+static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
+{
+ struct acpi_table_header *header = NULL;
+ struct es7000_oem_table *table;
+ acpi_size tbl_size;
+ acpi_status ret;
+ int i = 0;
+
+ for (;;) {
+ ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
+ if (!ACPI_SUCCESS(ret))
+ return -1;
+
+ if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
+ break;
+
+ early_acpi_os_unmap_memory(header, tbl_size);
+ }
+
+ table = (void *)header;
+
+ oem_addrX = table->OEMTableAddr;
+ oem_size = table->OEMTableSize;
+
+ early_acpi_os_unmap_memory(header, tbl_size);
+
+ *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
+
+ return 0;
+}
+
+static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
+{
+ if (!oem_addr)
+ return;
+
+ __acpi_unmap_table((char *)oem_addr, oem_size);
+}
+
+static int es7000_check_dsdt(void)
+{
+ struct acpi_table_header header;
+
+ if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
+ !strncmp(header.oem_id, "UNISYS", 6))
+ return 1;
+ return 0;
+}
+
+static int es7000_acpi_ret;
+
+/* Hook from generic ACPI tables.c */
+static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ unsigned long oem_addr = 0;
+ int check_dsdt;
+ int ret = 0;
+
+ /* check dsdt at first to avoid clear fix_map for oem_addr */
+ check_dsdt = es7000_check_dsdt();
+
+ if (!find_unisys_acpi_oem_table(&oem_addr)) {
+ if (check_dsdt) {
+ ret = parse_unisys_oem((char *)oem_addr);
+ } else {
+ setup_unisys();
+ ret = 1;
+ }
+ /*
+ * we need to unmap it
+ */
+ unmap_unisys_acpi_oem_table(oem_addr);
+ }
+
+ es7000_acpi_ret = ret;
+
+ return ret && !es7000_apic_is_cluster();
+}
+
+static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
+{
+ int ret = es7000_acpi_ret;
+
+ return ret && es7000_apic_is_cluster();
+}
+
+#else /* !CONFIG_ACPI: */
+static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return 0;
+}
+
+static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
+{
+ return 0;
+}
+#endif /* !CONFIG_ACPI */
+
+static void es7000_spin(int n)
+{
+ int i = 0;
+
+ while (i++ < n)
+ rep_nop();
+}
+
+static int es7000_mip_write(struct mip_reg *mip_reg)
+{
+ int status = 0;
+ int spin;
+
+ spin = MIP_SPIN;
+ while ((host_reg->off_0x38 & MIP_VALID) != 0) {
+ if (--spin <= 0) {
+ WARN(1, "Timeout waiting for Host Valid Flag\n");
+ return -1;
+ }
+ es7000_spin(MIP_SPIN);
+ }
+
+ memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
+ outb(1, mip_port);
+
+ spin = MIP_SPIN;
+
+ while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
+ if (--spin <= 0) {
+ WARN(1, "Timeout waiting for MIP Valid Flag\n");
+ return -1;
+ }
+ es7000_spin(MIP_SPIN);
+ }
+
+ status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
+ mip_reg->off_0x38 &= ~MIP_VALID;
+
+ return status;
+}
+
+static void es7000_enable_apic_mode(void)
+{
+ struct mip_reg es7000_mip_reg;
+ int mip_status;
+
+ if (!es7000_plat)
+ return;
+
+ pr_info("Enabling APIC mode.\n");
+ memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
+ es7000_mip_reg.off_0x00 = MIP_SW_APIC;
+ es7000_mip_reg.off_0x38 = MIP_VALID;
+
+ while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
+ WARN(1, "Command failed, status = %x\n", mip_status);
+}
+
+static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_clear(retmask);
+ cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+}
+
+
+static void es7000_wait_for_init_deassert(atomic_t *deassert)
+{
+ while (!atomic_read(deassert))
+ cpu_relax();
+}
+
+static unsigned int es7000_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_phys(mask, vector);
+}
+
+static void es7000_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
+}
+
+static void es7000_send_IPI_all(int vector)
+{
+ es7000_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int es7000_apic_id_registered(void)
+{
+ return 1;
+}
+
+static const struct cpumask *target_cpus_cluster(void)
+{
+ return cpu_all_mask;
+}
+
+static const struct cpumask *es7000_target_cpus(void)
+{
+ return cpumask_of(smp_processor_id());
+}
+
+static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return 0;
+}
+
+static unsigned long es7000_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static int es7000_early_logical_apicid(int cpu)
+{
+ /* on es7000, logical apicid is the same as physical */
+ return early_per_cpu(x86_bios_cpu_apicid, cpu);
+}
+
+static unsigned long calculate_ldr(int cpu)
+{
+ unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
+
+ return SET_APIC_LOGICAL_ID(id);
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LdR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void es7000_init_apic_ldr_cluster(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_CLUSTER);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void es7000_init_apic_ldr(void)
+{
+ unsigned long val;
+ int cpu = smp_processor_id();
+
+ apic_write(APIC_DFR, APIC_DFR_FLAT);
+ val = calculate_ldr(cpu);
+ apic_write(APIC_LDR, val);
+}
+
+static void es7000_setup_apic_routing(void)
+{
+ int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
+
+ pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
+ (apic_version[apic] == 0x14) ?
+ "Physical Cluster" : "Logical Cluster",
+ nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
+}
+
+static int es7000_cpu_present_to_apicid(int mps_cpu)
+{
+ if (!mps_cpu)
+ return boot_cpu_physical_apicid;
+ else if (mps_cpu < nr_cpu_ids)
+ return per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static int cpu_id;
+
+static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
+{
+ physid_set_mask_of_physid(cpu_id, retmap);
+ ++cpu_id;
+}
+
+static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ physids_promote(0xFFL, retmap);
+}
+
+static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
+{
+ boot_cpu_physical_apicid = read_apic_id();
+ return 1;
+}
+
+static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ unsigned int round = 0;
+ int cpu, uninitialized_var(apicid);
+
+ /*
+ * The cpus in the mask must all be on the apic cluster.
+ */
+ for_each_cpu(cpu, cpumask) {
+ int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
+ WARN(1, "Not a valid mask!");
+
+ return BAD_APICID;
+ }
+ apicid = new_apicid;
+ round++;
+ }
+ return apicid;
+}
+
+static unsigned int
+es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
+ const struct cpumask *andmask)
+{
+ int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
+ cpumask_var_t cpumask;
+
+ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+ return apicid;
+
+ cpumask_and(cpumask, inmask, andmask);
+ cpumask_and(cpumask, cpumask, cpu_online_mask);
+ apicid = es7000_cpu_mask_to_apicid(cpumask);
+
+ free_cpumask_var(cpumask);
+
+ return apicid;
+}
+
+static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static int probe_es7000(void)
+{
+ /* probed later in mptable/ACPI hooks */
+ return 0;
+}
+
+static int es7000_mps_ret;
+static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
+ char *productid)
+{
+ int ret = 0;
+
+ if (mpc->oemptr) {
+ struct mpc_oemtable *oem_table =
+ (struct mpc_oemtable *)mpc->oemptr;
+
+ if (!strncmp(oem, "UNISYS", 6))
+ ret = parse_unisys_oem((char *)oem_table);
+ }
+
+ es7000_mps_ret = ret;
+
+ return ret && !es7000_apic_is_cluster();
+}
+
+static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
+ char *productid)
+{
+ int ret = es7000_mps_ret;
+
+ return ret && es7000_apic_is_cluster();
+}
+
+/* We've been warned by a false positive warning.Use __refdata to keep calm. */
+static struct apic __refdata apic_es7000_cluster = {
+
+ .name = "es7000",
+ .probe = probe_es7000,
+ .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
+ .apic_id_registered = es7000_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all procs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = target_cpus_cluster,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = es7000_check_apicid_used,
+ .check_apicid_present = es7000_check_apicid_present,
+
+ .vector_allocation_domain = es7000_vector_allocation_domain,
+ .init_apic_ldr = es7000_init_apic_ldr_cluster,
+
+ .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
+ .setup_apic_routing = es7000_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
+ .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = es7000_check_phys_apicid_present,
+ .enable_apic_mode = es7000_enable_apic_mode,
+ .phys_pkg_id = es7000_phys_pkg_id,
+ .mps_oem_check = es7000_mps_oem_check_cluster,
+
+ .get_apic_id = es7000_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = es7000_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = es7000_send_IPI_allbutself,
+ .send_IPI_all = es7000_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
+
+ .trampoline_phys_low = 0x467,
+ .trampoline_phys_high = 0x469,
+
+ .wait_for_init_deassert = NULL,
+
+ /* Nothing to do for most platforms, since cleared by the INIT cycle: */
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = es7000_early_logical_apicid,
+};
+
+static struct apic __refdata apic_es7000 = {
+
+ .name = "es7000",
+ .probe = probe_es7000,
+ .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
+ .apic_id_registered = es7000_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ /* phys delivery to target CPUs: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = es7000_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = 0,
+ .check_apicid_used = es7000_check_apicid_used,
+ .check_apicid_present = es7000_check_apicid_present,
+
+ .vector_allocation_domain = es7000_vector_allocation_domain,
+ .init_apic_ldr = es7000_init_apic_ldr,
+
+ .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
+ .setup_apic_routing = es7000_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
+ .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = es7000_check_phys_apicid_present,
+ .enable_apic_mode = es7000_enable_apic_mode,
+ .phys_pkg_id = es7000_phys_pkg_id,
+ .mps_oem_check = es7000_mps_oem_check,
+
+ .get_apic_id = es7000_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = es7000_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = es7000_send_IPI_allbutself,
+ .send_IPI_all = es7000_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = 0x467,
+ .trampoline_phys_high = 0x469,
+
+ .wait_for_init_deassert = es7000_wait_for_init_deassert,
+
+ /* Nothing to do for most platforms, since cleared by the INIT cycle: */
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = es7000_early_logical_apicid,
+};
+
+/*
+ * Need to check for es7000 followed by es7000_cluster, so this order
+ * in apic_drivers is important.
+ */
+apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 00000000000..31cb9ae992b
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,90 @@
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <asm/apic.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
+{
+ return (u64)(cpu_khz) * 1000 * watchdog_thresh;
+}
+#endif
+
+#ifdef arch_trigger_all_cpu_backtrace
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
+void arch_trigger_all_cpu_backtrace(void)
+{
+ int i;
+
+ if (test_and_set_bit(0, &backtrace_flag))
+ /*
+ * If there is already a trigger_all_cpu_backtrace() in progress
+ * (backtrace_flag == 1), don't output double cpu dump infos.
+ */
+ return;
+
+ cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+
+ printk(KERN_INFO "sending NMI to all CPUs:\n");
+ apic->send_IPI_all(NMI_VECTOR);
+
+ /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpumask_empty(to_cpumask(backtrace_mask)))
+ break;
+ mdelay(1);
+ }
+
+ clear_bit(0, &backtrace_flag);
+ smp_mb__after_clear_bit();
+}
+
+static int __kprobes
+arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu;
+
+ cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+ arch_spin_lock(&lock);
+ printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
+ arch_spin_unlock(&lock);
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ return NMI_HANDLED;
+ }
+
+ return NMI_DONE;
+}
+
+static int __init register_trigger_all_cpu_backtrace(void)
+{
+ register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
+ 0, "arch_bt");
+ return 0;
+}
+early_initcall(register_trigger_all_cpu_backtrace);
+#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
new file mode 100644
index 00000000000..9a15d4b9055
--- /dev/null
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -0,0 +1,4043 @@
+/*
+ * Intel IO-APIC support for multi-Pentium hosts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ *
+ * Many thanks to Stig Venaas for trying out countless experimental
+ * patches and reporting/debugging problems patiently!
+ *
+ * (c) 1999, Multiple IO-APIC support, developed by
+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
+ * and Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively
+ * Paul Diefenbaugh : Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/syscore_ops.h>
+#include <linux/msi.h>
+#include <linux/htirq.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h> /* time_after() */
+#include <linux/slab.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+#include <linux/bootmem.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
+
+#include <asm/idle.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/msidef.h>
+#include <asm/hypertransport.h>
+#include <asm/setup.h>
+#include <asm/irq_remapping.h>
+#include <asm/hpet.h>
+#include <asm/hw_irq.h>
+
+#include <asm/apic.h>
+
+#define __apicdebuginit(type) static type __init
+#define for_each_irq_pin(entry, head) \
+ for (entry = head; entry; entry = entry->next)
+
+/*
+ * Is the SiS APIC rmw bug present ?
+ * -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(vector_lock);
+
+static struct ioapic {
+ /*
+ * # of IRQ routing registers
+ */
+ int nr_registers;
+ /*
+ * Saved state during suspend/resume, or while enabling intr-remap.
+ */
+ struct IO_APIC_route_entry *saved_registers;
+ /* I/O APIC config */
+ struct mpc_ioapic mp_config;
+ /* IO APIC gsi routing info */
+ struct mp_ioapic_gsi gsi_config;
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
+
+#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
+
+int mpc_ioapic_id(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicid;
+}
+
+unsigned int mpc_ioapic_addr(int ioapic_idx)
+{
+ return ioapics[ioapic_idx].mp_config.apicaddr;
+}
+
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+{
+ return &ioapics[ioapic_idx].gsi_config;
+}
+
+int nr_ioapics;
+
+/* The one past the highest gsi number used */
+u32 gsi_top;
+
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+/* GSI interrupts */
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+int skip_ioapic_setup;
+
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+void disable_ioapic_support(void)
+{
+#ifdef CONFIG_PCI
+ noioapicquirk = 1;
+ noioapicreroute = -1;
+#endif
+ skip_ioapic_setup = 1;
+}
+
+static int __init parse_noapic(char *str)
+{
+ /* disable IO-APIC */
+ disable_ioapic_support();
+ return 0;
+}
+early_param("noapic", parse_noapic);
+
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+ struct io_apic_irq_attr *attr);
+
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+ return;
+ }
+
+ memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+struct irq_pin_list {
+ int apic, pin;
+ struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *alloc_irq_pin_list(int node)
+{
+ return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
+}
+
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
+
+int __init arch_early_irq_init(void)
+{
+ struct irq_cfg *cfg;
+ int count, node, i;
+
+ if (!legacy_pic->nr_legacy_irqs)
+ io_apic_irqs = ~0UL;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ ioapics[i].saved_registers =
+ kzalloc(sizeof(struct IO_APIC_route_entry) *
+ ioapics[i].nr_registers, GFP_KERNEL);
+ if (!ioapics[i].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+ }
+
+ cfg = irq_cfgx;
+ count = ARRAY_SIZE(irq_cfgx);
+ node = cpu_to_node(0);
+
+ /* Make sure the legacy interrupts are marked in the bitmap */
+ irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
+
+ for (i = 0; i < count; i++) {
+ irq_set_chip_data(i, &cfg[i]);
+ zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
+ */
+ if (i < legacy_pic->nr_legacy_irqs) {
+ cfg[i].vector = IRQ0_VECTOR + i;
+ cpumask_set_cpu(0, cfg[i].domain);
+ }
+ }
+
+ return 0;
+}
+
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq_get_chip_data(irq);
+}
+
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+{
+ struct irq_cfg *cfg;
+
+ cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+ if (!cfg)
+ return NULL;
+ if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+ goto out_cfg;
+ if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ goto out_domain;
+ return cfg;
+out_domain:
+ free_cpumask_var(cfg->domain);
+out_cfg:
+ kfree(cfg);
+ return NULL;
+}
+
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
+{
+ if (!cfg)
+ return;
+ irq_set_chip_data(at, NULL);
+ free_cpumask_var(cfg->domain);
+ free_cpumask_var(cfg->old_domain);
+ kfree(cfg);
+}
+
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+ int res = irq_alloc_desc_at(at, node);
+ struct irq_cfg *cfg;
+
+ if (res < 0) {
+ if (res != -EEXIST)
+ return NULL;
+ cfg = irq_get_chip_data(at);
+ if (cfg)
+ return cfg;
+ }
+
+ cfg = alloc_irq_cfg(at, node);
+ if (cfg)
+ irq_set_chip_data(at, cfg);
+ else
+ irq_free_desc(at);
+ return cfg;
+}
+
+static int alloc_irq_from(unsigned int from, int node)
+{
+ return irq_alloc_desc_from(from, node);
+}
+
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
+{
+ free_irq_cfg(at, cfg);
+ irq_free_desc(at);
+}
+
+struct io_apic {
+ unsigned int index;
+ unsigned int unused[3];
+ unsigned int data;
+ unsigned int unused2[11];
+ unsigned int eoi;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(vector, &io_apic->eoi);
+}
+
+unsigned int native_ioapic_read(unsigned int apic, unsigned int reg)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ return readl(&io_apic->data);
+}
+
+void native_ioapic_write(unsigned int apic, unsigned int reg,
+ unsigned int value)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+}
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+void native_ioapic_modify(unsigned int apic, unsigned int reg,
+ unsigned int value)
+{
+ struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+ if (sis_apic_bug)
+ writel(reg, &io_apic->index);
+ writel(value, &io_apic->data);
+}
+
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ return eu.entry;
+}
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ eu.entry = __ioapic_read_entry(apic, pin);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return eu.entry;
+}
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void
+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ union entry_union eu = {{0, 0}};
+
+ eu.entry = e;
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __ioapic_write_entry(apic, pin, e);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+ unsigned long flags;
+ union entry_union eu = { .entry.mask = 1 };
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static int
+__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+ struct irq_pin_list **last, *entry;
+
+ /* don't allow duplicates */
+ last = &cfg->irq_2_pin;
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ if (entry->apic == apic && entry->pin == pin)
+ return 0;
+ last = &entry->next;
+ }
+
+ entry = alloc_irq_pin_list(node);
+ if (!entry) {
+ printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
+ node, apic, pin);
+ return -ENOMEM;
+ }
+ entry->apic = apic;
+ entry->pin = pin;
+
+ *last = entry;
+ return 0;
+}
+
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+ if (__add_pin_to_irq_node(cfg, node, apic, pin))
+ panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ /* every one is different, right? */
+ return;
+ }
+ }
+
+ /* old apic/pin didn't exist, so just add new ones */
+ add_pin_to_irq_node(cfg, node, newapic, newpin);
+}
+
+static void __io_apic_modify_irq(struct irq_pin_list *entry,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ unsigned int reg, pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+ reg &= mask_and;
+ reg |= mask_or;
+ io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+ if (final)
+ final(entry);
+}
+
+static void io_apic_modify_irq(struct irq_cfg *cfg,
+ int mask_and, int mask_or,
+ void (*final)(struct irq_pin_list *entry))
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ __io_apic_modify_irq(entry, mask_and, mask_or, final);
+}
+
+static void io_apic_sync(struct irq_pin_list *entry)
+{
+ /*
+ * Synchronize the IO-APIC and the CPU by doing
+ * a dummy read from the IO-APIC
+ */
+ struct io_apic __iomem *io_apic;
+ io_apic = io_apic_base(entry->apic);
+ readl(&io_apic->data);
+}
+
+static void mask_ioapic(struct irq_cfg *cfg)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void mask_ioapic_irq(struct irq_data *data)
+{
+ mask_ioapic(data->chip_data);
+}
+
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+ io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
+
+static void unmask_ioapic(struct irq_cfg *cfg)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __unmask_ioapic(cfg);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_ioapic_irq(struct irq_data *data)
+{
+ unmask_ioapic(data->chip_data);
+}
+
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 0Xh 82489DX
+ * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
+ * 30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
+{
+ if (mpc_ioapic_ver(apic) >= 0x20) {
+ /*
+ * Intr-remapping uses pin number as the virtual vector
+ * in the RTE. Actual vector is programmed in
+ * intr-remapping table entry. Hence for the io-apic
+ * EOI we use the pin number.
+ */
+ if (cfg && irq_remapped(cfg))
+ io_apic_eoi(apic, pin);
+ else
+ io_apic_eoi(apic, vector);
+ } else {
+ struct IO_APIC_route_entry entry, entry1;
+
+ entry = entry1 = __ioapic_read_entry(apic, pin);
+
+ /*
+ * Mask the entry and change the trigger mode to edge.
+ */
+ entry1.mask = 1;
+ entry1.trigger = IOAPIC_EDGE;
+
+ __ioapic_write_entry(apic, pin, entry1);
+
+ /*
+ * Restore the previous level triggered entry.
+ */
+ __ioapic_write_entry(apic, pin, entry);
+ }
+}
+
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+ struct IO_APIC_route_entry entry;
+
+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.delivery_mode == dest_SMI)
+ return;
+
+ /*
+ * Make sure the entry is masked and re-read the contents to check
+ * if it is a level triggered pin and if the remote-IRR is set.
+ */
+ if (!entry.mask) {
+ entry.mask = 1;
+ ioapic_write_entry(apic, pin, entry);
+ entry = ioapic_read_entry(apic, pin);
+ }
+
+ if (entry.irr) {
+ unsigned long flags;
+
+ /*
+ * Make sure the trigger mode is set to level. Explicit EOI
+ * doesn't clear the remote-IRR if the trigger mode is not
+ * set to level.
+ */
+ if (!entry.trigger) {
+ entry.trigger = IOAPIC_LEVEL;
+ ioapic_write_entry(apic, pin, entry);
+ }
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_pin(apic, pin, entry.vector, NULL);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ /*
+ * Clear the rest of the bits in the IO-APIC RTE except for the mask
+ * bit.
+ */
+ ioapic_mask_entry(apic, pin);
+ entry = ioapic_read_entry(apic, pin);
+ if (entry.irr)
+ printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n",
+ mpc_ioapic_id(apic), pin);
+}
+
+static void clear_IO_APIC (void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ clear_IO_APIC_pin(apic, pin);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries[MAX_PIRQS] = {
+ [0 ... MAX_PIRQS - 1] = -1
+};
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Saves all the IO-APIC RTE's
+ */
+int save_ioapic_entries(void)
+{
+ int apic, pin;
+ int err = 0;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!ioapics[apic].saved_registers) {
+ err = -ENOMEM;
+ continue;
+ }
+
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ ioapics[apic].saved_registers[pin] =
+ ioapic_read_entry(apic, pin);
+ }
+
+ return err;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapics[apic].saved_registers[pin];
+ if (!entry.mask) {
+ entry.mask = 1;
+ ioapic_write_entry(apic, pin, entry);
+ }
+ }
+ }
+}
+
+/*
+ * Restore IO APIC entries which was saved in the ioapic structure.
+ */
+int restore_ioapic_entries(void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (!ioapics[apic].saved_registers)
+ continue;
+
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ ioapic_write_entry(apic, pin,
+ ioapics[apic].saved_registers[pin]);
+ }
+ return 0;
+}
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int ioapic_idx, int pin, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].irqtype == type &&
+ (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
+ mp_irqs[i].dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].dstirq == pin)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+
+ return mp_irqs[i].dstirq;
+ }
+ return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ if (test_bit(lbus, mp_bus_not_pci) &&
+ (mp_irqs[i].irqtype == type) &&
+ (mp_irqs[i].srcbusirq == irq))
+ break;
+ }
+
+ if (i < mp_irq_entries) {
+ int ioapic_idx;
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+ return ioapic_idx;
+ }
+
+ return -1;
+}
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < legacy_pic->nr_legacy_irqs) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+#endif
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (0)
+#define default_ISA_polarity(idx) (0)
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx) (1)
+#define default_PCI_polarity(idx) (1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx) (1)
+#define default_MCA_polarity(idx) default_ISA_polarity(idx)
+
+static int irq_polarity(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+ int polarity;
+
+ /*
+ * Determine IRQ line polarity (high active or low active):
+ */
+ switch (mp_irqs[idx].irqflag & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent polarity */
+ if (test_bit(bus, mp_bus_not_pci))
+ polarity = default_ISA_polarity(idx);
+ else
+ polarity = default_PCI_polarity(idx);
+ break;
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ }
+ return polarity;
+}
+
+static int irq_trigger(int idx)
+{
+ int bus = mp_irqs[idx].srcbus;
+ int trigger;
+
+ /*
+ * Determine IRQ trigger mode (edge or level sensitive):
+ */
+ switch ((mp_irqs[idx].irqflag>>2) & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent */
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ trigger = default_EISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ /* set before the switch */
+ break;
+ }
+ case MP_BUS_MCA: /* MCA pin */
+ {
+ trigger = default_MCA_trigger(idx);
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ }
+#endif
+ break;
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
+ return trigger;
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+ int irq;
+ int bus = mp_irqs[idx].srcbus;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].dstirq != pin)
+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+ if (test_bit(bus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].srcbusirq;
+ } else {
+ u32 gsi = gsi_cfg->gsi_base + pin;
+
+ if (gsi >= NR_IRQS_LEGACY)
+ irq = gsi;
+ else
+ irq = gsi_top + gsi;
+ }
+
+#ifdef CONFIG_X86_32
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "disabling PIRQ%d\n", pin-16);
+ } else {
+ irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, KERN_DEBUG
+ "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ }
+ }
+ }
+#endif
+
+ return irq;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int ioapic_idx, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG,
+ "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_printk(APIC_VERBOSE,
+ "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
+ break;
+
+ if (!test_bit(lbus, mp_bus_not_pci) &&
+ !mp_irqs[i].irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+
+ if (!(ioapic_idx || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ set_io_apic_irq_attr(irq_attr, ioapic_idx,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ return irq;
+ }
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0) {
+ set_io_apic_irq_attr(irq_attr, ioapic_idx,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ best_guess = irq;
+ }
+ }
+ }
+ return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ raw_spin_unlock(&vector_lock);
+}
+
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+ static int current_offset = VECTOR_OFFSET_START % 8;
+ unsigned int old_vector;
+ int cpu, err;
+ cpumask_var_t tmp_mask;
+
+ if (cfg->move_in_progress)
+ return -EBUSY;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ old_vector = cfg->vector;
+ if (old_vector) {
+ cpumask_and(tmp_mask, mask, cpu_online_mask);
+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ if (!cpumask_empty(tmp_mask)) {
+ free_cpumask_var(tmp_mask);
+ return 0;
+ }
+ }
+
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ int new_cpu;
+ int vector, offset;
+
+ apic->vector_allocation_domain(cpu, tmp_mask);
+
+ vector = current_vector;
+ offset = current_offset;
+next:
+ vector += 8;
+ if (vector >= first_system_vector) {
+ /* If out of vectors on large boxen, must share them. */
+ offset = (offset + 1) % 8;
+ vector = FIRST_EXTERNAL_VECTOR + offset;
+ }
+ if (unlikely(current_vector == vector))
+ continue;
+
+ if (test_bit(vector, used_vectors))
+ goto next;
+
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ goto next;
+ /* Found one! */
+ current_vector = vector;
+ current_offset = offset;
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cpumask_copy(cfg->old_domain, cfg->domain);
+ }
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ per_cpu(vector_irq, new_cpu)[vector] = irq;
+ cfg->vector = vector;
+ cpumask_copy(cfg->domain, tmp_mask);
+ err = 0;
+ break;
+ }
+ free_cpumask_var(tmp_mask);
+ return err;
+}
+
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ int err;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector(irq, cfg, mask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return err;
+}
+
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
+{
+ int cpu, vector;
+
+ BUG_ON(!cfg->vector);
+
+ vector = cfg->vector;
+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ per_cpu(vector_irq, cpu)[vector] = -1;
+
+ cfg->vector = 0;
+ cpumask_clear(cfg->domain);
+
+ if (likely(!cfg->move_in_progress))
+ return;
+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] != irq)
+ continue;
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ break;
+ }
+ }
+ cfg->move_in_progress = 0;
+}
+
+void __setup_vector_irq(int cpu)
+{
+ /* Initialize vector_irq on a new cpu */
+ int irq, vector;
+ struct irq_cfg *cfg;
+
+ /*
+ * vector_lock will make sure that we don't run into irq vector
+ * assignments that might be happening on another cpu in parallel,
+ * while we setup our initial vector to irq mappings.
+ */
+ raw_spin_lock(&vector_lock);
+ /* Mark the inuse vectors */
+ for_each_active_irq(irq) {
+ cfg = irq_get_chip_data(irq);
+ if (!cfg)
+ continue;
+ /*
+ * If it is a legacy IRQ handled by the legacy PIC, this cpu
+ * will be part of the irq_cfg's domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
+ cpumask_set_cpu(cpu, cfg->domain);
+
+ if (!cpumask_test_cpu(cpu, cfg->domain))
+ continue;
+ vector = cfg->vector;
+ per_cpu(vector_irq, cpu)[vector] = irq;
+ }
+ /* Mark the free vectors */
+ for (vector = 0; vector < NR_VECTORS; ++vector) {
+ irq = per_cpu(vector_irq, cpu)[vector];
+ if (irq < 0)
+ continue;
+
+ cfg = irq_cfg(irq);
+ if (!cpumask_test_cpu(cpu, cfg->domain))
+ per_cpu(vector_irq, cpu)[vector] = -1;
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+static struct irq_chip ioapic_chip;
+
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ int apic, idx, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+ return irq_trigger(idx);
+ }
+ }
+ /*
+ * nonexistent IRQs are edge default
+ */
+ return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ return 1;
+}
+#endif
+
+static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
+ unsigned long trigger)
+{
+ struct irq_chip *chip = &ioapic_chip;
+ irq_flow_handler_t hdl;
+ bool fasteoi;
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL) {
+ irq_set_status_flags(irq, IRQ_LEVEL);
+ fasteoi = true;
+ } else {
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ fasteoi = false;
+ }
+
+ if (irq_remapped(cfg)) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ irq_remap_modify_chip_defaults(chip);
+ fasteoi = trigger != 0;
+ }
+
+ hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+ irq_set_chip_and_handler_name(irq, chip, hdl,
+ fasteoi ? "fasteoi" : "edge");
+}
+
+
+static int setup_ir_ioapic_entry(int irq,
+ struct IR_IO_APIC_route_entry *entry,
+ unsigned int destination, int vector,
+ struct io_apic_irq_attr *attr)
+{
+ int index;
+ struct irte irte;
+ int ioapic_id = mpc_ioapic_id(attr->ioapic);
+ struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
+
+ if (!iommu) {
+ pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
+ return -ENODEV;
+ }
+
+ index = alloc_irte(iommu, irq, 1);
+ if (index < 0) {
+ pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
+ return -ENOMEM;
+ }
+
+ prepare_irte(&irte, vector, destination);
+
+ /* Set source-id of interrupt request */
+ set_ioapic_sid(&irte, ioapic_id);
+
+ modify_irte(irq, &irte);
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+ "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+ "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+ "Avail:%X Vector:%02X Dest:%08X "
+ "SID:%04X SQ:%X SVT:%X)\n",
+ attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
+ irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+ irte.avail, irte.vector, irte.dest_id,
+ irte.sid, irte.sq, irte.svt);
+
+ memset(entry, 0, sizeof(*entry));
+
+ entry->index2 = (index >> 15) & 0x1;
+ entry->zero = 0;
+ entry->format = 1;
+ entry->index = (index & 0x7fff);
+ /*
+ * IO-APIC RTE will be configured with virtual vector.
+ * irq handler will do the explicit EOI to the io-apic.
+ */
+ entry->vector = attr->ioapic_pin;
+ entry->mask = 0; /* enable IRQ */
+ entry->trigger = attr->trigger;
+ entry->polarity = attr->polarity;
+
+ /* Mask level triggered irqs.
+ * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+ */
+ if (attr->trigger)
+ entry->mask = 1;
+
+ return 0;
+}
+
+static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
+ unsigned int destination, int vector,
+ struct io_apic_irq_attr *attr)
+{
+ if (intr_remapping_enabled)
+ return setup_ir_ioapic_entry(irq,
+ (struct IR_IO_APIC_route_entry *)entry,
+ destination, vector, attr);
+
+ memset(entry, 0, sizeof(*entry));
+
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->dest = destination;
+ entry->vector = vector;
+ entry->mask = 0; /* enable IRQ */
+ entry->trigger = attr->trigger;
+ entry->polarity = attr->polarity;
+
+ /*
+ * Mask level triggered irqs.
+ * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+ */
+ if (attr->trigger)
+ entry->mask = 1;
+
+ return 0;
+}
+
+static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
+ struct io_apic_irq_attr *attr)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned int dest;
+
+ if (!IO_APIC_IRQ(irq))
+ return;
+ /*
+ * For legacy irqs, cfg->domain starts with cpu 0 for legacy
+ * controllers like 8259. Now that IO-APIC can handle this irq, update
+ * the cfg->domain.
+ */
+ if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
+ apic->vector_allocation_domain(0, cfg->domain);
+
+ if (assign_irq_vector(irq, cfg, apic->target_cpus()))
+ return;
+
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+
+ apic_printk(APIC_VERBOSE,KERN_DEBUG
+ "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+ "IRQ %d Mode:%i Active:%i Dest:%d)\n",
+ attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
+ cfg->vector, irq, attr->trigger, attr->polarity, dest);
+
+ if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
+ pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
+ mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
+ __clear_irq_vector(irq, cfg);
+
+ return;
+ }
+
+ ioapic_register_intr(irq, cfg, attr->trigger);
+ if (irq < legacy_pic->nr_legacy_irqs)
+ legacy_pic->mask(irq);
+
+ ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
+}
+
+static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
+{
+ if (idx != -1)
+ return false;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic_idx), pin);
+ return true;
+}
+
+static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
+{
+ int idx, node = cpu_to_node(0);
+ struct io_apic_irq_attr attr;
+ unsigned int pin, irq;
+
+ for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
+ idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+ if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
+ continue;
+
+ irq = pin_2_irq(idx, ioapic_idx, pin);
+
+ if ((ioapic_idx > 0) && (irq > 16))
+ continue;
+
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(ioapic_idx, irq))
+ continue;
+
+ set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+ irq_polarity(idx));
+
+ io_apic_setup_irq_pin(irq, node, &attr);
+ }
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+ unsigned int ioapic_idx;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ __io_apic_setup_irqs(ioapic_idx);
+}
+
+/*
+ * for the gsit that is not in first ioapic
+ * but could not use acpi_register_gsi()
+ * like some special sci in IBM x3330
+ */
+void setup_IO_APIC_irq_extra(u32 gsi)
+{
+ int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
+ struct io_apic_irq_attr attr;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic_idx = mp_find_ioapic(gsi);
+ if (ioapic_idx < 0)
+ return;
+
+ pin = mp_find_ioapic_pin(ioapic_idx, gsi);
+ idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+ if (idx == -1)
+ return;
+
+ irq = pin_2_irq(idx, ioapic_idx, pin);
+
+ /* Only handle the non legacy irqs on secondary ioapics */
+ if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
+ return;
+
+ set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+ irq_polarity(idx));
+
+ io_apic_setup_irq_pin_once(irq, node, &attr);
+}
+
+/*
+ * Set up the timer pin, possibly with the 8259A-master behind.
+ */
+static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
+ unsigned int pin, int vector)
+{
+ struct IO_APIC_route_entry entry;
+
+ if (intr_remapping_enabled)
+ return;
+
+ memset(&entry, 0, sizeof(entry));
+
+ /*
+ * We use logical delivery to get the timer IRQ
+ * to the first CPU.
+ */
+ entry.dest_mode = apic->irq_dest_mode;
+ entry.mask = 0; /* don't mask IRQ for edge */
+ entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+ entry.delivery_mode = apic->irq_delivery_mode;
+ entry.polarity = 0;
+ entry.trigger = 0;
+ entry.vector = vector;
+
+ /*
+ * The timer IRQ doesn't have to know that behind the
+ * scene we may have a 8259A-master in AEOI mode ...
+ */
+ irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+ "edge");
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_idx, pin, entry);
+}
+
+__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+{
+ int i;
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+ union IO_APIC_reg_03 reg_03;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk("\n");
+ printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
+
+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+ printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
+ reg_01.bits.entries);
+
+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
+ reg_01.bits.version);
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+ * but the value of reg_02 is read as the previous read register
+ * value, so ignore it if reg_02 == reg_01.
+ */
+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ }
+
+ /*
+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+ * or reg_03, but the value of reg_0[23] is read as the previous read
+ * register value, so ignore it if reg_03 == reg_0[12].
+ */
+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+ reg_03.raw != reg_01.raw) {
+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ }
+
+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+ if (intr_remapping_enabled) {
+ printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+ " Pol Stat Indx2 Zero Vect:\n");
+ } else {
+ printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+ " Stat Dmod Deli Vect:\n");
+ }
+
+ for (i = 0; i <= reg_01.bits.entries; i++) {
+ if (intr_remapping_enabled) {
+ struct IO_APIC_route_entry entry;
+ struct IR_IO_APIC_route_entry *ir_entry;
+
+ entry = ioapic_read_entry(ioapic_idx, i);
+ ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+ printk(KERN_DEBUG " %02x %04X ",
+ i,
+ ir_entry->index
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %X %02X\n",
+ ir_entry->format,
+ ir_entry->mask,
+ ir_entry->trigger,
+ ir_entry->irr,
+ ir_entry->polarity,
+ ir_entry->delivery_status,
+ ir_entry->index2,
+ ir_entry->zero,
+ ir_entry->vector
+ );
+ } else {
+ struct IO_APIC_route_entry entry;
+
+ entry = ioapic_read_entry(ioapic_idx, i);
+ printk(KERN_DEBUG " %02x %02X ",
+ i,
+ entry.dest
+ );
+ printk("%1d %1d %1d %1d %1d "
+ "%1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector
+ );
+ }
+ }
+}
+
+__apicdebuginit(void) print_IO_APICs(void)
+{
+ int ioapic_idx;
+ struct irq_cfg *cfg;
+ unsigned int irq;
+ struct irq_chip *chip;
+
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx),
+ ioapics[ioapic_idx].nr_registers);
+
+ /*
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
+ */
+ printk(KERN_INFO "testing the IO APIC.......................\n");
+
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ print_IO_APIC(ioapic_idx);
+
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ for_each_active_irq(irq) {
+ struct irq_pin_list *entry;
+
+ chip = irq_get_chip(irq);
+ if (chip != &ioapic_chip)
+ continue;
+
+ cfg = irq_get_chip_data(irq);
+ if (!cfg)
+ continue;
+ entry = cfg->irq_2_pin;
+ if (!entry)
+ continue;
+ printk(KERN_DEBUG "IRQ%d ", irq);
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ printk("-> %d:%d", entry->apic, entry->pin);
+ printk("\n");
+ }
+
+ printk(KERN_INFO ".................................... done.\n");
+}
+
+__apicdebuginit(void) print_APIC_field(int base)
+{
+ int i;
+
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ printk(KERN_CONT "%08x", apic_read(base + i*0x10));
+
+ printk(KERN_CONT "\n");
+}
+
+__apicdebuginit(void) print_local_APIC(void *dummy)
+{
+ unsigned int i, v, ver, maxlvt;
+ u64 icr;
+
+ printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
+ v = apic_read(APIC_LVR);
+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_LDR);
+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ }
+ v = apic_read(APIC_SPIV);
+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+ printk(KERN_DEBUG "... APIC ISR field:\n");
+ print_APIC_field(APIC_ISR);
+ printk(KERN_DEBUG "... APIC TMR field:\n");
+ print_APIC_field(APIC_TMR);
+ printk(KERN_DEBUG "... APIC IRR field:\n");
+ print_APIC_field(APIC_IRR);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+ v = apic_read(APIC_LVTT);
+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) { /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) { /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
+ printk("\n");
+}
+
+__apicdebuginit(void) print_local_APICs(int maxcpu)
+{
+ int cpu;
+
+ if (!maxcpu)
+ return;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
+ preempt_enable();
+}
+
+__apicdebuginit(void) print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (!legacy_pic->nr_legacy_irqs)
+ return;
+
+ printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
+
+ outb(0x0b,0xa0);
+ outb(0x0b,0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a,0xa0);
+ outb(0x0a,0x20);
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+static int __initdata show_lapic = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+__apicdebuginit(int) print_ICs(void)
+{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
+ print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!cpu_has_apic && !apic_from_smp_config())
+ return 0;
+
+ print_local_APICs(show_lapic);
+ print_IO_APICs();
+
+ return 0;
+}
+
+late_initcall(print_ICs);
+
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+void __init enable_IO_APIC(void)
+{
+ int i8259_apic, i8259_pin;
+ int apic;
+
+ if (!legacy_pic->nr_legacy_irqs)
+ return;
+
+ for(apic = 0; apic < nr_ioapics; apic++) {
+ int pin;
+ /* See if any of the pins is in ExtINT mode */
+ for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ struct IO_APIC_route_entry entry;
+ entry = ioapic_read_entry(apic, pin);
+
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
+ }
+ }
+ }
+ found_i8259:
+ /* Look to see what if the MP table has reported the ExtINT */
+ /* If we could not find the appropriate pin by looking at the ioapic
+ * the i8259 probably is not connected the ioapic but give the
+ * mptable a chance anyway.
+ */
+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+ /* Trust the MP table if nothing is setup in the hardware */
+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ ioapic_i8259.pin = i8259_pin;
+ ioapic_i8259.apic = i8259_apic;
+ }
+ /* Complain if the MP table and the hardware disagree */
+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ {
+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+ }
+
+ /*
+ * Do not trust the IO-APIC being empty at bootup
+ */
+ clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+ /*
+ * Clear the IO-APIC before rebooting:
+ */
+ clear_IO_APIC();
+
+ if (!legacy_pic->nr_legacy_irqs)
+ return;
+
+ /*
+ * If the i8259 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrupts can be delivered.
+ *
+ * With interrupt-remapping, for now we will use virtual wire A mode,
+ * as virtual wire B is little complex (need to configure both
+ * IOAPIC RTE as well as interrupt-remapping table entry).
+ * As this gets called during crash dump, keep this simple for now.
+ */
+ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+ struct IO_APIC_route_entry entry;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = dest_ExtINT; /* ExtInt */
+ entry.vector = 0;
+ entry.dest = read_apic_id();
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+ }
+
+ /*
+ * Use virtual wire A mode when interrupt remapping is enabled.
+ */
+ if (cpu_has_apic || apic_from_smp_config())
+ disconnect_bsp_APIC(!intr_remapping_enabled &&
+ ioapic_i8259.pin != -1);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
+{
+ union IO_APIC_reg_00 reg_00;
+ physid_mask_t phys_id_present_map;
+ int ioapic_idx;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ /*
+ * This is broken; anything with a real cpu count has to
+ * circumvent this idiocy regardless.
+ */
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+ /* Read the register 0 value */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mpc_ioapic_id(ioapic_idx);
+
+ if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ reg_00.bits.ID);
+ ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
+ }
+
+ /*
+ * Sanity check, is the ID really free? Every APIC in a
+ * system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (apic->check_apicid_used(&phys_id_present_map,
+ mpc_ioapic_id(ioapic_idx))) {
+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < get_physical_broadcast(); i++)
+ if (!physid_isset(i, phys_id_present_map))
+ break;
+ if (i >= get_physical_broadcast())
+ panic("Max APIC ID exceeded!\n");
+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+ i);
+ physid_set(i, phys_id_present_map);
+ ioapics[ioapic_idx].mp_config.apicid = i;
+ } else {
+ physid_mask_t tmp;
+ apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
+ &tmp);
+ apic_printk(APIC_VERBOSE, "Setting %d in the "
+ "phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ physids_or(phys_id_present_map, phys_id_present_map, tmp);
+ }
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mpc_ioapic_id(ioapic_idx))
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].dstapic == old_id)
+ mp_irqs[i].dstapic
+ = mpc_ioapic_id(ioapic_idx);
+
+ /*
+ * Update the ID register according to the right value
+ * from the MPC table if they are different.
+ */
+ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+ continue;
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
+
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+ printk("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE, " ok.\n");
+ }
+}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+ no_timer_check = 1;
+ return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ * - timer IRQ defaults to IO-APIC IRQ
+ * - if this function detects that timer IRQs are defunct, then we fall
+ * back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+ unsigned long t1 = jiffies;
+ unsigned long flags;
+
+ if (no_timer_check)
+ return 1;
+
+ local_save_flags(flags);
+ local_irq_enable();
+ /* Let ten ticks pass... */
+ mdelay((10 * 1000) / HZ);
+ local_irq_restore(flags);
+
+ /*
+ * Expect a few ticks at least, to be sure some possible
+ * glue logic does not lock up after one or two first
+ * ticks in a non-ExtINT mode. Also the local APIC
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+
+ /* jiffies wrap? */
+ if (time_after(jiffies, t1 + 4))
+ return 1;
+ return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_ioapic_irq(struct irq_data *data)
+{
+ int was_pending = 0, irq = data->irq;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < legacy_pic->nr_legacy_irqs) {
+ legacy_pic->mask(irq);
+ if (legacy_pic->irq_pending(irq))
+ was_pending = 1;
+ }
+ __unmask_ioapic(data->chip_data);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+}
+
+static int ioapic_retrigger_irq(struct irq_data *data)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+#ifdef CONFIG_SMP
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
+
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ /*
+ * With interrupt-remapping, destination information comes
+ * from interrupt-remapping table entry.
+ */
+ if (!irq_remapped(cfg))
+ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+ reg |= vector;
+ io_apic_modify(apic, 0x10 + pin*2, reg);
+ }
+}
+
+/*
+ * Either sets data->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
+ * leaves data->affinity untouched.
+ */
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id)
+{
+ struct irq_cfg *cfg = data->chip_data;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return -1;
+
+ if (assign_irq_vector(data->irq, data->chip_data, mask))
+ return -1;
+
+ cpumask_copy(data->affinity, mask);
+
+ *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
+ return 0;
+}
+
+static int
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ unsigned int dest, irq = data->irq;
+ unsigned long flags;
+ int ret;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (!ret) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, data->chip_data);
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return ret;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
+ struct irte irte;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return -EINVAL;
+
+ if (get_irte(irq, &irte))
+ return -EBUSY;
+
+ if (assign_irq_vector(irq, cfg, mask))
+ return -EBUSY;
+
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
+
+ irte.vector = cfg->vector;
+ irte.dest_id = IRTE_DEST(dest);
+
+ /*
+ * Atomically updates the IRTE with the new destination, vector
+ * and flushes the interrupt entry cache.
+ */
+ modify_irte(irq, &irte);
+
+ /*
+ * After this point, all the interrupts will start arriving
+ * at the new destination. So, time to cleanup the previous
+ * vector allocation.
+ */
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
+
+ cpumask_copy(data->affinity, mask);
+ return 0;
+}
+
+#else
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ return 0;
+}
+#endif
+
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+ unsigned vector, me;
+
+ ack_APIC_irq();
+ irq_enter();
+ exit_idle();
+
+ me = smp_processor_id();
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ unsigned int irq;
+ unsigned int irr;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ irq = __this_cpu_read(vector_irq[vector]);
+
+ if (irq == -1)
+ continue;
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ cfg = irq_cfg(irq);
+ raw_spin_lock(&desc->lock);
+
+ /*
+ * Check if the irq migration is in progress. If so, we
+ * haven't received the cleanup request yet for this irq.
+ */
+ if (cfg->move_in_progress)
+ goto unlock;
+
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ goto unlock;
+
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ /*
+ * Check if the vector that needs to be cleanedup is
+ * registered at the cpu's IRR. If so, then this is not
+ * the best time to clean it up. Lets clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to myself.
+ */
+ if (irr & (1 << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ goto unlock;
+ }
+ __this_cpu_write(vector_irq[vector], -1);
+unlock:
+ raw_spin_unlock(&desc->lock);
+ }
+
+ irq_exit();
+}
+
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
+{
+ unsigned me;
+
+ if (likely(!cfg->move_in_progress))
+ return;
+
+ me = smp_processor_id();
+
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ send_cleanup_vector(cfg);
+}
+
+static void irq_complete_move(struct irq_cfg *cfg)
+{
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+ struct irq_cfg *cfg = irq_get_chip_data(irq);
+
+ if (!cfg)
+ return;
+
+ __irq_complete_move(cfg, cfg->vector);
+}
+#else
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
+#endif
+
+static void ack_apic_edge(struct irq_data *data)
+{
+ irq_complete_move(data->chip_data);
+ irq_move_irq(data);
+ ack_APIC_irq();
+}
+
+atomic_t irq_mis_count;
+
+static void ack_apic_level(struct irq_data *data)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ int i, do_unmask_irq = 0, irq = data->irq;
+ unsigned long v;
+
+ irq_complete_move(cfg);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ /* If we are moving the irq we need to mask it */
+ if (unlikely(irqd_is_setaffinity_pending(data))) {
+ do_unmask_irq = 1;
+ mask_ioapic(cfg);
+ }
+#endif
+
+ /*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ *
+ * Also in the case when cpu goes offline, fixup_irqs() will forward
+ * any unhandled interrupt on the offlined cpu to the new cpu
+ * destination that is handling the corresponding interrupt. This
+ * interrupt forwarding is done via IPI's. Hence, in this case also
+ * level-triggered io-apic interrupt will be seen as an edge
+ * interrupt in the IRR. And we can't rely on the cpu's EOI
+ * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+ * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+ * supporting EOI register, we do an explicit EOI to clear the
+ * remote IRR and on IO-APIC's which don't have an EOI register,
+ * we use the above logic (mask+edge followed by unmask+level) from
+ * Manfred Spraul to clear the remote IRR.
+ */
+ i = cfg->vector;
+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+ /*
+ * We must acknowledge the irq before we move it or the acknowledge will
+ * not propagate properly.
+ */
+ ack_APIC_irq();
+
+ /*
+ * Tail end of clearing remote IRR bit (either by delivering the EOI
+ * message via io-apic EOI register write or simulating it using
+ * mask+edge followed by unnask+level logic) manually when the
+ * level triggered interrupt is seen as the edge triggered interrupt
+ * at the cpu.
+ */
+ if (!(v & (1 << (i & 0x1f)))) {
+ atomic_inc(&irq_mis_count);
+
+ eoi_ioapic_irq(irq, cfg);
+ }
+
+ /* Now we can move and renable the irq */
+ if (unlikely(do_unmask_irq)) {
+ /* Only migrate the irq if the ack has been received.
+ *
+ * On rare occasions the broadcast level triggered ack gets
+ * delayed going to ioapics, and if we reprogram the
+ * vector while Remote IRR is still set the irq will never
+ * fire again.
+ *
+ * To prevent this scenario we read the Remote IRR bit
+ * of the ioapic. This has two effects.
+ * - On any sane system the read of the ioapic will
+ * flush writes (and acks) going to the ioapic from
+ * this cpu.
+ * - We get to see if the ACK has actually been delivered.
+ *
+ * Based on failed experiments of reprogramming the
+ * ioapic entry from outside of irq context starting
+ * with masking the ioapic entry and then polling until
+ * Remote IRR was clear before reprogramming the
+ * ioapic I don't trust the Remote IRR bit to be
+ * completey accurate.
+ *
+ * However there appears to be no other way to plug
+ * this race, so if the Remote IRR bit is not
+ * accurate and is causing problems then it is a hardware bug
+ * and you can go talk to the chipset vendor about it.
+ */
+ if (!io_apic_level_ack_pending(cfg))
+ irq_move_masked_irq(data);
+ unmask_ioapic(cfg);
+ }
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static void ir_ack_apic_edge(struct irq_data *data)
+{
+ ack_APIC_irq();
+}
+
+static void ir_ack_apic_level(struct irq_data *data)
+{
+ ack_APIC_irq();
+ eoi_ioapic_irq(data->irq, data->chip_data);
+}
+
+static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
+{
+ seq_printf(p, " IR-%s", data->chip->name);
+}
+
+static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+{
+ chip->irq_print_chip = ir_print_prefix;
+ chip->irq_ack = ir_ack_apic_edge;
+ chip->irq_eoi = ir_ack_apic_level;
+
+#ifdef CONFIG_SMP
+ chip->irq_set_affinity = ir_ioapic_set_affinity;
+#endif
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+static struct irq_chip ioapic_chip __read_mostly = {
+ .name = "IO-APIC",
+ .irq_startup = startup_ioapic_irq,
+ .irq_mask = mask_ioapic_irq,
+ .irq_unmask = unmask_ioapic_irq,
+ .irq_ack = ack_apic_edge,
+ .irq_eoi = ack_apic_level,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = ioapic_set_affinity,
+#endif
+ .irq_retrigger = ioapic_retrigger_irq,
+};
+
+static inline void init_IO_APIC_traps(void)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ for_each_active_irq(irq) {
+ cfg = irq_get_chip_data(irq);
+ if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
+ /*
+ * Hmm.. We don't have an entry for this,
+ * so default to an old-fashioned 8259
+ * interrupt if we can..
+ */
+ if (irq < legacy_pic->nr_legacy_irqs)
+ legacy_pic->make_irq(irq);
+ else
+ /* Strange. Oh, well.. */
+ irq_set_chip(irq, &no_irq_chip);
+ }
+ }
+}
+
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(struct irq_data *data)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq(struct irq_data *data)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq(struct irq_data *data)
+{
+ ack_APIC_irq();
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+ .name = "local-APIC",
+ .irq_mask = mask_lapic_irq,
+ .irq_unmask = unmask_lapic_irq,
+ .irq_ack = ack_lapic_irq,
+};
+
+static void lapic_register_intr(int irq)
+{
+ irq_clear_status_flags(irq, IRQ_LEVEL);
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+ "edge");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
+ * not support the ExtINT mode, unfortunately. We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA. --macro
+ */
+static inline void __init unlock_ExtINT_logic(void)
+{
+ int apic, pin, i;
+ struct IO_APIC_route_entry entry0, entry1;
+ unsigned char save_control, save_freq_select;
+
+ pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ apic = find_isa_irq_apic(8, mp_INT);
+ if (apic == -1) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ entry0 = ioapic_read_entry(apic, pin);
+ clear_IO_APIC_pin(apic, pin);
+
+ memset(&entry1, 0, sizeof(entry1));
+
+ entry1.dest_mode = 0; /* physical delivery */
+ entry1.mask = 0; /* unmask IRQ now */
+ entry1.dest = hard_smp_processor_id();
+ entry1.delivery_mode = dest_ExtINT;
+ entry1.polarity = entry0.polarity;
+ entry1.trigger = 0;
+ entry1.vector = 0;
+
+ ioapic_write_entry(apic, pin, entry1);
+
+ save_control = CMOS_READ(RTC_CONTROL);
+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+ RTC_FREQ_SELECT);
+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+ i = 100;
+ while (i-- > 0) {
+ mdelay(10);
+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+ i -= 10;
+ }
+
+ CMOS_WRITE(save_control, RTC_CONTROL);
+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+ clear_IO_APIC_pin(apic, pin);
+
+ ioapic_write_entry(apic, pin, entry0);
+}
+
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+int timer_through_8259 __initdata;
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for all platforms.
+ */
+static inline void __init check_timer(void)
+{
+ struct irq_cfg *cfg = irq_get_chip_data(0);
+ int node = cpu_to_node(0);
+ int apic1, pin1, apic2, pin2;
+ unsigned long flags;
+ int no_pin1 = 0;
+
+ local_irq_save(flags);
+
+ /*
+ * get/set the timer IRQ vector:
+ */
+ legacy_pic->mask(0);
+ assign_irq_vector(0, cfg, apic->target_cpus());
+
+ /*
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
+ */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ legacy_pic->init(1);
+
+ pin1 = find_isa_irq_pin(0, mp_INT);
+ apic1 = find_isa_irq_apic(0, mp_INT);
+ pin2 = ioapic_i8259.pin;
+ apic2 = ioapic_i8259.apic;
+
+ apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+ "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
+
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ if (intr_remapping_enabled)
+ panic("BIOS bug: timer not connected to IO-APIC");
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
+
+ if (pin1 != -1) {
+ /*
+ * Ok, does IRQ0 through the IOAPIC work?
+ */
+ if (no_pin1) {
+ add_pin_to_irq_node(cfg, node, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ } else {
+ /* for edge trigger, setup_ioapic_irq already
+ * leave it unmasked.
+ * so only need to unmask if it is level-trigger
+ * do we really have level trigger timer?
+ */
+ int idx;
+ idx = find_irq_entry(apic1, pin1, mp_INT);
+ if (idx != -1 && irq_trigger(idx))
+ unmask_ioapic(cfg);
+ }
+ if (timer_irq_works()) {
+ if (disable_timer_pin_1 > 0)
+ clear_IO_APIC_pin(0, pin1);
+ goto out;
+ }
+ if (intr_remapping_enabled)
+ panic("timer doesn't work through Interrupt-remapped IO-APIC");
+ local_irq_disable();
+ clear_IO_APIC_pin(apic1, pin1);
+ if (!no_pin1)
+ apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+ "(IRQ0) through the 8259A ...\n");
+ apic_printk(APIC_QUIET, KERN_INFO
+ "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+ legacy_pic->unmask(0);
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ timer_through_8259 = 1;
+ goto out;
+ }
+ /*
+ * Cleanup, just in case ...
+ */
+ local_irq_disable();
+ legacy_pic->mask(0);
+ clear_IO_APIC_pin(apic2, pin2);
+ apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ }
+
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as Virtual Wire IRQ...\n");
+
+ lapic_register_intr(0);
+ apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
+ legacy_pic->unmask(0);
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ goto out;
+ }
+ local_irq_disable();
+ legacy_pic->mask(0);
+ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+
+ apic_printk(APIC_QUIET, KERN_INFO
+ "...trying to set up timer as ExtINT IRQ...\n");
+
+ legacy_pic->init(0);
+ legacy_pic->make_irq(0);
+ apic_write(APIC_LVT0, APIC_DM_EXTINT);
+
+ unlock_ExtINT_logic();
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ goto out;
+ }
+ local_irq_disable();
+ apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+ if (x2apic_preenabled)
+ apic_printk(APIC_QUIET, KERN_INFO
+ "Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
+ "report. Then try booting with the 'noapic' option.\n");
+out:
+ local_irq_restore(flags);
+}
+
+/*
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices. However there may be an I/O APIC pin available for
+ * this interrupt regardless. The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A. In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table. With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default. We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor. Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now. No actual device should request
+ * it anyway. --macro
+ */
+#define PIC_IRQS (1UL << PIC_CASCADE_IR)
+
+void __init setup_IO_APIC(void)
+{
+
+ /*
+ * calling enable_IO_APIC() is moved to setup_local_APIC for BP
+ */
+ io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+
+ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ /*
+ * Set up IO-APIC IRQ routing.
+ */
+ x86_init.mpparse.setup_ioapic_ids();
+
+ sync_Arb_IDs();
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ if (legacy_pic->nr_legacy_irqs)
+ check_timer();
+}
+
+/*
+ * Called after all the initialization is done. If we didn't find any
+ * APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+ if (sis_apic_bug == -1)
+ sis_apic_bug = 0;
+ return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
+static void resume_ioapic_id(int ioapic_idx)
+{
+ unsigned long flags;
+ union IO_APIC_reg_00 reg_00;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+ reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void ioapic_resume(void)
+{
+ int ioapic_idx;
+
+ for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+ resume_ioapic_id(ioapic_idx);
+
+ restore_ioapic_entries();
+}
+
+static struct syscore_ops ioapic_syscore_ops = {
+ .suspend = save_ioapic_entries,
+ .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_ops(void)
+{
+ register_syscore_ops(&ioapic_syscore_ops);
+
+ return 0;
+}
+
+device_initcall(ioapic_init_ops);
+
+/*
+ * Dynamic irq allocate and deallocation
+ */
+unsigned int create_irq_nr(unsigned int from, int node)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int ret = 0;
+ int irq;
+
+ if (from < nr_irqs_gsi)
+ from = nr_irqs_gsi;
+
+ irq = alloc_irq_from(from, node);
+ if (irq < 0)
+ return 0;
+ cfg = alloc_irq_cfg(irq, node);
+ if (!cfg) {
+ free_irq_at(irq, NULL);
+ return 0;
+ }
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+ ret = irq;
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (ret) {
+ irq_set_chip_data(irq, cfg);
+ irq_clear_status_flags(irq, IRQ_NOREQUEST);
+ } else {
+ free_irq_at(irq, cfg);
+ }
+ return ret;
+}
+
+int create_irq(void)
+{
+ int node = cpu_to_node(0);
+ unsigned int irq_want;
+ int irq;
+
+ irq_want = nr_irqs_gsi;
+ irq = create_irq_nr(irq_want, node);
+
+ if (irq == 0)
+ irq = -1;
+
+ return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+ struct irq_cfg *cfg = irq_get_chip_data(irq);
+ unsigned long flags;
+
+ irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
+
+ if (irq_remapped(cfg))
+ free_irte(irq);
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ __clear_irq_vector(irq, cfg);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ free_irq_at(irq, cfg);
+}
+
+/*
+ * MSI message composition
+ */
+#ifdef CONFIG_PCI_MSI
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+ struct msi_msg *msg, u8 hpet_id)
+{
+ struct irq_cfg *cfg;
+ int err;
+ unsigned dest;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ if (err)
+ return err;
+
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+
+ if (irq_remapped(cfg)) {
+ struct irte irte;
+ int ir_index;
+ u16 sub_handle;
+
+ ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+ BUG_ON(ir_index == -1);
+
+ prepare_irte(&irte, cfg->vector, dest);
+
+ /* Set source-id of interrupt request */
+ if (pdev)
+ set_msi_sid(&irte, pdev);
+ else
+ set_hpet_sid(&irte, hpet_id);
+
+ modify_irte(irq, &irte);
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->data = sub_handle;
+ msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+ MSI_ADDR_IR_SHV |
+ MSI_ADDR_IR_INDEX1(ir_index) |
+ MSI_ADDR_IR_INDEX2(ir_index);
+ } else {
+ if (x2apic_enabled())
+ msg->address_hi = MSI_ADDR_BASE_HI |
+ MSI_ADDR_EXT_DEST_ID(dest);
+ else
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL:
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU:
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED:
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(cfg->vector);
+ }
+ return err;
+}
+
+#ifdef CONFIG_SMP
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ struct msi_msg msg;
+ unsigned int dest;
+
+ if (__ioapic_set_affinity(data, mask, &dest))
+ return -1;
+
+ __get_cached_msi_msg(data->msi_desc, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ __write_msi_msg(data->msi_desc, &msg);
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+ .name = "PCI-MSI",
+ .irq_unmask = unmask_msi_irq,
+ .irq_mask = mask_msi_irq,
+ .irq_ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = msi_set_affinity,
+#endif
+ .irq_retrigger = ioapic_retrigger_irq,
+};
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
+{
+ struct intel_iommu *iommu;
+ int index;
+
+ iommu = map_dev_to_ir(dev);
+ if (!iommu) {
+ printk(KERN_ERR
+ "Unable to map PCI %s to iommu\n", pci_name(dev));
+ return -ENOENT;
+ }
+
+ index = alloc_irte(iommu, irq, nvec);
+ if (index < 0) {
+ printk(KERN_ERR
+ "Unable to allocate %d IRTE for PCI %s\n", nvec,
+ pci_name(dev));
+ return -ENOSPC;
+ }
+ return index;
+}
+
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+{
+ struct irq_chip *chip = &msi_chip;
+ struct msi_msg msg;
+ int ret;
+
+ ret = msi_compose_msg(dev, irq, &msg, -1);
+ if (ret < 0)
+ return ret;
+
+ irq_set_msi_desc(irq, msidesc);
+ write_msi_msg(irq, &msg);
+
+ if (irq_remapped(irq_get_chip_data(irq))) {
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ irq_remap_modify_chip_defaults(chip);
+ }
+
+ irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
+ return 0;
+}
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ int node, ret, sub_handle, index = 0;
+ unsigned int irq, irq_want;
+ struct msi_desc *msidesc;
+ struct intel_iommu *iommu = NULL;
+
+ /* x86 doesn't support multiple MSI yet */
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
+
+ node = dev_to_node(&dev->dev);
+ irq_want = nr_irqs_gsi;
+ sub_handle = 0;
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = create_irq_nr(irq_want, node);
+ if (irq == 0)
+ return -1;
+ irq_want = irq + 1;
+ if (!intr_remapping_enabled)
+ goto no_ir;
+
+ if (!sub_handle) {
+ /*
+ * allocate the consecutive block of IRTE's
+ * for 'nvec'
+ */
+ index = msi_alloc_irte(dev, irq, nvec);
+ if (index < 0) {
+ ret = index;
+ goto error;
+ }
+ } else {
+ iommu = map_dev_to_ir(dev);
+ if (!iommu) {
+ ret = -ENOENT;
+ goto error;
+ }
+ /*
+ * setup the mapping between the irq and the IRTE
+ * base index, the sub_handle pointing to the
+ * appropriate interrupt remap table entry.
+ */
+ set_irte_irq(irq, iommu, index, sub_handle);
+ }
+no_ir:
+ ret = setup_msi_irq(dev, msidesc, irq);
+ if (ret < 0)
+ goto error;
+ sub_handle++;
+ }
+ return 0;
+
+error:
+ destroy_irq(irq);
+ return ret;
+}
+
+void native_teardown_msi_irq(unsigned int irq)
+{
+ destroy_irq(irq);
+}
+
+#ifdef CONFIG_DMAR_TABLE
+#ifdef CONFIG_SMP
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest, irq = data->irq;
+ struct msi_msg msg;
+
+ if (__ioapic_set_affinity(data, mask, &dest))
+ return -1;
+
+ dmar_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+ msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+
+ dmar_msi_write(irq, &msg);
+
+ return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+static struct irq_chip dmar_msi_type = {
+ .name = "DMAR_MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = dmar_msi_set_affinity,
+#endif
+ .irq_retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg, -1);
+ if (ret < 0)
+ return ret;
+ dmar_msi_write(irq, &msg);
+ irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+ "edge");
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_HPET_TIMER
+
+#ifdef CONFIG_SMP
+static int hpet_msi_set_affinity(struct irq_data *data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ struct msi_msg msg;
+ unsigned int dest;
+
+ if (__ioapic_set_affinity(data, mask, &dest))
+ return -1;
+
+ hpet_msi_read(data->handler_data, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ hpet_msi_write(data->handler_data, &msg);
+
+ return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+static struct irq_chip hpet_msi_type = {
+ .name = "HPET_MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = hpet_msi_set_affinity,
+#endif
+ .irq_retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+ struct irq_chip *chip = &hpet_msi_type;
+ struct msi_msg msg;
+ int ret;
+
+ if (intr_remapping_enabled) {
+ struct intel_iommu *iommu = map_hpet_to_ir(id);
+ int index;
+
+ if (!iommu)
+ return -1;
+
+ index = alloc_irte(iommu, irq, 1);
+ if (index < 0)
+ return -1;
+ }
+
+ ret = msi_compose_msg(NULL, irq, &msg, id);
+ if (ret < 0)
+ return ret;
+
+ hpet_msi_write(irq_get_handler_data(irq), &msg);
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ if (irq_remapped(irq_get_chip_data(irq)))
+ irq_remap_modify_chip_defaults(chip);
+
+ irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+ return 0;
+}
+#endif
+
+#endif /* CONFIG_PCI_MSI */
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+ struct ht_irq_msg msg;
+ fetch_ht_irq_msg(irq, &msg);
+
+ msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+ msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+ write_ht_irq_msg(irq, &msg);
+}
+
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg *cfg = data->chip_data;
+ unsigned int dest;
+
+ if (__ioapic_set_affinity(data, mask, &dest))
+ return -1;
+
+ target_ht_irq(data->irq, dest, cfg->vector);
+ return 0;
+}
+
+#endif
+
+static struct irq_chip ht_irq_chip = {
+ .name = "PCI-HT",
+ .irq_mask = mask_ht_irq,
+ .irq_unmask = unmask_ht_irq,
+ .irq_ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+ .irq_set_affinity = ht_set_affinity,
+#endif
+ .irq_retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+ struct irq_cfg *cfg;
+ int err;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ if (!err) {
+ struct ht_irq_msg msg;
+ unsigned dest;
+
+ dest = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus());
+
+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+ msg.address_lo =
+ HT_IRQ_LOW_BASE |
+ HT_IRQ_LOW_DEST_ID(dest) |
+ HT_IRQ_LOW_VECTOR(cfg->vector) |
+ ((apic->irq_dest_mode == 0) ?
+ HT_IRQ_LOW_DM_PHYSICAL :
+ HT_IRQ_LOW_DM_LOGICAL) |
+ HT_IRQ_LOW_RQEOI_EDGE |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ HT_IRQ_LOW_MT_FIXED :
+ HT_IRQ_LOW_MT_ARBITRATED) |
+ HT_IRQ_LOW_IRQ_MASKED;
+
+ write_ht_irq_msg(irq, &msg);
+
+ irq_set_chip_and_handler_name(irq, &ht_irq_chip,
+ handle_edge_irq, "edge");
+
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+ }
+ return err;
+}
+#endif /* CONFIG_HT_IRQ */
+
+static int
+io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
+{
+ struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
+ int ret;
+
+ if (!cfg)
+ return -EINVAL;
+ ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
+ if (!ret)
+ setup_ioapic_irq(irq, cfg, attr);
+ return ret;
+}
+
+int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+ struct io_apic_irq_attr *attr)
+{
+ unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
+ int ret;
+
+ /* Avoid redundant programming */
+ if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
+ pr_debug("Pin %d-%d already programmed\n",
+ mpc_ioapic_id(ioapic_idx), pin);
+ return 0;
+ }
+ ret = io_apic_setup_irq_pin(irq, node, attr);
+ if (!ret)
+ set_bit(pin, ioapics[ioapic_idx].pin_programmed);
+ return ret;
+}
+
+static int __init io_apic_get_redir_entries(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* The register returns the maximum index redir index
+ * supported, which is one less than the total number of redir
+ * entries.
+ */
+ return reg_01.bits.entries + 1;
+}
+
+static void __init probe_nr_irqs_gsi(void)
+{
+ int nr;
+
+ nr = gsi_top + NR_IRQS_LEGACY;
+ if (nr > nr_irqs_gsi)
+ nr_irqs_gsi = nr;
+
+ printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
+}
+
+int get_nr_irqs_gsi(void)
+{
+ return nr_irqs_gsi;
+}
+
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+ nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+ nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+ /*
+ * for MSI and HT dyn irq
+ */
+ nr += nr_irqs_gsi * 16;
+#endif
+ if (nr < nr_irqs)
+ nr_irqs = nr;
+
+ return NR_IRQS_LEGACY;
+}
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int node;
+
+ if (!IO_APIC_IRQ(irq)) {
+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+ irq_attr->ioapic);
+ return -EINVAL;
+ }
+
+ node = dev ? dev_to_node(dev) : cpu_to_node(0);
+
+ return io_apic_setup_irq_pin_once(irq, node, irq_attr);
+}
+
+#ifdef CONFIG_X86_32
+static int __init io_apic_get_unique_id(int ioapic, int apic_id)
+{
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+ physid_mask_t tmp;
+ unsigned long flags;
+ int i = 0;
+
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= get_physical_broadcast()) {
+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (apic->check_apicid_used(&apic_id_map, apic_id)) {
+
+ for (i = 0; i < get_physical_broadcast(); i++) {
+ if (!apic->check_apicid_used(&apic_id_map, i))
+ break;
+ }
+
+ if (i == get_physical_broadcast())
+ panic("Max apic_id exceeded!\n");
+
+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ apic->apicid_to_cpu_present(apic_id, &tmp);
+ physids_or(apic_id_map, apic_id_map, tmp);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id) {
+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+ return -1;
+ }
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+ return apic_id;
+}
+
+static u8 __init io_apic_unique_id(u8 id)
+{
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return io_apic_get_unique_id(nr_ioapics, id);
+ else
+ return id;
+}
+#else
+static u8 __init io_apic_unique_id(u8 id)
+{
+ int i;
+ DECLARE_BITMAP(used, 256);
+
+ bitmap_zero(used, 256);
+ for (i = 0; i < nr_ioapics; i++) {
+ __set_bit(mpc_ioapic_id(i), used);
+ }
+ if (!test_bit(id, used))
+ return id;
+ return find_first_zero_bit(used, 256);
+}
+#endif
+
+static int __init io_apic_get_version(int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.version;
+}
+
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_trigger(idx);
+ *polarity = irq_polarity(idx);
+ return 0;
+}
+
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be apic->target_cpus()
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+ int pin, ioapic, irq, irq_entry;
+ const struct cpumask *mask;
+ struct irq_data *idata;
+
+ if (skip_ioapic_setup == 1)
+ return;
+
+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
+ for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+ if (irq_entry == -1)
+ continue;
+ irq = pin_2_irq(irq_entry, ioapic, pin);
+
+ if ((ioapic > 0) && (irq > 16))
+ continue;
+
+ idata = irq_get_irq_data(irq);
+
+ /*
+ * Honour affinities which have been set in early boot
+ */
+ if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
+ mask = idata->affinity;
+ else
+ mask = apic->target_cpus();
+
+ if (intr_remapping_enabled)
+ ir_ioapic_set_affinity(idata, mask, false);
+ else
+ ioapic_set_affinity(idata, mask, false);
+ }
+
+}
+#endif
+
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+{
+ unsigned long n;
+ struct resource *res;
+ char *mem;
+ int i;
+
+ if (nr_ioapics <= 0)
+ return NULL;
+
+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+ n *= nr_ioapics;
+
+ mem = alloc_bootmem(n);
+ res = (void *)mem;
+
+ mem += sizeof(struct resource) * nr_ioapics;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
+ }
+
+ ioapic_resources = res;
+
+ return res;
+}
+
+void __init native_ioapic_init_mappings(void)
+{
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ struct resource *ioapic_res;
+ int i;
+
+ ioapic_res = ioapic_setup_resources(nr_ioapics);
+ for (i = 0; i < nr_ioapics; i++) {
+ if (smp_found_config) {
+ ioapic_phys = mpc_ioapic_addr(i);
+#ifdef CONFIG_X86_32
+ if (!ioapic_phys) {
+ printk(KERN_ERR
+ "WARNING: bogus zero IO-APIC "
+ "address found in MPTABLE, "
+ "disabling IO/APIC support!\n");
+ smp_found_config = 0;
+ skip_ioapic_setup = 1;
+ goto fake_ioapic_page;
+ }
+#endif
+ } else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
+ ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ set_fixmap_nocache(idx, ioapic_phys);
+ apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
+ ioapic_phys);
+ idx++;
+
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
+ ioapic_res++;
+ }
+
+ probe_nr_irqs_gsi();
+}
+
+void __init ioapic_insert_resources(void)
+{
+ int i;
+ struct resource *r = ioapic_resources;
+
+ if (!r) {
+ if (nr_ioapics > 0)
+ printk(KERN_ERR
+ "IO APIC resources couldn't be allocated.\n");
+ return;
+ }
+
+ for (i = 0; i < nr_ioapics; i++) {
+ insert_resource(&iomem_resource, r);
+ r++;
+ }
+}
+
+int mp_find_ioapic(u32 gsi)
+{
+ int i = 0;
+
+ if (nr_ioapics == 0)
+ return -1;
+
+ /* Find the IOAPIC that manages this GSI. */
+ for (i = 0; i < nr_ioapics; i++) {
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+ if ((gsi >= gsi_cfg->gsi_base)
+ && (gsi <= gsi_cfg->gsi_end))
+ return i;
+ }
+
+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
+}
+
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
+{
+ struct mp_ioapic_gsi *gsi_cfg;
+
+ if (WARN_ON(ioapic == -1))
+ return -1;
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if (WARN_ON(gsi > gsi_cfg->gsi_end))
+ return -1;
+
+ return gsi - gsi_cfg->gsi_base;
+}
+
+static __init int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
+ "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
+ return 1;
+ }
+ if (!address) {
+ printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+ int idx = 0;
+ int entries;
+ struct mp_ioapic_gsi *gsi_cfg;
+
+ if (bad_ioapic(address))
+ return;
+
+ idx = nr_ioapics;
+
+ ioapics[idx].mp_config.type = MP_IOAPIC;
+ ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+ ioapics[idx].mp_config.apicaddr = address;
+
+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
+ ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
+
+ /*
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+ */
+ entries = io_apic_get_redir_entries(idx);
+ gsi_cfg = mp_ioapic_gsi_routing(idx);
+ gsi_cfg->gsi_base = gsi_base;
+ gsi_cfg->gsi_end = gsi_base + entries - 1;
+
+ /*
+ * The number of IO-APIC IRQ registers (== #pins):
+ */
+ ioapics[idx].nr_registers = entries;
+
+ if (gsi_cfg->gsi_end >= gsi_top)
+ gsi_top = gsi_cfg->gsi_end + 1;
+
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
+ mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+
+ nr_ioapics++;
+}
+
+/* Enable IOAPIC early just for system timer */
+void __init pre_init_apic_IRQ0(void)
+{
+ struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+
+ printk(KERN_INFO "Early APIC setup for system timer0\n");
+#ifndef CONFIG_SMP
+ physid_set_mask_of_physid(boot_cpu_physical_apicid,
+ &phys_cpu_present_map);
+#endif
+ setup_local_APIC();
+
+ io_apic_setup_irq_pin(0, 0, &attr);
+ irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+ "edge");
+}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
new file mode 100644
index 00000000000..cce91bf2667
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi.c
@@ -0,0 +1,167 @@
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/proto.h>
+#include <asm/ipi.h>
+
+void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector)
+{
+ unsigned long query_cpu;
+ unsigned long flags;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicast to each CPU instead.
+ * - mbligh
+ */
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int query_cpu;
+ unsigned long flags;
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid,
+ query_cpu), vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_X86_32
+
+void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+ * should be modified to do 1 message per cluster ID - mbligh
+ */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask)
+ __default_send_IPI_dest_field(
+ early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
+ int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ /* See Hack comment above */
+
+ local_irq_save(flags);
+ for_each_cpu(query_cpu, mask) {
+ if (query_cpu == this_cpu)
+ continue;
+ __default_send_IPI_dest_field(
+ early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+ vector, apic->dest_logical);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
+{
+ unsigned long mask = cpumask_bits(cpumask)[0];
+ unsigned long flags;
+
+ if (WARN_ONCE(!mask, "empty IPI mask"))
+ return;
+
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
+ __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ local_irq_restore(flags);
+}
+
+void default_send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then we get an APIC send
+ * error if we try to broadcast, thus avoid sending IPIs in this case.
+ */
+ if (!(num_online_cpus() > 1))
+ return;
+
+ __default_local_send_IPI_allbutself(vector);
+}
+
+void default_send_IPI_all(int vector)
+{
+ __default_local_send_IPI_all(vector);
+}
+
+void default_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical);
+}
+
+/* must come after the send_IPI functions above for inlining */
+static int convert_apicid_to_cpu(int apic_id)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
+ return i;
+ }
+ return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+ int apicid, cpuid;
+
+ if (!cpu_has_apic)
+ return 0;
+
+ apicid = hard_smp_processor_id();
+ if (apicid == BAD_APICID)
+ return 0;
+
+ cpuid = convert_apicid_to_cpu(apicid);
+
+ return cpuid >= 0 ? cpuid : 0;
+}
+#endif
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
new file mode 100644
index 00000000000..c4a61ca1349
--- /dev/null
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -0,0 +1,541 @@
+/*
+ * Written by: Patricia Gaughen, IBM Corporation
+ *
+ * Copyright (C) 2002, IBM Corp.
+ * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <gone@us.ibm.com>
+ */
+#include <linux/nodemask.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/numa.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/numaq.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/ipi.h>
+
+int found_numaq;
+
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+struct mpc_trans {
+ unsigned char mpc_type;
+ unsigned char trans_len;
+ unsigned char trans_type;
+ unsigned char trans_quad;
+ unsigned char trans_global;
+ unsigned char trans_local;
+ unsigned short trans_reserved;
+};
+
+static int mpc_record;
+
+static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+int quad_local_to_mp_bus_id[NR_CPUS/4][4];
+
+
+static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
+{
+ struct eachquadmem *eq = scd->eq + node;
+ u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
+ u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
+ int ret;
+
+ node_set(node, numa_nodes_parsed);
+ ret = numa_add_memblk(node, start, end);
+ BUG_ON(ret < 0);
+}
+
+/*
+ * Function: smp_dump_qct()
+ *
+ * Description: gets memory layout from the quad config table. This
+ * function also updates numa_nodes_parsed with the nodes (quads) present.
+ */
+static void __init smp_dump_qct(void)
+{
+ struct sys_cfg_data *scd;
+ int node;
+
+ scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
+
+ for_each_node(node) {
+ if (scd->quads_present31_0 & (1 << node))
+ numaq_register_node(node, scd);
+ }
+}
+
+void __cpuinit numaq_tsc_disable(void)
+{
+ if (!found_numaq)
+ return;
+
+ if (num_online_nodes() > 1) {
+ printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+ setup_clear_cpu_cap(X86_FEATURE_TSC);
+ }
+}
+
+static void __init numaq_tsc_init(void)
+{
+ numaq_tsc_disable();
+}
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+ return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+/* x86_quirks member */
+static int mpc_apic_id(struct mpc_cpu *m)
+{
+ int quad = translation_table[mpc_record]->trans_quad;
+ int logical_apicid = generate_logical_apicid(quad, m->apicid);
+
+ printk(KERN_DEBUG
+ "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+ m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
+ (m->cpufeature & CPU_MODEL_MASK) >> 4,
+ m->apicver, quad, logical_apicid);
+
+ return logical_apicid;
+}
+
+/* x86_quirks member */
+static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
+{
+ int quad = translation_table[mpc_record]->trans_quad;
+ int local = translation_table[mpc_record]->trans_local;
+
+ mp_bus_id_to_node[m->busid] = quad;
+ mp_bus_id_to_local[m->busid] = local;
+
+ printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
+}
+
+/* x86_quirks member */
+static void mpc_oem_pci_bus(struct mpc_bus *m)
+{
+ int quad = translation_table[mpc_record]->trans_quad;
+ int local = translation_table[mpc_record]->trans_local;
+
+ quad_local_to_mp_bus_id[quad][local] = m->busid;
+}
+
+/*
+ * Called from mpparse code.
+ * mode = 0: prescan
+ * mode = 1: one mpc entry scanned
+ */
+static void numaq_mpc_record(unsigned int mode)
+{
+ if (!mode)
+ mpc_record = 0;
+ else
+ mpc_record++;
+}
+
+static void __init MP_translation_info(struct mpc_trans *m)
+{
+ printk(KERN_INFO
+ "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+ mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+ m->trans_local);
+
+ if (mpc_record >= MAX_MPC_ENTRY)
+ printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+ else
+ translation_table[mpc_record] = m; /* stash this for later */
+
+ if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+ node_set_online(m->trans_quad);
+}
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+ int sum = 0;
+
+ while (len--)
+ sum += *mp++;
+
+ return sum & 0xFF;
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+static void __init smp_read_mpc_oem(struct mpc_table *mpc)
+{
+ struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
+ int count = sizeof(*oemtable); /* the header size */
+ unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+ mpc_record = 0;
+ printk(KERN_INFO
+ "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
+
+ if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
+ printk(KERN_WARNING
+ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+ oemtable->signature[0], oemtable->signature[1],
+ oemtable->signature[2], oemtable->signature[3]);
+ return;
+ }
+
+ if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
+ printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+ return;
+ }
+
+ while (count < oemtable->length) {
+ switch (*oemptr) {
+ case MP_TRANSLATION:
+ {
+ struct mpc_trans *m = (void *)oemptr;
+
+ MP_translation_info(m);
+ oemptr += sizeof(*m);
+ count += sizeof(*m);
+ ++mpc_record;
+ break;
+ }
+ default:
+ printk(KERN_WARNING
+ "Unrecognised OEM table entry type! - %d\n",
+ (int)*oemptr);
+ return;
+ }
+ }
+}
+
+static __init void early_check_numaq(void)
+{
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ early_get_smp_config();
+
+ if (found_numaq) {
+ x86_init.mpparse.mpc_record = numaq_mpc_record;
+ x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
+ x86_init.mpparse.mpc_apic_id = mpc_apic_id;
+ x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
+ x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
+ x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
+ x86_init.timers.tsc_pre_init = numaq_tsc_init;
+ x86_init.pci.init = pci_numaq_init;
+ }
+}
+
+int __init numaq_numa_init(void)
+{
+ early_check_numaq();
+ if (!found_numaq)
+ return -ENOENT;
+ smp_dump_qct();
+
+ return 0;
+}
+
+#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static inline unsigned int numaq_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0x0F;
+}
+
+static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_logical(mask, vector);
+}
+
+static inline void numaq_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
+}
+
+static inline void numaq_send_IPI_all(int vector)
+{
+ numaq_send_IPI_mask(cpu_online_mask, vector);
+}
+
+#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
+#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
+
+/*
+ * Because we use NMIs rather than the INIT-STARTUP sequence to
+ * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
+ */
+static inline void numaq_smp_callin_clear_local_apic(void)
+{
+ clear_local_APIC();
+}
+
+static inline const struct cpumask *numaq_target_cpus(void)
+{
+ return cpu_all_mask;
+}
+
+static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return physid_isset(apicid, *map);
+}
+
+static inline unsigned long numaq_check_apicid_present(int bit)
+{
+ return physid_isset(bit, phys_cpu_present_map);
+}
+
+static inline int numaq_apic_id_registered(void)
+{
+ return 1;
+}
+
+static inline void numaq_init_apic_ldr(void)
+{
+ /* Already done in NUMA-Q firmware */
+}
+
+static inline void numaq_setup_apic_routing(void)
+{
+ printk(KERN_INFO
+ "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+/*
+ * Skip adding the timer int on secondary nodes, which causes
+ * a small but painful rift in the time-space continuum.
+ */
+static inline int numaq_multi_timer_check(int apic, int irq)
+{
+ return apic != 0 && irq == 0;
+}
+
+static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
+{
+ /* We don't have a good way to do this yet - hack */
+ return physids_promote(0xFUL, retmap);
+}
+
+/*
+ * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
+ * cpu to APIC ID relation to properly interact with the intelligent
+ * mode of the cluster controller.
+ */
+static inline int numaq_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < 60)
+ return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
+ else
+ return BAD_APICID;
+}
+
+static inline int numaq_apicid_to_node(int logical_apicid)
+{
+ return logical_apicid >> 4;
+}
+
+static int numaq_numa_cpu_node(int cpu)
+{
+ int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ if (logical_apicid != BAD_APICID)
+ return numaq_apicid_to_node(logical_apicid);
+ return NUMA_NO_NODE;
+}
+
+static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
+{
+ int node = numaq_apicid_to_node(logical_apicid);
+ int cpu = __ffs(logical_apicid & 0xf);
+
+ physid_set_mask_of_physid(cpu + 4*node, retmap);
+}
+
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+
+static inline int numaq_check_phys_apicid_present(int phys_apicid)
+{
+ return 1;
+}
+
+/*
+ * We use physical apicids here, not logical, so just return the default
+ * physical broadcast to stop people from breaking us
+ */
+static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ return 0x0F;
+}
+
+static inline unsigned int
+numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ return 0x0F;
+}
+
+/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
+static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return cpuid_apic >> index_msb;
+}
+
+static int
+numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ if (strncmp(oem, "IBM NUMA", 8))
+ printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
+ else
+ found_numaq = 1;
+
+ return found_numaq;
+}
+
+static int probe_numaq(void)
+{
+ /* already know from get_memcfg_numaq() */
+ return found_numaq;
+}
+
+static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_clear(retmask);
+ cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+}
+
+static void numaq_setup_portio_remap(void)
+{
+ int num_quads = num_online_nodes();
+
+ if (num_quads <= 1)
+ return;
+
+ printk(KERN_INFO
+ "Remapping cross-quad port I/O for %d quads\n", num_quads);
+
+ xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
+
+ printk(KERN_INFO
+ "xquad_portio vaddr 0x%08lx, len %08lx\n",
+ (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
+}
+
+/* Use __refdata to keep false positive warning calm. */
+static struct apic __refdata apic_numaq = {
+
+ .name = "NUMAQ",
+ .probe = probe_numaq,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = numaq_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* physical delivery on LOCAL quad: */
+ .irq_dest_mode = 0,
+
+ .target_cpus = numaq_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = numaq_check_apicid_used,
+ .check_apicid_present = numaq_check_apicid_present,
+
+ .vector_allocation_domain = numaq_vector_allocation_domain,
+ .init_apic_ldr = numaq_init_apic_ldr,
+
+ .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
+ .setup_apic_routing = numaq_setup_apic_routing,
+ .multi_timer_check = numaq_multi_timer_check,
+ .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
+ .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
+ .setup_portio_remap = numaq_setup_portio_remap,
+ .check_phys_apicid_present = numaq_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = numaq_phys_pkg_id,
+ .mps_oem_check = numaq_mps_oem_check,
+
+ .get_apic_id = numaq_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = numaq_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = numaq_send_IPI_allbutself,
+ .send_IPI_all = numaq_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
+ .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
+
+ /* We don't do anything here because we use NMI's to boot instead */
+ .wait_for_init_deassert = NULL,
+
+ .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
+ .x86_32_numa_cpu_node = numaq_numa_cpu_node,
+};
+
+apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
new file mode 100644
index 00000000000..0787bb3412f
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -0,0 +1,270 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+int no_broadcast = DEFAULT_SEND_IPI;
+
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ pr_info("Using %s mode\n",
+ no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
+ return 1;
+}
+__setup("no_ipi_broadcast=", no_ipi_broadcast);
+
+static int __init print_ipi_mode(void)
+{
+ pr_info("Using IPI %s mode\n",
+ no_broadcast ? "No-Shortcut" : "Shortcut");
+ return 0;
+}
+late_initcall(print_ipi_mode);
+
+static int default_x86_32_early_logical_apicid(int cpu)
+{
+ return 1 << cpu;
+}
+
+static void setup_apic_flat_routing(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ printk(KERN_INFO
+ "Enabling APIC mode: Flat. Using %d I/O APICs\n",
+ nr_ioapics);
+#endif
+}
+
+static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ /*
+ * Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_clear(retmask);
+ cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+}
+
+/* should be called last. */
+static int probe_default(void)
+{
+ return 1;
+}
+
+static struct apic apic_default = {
+
+ .name = "default",
+ .probe = probe_default,
+ .acpi_madt_oem_check = NULL,
+ .apic_id_registered = default_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = default_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = default_check_apicid_used,
+ .check_apicid_present = default_check_apicid_present,
+
+ .vector_allocation_domain = default_vector_allocation_domain,
+ .init_apic_ldr = default_init_apic_ldr,
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map,
+ .setup_apic_routing = setup_apic_flat_routing,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = default_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = default_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0x0F << 24,
+
+ .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = default_send_IPI_mask_logical,
+ .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
+ .send_IPI_allbutself = default_send_IPI_allbutself,
+ .send_IPI_all = default_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
+};
+
+apic_driver(apic_default);
+
+struct apic *apic = &apic_default;
+EXPORT_SYMBOL_GPL(apic);
+
+static int cmdline_apic __initdata;
+static int __init parse_apic(char *arg)
+{
+ struct apic **drv;
+
+ if (!arg)
+ return -EINVAL;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!strcmp((*drv)->name, arg)) {
+ apic = *drv;
+ cmdline_apic = 1;
+ return 0;
+ }
+ }
+
+ /* Parsed again by __setup for debug/verbose */
+ return 0;
+}
+early_param("apic", parse_apic);
+
+void __init default_setup_apic_routing(void)
+{
+ int version = apic_version[boot_cpu_physical_apicid];
+
+ if (num_possible_cpus() > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+
+#ifdef CONFIG_X86_BIGSMP
+ /*
+ * This is used to switch to bigsmp mode when
+ * - There is no apic= option specified by the user
+ * - generic_apic_probe() has chosen apic_default as the sub_arch
+ * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+ */
+
+ if (!cmdline_apic && apic == &apic_default)
+ generic_bigsmp_probe();
+#endif
+
+ if (apic->setup_apic_routing)
+ apic->setup_apic_routing();
+}
+
+void __init generic_apic_probe(void)
+{
+ if (!cmdline_apic) {
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe()) {
+ apic = *drv;
+ break;
+ }
+ }
+ /* Not visible without early console */
+ if (drv == __apicdrivers_end)
+ panic("Didn't find an APIC driver");
+ }
+ printk(KERN_INFO "Using APIC driver %s\n", apic->name);
+}
+
+/* These functions can switch the APIC even after the initial ->probe() */
+
+int __init
+generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!((*drv)->mps_oem_check))
+ continue;
+ if (!(*drv)->mps_oem_check(mpc, oem, productid))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = *drv;
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if (!(*drv)->acpi_madt_oem_check)
+ continue;
+ if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
+ continue;
+
+ if (!cmdline_apic) {
+ apic = *drv;
+ printk(KERN_INFO "Switched to APIC driver `%s'.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
new file mode 100644
index 00000000000..3fe98669892
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/setup.h>
+
+static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init default_setup_apic_routing(void)
+{
+ struct apic **drv;
+
+ enable_IR_x2apic();
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->probe && (*drv)->probe()) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Switched APIC routing to %s.\n",
+ apic->name);
+ }
+ break;
+ }
+ }
+
+ if (is_vsmp_box()) {
+ /* need to update phys_pkg_id */
+ apic->phys_pkg_id = apicid_phys_pkg_id;
+ }
+}
+
+/* Same for both flat and physical. */
+
+void apic_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ struct apic **drv;
+
+ for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ if (apic != *drv) {
+ apic = *drv;
+ pr_info("Setting APIC routing to %s.\n",
+ apic->name);
+ }
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
new file mode 100644
index 00000000000..19114423c58
--- /dev/null
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -0,0 +1,556 @@
+/*
+ * IBM Summit-Specific Code
+ *
+ * Written By: Matthew Dobson, IBM Corporation
+ *
+ * Copyright (c) 2003 IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to <colpatch@us.ibm.com>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/bios_ebda.h>
+
+/*
+ * APIC driver for the IBM "Summit" chipset.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/smp.h>
+#include <asm/fixmap.h>
+#include <asm/apicdef.h>
+#include <asm/ipi.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/gfp.h>
+#include <linux/smp.h>
+
+static unsigned summit_get_apic_id(unsigned long x)
+{
+ return (x >> 24) & 0xFF;
+}
+
+static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ default_send_IPI_mask_sequence_logical(mask, vector);
+}
+
+static void summit_send_IPI_allbutself(int vector)
+{
+ default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
+}
+
+static void summit_send_IPI_all(int vector)
+{
+ summit_send_IPI_mask(cpu_online_mask, vector);
+}
+
+#include <asm/tsc.h>
+
+extern int use_cyclone;
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
+static void setup_summit(void);
+#else
+static inline void setup_summit(void) {}
+#endif
+
+static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
+ char *productid)
+{
+ if (!strncmp(oem, "IBM ENSW", 8) &&
+ (!strncmp(productid, "VIGIL SMP", 9)
+ || !strncmp(productid, "EXA", 3)
+ || !strncmp(productid, "RUTHLESS SMP", 12))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+/* Hook from generic ACPI tables.c */
+static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "IBM", 3) &&
+ (!strncmp(oem_table_id, "SERVIGIL", 8)
+ || !strncmp(oem_table_id, "EXA", 3))){
+ mark_tsc_unstable("Summit based system");
+ use_cyclone = 1; /*enable cyclone-timer*/
+ setup_summit();
+ return 1;
+ }
+ return 0;
+}
+
+struct rio_table_hdr {
+ unsigned char version; /* Version number of this data structure */
+ /* Version 3 adds chassis_num & WP_index */
+ unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
+ unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
+} __attribute__((packed));
+
+struct scal_detail {
+ unsigned char node_id; /* Scalability Node ID */
+ unsigned long CBAR; /* Address of 1MB register space */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port2node; /* Node ID port connected to: 0xFF = None */
+ unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
+} __attribute__((packed));
+
+struct rio_detail {
+ unsigned char node_id; /* RIO Node ID */
+ unsigned long BBAR; /* Address of 1MB register space */
+ unsigned char type; /* Type of device */
+ unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
+ /* For CYC: Node ID of Twister that owns this CYC */
+ unsigned char port0node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char port1node; /* Node ID port connected to: 0xFF=None */
+ unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
+ unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
+ /* For CYC: 0 */
+ unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
+ /* = 0 : the XAPIC is not used, ie:*/
+ /* ints fwded to another XAPIC */
+ /* Bits1:7 Reserved */
+ /* For CYC: Bits0:7 Reserved */
+ unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
+ /* lower slot numbers/PCI bus numbers */
+ /* For CYC: No meaning */
+ unsigned char chassis_num; /* 1 based Chassis number */
+ /* For LookOut WPEGs this field indicates the */
+ /* Expansion Chassis #, enumerated from Boot */
+ /* Node WPEG external port, then Boot Node CYC */
+ /* external port, then Next Vigil chassis WPEG */
+ /* external port, etc. */
+ /* Shared Lookouts have only 1 chassis number (the */
+ /* first one assigned) */
+} __attribute__((packed));
+
+
+typedef enum {
+ CompatTwister = 0, /* Compatibility Twister */
+ AltTwister = 1, /* Alternate Twister of internal 8-way */
+ CompatCyclone = 2, /* Compatibility Cyclone */
+ AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
+ CompatWPEG = 4, /* Compatibility WPEG */
+ AltWPEG = 5, /* Second Planar WPEG */
+ LookOutAWPEG = 6, /* LookOut WPEG */
+ LookOutBWPEG = 7, /* LookOut WPEG */
+} node_type;
+
+static inline int is_WPEG(struct rio_detail *rio){
+ return (rio->type == CompatWPEG || rio->type == AltWPEG ||
+ rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
+}
+
+#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
+
+static const struct cpumask *summit_target_cpus(void)
+{
+ /* CPU_MASK_ALL (0xff) has undefined behaviour with
+ * dest_LowestPrio mode logical clustered apic interrupt routing
+ * Just start on cpu 0. IRQ balancing will spread load
+ */
+ return cpumask_of(0);
+}
+
+static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
+{
+ return 0;
+}
+
+/* we don't use the phys_cpu_present_map to indicate apicid presence */
+static unsigned long summit_check_apicid_present(int bit)
+{
+ return 1;
+}
+
+static int summit_early_logical_apicid(int cpu)
+{
+ int count = 0;
+ u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
+ u8 my_cluster = APIC_CLUSTER(my_id);
+#ifdef CONFIG_SMP
+ u8 lid;
+ int i;
+
+ /* Create logical APIC IDs by counting CPUs already in cluster. */
+ for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
+ lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
+ if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
+ ++count;
+ }
+#endif
+ /* We only have a 4 wide bitmap in cluster mode. If a deranged
+ * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
+ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
+ return my_cluster | (1UL << count);
+}
+
+static void summit_init_apic_ldr(void)
+{
+ int cpu = smp_processor_id();
+ unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+ unsigned long val;
+
+ apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write(APIC_LDR, val);
+}
+
+static int summit_apic_id_registered(void)
+{
+ return 1;
+}
+
+static void summit_setup_apic_routing(void)
+{
+ printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
+ nr_ioapics);
+}
+
+static int summit_cpu_present_to_apicid(int mps_cpu)
+{
+ if (mps_cpu < nr_cpu_ids)
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
+ else
+ return BAD_APICID;
+}
+
+static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
+{
+ /* For clustered we don't have a good way to do this yet - hack */
+ physids_promote(0x0FL, retmap);
+}
+
+static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
+{
+ physid_set_mask_of_physid(0, retmap);
+}
+
+static int summit_check_phys_apicid_present(int physical_apicid)
+{
+ return 1;
+}
+
+static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ unsigned int round = 0;
+ int cpu, apicid = 0;
+
+ /*
+ * The cpus in the mask must all be on the apic cluster.
+ */
+ for_each_cpu(cpu, cpumask) {
+ int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+
+ if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
+ printk("%s: Not a valid mask!\n", __func__);
+ return BAD_APICID;
+ }
+ apicid |= new_apicid;
+ round++;
+ }
+ return apicid;
+}
+
+static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
+ const struct cpumask *andmask)
+{
+ int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
+ cpumask_var_t cpumask;
+
+ if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+ return apicid;
+
+ cpumask_and(cpumask, inmask, andmask);
+ cpumask_and(cpumask, cpumask, cpu_online_mask);
+ apicid = summit_cpu_mask_to_apicid(cpumask);
+
+ free_cpumask_var(cpumask);
+
+ return apicid;
+}
+
+/*
+ * cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value. For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+static int probe_summit(void)
+{
+ /* probed later in mptable/ACPI hooks */
+ return 0;
+}
+
+static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ /* Careful. Some cpus do not strictly honor the set of cpus
+ * specified in the interrupt destination when using lowest
+ * priority interrupt delivery mode.
+ *
+ * In particular there was a hyperthreading cpu observed to
+ * deliver interrupts to the wrong hyperthread when only one
+ * hyperthread was specified in the interrupt desitination.
+ */
+ cpumask_clear(retmask);
+ cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+}
+
+#ifdef CONFIG_X86_SUMMIT_NUMA
+static struct rio_table_hdr *rio_table_hdr;
+static struct scal_detail *scal_devs[MAX_NUMNODES];
+static struct rio_detail *rio_devs[MAX_NUMNODES*4];
+
+#ifndef CONFIG_X86_NUMAQ
+static int mp_bus_id_to_node[MAX_MP_BUSSES];
+#endif
+
+static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
+{
+ int twister = 0, node = 0;
+ int i, bus, num_buses;
+
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
+ if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
+ twister = rio_devs[i]->owner_id;
+ break;
+ }
+ }
+ if (i == rio_table_hdr->num_rio_dev) {
+ printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
+ return last_bus;
+ }
+
+ for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
+ if (scal_devs[i]->node_id == twister) {
+ node = scal_devs[i]->node_id;
+ break;
+ }
+ }
+ if (i == rio_table_hdr->num_scal_dev) {
+ printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
+ return last_bus;
+ }
+
+ switch (rio_devs[wpeg_num]->type) {
+ case CompatWPEG:
+ /*
+ * The Compatibility Winnipeg controls the 2 legacy buses,
+ * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
+ * a PCI-PCI bridge card is used in either slot: total 5 buses.
+ */
+ num_buses = 5;
+ break;
+ case AltWPEG:
+ /*
+ * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
+ * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
+ * the "extra" buses for each of those slots: total 7 buses.
+ */
+ num_buses = 7;
+ break;
+ case LookOutAWPEG:
+ case LookOutBWPEG:
+ /*
+ * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
+ * & the "extra" buses for each of those slots: total 9 buses.
+ */
+ num_buses = 9;
+ break;
+ default:
+ printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
+ return last_bus;
+ }
+
+ for (bus = last_bus; bus < last_bus + num_buses; bus++)
+ mp_bus_id_to_node[bus] = node;
+ return bus;
+}
+
+static int build_detail_arrays(void)
+{
+ unsigned long ptr;
+ int i, scal_detail_size, rio_detail_size;
+
+ if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
+ printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+ return 0;
+ }
+
+ switch (rio_table_hdr->version) {
+ default:
+ printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
+ return 0;
+ case 2:
+ scal_detail_size = 11;
+ rio_detail_size = 13;
+ break;
+ case 3:
+ scal_detail_size = 12;
+ rio_detail_size = 15;
+ break;
+ }
+
+ ptr = (unsigned long)rio_table_hdr + 3;
+ for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
+ scal_devs[i] = (struct scal_detail *)ptr;
+
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
+ rio_devs[i] = (struct rio_detail *)ptr;
+
+ return 1;
+}
+
+void setup_summit(void)
+{
+ unsigned long ptr;
+ unsigned short offset;
+ int i, next_wpeg, next_bus = 0;
+
+ /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
+ ptr = get_bios_ebda();
+ ptr = (unsigned long)phys_to_virt(ptr);
+
+ rio_table_hdr = NULL;
+ offset = 0x180;
+ while (offset) {
+ /* The block id is stored in the 2nd word */
+ if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
+ /* set the pointer past the offset & block id */
+ rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
+ break;
+ }
+ /* The next offset is stored in the 1st word. 0 means no more */
+ offset = *((unsigned short *)(ptr + offset));
+ }
+ if (!rio_table_hdr) {
+ printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
+ return;
+ }
+
+ if (!build_detail_arrays())
+ return;
+
+ /* The first Winnipeg we're looking for has an index of 0 */
+ next_wpeg = 0;
+ do {
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
+ if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
+ /* It's the Winnipeg we're looking for! */
+ next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
+ next_wpeg++;
+ break;
+ }
+ }
+ /*
+ * If we go through all Rio devices and don't find one with
+ * the next index, it means we've found all the Winnipegs,
+ * and thus all the PCI buses.
+ */
+ if (i == rio_table_hdr->num_rio_dev)
+ next_wpeg = 0;
+ } while (next_wpeg != 0);
+}
+#endif
+
+static struct apic apic_summit = {
+
+ .name = "summit",
+ .probe = probe_summit,
+ .acpi_madt_oem_check = summit_acpi_madt_oem_check,
+ .apic_id_registered = summit_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ /* logical delivery broadcast to all CPUs: */
+ .irq_dest_mode = 1,
+
+ .target_cpus = summit_target_cpus,
+ .disable_esr = 1,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = summit_check_apicid_used,
+ .check_apicid_present = summit_check_apicid_present,
+
+ .vector_allocation_domain = summit_vector_allocation_domain,
+ .init_apic_ldr = summit_init_apic_ldr,
+
+ .ioapic_phys_id_map = summit_ioapic_phys_id_map,
+ .setup_apic_routing = summit_setup_apic_routing,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = summit_cpu_present_to_apicid,
+ .apicid_to_cpu_present = summit_apicid_to_cpu_present,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = summit_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = summit_phys_pkg_id,
+ .mps_oem_check = summit_mps_oem_check,
+
+ .get_apic_id = summit_get_apic_id,
+ .set_apic_id = NULL,
+ .apic_id_mask = 0xFF << 24,
+
+ .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = summit_send_IPI_mask,
+ .send_IPI_mask_allbutself = NULL,
+ .send_IPI_allbutself = summit_send_IPI_allbutself,
+ .send_IPI_all = summit_send_IPI_all,
+ .send_IPI_self = default_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+
+ .wait_for_init_deassert = default_wait_for_init_deassert,
+
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = default_inquire_remote_apic,
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+
+ .x86_32_early_logical_apicid = summit_early_logical_apicid,
+};
+
+apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
new file mode 100644
index 00000000000..50079587582
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -0,0 +1,268 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+#include <linux/cpu.h>
+
+#include <asm/smp.h>
+#include <asm/x2apic.h>
+
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled();
+}
+
+static inline u32 x2apic_cluster(int cpu)
+{
+ return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
+}
+
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
+{
+ struct cpumask *cpus_in_cluster_ptr;
+ struct cpumask *ipi_mask_ptr;
+ unsigned int cpu, this_cpu;
+ unsigned long flags;
+ u32 dest;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
+
+ this_cpu = smp_processor_id();
+
+ /*
+ * We are to modify mask, so we need an own copy
+ * and be sure it's manipulated with irq off.
+ */
+ ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+ cpumask_copy(ipi_mask_ptr, mask);
+
+ /*
+ * The idea is to send one IPI per cluster.
+ */
+ for_each_cpu(cpu, ipi_mask_ptr) {
+ unsigned long i;
+
+ cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+ dest = 0;
+
+ /* Collect cpus in cluster. */
+ for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+ if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
+ dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+ }
+
+ if (!dest)
+ continue;
+
+ __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ /*
+ * Cluster sibling cpus should be discared now so
+ * we would not send IPI them second time.
+ */
+ cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ /*
+ * We're using fixed IRQ delivery, can only return one logical APIC ID.
+ * May as well be the first.
+ */
+ int cpu = cpumask_first(cpumask);
+
+ if ((unsigned)cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+ else
+ return BAD_APICID;
+}
+
+static unsigned int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one logical APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+
+ return per_cpu(x86_cpu_to_logical_apicid, cpu);
+}
+
+static void init_x2apic_ldr(void)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+ __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+ for_each_online_cpu(cpu) {
+ if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+ continue;
+ __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ }
+}
+
+ /*
+ * At CPU state changes, update the x2apic cluster sibling info.
+ */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ unsigned int this_cpu = (unsigned long)hcpu;
+ unsigned int cpu;
+ int err = 0;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+ GFP_KERNEL)) {
+ err = -ENOMEM;
+ } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+ GFP_KERNEL)) {
+ free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+ err = -ENOMEM;
+ }
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+ case CPU_DEAD:
+ for_each_online_cpu(cpu) {
+ if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+ continue;
+ __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ }
+ free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+ free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+ break;
+ }
+
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block __refdata x2apic_cpu_notifier = {
+ .notifier_call = update_clusterinfo,
+};
+
+static int x2apic_init_cpu_notifier(void)
+{
+ int cpu = smp_processor_id();
+
+ zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+ zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+
+ BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+
+ __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+ register_hotcpu_notifier(&x2apic_cpu_notifier);
+ return 1;
+}
+
+static int x2apic_cluster_probe(void)
+{
+ if (x2apic_mode)
+ return x2apic_init_cpu_notifier();
+ else
+ return 0;
+}
+
+static struct apic apic_x2apic_cluster = {
+
+ .name = "cluster x2apic",
+ .probe = x2apic_cluster_probe,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_LowestPrio,
+ .irq_dest_mode = 1, /* logical */
+
+ .target_cpus = x2apic_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
new file mode 100644
index 00000000000..f5373dfde21
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -0,0 +1,174 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/x2apic.h>
+
+int x2apic_phys;
+
+static struct apic apic_x2apic_phys;
+
+static int set_x2apic_phys_mode(char *arg)
+{
+ x2apic_phys = 1;
+ return 0;
+}
+early_param("x2apic_phys", set_x2apic_phys_mode);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (x2apic_phys)
+ return x2apic_enabled();
+ else
+ return 0;
+}
+
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
+{
+ unsigned long query_cpu;
+ unsigned long this_cpu;
+ unsigned long flags;
+
+ x2apic_wrmsr_fence();
+
+ local_irq_save(flags);
+
+ this_cpu = smp_processor_id();
+ for_each_cpu(query_cpu, mask) {
+ if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
+ continue;
+ __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+ vector, APIC_DEST_PHYSICAL);
+ }
+ local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
+static void
+ x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+ __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ int cpu = cpumask_first(cpumask);
+
+ if ((unsigned)cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ else
+ return BAD_APICID;
+}
+
+static unsigned int
+x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static void init_x2apic_ldr(void)
+{
+}
+
+static int x2apic_phys_probe(void)
+{
+ if (x2apic_mode && x2apic_phys)
+ return 1;
+
+ return apic == &apic_x2apic_phys;
+}
+
+static struct apic apic_x2apic_phys = {
+
+ .name = "physical x2apic",
+ .probe = x2apic_phys_probe,
+ .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+ .apic_id_registered = x2apic_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = x2apic_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = x2apic_vector_allocation_domain,
+ .init_apic_ldr = init_x2apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = x2apic_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = x2apic_set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = x2apic_send_IPI_mask,
+ .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = x2apic_send_IPI_allbutself,
+ .send_IPI_all = x2apic_send_IPI_all,
+ .send_IPI_self = x2apic_send_IPI_self,
+
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
new file mode 100644
index 00000000000..79b05b88aa1
--- /dev/null
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -0,0 +1,889 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV APIC functions (note: not an Intel compatible APIC)
+ *
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
+ */
+#include <linux/cpumask.h>
+#include <linux/hardirq.h>
+#include <linux/proc_fs.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/kdebug.h>
+#include <linux/delay.h>
+#include <linux/crash_dump.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/current.h>
+#include <asm/pgtable.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/smp.h>
+#include <asm/x86_init.h>
+#include <asm/emergency-restart.h>
+#include <asm/nmi.h>
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR UVH_SCRATCH5
+#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
+#define UV_NMI_PENDING_MASK (1UL << 63)
+DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
+
+DEFINE_PER_CPU(int, x2apic_extra_bits);
+
+#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
+
+static enum uv_system_type uv_system_type;
+static u64 gru_start_paddr, gru_end_paddr;
+static union uvh_apicid uvh_apicid;
+int uv_min_hub_revision_id;
+EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+unsigned int uv_apicid_hibits;
+EXPORT_SYMBOL_GPL(uv_apicid_hibits);
+static DEFINE_SPINLOCK(uv_nmi_lock);
+
+static struct apic apic_x2apic_uv_x;
+
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+ unsigned long val, *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+ val = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+ return val;
+}
+
+static inline bool is_GRU_range(u64 start, u64 end)
+{
+ return start >= gru_start_paddr && end <= gru_end_paddr;
+}
+
+static bool uv_is_untracked_pat_range(u64 start, u64 end)
+{
+ return is_ISA_range(start, end) || is_GRU_range(start, end);
+}
+
+static int __init early_get_pnodeid(void)
+{
+ union uvh_node_id_u node_id;
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ int pnode;
+
+ /* Currently, all blades have same revision number */
+ node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
+ uv_min_hub_revision_id = node_id.s.revision;
+
+ if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
+ uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+ if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X)
+ uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+
+ uv_hub_info->hub_revision = uv_min_hub_revision_id;
+ pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+ return pnode;
+}
+
+static void __init early_get_apic_pnode_shift(void)
+{
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
+ if (!uvh_apicid.v)
+ /*
+ * Old bios, use default value
+ */
+ uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
+}
+
+/*
+ * Add an extra bit as dictated by bios to the destination apicid of
+ * interrupts potentially passing through the UV HUB. This prevents
+ * a deadlock between interrupts and IO port operations.
+ */
+static void __init uv_set_apicid_hibit(void)
+{
+ union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
+
+ if (is_uv1_hub()) {
+ apicid_mask.v =
+ uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+ uv_apicid_hibits =
+ apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+ }
+}
+
+static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ int pnodeid, is_uv1, is_uv2;
+
+ is_uv1 = !strcmp(oem_id, "SGI");
+ is_uv2 = !strcmp(oem_id, "SGI2");
+ if (is_uv1 || is_uv2) {
+ uv_hub_info->hub_revision =
+ is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
+ pnodeid = early_get_pnodeid();
+ early_get_apic_pnode_shift();
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
+ if (!strcmp(oem_table_id, "UVL"))
+ uv_system_type = UV_LEGACY_APIC;
+ else if (!strcmp(oem_table_id, "UVX"))
+ uv_system_type = UV_X2APIC;
+ else if (!strcmp(oem_table_id, "UVH")) {
+ __this_cpu_write(x2apic_extra_bits,
+ pnodeid << uvh_apicid.s.pnode_shift);
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ uv_set_apicid_hibit();
+ return 1;
+ }
+ }
+ return 0;
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+ return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+ return uv_system_type != UV_NONE;
+}
+EXPORT_SYMBOL_GPL(is_uv_system);
+
+DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
+
+struct uv_blade_info *uv_blade_info;
+EXPORT_SYMBOL_GPL(uv_blade_info);
+
+short *uv_node_to_blade;
+EXPORT_SYMBOL_GPL(uv_node_to_blade);
+
+short *uv_cpu_to_blade;
+EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
+
+short uv_possible_blades;
+EXPORT_SYMBOL_GPL(uv_possible_blades);
+
+unsigned long sn_rtc_cycles_per_second;
+EXPORT_SYMBOL(sn_rtc_cycles_per_second);
+
+static const struct cpumask *uv_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+#ifdef CONFIG_SMP
+ unsigned long val;
+ int pnode;
+
+ pnode = uv_apicid_to_pnode(phys_apicid);
+ phys_apicid |= uv_apicid_hibits;
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_INIT;
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_STARTUP;
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+
+ atomic_set(&init_deasserted, 1);
+#endif
+ return 0;
+}
+
+static void uv_send_IPI_one(int cpu, int vector)
+{
+ unsigned long apicid;
+ int pnode;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ pnode = uv_apicid_to_pnode(apicid);
+ uv_hub_send_ipi(pnode, apicid, vector);
+}
+
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
+ }
+}
+
+static void uv_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
+ }
+}
+
+static void uv_send_IPI_all(int vector)
+{
+ uv_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static int uv_apic_id_registered(void)
+{
+ return 1;
+}
+
+static void uv_init_apic_ldr(void)
+{
+}
+
+static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ int cpu = cpumask_first(cpumask);
+
+ if ((unsigned)cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
+ else
+ return BAD_APICID;
+}
+
+static unsigned int
+uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
+}
+
+static unsigned int x2apic_get_apic_id(unsigned long x)
+{
+ unsigned int id;
+
+ WARN_ON(preemptible() && num_online_cpus() > 1);
+ id = x | __this_cpu_read(x2apic_extra_bits);
+
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ /* maskout x2apic_extra_bits ? */
+ x = id;
+ return x;
+}
+
+static unsigned int uv_read_apic_id(void)
+{
+
+ return x2apic_get_apic_id(apic_read(APIC_ID));
+}
+
+static int uv_phys_pkg_id(int initial_apicid, int index_msb)
+{
+ return uv_read_apic_id() >> index_msb;
+}
+
+static void uv_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+
+static int uv_probe(void)
+{
+ return apic == &apic_x2apic_uv_x;
+}
+
+static struct apic __refdata apic_x2apic_uv_x = {
+
+ .name = "UV large system",
+ .probe = uv_probe,
+ .acpi_madt_oem_check = uv_acpi_madt_oem_check,
+ .apic_id_registered = uv_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = uv_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = APIC_DEST_LOGICAL,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = uv_vector_allocation_domain,
+ .init_apic_ldr = uv_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = uv_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = x2apic_get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xFFFFFFFFu,
+
+ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = uv_send_IPI_mask,
+ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = uv_send_IPI_allbutself,
+ .send_IPI_all = uv_send_IPI_all,
+ .send_IPI_self = uv_send_IPI_self,
+
+ .wakeup_secondary_cpu = uv_wakeup_secondary,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL,
+
+ .read = native_apic_msr_read,
+ .write = native_apic_msr_write,
+ .icr_read = native_x2apic_icr_read,
+ .icr_write = native_x2apic_icr_write,
+ .wait_icr_idle = native_x2apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
+};
+
+static __cpuinit void set_x2apic_extra_bits(int pnode)
+{
+ __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
+}
+
+/*
+ * Called on boot cpu.
+ */
+static __init int boot_pnode_to_blade(int pnode)
+{
+ int blade;
+
+ for (blade = 0; blade < uv_num_possible_blades(); blade++)
+ if (pnode == uv_blade_info[blade].pnode)
+ return blade;
+ BUG();
+}
+
+struct redir_addr {
+ unsigned long redirect;
+ unsigned long alias;
+};
+
+#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+
+static __initdata struct redir_addr redir_addrs[] = {
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
+};
+
+static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
+{
+ union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
+ union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
+ alias.v = uv_read_local_mmr(redir_addrs[i].alias);
+ if (alias.s.enable && alias.s.base == 0) {
+ *size = (1UL << alias.s.m_alias);
+ redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
+ *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+ return;
+ }
+ }
+ *base = *size = 0;
+}
+
+enum map_type {map_wb, map_uc};
+
+static __init void map_high(char *id, unsigned long base, int pshift,
+ int bshift, int max_pnode, enum map_type map_type)
+{
+ unsigned long bytes, paddr;
+
+ paddr = base << pshift;
+ bytes = (1UL << bshift) * (max_pnode + 1);
+ printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
+ paddr + bytes);
+ if (map_type == map_uc)
+ init_extra_mapping_uc(paddr, bytes);
+ else
+ init_extra_mapping_wb(paddr, bytes);
+
+}
+static __init void map_gru_high(int max_pnode)
+{
+ union uvh_rh_gam_gru_overlay_config_mmr_u gru;
+ int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
+ if (gru.s.enable) {
+ map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
+ gru_start_paddr = ((u64)gru.s.base << shift);
+ gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
+
+ }
+}
+
+static __init void map_mmr_high(int max_pnode)
+{
+ union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
+ int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
+
+ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
+ if (mmr.s.enable)
+ map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
+}
+
+static __init void map_mmioh_high(int max_pnode)
+{
+ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
+ int shift;
+
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+ if (is_uv1_hub() && mmioh.s1.enable) {
+ shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
+ max_pnode, map_uc);
+ }
+ if (is_uv2_hub() && mmioh.s2.enable) {
+ shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
+ max_pnode, map_uc);
+ }
+}
+
+static __init void map_low_mmrs(void)
+{
+ init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
+ init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
+}
+
+static __init void uv_rtc_init(void)
+{
+ long status;
+ u64 ticks_per_sec;
+
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
+ &ticks_per_sec);
+ if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
+ printk(KERN_WARNING
+ "unable to determine platform RTC clock frequency, "
+ "guessing.\n");
+ /* BIOS gives wrong value for clock freq. so guess */
+ sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
+ } else
+ sn_rtc_cycles_per_second = ticks_per_sec;
+}
+
+/*
+ * percpu heartbeat timer
+ */
+static void uv_heartbeat(unsigned long ignored)
+{
+ struct timer_list *timer = &uv_hub_info->scir.timer;
+ unsigned char bits = uv_hub_info->scir.state;
+
+ /* flip heartbeat bit */
+ bits ^= SCIR_CPU_HEARTBEAT;
+
+ /* is this cpu idle? */
+ if (idle_cpu(raw_smp_processor_id()))
+ bits &= ~SCIR_CPU_ACTIVITY;
+ else
+ bits |= SCIR_CPU_ACTIVITY;
+
+ /* update system controller interface reg */
+ uv_set_scir_bits(bits);
+
+ /* enable next timer period */
+ mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+}
+
+static void __cpuinit uv_heartbeat_enable(int cpu)
+{
+ while (!uv_cpu_hub_info(cpu)->scir.enabled) {
+ struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+
+ uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+ setup_timer(timer, uv_heartbeat, cpu);
+ timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+ add_timer_on(timer, cpu);
+ uv_cpu_hub_info(cpu)->scir.enabled = 1;
+
+ /* also ensure that boot cpu is enabled */
+ cpu = 0;
+ }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __cpuinit uv_heartbeat_disable(int cpu)
+{
+ if (uv_cpu_hub_info(cpu)->scir.enabled) {
+ uv_cpu_hub_info(cpu)->scir.enabled = 0;
+ del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+ }
+ uv_set_cpu_scir_bits(cpu, 0xff);
+}
+
+/*
+ * cpu hotplug notifier
+ */
+static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ uv_heartbeat_enable(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ uv_heartbeat_disable(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+ hotcpu_notifier(uv_scir_cpu_notify, 0);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+}
+
+static __init int uv_init_heartbeat(void)
+{
+ int cpu;
+
+ if (is_uv_system())
+ for_each_online_cpu(cpu)
+ uv_heartbeat_enable(cpu);
+ return 0;
+}
+
+late_initcall(uv_init_heartbeat);
+
+#endif /* !CONFIG_HOTPLUG_CPU */
+
+/* Direct Legacy VGA I/O traffic to designated IOH */
+int uv_set_vga_state(struct pci_dev *pdev, bool decode,
+ unsigned int command_bits, u32 flags)
+{
+ int domain, bus, rc;
+
+ PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
+ pdev->devfn, decode, command_bits, flags);
+
+ if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
+ return 0;
+
+ if ((command_bits & PCI_COMMAND_IO) == 0)
+ return 0;
+
+ domain = pci_domain_nr(pdev->bus);
+ bus = pdev->bus->number;
+
+ rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
+ PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
+
+ return rc;
+}
+
+/*
+ * Called on each cpu to initialize the per_cpu UV data area.
+ * FIXME: hotplug not supported yet
+ */
+void __cpuinit uv_cpu_init(void)
+{
+ /* CPU 0 initilization will be done via uv_system_init. */
+ if (!uv_blade_info)
+ return;
+
+ uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+ set_x2apic_extra_bits(uv_hub_info->pnode);
+}
+
+/*
+ * When NMI is received, print a stack trace.
+ */
+int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
+{
+ unsigned long real_uv_nmi;
+ int bid;
+
+ /*
+ * Each blade has an MMR that indicates when an NMI has been sent
+ * to cpus on the blade. If an NMI is detected, atomically
+ * clear the MMR and update a per-blade NMI count used to
+ * cause each cpu on the blade to notice a new NMI.
+ */
+ bid = uv_numa_blade_id();
+ real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+
+ if (unlikely(real_uv_nmi)) {
+ spin_lock(&uv_blade_info[bid].nmi_lock);
+ real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+ if (real_uv_nmi) {
+ uv_blade_info[bid].nmi_count++;
+ uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
+ }
+ spin_unlock(&uv_blade_info[bid].nmi_lock);
+ }
+
+ if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
+ return NMI_DONE;
+
+ __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
+
+ /*
+ * Use a lock so only one cpu prints at a time.
+ * This prevents intermixed output.
+ */
+ spin_lock(&uv_nmi_lock);
+ pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
+ dump_stack();
+ spin_unlock(&uv_nmi_lock);
+
+ return NMI_HANDLED;
+}
+
+void uv_register_nmi_notifier(void)
+{
+ if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
+ printk(KERN_WARNING "UV NMI handler failed to register\n");
+}
+
+void uv_nmi_init(void)
+{
+ unsigned int value;
+
+ /*
+ * Unmask NMI on all cpus
+ */
+ value = apic_read(APIC_LVT1) | APIC_DM_NMI;
+ value &= ~APIC_LVT_MASKED;
+ apic_write(APIC_LVT1, value);
+}
+
+void __init uv_system_init(void)
+{
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
+ union uvh_node_id_u node_id;
+ unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
+ int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
+ int gnode_extra, max_pnode = 0;
+ unsigned long mmr_base, present, paddr;
+ unsigned short pnode_mask, pnode_io_mask;
+
+ printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
+ map_low_mmrs();
+
+ m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
+ m_val = m_n_config.s.m_skt;
+ n_val = m_n_config.s.n_skt;
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+ n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
+ mmr_base =
+ uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
+ ~UV_MMR_ENABLE;
+ pnode_mask = (1 << n_val) - 1;
+ pnode_io_mask = (1 << n_io) - 1;
+
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+ gnode_upper = ((unsigned long)gnode_extra << m_val);
+ printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
+ n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
+
+ printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
+
+ for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
+ uv_possible_blades +=
+ hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
+
+ /* uv_num_possible_blades() is really the hub count */
+ printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
+ is_uv1_hub() ? uv_num_possible_blades() :
+ (uv_num_possible_blades() + 1) / 2,
+ uv_num_possible_blades());
+
+ bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
+ uv_blade_info = kzalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_blade_info);
+
+ for (blade = 0; blade < uv_num_possible_blades(); blade++)
+ uv_blade_info[blade].memory_nid = -1;
+
+ get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
+
+ bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
+ uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_node_to_blade);
+ memset(uv_node_to_blade, 255, bytes);
+
+ bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
+ uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_cpu_to_blade);
+ memset(uv_cpu_to_blade, 255, bytes);
+
+ blade = 0;
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+ for (j = 0; j < 64; j++) {
+ if (!test_bit(j, &present))
+ continue;
+ pnode = (i * 64 + j) & pnode_mask;
+ uv_blade_info[blade].pnode = pnode;
+ uv_blade_info[blade].nr_possible_cpus = 0;
+ uv_blade_info[blade].nr_online_cpus = 0;
+ spin_lock_init(&uv_blade_info[blade].nmi_lock);
+ max_pnode = max(pnode, max_pnode);
+ blade++;
+ }
+ }
+
+ uv_bios_init();
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
+ &sn_region_size, &system_serial_number);
+ uv_rtc_init();
+
+ for_each_present_cpu(cpu) {
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
+ nid = cpu_to_node(cpu);
+ /*
+ * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
+ */
+ uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
+ uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
+ uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
+
+ uv_cpu_hub_info(cpu)->m_shift = 64 - m_val;
+ uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ?
+ (m_val == 40 ? 40 : 39) : m_val;
+
+ pnode = uv_apicid_to_pnode(apicid);
+ blade = boot_pnode_to_blade(pnode);
+ lcpu = uv_blade_info[blade].nr_possible_cpus;
+ uv_blade_info[blade].nr_possible_cpus++;
+
+ /* Any node on the blade, else will contain -1. */
+ uv_blade_info[blade].memory_nid = nid;
+
+ uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
+ uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
+ uv_cpu_hub_info(cpu)->m_val = m_val;
+ uv_cpu_hub_info(cpu)->n_val = n_val;
+ uv_cpu_hub_info(cpu)->numa_blade_id = blade;
+ uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
+ uv_cpu_hub_info(cpu)->pnode = pnode;
+ uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
+ uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+ uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
+ uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
+ uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
+ uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
+ uv_node_to_blade[nid] = blade;
+ uv_cpu_to_blade[cpu] = blade;
+ }
+
+ /* Add blade/pnode info for nodes without cpus */
+ for_each_online_node(nid) {
+ if (uv_node_to_blade[nid] >= 0)
+ continue;
+ paddr = node_start_pfn(nid) << PAGE_SHIFT;
+ pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr));
+ blade = boot_pnode_to_blade(pnode);
+ uv_node_to_blade[nid] = blade;
+ }
+
+ map_gru_high(max_pnode);
+ map_mmr_high(max_pnode);
+ map_mmioh_high(max_pnode & pnode_io_mask);
+
+ uv_cpu_init();
+ uv_scir_register_cpu_notifier();
+ uv_register_nmi_notifier();
+ proc_mkdir("sgi_uv", NULL);
+
+ /* register Legacy VGA I/O redirection handler */
+ pci_register_set_vga_state(uv_set_vga_state);
+
+ /*
+ * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
+ * EFI is not enabled in the kdump kernel.
+ */
+ if (is_kdump_kernel())
+ reboot_type = BOOT_ACPI;
+}
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
new file mode 100644
index 00000000000..219cf2f2078
--- /dev/null
+++ b/arch/x86/kernel/apm_32.c
@@ -0,0 +1,2453 @@
+/* -*- linux-c -*-
+ * APM BIOS driver for Linux
+ * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au)
+ *
+ * Initial development of this driver was funded by NEC Australia P/L
+ * and NEC Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * October 1995, Rik Faith (faith@cs.unc.edu):
+ * Minor enhancements and updates (to the patch set) for 1.3.x
+ * Documentation
+ * January 1996, Rik Faith (faith@cs.unc.edu):
+ * Make /proc/apm easy to format (bump driver version)
+ * March 1996, Rik Faith (faith@cs.unc.edu):
+ * Prohibit APM BIOS calls unless apm_enabled.
+ * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>)
+ * April 1996, Stephen Rothwell (sfr@canb.auug.org.au)
+ * Version 1.0 and 1.1
+ * May 1996, Version 1.2
+ * Feb 1998, Version 1.3
+ * Feb 1998, Version 1.4
+ * Aug 1998, Version 1.5
+ * Sep 1998, Version 1.6
+ * Nov 1998, Version 1.7
+ * Jan 1999, Version 1.8
+ * Jan 1999, Version 1.9
+ * Oct 1999, Version 1.10
+ * Nov 1999, Version 1.11
+ * Jan 2000, Version 1.12
+ * Feb 2000, Version 1.13
+ * Nov 2000, Version 1.14
+ * Oct 2001, Version 1.15
+ * Jan 2002, Version 1.16
+ * Oct 2002, Version 1.16ac
+ *
+ * History:
+ * 0.6b: first version in official kernel, Linux 1.3.46
+ * 0.7: changed /proc/apm format, Linux 1.3.58
+ * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59
+ * 0.9: only call bios if bios is present, Linux 1.3.72
+ * 1.0: use fixed device number, consolidate /proc/apm into this file,
+ * Linux 1.3.85
+ * 1.1: support user-space standby and suspend, power off after system
+ * halted, Linux 1.3.98
+ * 1.2: When resetting RTC after resume, take care so that the time
+ * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth
+ * <jtoth@princeton.edu>); improve interaction between
+ * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4
+ * 1.2a:Simple change to stop mysterious bug reports with SMP also added
+ * levels to the printk calls. APM is not defined for SMP machines.
+ * The new replacement for it is, but Linux doesn't yet support this.
+ * Alan Cox Linux 2.1.55
+ * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's
+ * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by
+ * Dean Gaudet <dgaudet@arctic.org>.
+ * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87
+ * 1.5: Fix segment register reloading (in case of bad segments saved
+ * across BIOS call).
+ * Stephen Rothwell
+ * 1.6: Cope with compiler/assembler differences.
+ * Only try to turn off the first display device.
+ * Fix OOPS at power off with no APM BIOS by Jan Echternach
+ * <echter@informatik.uni-rostock.de>
+ * Stephen Rothwell
+ * 1.7: Modify driver's cached copy of the disabled/disengaged flags
+ * to reflect current state of APM BIOS.
+ * Chris Rankin <rankinc@bellsouth.net>
+ * Reset interrupt 0 timer to 100Hz after suspend
+ * Chad Miller <cmiller@surfsouth.com>
+ * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE
+ * Richard Gooch <rgooch@atnf.csiro.au>
+ * Allow boot time disabling of APM
+ * Make boot messages far less verbose by default
+ * Make asm safer
+ * Stephen Rothwell
+ * 1.8: Add CONFIG_APM_RTC_IS_GMT
+ * Richard Gooch <rgooch@atnf.csiro.au>
+ * change APM_NOINTS to CONFIG_APM_ALLOW_INTS
+ * remove dependency on CONFIG_PROC_FS
+ * Stephen Rothwell
+ * 1.9: Fix small typo. <laslo@wodip.opole.pl>
+ * Try to cope with BIOS's that need to have all display
+ * devices blanked and not just the first one.
+ * Ross Paterson <ross@soi.city.ac.uk>
+ * Fix segment limit setting it has always been wrong as
+ * the segments needed to have byte granularity.
+ * Mark a few things __init.
+ * Add hack to allow power off of SMP systems by popular request.
+ * Use CONFIG_SMP instead of __SMP__
+ * Ignore BOUNCES for three seconds.
+ * Stephen Rothwell
+ * 1.10: Fix for Thinkpad return code.
+ * Merge 2.2 and 2.3 drivers.
+ * Remove APM dependencies in arch/i386/kernel/process.c
+ * Remove APM dependencies in drivers/char/sysrq.c
+ * Reset time across standby.
+ * Allow more inititialisation on SMP.
+ * Remove CONFIG_APM_POWER_OFF and make it boot time
+ * configurable (default on).
+ * Make debug only a boot time parameter (remove APM_DEBUG).
+ * Try to blank all devices on any error.
+ * 1.11: Remove APM dependencies in drivers/char/console.c
+ * Check nr_running to detect if we are idle (from
+ * Borislav Deianov <borislav@lix.polytechnique.fr>)
+ * Fix for bioses that don't zero the top part of the
+ * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>)
+ * (reported by Panos Katsaloulis <teras@writeme.com>).
+ * Real mode power off patch (Walter Hofmann
+ * <Walter.Hofmann@physik.stud.uni-erlangen.de>).
+ * 1.12: Remove CONFIG_SMP as the compiler will optimize
+ * the code away anyway (smp_num_cpus == 1 in UP)
+ * noted by Artur Skawina <skawina@geocities.com>.
+ * Make power off under SMP work again.
+ * Fix thinko with initial engaging of BIOS.
+ * Make sure power off only happens on CPU 0
+ * (Paul "Rusty" Russell <rusty@rustcorp.com.au>).
+ * Do error notification to user mode if BIOS calls fail.
+ * Move entrypoint offset fix to ...boot/setup.S
+ * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>).
+ * Remove smp-power-off. SMP users must now specify
+ * "apm=power-off" on the kernel command line. Suggested
+ * by Jim Avera <jima@hal.com>, modified by Alan Cox
+ * <alan@lxorguk.ukuu.org.uk>.
+ * Register the /proc/apm entry even on SMP so that
+ * scripts that check for it before doing power off
+ * work (Jim Avera <jima@hal.com>).
+ * 1.13: Changes for new pm_ interfaces (Andy Henroid
+ * <andy_henroid@yahoo.com>).
+ * Modularize the code.
+ * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS
+ * is now the way life works).
+ * Fix thinko in suspend() (wrong return).
+ * Notify drivers on critical suspend.
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
+ * modified by sfr).
+ * Disable interrupts while we are suspended (Andy Henroid
+ * <andy_henroid@yahoo.com> fixed by sfr).
+ * Make power off work on SMP again (Tony Hoyle
+ * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr.
+ * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore
+ * interval is now configurable.
+ * 1.14: Make connection version persist across module unload/load.
+ * Enable and engage power management earlier.
+ * Disengage power management on module unload.
+ * Changed to use the sysrq-register hack for registering the
+ * power off function called by magic sysrq based upon discussions
+ * in irc://irc.openprojects.net/#kernelnewbies
+ * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>).
+ * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable.
+ * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr.
+ * Work around byte swap bug in one of the Vaio's BIOS's
+ * (Marc Boucher <marc@mbsi.ca>).
+ * Exposed the disable flag to dmi so that we can handle known
+ * broken APM (Alan Cox <alan@lxorguk.ukuu.org.uk>).
+ * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin
+ * calling it - instead idle. (Alan Cox <alan@lxorguk.ukuu.org.uk>)
+ * If an APM idle fails log it and idle sensibly
+ * 1.15: Don't queue events to clients who open the device O_WRONLY.
+ * Don't expect replies from clients who open the device O_RDONLY.
+ * (Idea from Thomas Hood)
+ * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>)
+ * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.)
+ * Notify listeners of standby or suspend events before notifying
+ * drivers. Return EBUSY to ioctl() if suspend is rejected.
+ * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood)
+ * Ignore first resume after we generate our own resume event
+ * after a suspend (Thomas Hood)
+ * Daemonize now gets rid of our controlling terminal (sfr).
+ * CONFIG_APM_CPU_IDLE now just affects the default value of
+ * idle_threshold (sfr).
+ * Change name of kernel apm daemon (as it no longer idles) (sfr).
+ * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we
+ * make _all_ APM calls on the CPU#0. Fix unsafe sign bug.
+ * TODO: determine if its "boot CPU" or "CPU0" we want to lock to.
+ *
+ * APM 1.1 Reference:
+ *
+ * Intel Corporation, Microsoft Corporation. Advanced Power Management
+ * (APM) BIOS Interface Specification, Revision 1.1, September 1993.
+ * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
+ *
+ * [This document is available free from Intel by calling 800.628.8686 (fax
+ * 916.356.6100) or 800.548.4725; or from
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
+ * available from Microsoft by calling 206.882.8080.]
+ *
+ * APM 1.2 Reference:
+ * Intel Corporation, Microsoft Corporation. Advanced Power Management
+ * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
+ *
+ * [This document is available from Microsoft at:
+ * http://www.microsoft.com/whdc/archive/amp_12.mspx]
+ */
+
+#include <linux/module.h>
+
+#include <linux/poll.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/timer.h>
+#include <linux/fcntl.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/apm_bios.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <linux/pm.h>
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/freezer.h>
+#include <linux/smp.h>
+#include <linux/dmi.h>
+#include <linux/suspend.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h>
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/olpc.h>
+#include <asm/paravirt.h>
+#include <asm/reboot.h>
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+extern int (*console_blank_hook)(int);
+#endif
+
+/*
+ * The apm_bios device is one of the misc char devices.
+ * This is its minor number.
+ */
+#define APM_MINOR_DEV 134
+
+/*
+ * Various options can be changed at boot time as follows:
+ * (We allow underscores for compatibility with the modules code)
+ * apm=on/off enable/disable APM
+ * [no-]allow[-_]ints allow interrupts during BIOS calls
+ * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call
+ * [no-]realmode[-_]power[-_]off switch to real mode before
+ * powering off
+ * [no-]debug log some debugging messages
+ * [no-]power[-_]off power off on shutdown
+ * [no-]smp Use apm even on an SMP box
+ * bounce[-_]interval=<n> number of ticks to ignore suspend
+ * bounces
+ * idle[-_]threshold=<n> System idle percentage above which to
+ * make APM BIOS idle calls. Set it to
+ * 100 to disable.
+ * idle[-_]period=<n> Period (in 1/100s of a second) over
+ * which the idle percentage is
+ * calculated.
+ */
+
+/* KNOWN PROBLEM MACHINES:
+ *
+ * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant
+ * [Confirmed by TI representative]
+ * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification
+ * [Confirmed by BIOS disassembly]
+ * [This may work now ...]
+ * P: Toshiba 1950S: battery life information only gets updated after resume
+ * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking
+ * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>]
+ * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP
+ * Neale Banks <neale@lowendale.com.au> December 2000
+ *
+ * Legend: U = unusable with APM patches
+ * P = partially usable with APM patches
+ */
+
+/*
+ * Define as 1 to make the driver always call the APM BIOS busy
+ * routine even if the clock was not reported as slowed by the
+ * idle routine. Otherwise, define as 0.
+ */
+#define ALWAYS_CALL_BUSY 1
+
+/*
+ * Define to make the APM BIOS calls zero all data segment registers (so
+ * that an incorrect BIOS implementation will cause a kernel panic if it
+ * tries to write to arbitrary memory).
+ */
+#define APM_ZERO_SEGS
+
+#include <asm/apm.h>
+
+/*
+ * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend.
+ * This patched by Chad Miller <cmiller@surfsouth.com>, original code by
+ * David Chen <chen@ctpa04.mit.edu>
+ */
+#undef INIT_TIMER_AFTER_SUSPEND
+
+#ifdef INIT_TIMER_AFTER_SUSPEND
+#include <linux/timex.h>
+#include <asm/io.h>
+#include <linux/delay.h>
+#endif
+
+/*
+ * Need to poll the APM BIOS every second
+ */
+#define APM_CHECK_TIMEOUT (HZ)
+
+/*
+ * Ignore suspend events for this amount of time after a resume
+ */
+#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
+
+/*
+ * Maximum number of events stored
+ */
+#define APM_MAX_EVENTS 20
+
+/*
+ * The per-file APM data
+ */
+struct apm_user {
+ int magic;
+ struct apm_user *next;
+ unsigned int suser: 1;
+ unsigned int writer: 1;
+ unsigned int reader: 1;
+ unsigned int suspend_wait: 1;
+ int suspend_result;
+ int suspends_pending;
+ int standbys_pending;
+ int suspends_read;
+ int standbys_read;
+ int event_head;
+ int event_tail;
+ apm_event_t events[APM_MAX_EVENTS];
+};
+
+/*
+ * The magic number in apm_user
+ */
+#define APM_BIOS_MAGIC 0x4101
+
+/*
+ * idle percentage above which bios idle calls are done
+ */
+#ifdef CONFIG_APM_CPU_IDLE
+#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
+#define DEFAULT_IDLE_THRESHOLD 95
+#else
+#define DEFAULT_IDLE_THRESHOLD 100
+#endif
+#define DEFAULT_IDLE_PERIOD (100 / 3)
+
+/*
+ * Local variables
+ */
+static struct {
+ unsigned long offset;
+ unsigned short segment;
+} apm_bios_entry;
+static int clock_slowed;
+static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
+static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
+static int set_pm_idle;
+static int suspends_pending;
+static int standbys_pending;
+static int ignore_sys_suspend;
+static int ignore_normal_resume;
+static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
+
+static bool debug __read_mostly;
+static bool smp __read_mostly;
+static int apm_disabled = -1;
+#ifdef CONFIG_SMP
+static bool power_off;
+#else
+static bool power_off = 1;
+#endif
+static bool realmode_power_off;
+#ifdef CONFIG_APM_ALLOW_INTS
+static bool allow_ints = 1;
+#else
+static bool allow_ints;
+#endif
+static bool broken_psr;
+
+static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
+static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
+static struct apm_user *user_list;
+static DEFINE_SPINLOCK(user_list_lock);
+static DEFINE_MUTEX(apm_mutex);
+
+/*
+ * Set up a segment that references the real mode segment 0x40
+ * that extends up to the end of page zero (that we have reserved).
+ * This is for buggy BIOS's that refer to (real mode) segment 0x40
+ * even though they are called in protected mode.
+ */
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+ (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
+
+static const char driver_version[] = "1.16ac"; /* no spaces */
+
+static struct task_struct *kapmd_task;
+
+/*
+ * APM event names taken from the APM 1.2 specification. These are
+ * the message codes that the BIOS uses to tell us about events
+ */
+static const char * const apm_event_name[] = {
+ "system standby",
+ "system suspend",
+ "normal resume",
+ "critical resume",
+ "low battery",
+ "power status change",
+ "update time",
+ "critical suspend",
+ "user standby",
+ "user suspend",
+ "system standby resume",
+ "capabilities change"
+};
+#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name)
+
+typedef struct lookup_t {
+ int key;
+ char *msg;
+} lookup_t;
+
+/*
+ * The BIOS returns a set of standard error codes in AX when the
+ * carry flag is set.
+ */
+
+static const lookup_t error_table[] = {
+/* N/A { APM_SUCCESS, "Operation succeeded" }, */
+ { APM_DISABLED, "Power management disabled" },
+ { APM_CONNECTED, "Real mode interface already connected" },
+ { APM_NOT_CONNECTED, "Interface not connected" },
+ { APM_16_CONNECTED, "16 bit interface already connected" },
+/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */
+ { APM_32_CONNECTED, "32 bit interface already connected" },
+ { APM_32_UNSUPPORTED, "32 bit interface not supported" },
+ { APM_BAD_DEVICE, "Unrecognized device ID" },
+ { APM_BAD_PARAM, "Parameter out of range" },
+ { APM_NOT_ENGAGED, "Interface not engaged" },
+ { APM_BAD_FUNCTION, "Function not supported" },
+ { APM_RESUME_DISABLED, "Resume timer disabled" },
+ { APM_BAD_STATE, "Unable to enter requested state" },
+/* N/A { APM_NO_EVENTS, "No events pending" }, */
+ { APM_NO_ERROR, "BIOS did not set a return code" },
+ { APM_NOT_PRESENT, "No APM present" }
+};
+#define ERROR_COUNT ARRAY_SIZE(error_table)
+
+/**
+ * apm_error - display an APM error
+ * @str: information string
+ * @err: APM BIOS return code
+ *
+ * Write a meaningful log entry to the kernel log in the event of
+ * an APM error. Note that this also handles (negative) kernel errors.
+ */
+
+static void apm_error(char *str, int err)
+{
+ int i;
+
+ for (i = 0; i < ERROR_COUNT; i++)
+ if (error_table[i].key == err)
+ break;
+ if (i < ERROR_COUNT)
+ printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
+ else if (err < 0)
+ printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
+ else
+ printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
+ str, err);
+}
+
+/*
+ * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and
+ * apm_info.allow_ints, we are being really paranoid here! Not only
+ * are interrupts disabled, but all the segment registers (except SS)
+ * are saved and zeroed this means that if the BIOS tries to reference
+ * any data without explicitly loading the segment registers, the kernel
+ * will fault immediately rather than have some unforeseen circumstances
+ * for the rest of the kernel. And it will be very obvious! :-) Doing
+ * this depends on CS referring to the same physical memory as DS so that
+ * DS can be zeroed before the call. Unfortunately, we can't do anything
+ * about the stack segment/pointer. Also, we tell the compiler that
+ * everything could change.
+ *
+ * Also, we KNOW that for the non error case of apm_bios_call, there
+ * is no useful data returned in the low order 8 bits of eax.
+ */
+
+static inline unsigned long __apm_irq_save(void)
+{
+ unsigned long flags;
+ local_save_flags(flags);
+ if (apm_info.allow_ints) {
+ if (irqs_disabled_flags(flags))
+ local_irq_enable();
+ } else
+ local_irq_disable();
+
+ return flags;
+}
+
+#define apm_irq_save(flags) \
+ do { flags = __apm_irq_save(); } while (0)
+
+static inline void apm_irq_restore(unsigned long flags)
+{
+ if (irqs_disabled_flags(flags))
+ local_irq_disable();
+ else if (irqs_disabled())
+ local_irq_enable();
+}
+
+#ifdef APM_ZERO_SEGS
+# define APM_DECL_SEGS \
+ unsigned int saved_fs; unsigned int saved_gs;
+# define APM_DO_SAVE_SEGS \
+ savesegment(fs, saved_fs); savesegment(gs, saved_gs)
+# define APM_DO_RESTORE_SEGS \
+ loadsegment(fs, saved_fs); loadsegment(gs, saved_gs)
+#else
+# define APM_DECL_SEGS
+# define APM_DO_SAVE_SEGS
+# define APM_DO_RESTORE_SEGS
+#endif
+
+struct apm_bios_call {
+ u32 func;
+ /* In and out */
+ u32 ebx;
+ u32 ecx;
+ /* Out only */
+ u32 eax;
+ u32 edx;
+ u32 esi;
+
+ /* Error: -ENOMEM, or bits 8-15 of eax */
+ int err;
+};
+
+/**
+ * __apm_bios_call - Make an APM BIOS 32bit call
+ * @_call: pointer to struct apm_bios_call.
+ *
+ * Make an APM call using the 32bit protected mode interface. The
+ * caller is responsible for knowing if APM BIOS is configured and
+ * enabled. This call can disable interrupts for a long period of
+ * time on some laptops. The return value is in AH and the carry
+ * flag is loaded into AL. If there is an error, then the error
+ * code is returned in AH (bits 8-15 of eax) and this function
+ * returns non-zero.
+ *
+ * Note: this makes the call on the current CPU.
+ */
+static long __apm_bios_call(void *_call)
+{
+ APM_DECL_SEGS
+ unsigned long flags;
+ int cpu;
+ struct desc_struct save_desc_40;
+ struct desc_struct *gdt;
+ struct apm_bios_call *call = _call;
+
+ cpu = get_cpu();
+ BUG_ON(cpu != 0);
+ gdt = get_cpu_gdt_table(cpu);
+ save_desc_40 = gdt[0x40 / 8];
+ gdt[0x40 / 8] = bad_bios_desc;
+
+ apm_irq_save(flags);
+ APM_DO_SAVE_SEGS;
+ apm_bios_call_asm(call->func, call->ebx, call->ecx,
+ &call->eax, &call->ebx, &call->ecx, &call->edx,
+ &call->esi);
+ APM_DO_RESTORE_SEGS;
+ apm_irq_restore(flags);
+ gdt[0x40 / 8] = save_desc_40;
+ put_cpu();
+
+ return call->eax & 0xff;
+}
+
+/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */
+static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call)
+{
+ int ret;
+
+ /* Don't bother with work_on_cpu in the common case, so we don't
+ * have to worry about OOM or overhead. */
+ if (get_cpu() == 0) {
+ ret = fn(call);
+ put_cpu();
+ } else {
+ put_cpu();
+ ret = work_on_cpu(0, fn, call);
+ }
+
+ /* work_on_cpu can fail with -ENOMEM */
+ if (ret < 0)
+ call->err = ret;
+ else
+ call->err = (call->eax >> 8) & 0xff;
+
+ return ret;
+}
+
+/**
+ * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0)
+ * @call: the apm_bios_call registers.
+ *
+ * If there is an error, it is returned in @call.err.
+ */
+static int apm_bios_call(struct apm_bios_call *call)
+{
+ return on_cpu0(__apm_bios_call, call);
+}
+
+/**
+ * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0)
+ * @_call: pointer to struct apm_bios_call.
+ *
+ * Make a BIOS call that returns one value only, or just status.
+ * If there is an error, then the error code is returned in AH
+ * (bits 8-15 of eax) and this function returns non-zero (it can
+ * also return -ENOMEM). This is used for simpler BIOS operations.
+ * This call may hold interrupts off for a long time on some laptops.
+ *
+ * Note: this makes the call on the current CPU.
+ */
+static long __apm_bios_call_simple(void *_call)
+{
+ u8 error;
+ APM_DECL_SEGS
+ unsigned long flags;
+ int cpu;
+ struct desc_struct save_desc_40;
+ struct desc_struct *gdt;
+ struct apm_bios_call *call = _call;
+
+ cpu = get_cpu();
+ BUG_ON(cpu != 0);
+ gdt = get_cpu_gdt_table(cpu);
+ save_desc_40 = gdt[0x40 / 8];
+ gdt[0x40 / 8] = bad_bios_desc;
+
+ apm_irq_save(flags);
+ APM_DO_SAVE_SEGS;
+ error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
+ &call->eax);
+ APM_DO_RESTORE_SEGS;
+ apm_irq_restore(flags);
+ gdt[0x40 / 8] = save_desc_40;
+ put_cpu();
+ return error;
+}
+
+/**
+ * apm_bios_call_simple - make a simple APM BIOS 32bit call
+ * @func: APM function to invoke
+ * @ebx_in: EBX register value for BIOS call
+ * @ecx_in: ECX register value for BIOS call
+ * @eax: EAX register on return from the BIOS call
+ * @err: bits
+ *
+ * Make a BIOS call that returns one value only, or just status.
+ * If there is an error, then the error code is returned in @err
+ * and this function returns non-zero. This is used for simpler
+ * BIOS operations. This call may hold interrupts off for a long
+ * time on some laptops.
+ */
+static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax,
+ int *err)
+{
+ struct apm_bios_call call;
+ int ret;
+
+ call.func = func;
+ call.ebx = ebx_in;
+ call.ecx = ecx_in;
+
+ ret = on_cpu0(__apm_bios_call_simple, &call);
+ *eax = call.eax;
+ *err = call.err;
+ return ret;
+}
+
+/**
+ * apm_driver_version - APM driver version
+ * @val: loaded with the APM version on return
+ *
+ * Retrieve the APM version supported by the BIOS. This is only
+ * supported for APM 1.1 or higher. An error indicates APM 1.0 is
+ * probably present.
+ *
+ * On entry val should point to a value indicating the APM driver
+ * version with the high byte being the major and the low byte the
+ * minor number both in BCD
+ *
+ * On return it will hold the BIOS revision supported in the
+ * same format.
+ */
+
+static int apm_driver_version(u_short *val)
+{
+ u32 eax;
+ int err;
+
+ if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err))
+ return err;
+ *val = eax;
+ return APM_SUCCESS;
+}
+
+/**
+ * apm_get_event - get an APM event from the BIOS
+ * @event: pointer to the event
+ * @info: point to the event information
+ *
+ * The APM BIOS provides a polled information for event
+ * reporting. The BIOS expects to be polled at least every second
+ * when events are pending. When a message is found the caller should
+ * poll until no more messages are present. However, this causes
+ * problems on some laptops where a suspend event notification is
+ * not cleared until it is acknowledged.
+ *
+ * Additional information is returned in the info pointer, providing
+ * that APM 1.2 is in use. If no messges are pending the value 0x80
+ * is returned (No power management events pending).
+ */
+static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
+{
+ struct apm_bios_call call;
+
+ call.func = APM_FUNC_GET_EVENT;
+ call.ebx = call.ecx = 0;
+
+ if (apm_bios_call(&call))
+ return call.err;
+
+ *event = call.ebx;
+ if (apm_info.connection_version < 0x0102)
+ *info = ~0; /* indicate info not valid */
+ else
+ *info = call.ecx;
+ return APM_SUCCESS;
+}
+
+/**
+ * set_power_state - set the power management state
+ * @what: which items to transition
+ * @state: state to transition to
+ *
+ * Request an APM change of state for one or more system devices. The
+ * processor state must be transitioned last of all. what holds the
+ * class of device in the upper byte and the device number (0xFF for
+ * all) for the object to be transitioned.
+ *
+ * The state holds the state to transition to, which may in fact
+ * be an acceptance of a BIOS requested state change.
+ */
+
+static int set_power_state(u_short what, u_short state)
+{
+ u32 eax;
+ int err;
+
+ if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err))
+ return err;
+ return APM_SUCCESS;
+}
+
+/**
+ * set_system_power_state - set system wide power state
+ * @state: which state to enter
+ *
+ * Transition the entire system into a new APM power state.
+ */
+
+static int set_system_power_state(u_short state)
+{
+ return set_power_state(APM_DEVICE_ALL, state);
+}
+
+/**
+ * apm_do_idle - perform power saving
+ *
+ * This function notifies the BIOS that the processor is (in the view
+ * of the OS) idle. It returns -1 in the event that the BIOS refuses
+ * to handle the idle request. On a success the function returns 1
+ * if the BIOS did clock slowing or 0 otherwise.
+ */
+
+static int apm_do_idle(void)
+{
+ u32 eax;
+ u8 ret = 0;
+ int idled = 0;
+ int polling;
+ int err = 0;
+
+ polling = !!(current_thread_info()->status & TS_POLLING);
+ if (polling) {
+ current_thread_info()->status &= ~TS_POLLING;
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+ }
+ if (!need_resched()) {
+ idled = 1;
+ ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
+ }
+ if (polling)
+ current_thread_info()->status |= TS_POLLING;
+
+ if (!idled)
+ return 0;
+
+ if (ret) {
+ static unsigned long t;
+
+ /* This always fails on some SMP boards running UP kernels.
+ * Only report the failure the first 5 times.
+ */
+ if (++t < 5) {
+ printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err);
+ t = jiffies;
+ }
+ return -1;
+ }
+ clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0;
+ return clock_slowed;
+}
+
+/**
+ * apm_do_busy - inform the BIOS the CPU is busy
+ *
+ * Request that the BIOS brings the CPU back to full performance.
+ */
+
+static void apm_do_busy(void)
+{
+ u32 dummy;
+ int err;
+
+ if (clock_slowed || ALWAYS_CALL_BUSY) {
+ (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err);
+ clock_slowed = 0;
+ }
+}
+
+/*
+ * If no process has really been interested in
+ * the CPU for some time, we want to call BIOS
+ * power management - we probably want
+ * to conserve power.
+ */
+#define IDLE_CALC_LIMIT (HZ * 100)
+#define IDLE_LEAKY_MAX 16
+
+static void (*original_pm_idle)(void) __read_mostly;
+
+/**
+ * apm_cpu_idle - cpu idling for APM capable Linux
+ *
+ * This is the idling function the kernel executes when APM is available. It
+ * tries to do BIOS powermanagement based on the average system idle time.
+ * Furthermore it calls the system default idle routine.
+ */
+
+static void apm_cpu_idle(void)
+{
+ static int use_apm_idle; /* = 0 */
+ static unsigned int last_jiffies; /* = 0 */
+ static unsigned int last_stime; /* = 0 */
+
+ int apm_idle_done = 0;
+ unsigned int jiffies_since_last_check = jiffies - last_jiffies;
+ unsigned int bucket;
+
+ printk_once(KERN_INFO "deprecated apm_cpu_idle will be deleted in 2012");
+recalc:
+ if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
+ use_apm_idle = 0;
+ last_jiffies = jiffies;
+ last_stime = current->stime;
+ } else if (jiffies_since_last_check > idle_period) {
+ unsigned int idle_percentage;
+
+ idle_percentage = current->stime - last_stime;
+ idle_percentage *= 100;
+ idle_percentage /= jiffies_since_last_check;
+ use_apm_idle = (idle_percentage > idle_threshold);
+ if (apm_info.forbid_idle)
+ use_apm_idle = 0;
+ last_jiffies = jiffies;
+ last_stime = current->stime;
+ }
+
+ bucket = IDLE_LEAKY_MAX;
+
+ while (!need_resched()) {
+ if (use_apm_idle) {
+ unsigned int t;
+
+ t = jiffies;
+ switch (apm_do_idle()) {
+ case 0:
+ apm_idle_done = 1;
+ if (t != jiffies) {
+ if (bucket) {
+ bucket = IDLE_LEAKY_MAX;
+ continue;
+ }
+ } else if (bucket) {
+ bucket--;
+ continue;
+ }
+ break;
+ case 1:
+ apm_idle_done = 1;
+ break;
+ default: /* BIOS refused */
+ break;
+ }
+ }
+ if (original_pm_idle)
+ original_pm_idle();
+ else
+ default_idle();
+ local_irq_disable();
+ jiffies_since_last_check = jiffies - last_jiffies;
+ if (jiffies_since_last_check > idle_period)
+ goto recalc;
+ }
+
+ if (apm_idle_done)
+ apm_do_busy();
+
+ local_irq_enable();
+}
+
+/**
+ * apm_power_off - ask the BIOS to power off
+ *
+ * Handle the power off sequence. This is the one piece of code we
+ * will execute even on SMP machines. In order to deal with BIOS
+ * bugs we support real mode APM BIOS power off calls. We also make
+ * the SMP call on CPU0 as some systems will only honour this call
+ * on their first cpu.
+ */
+
+static void apm_power_off(void)
+{
+ /* Some bioses don't like being called from CPU != 0 */
+ if (apm_info.realmode_power_off) {
+ set_cpus_allowed_ptr(current, cpumask_of(0));
+ machine_real_restart(MRR_APM);
+ } else {
+ (void)set_system_power_state(APM_STATE_OFF);
+ }
+}
+
+#ifdef CONFIG_APM_DO_ENABLE
+
+/**
+ * apm_enable_power_management - enable BIOS APM power management
+ * @enable: enable yes/no
+ *
+ * Enable or disable the APM BIOS power services.
+ */
+
+static int apm_enable_power_management(int enable)
+{
+ u32 eax;
+ int err;
+
+ if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
+ return APM_NOT_ENGAGED;
+ if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
+ enable, &eax, &err))
+ return err;
+ if (enable)
+ apm_info.bios.flags &= ~APM_BIOS_DISABLED;
+ else
+ apm_info.bios.flags |= APM_BIOS_DISABLED;
+ return APM_SUCCESS;
+}
+#endif
+
+/**
+ * apm_get_power_status - get current power state
+ * @status: returned status
+ * @bat: battery info
+ * @life: estimated life
+ *
+ * Obtain the current power status from the APM BIOS. We return a
+ * status which gives the rough battery status, and current power
+ * source. The bat value returned give an estimate as a percentage
+ * of life and a status value for the battery. The estimated life
+ * if reported is a lifetime in secodnds/minutes at current powwer
+ * consumption.
+ */
+
+static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
+{
+ struct apm_bios_call call;
+
+ call.func = APM_FUNC_GET_STATUS;
+ call.ebx = APM_DEVICE_ALL;
+ call.ecx = 0;
+
+ if (apm_info.get_power_status_broken)
+ return APM_32_UNSUPPORTED;
+ if (apm_bios_call(&call))
+ return call.err;
+ *status = call.ebx;
+ *bat = call.ecx;
+ if (apm_info.get_power_status_swabinminutes) {
+ *life = swab16((u16)call.edx);
+ *life |= 0x8000;
+ } else
+ *life = call.edx;
+ return APM_SUCCESS;
+}
+
+#if 0
+static int apm_get_battery_status(u_short which, u_short *status,
+ u_short *bat, u_short *life, u_short *nbat)
+{
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+ u32 esi;
+
+ if (apm_info.connection_version < 0x0102) {
+ /* pretend we only have one battery. */
+ if (which != 1)
+ return APM_BAD_DEVICE;
+ *nbat = 1;
+ return apm_get_power_status(status, bat, life);
+ }
+
+ if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
+ &ebx, &ecx, &edx, &esi))
+ return (eax >> 8) & 0xff;
+ *status = ebx;
+ *bat = ecx;
+ *life = edx;
+ *nbat = esi;
+ return APM_SUCCESS;
+}
+#endif
+
+/**
+ * apm_engage_power_management - enable PM on a device
+ * @device: identity of device
+ * @enable: on/off
+ *
+ * Activate or deactive power management on either a specific device
+ * or the entire system (%APM_DEVICE_ALL).
+ */
+
+static int apm_engage_power_management(u_short device, int enable)
+{
+ u32 eax;
+ int err;
+
+ if ((enable == 0) && (device == APM_DEVICE_ALL)
+ && (apm_info.bios.flags & APM_BIOS_DISABLED))
+ return APM_DISABLED;
+ if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable,
+ &eax, &err))
+ return err;
+ if (device == APM_DEVICE_ALL) {
+ if (enable)
+ apm_info.bios.flags &= ~APM_BIOS_DISENGAGED;
+ else
+ apm_info.bios.flags |= APM_BIOS_DISENGAGED;
+ }
+ return APM_SUCCESS;
+}
+
+#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
+
+/**
+ * apm_console_blank - blank the display
+ * @blank: on/off
+ *
+ * Attempt to blank the console, firstly by blanking just video device
+ * zero, and if that fails (some BIOSes don't support it) then it blanks
+ * all video devices. Typically the BIOS will do laptop backlight and
+ * monitor powerdown for us.
+ */
+
+static int apm_console_blank(int blank)
+{
+ int error = APM_NOT_ENGAGED; /* silence gcc */
+ int i;
+ u_short state;
+ static const u_short dev[3] = { 0x100, 0x1FF, 0x101 };
+
+ state = blank ? APM_STATE_STANDBY : APM_STATE_READY;
+
+ for (i = 0; i < ARRAY_SIZE(dev); i++) {
+ error = set_power_state(dev[i], state);
+
+ if ((error == APM_SUCCESS) || (error == APM_NO_ERROR))
+ return 1;
+
+ if (error == APM_NOT_ENGAGED)
+ break;
+ }
+
+ if (error == APM_NOT_ENGAGED) {
+ static int tried;
+ int eng_error;
+ if (tried++ == 0) {
+ eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1);
+ if (eng_error) {
+ apm_error("set display", error);
+ apm_error("engage interface", eng_error);
+ return 0;
+ } else
+ return apm_console_blank(blank);
+ }
+ }
+ apm_error("set display", error);
+ return 0;
+}
+#endif
+
+static int queue_empty(struct apm_user *as)
+{
+ return as->event_head == as->event_tail;
+}
+
+static apm_event_t get_queued_event(struct apm_user *as)
+{
+ if (++as->event_tail >= APM_MAX_EVENTS)
+ as->event_tail = 0;
+ return as->events[as->event_tail];
+}
+
+static void queue_event(apm_event_t event, struct apm_user *sender)
+{
+ struct apm_user *as;
+
+ spin_lock(&user_list_lock);
+ if (user_list == NULL)
+ goto out;
+ for (as = user_list; as != NULL; as = as->next) {
+ if ((as == sender) || (!as->reader))
+ continue;
+ if (++as->event_head >= APM_MAX_EVENTS)
+ as->event_head = 0;
+
+ if (as->event_head == as->event_tail) {
+ static int notified;
+
+ if (notified++ == 0)
+ printk(KERN_ERR "apm: an event queue overflowed\n");
+ if (++as->event_tail >= APM_MAX_EVENTS)
+ as->event_tail = 0;
+ }
+ as->events[as->event_head] = event;
+ if (!as->suser || !as->writer)
+ continue;
+ switch (event) {
+ case APM_SYS_SUSPEND:
+ case APM_USER_SUSPEND:
+ as->suspends_pending++;
+ suspends_pending++;
+ break;
+
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ as->standbys_pending++;
+ standbys_pending++;
+ break;
+ }
+ }
+ wake_up_interruptible(&apm_waitqueue);
+out:
+ spin_unlock(&user_list_lock);
+}
+
+static void reinit_timer(void)
+{
+#ifdef INIT_TIMER_AFTER_SUSPEND
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&i8253_lock, flags);
+ /* set the clock to HZ */
+ outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ udelay(10);
+ outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
+ udelay(10);
+ outb_p(LATCH >> 8, PIT_CH0); /* MSB */
+ udelay(10);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
+#endif
+}
+
+static int suspend(int vetoable)
+{
+ int err;
+ struct apm_user *as;
+
+ dpm_suspend_start(PMSG_SUSPEND);
+
+ dpm_suspend_noirq(PMSG_SUSPEND);
+
+ local_irq_disable();
+ syscore_suspend();
+
+ local_irq_enable();
+
+ save_processor_state();
+ err = set_system_power_state(APM_STATE_SUSPEND);
+ ignore_normal_resume = 1;
+ restore_processor_state();
+
+ local_irq_disable();
+ reinit_timer();
+
+ if (err == APM_NO_ERROR)
+ err = APM_SUCCESS;
+ if (err != APM_SUCCESS)
+ apm_error("suspend", err);
+ err = (err == APM_SUCCESS) ? 0 : -EIO;
+
+ syscore_resume();
+ local_irq_enable();
+
+ dpm_resume_noirq(PMSG_RESUME);
+
+ dpm_resume_end(PMSG_RESUME);
+ queue_event(APM_NORMAL_RESUME, NULL);
+ spin_lock(&user_list_lock);
+ for (as = user_list; as != NULL; as = as->next) {
+ as->suspend_wait = 0;
+ as->suspend_result = err;
+ }
+ spin_unlock(&user_list_lock);
+ wake_up_interruptible(&apm_suspend_waitqueue);
+ return err;
+}
+
+static void standby(void)
+{
+ int err;
+
+ dpm_suspend_noirq(PMSG_SUSPEND);
+
+ local_irq_disable();
+ syscore_suspend();
+ local_irq_enable();
+
+ err = set_system_power_state(APM_STATE_STANDBY);
+ if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
+ apm_error("standby", err);
+
+ local_irq_disable();
+ syscore_resume();
+ local_irq_enable();
+
+ dpm_resume_noirq(PMSG_RESUME);
+}
+
+static apm_event_t get_event(void)
+{
+ int error;
+ apm_event_t event = APM_NO_EVENTS; /* silence gcc */
+ apm_eventinfo_t info;
+
+ static int notified;
+
+ /* we don't use the eventinfo */
+ error = apm_get_event(&event, &info);
+ if (error == APM_SUCCESS)
+ return event;
+
+ if ((error != APM_NO_EVENTS) && (notified++ == 0))
+ apm_error("get_event", error);
+
+ return 0;
+}
+
+static void check_events(void)
+{
+ apm_event_t event;
+ static unsigned long last_resume;
+ static int ignore_bounce;
+
+ while ((event = get_event()) != 0) {
+ if (debug) {
+ if (event <= NR_APM_EVENT_NAME)
+ printk(KERN_DEBUG "apm: received %s notify\n",
+ apm_event_name[event - 1]);
+ else
+ printk(KERN_DEBUG "apm: received unknown "
+ "event 0x%02x\n", event);
+ }
+ if (ignore_bounce
+ && (time_after(jiffies, last_resume + bounce_interval)))
+ ignore_bounce = 0;
+
+ switch (event) {
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ queue_event(event, NULL);
+ if (standbys_pending <= 0)
+ standby();
+ break;
+
+ case APM_USER_SUSPEND:
+#ifdef CONFIG_APM_IGNORE_USER_SUSPEND
+ if (apm_info.connection_version > 0x100)
+ set_system_power_state(APM_STATE_REJECT);
+ break;
+#endif
+ case APM_SYS_SUSPEND:
+ if (ignore_bounce) {
+ if (apm_info.connection_version > 0x100)
+ set_system_power_state(APM_STATE_REJECT);
+ break;
+ }
+ /*
+ * If we are already processing a SUSPEND,
+ * then further SUSPEND events from the BIOS
+ * will be ignored. We also return here to
+ * cope with the fact that the Thinkpads keep
+ * sending a SUSPEND event until something else
+ * happens!
+ */
+ if (ignore_sys_suspend)
+ return;
+ ignore_sys_suspend = 1;
+ queue_event(event, NULL);
+ if (suspends_pending <= 0)
+ (void) suspend(1);
+ break;
+
+ case APM_NORMAL_RESUME:
+ case APM_CRITICAL_RESUME:
+ case APM_STANDBY_RESUME:
+ ignore_sys_suspend = 0;
+ last_resume = jiffies;
+ ignore_bounce = 1;
+ if ((event != APM_NORMAL_RESUME)
+ || (ignore_normal_resume == 0)) {
+ dpm_resume_end(PMSG_RESUME);
+ queue_event(event, NULL);
+ }
+ ignore_normal_resume = 0;
+ break;
+
+ case APM_CAPABILITY_CHANGE:
+ case APM_LOW_BATTERY:
+ case APM_POWER_STATUS_CHANGE:
+ queue_event(event, NULL);
+ /* If needed, notify drivers here */
+ break;
+
+ case APM_UPDATE_TIME:
+ break;
+
+ case APM_CRITICAL_SUSPEND:
+ /*
+ * We are not allowed to reject a critical suspend.
+ */
+ (void)suspend(0);
+ break;
+ }
+ }
+}
+
+static void apm_event_handler(void)
+{
+ static int pending_count = 4;
+ int err;
+
+ if ((standbys_pending > 0) || (suspends_pending > 0)) {
+ if ((apm_info.connection_version > 0x100) &&
+ (pending_count-- <= 0)) {
+ pending_count = 4;
+ if (debug)
+ printk(KERN_DEBUG "apm: setting state busy\n");
+ err = set_system_power_state(APM_STATE_BUSY);
+ if (err)
+ apm_error("busy", err);
+ }
+ } else
+ pending_count = 4;
+ check_events();
+}
+
+/*
+ * This is the APM thread main loop.
+ */
+
+static void apm_mainloop(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&apm_waitqueue, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+ for (;;) {
+ schedule_timeout(APM_CHECK_TIMEOUT);
+ if (kthread_should_stop())
+ break;
+ /*
+ * Ok, check all events, check for idle (and mark us sleeping
+ * so as not to count towards the load average)..
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ apm_event_handler();
+ }
+ remove_wait_queue(&apm_waitqueue, &wait);
+}
+
+static int check_apm_user(struct apm_user *as, const char *func)
+{
+ if (as == NULL || as->magic != APM_BIOS_MAGIC) {
+ printk(KERN_ERR "apm: %s passed bad filp\n", func);
+ return 1;
+ }
+ return 0;
+}
+
+static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct apm_user *as;
+ int i;
+ apm_event_t event;
+
+ as = fp->private_data;
+ if (check_apm_user(as, "read"))
+ return -EIO;
+ if ((int)count < sizeof(apm_event_t))
+ return -EINVAL;
+ if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK))
+ return -EAGAIN;
+ wait_event_interruptible(apm_waitqueue, !queue_empty(as));
+ i = count;
+ while ((i >= sizeof(event)) && !queue_empty(as)) {
+ event = get_queued_event(as);
+ if (copy_to_user(buf, &event, sizeof(event))) {
+ if (i < count)
+ break;
+ return -EFAULT;
+ }
+ switch (event) {
+ case APM_SYS_SUSPEND:
+ case APM_USER_SUSPEND:
+ as->suspends_read++;
+ break;
+
+ case APM_SYS_STANDBY:
+ case APM_USER_STANDBY:
+ as->standbys_read++;
+ break;
+ }
+ buf += sizeof(event);
+ i -= sizeof(event);
+ }
+ if (i < count)
+ return count - i;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
+}
+
+static unsigned int do_poll(struct file *fp, poll_table *wait)
+{
+ struct apm_user *as;
+
+ as = fp->private_data;
+ if (check_apm_user(as, "poll"))
+ return 0;
+ poll_wait(fp, &apm_waitqueue, wait);
+ if (!queue_empty(as))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
+{
+ struct apm_user *as;
+ int ret;
+
+ as = filp->private_data;
+ if (check_apm_user(as, "ioctl"))
+ return -EIO;
+ if (!as->suser || !as->writer)
+ return -EPERM;
+ switch (cmd) {
+ case APM_IOC_STANDBY:
+ mutex_lock(&apm_mutex);
+ if (as->standbys_read > 0) {
+ as->standbys_read--;
+ as->standbys_pending--;
+ standbys_pending--;
+ } else
+ queue_event(APM_USER_STANDBY, as);
+ if (standbys_pending <= 0)
+ standby();
+ mutex_unlock(&apm_mutex);
+ break;
+ case APM_IOC_SUSPEND:
+ mutex_lock(&apm_mutex);
+ if (as->suspends_read > 0) {
+ as->suspends_read--;
+ as->suspends_pending--;
+ suspends_pending--;
+ } else
+ queue_event(APM_USER_SUSPEND, as);
+ if (suspends_pending <= 0) {
+ ret = suspend(1);
+ mutex_unlock(&apm_mutex);
+ } else {
+ as->suspend_wait = 1;
+ mutex_unlock(&apm_mutex);
+ wait_event_interruptible(apm_suspend_waitqueue,
+ as->suspend_wait == 0);
+ ret = as->suspend_result;
+ }
+ return ret;
+ default:
+ return -ENOTTY;
+ }
+ return 0;
+}
+
+static int do_release(struct inode *inode, struct file *filp)
+{
+ struct apm_user *as;
+
+ as = filp->private_data;
+ if (check_apm_user(as, "release"))
+ return 0;
+ filp->private_data = NULL;
+ if (as->standbys_pending > 0) {
+ standbys_pending -= as->standbys_pending;
+ if (standbys_pending <= 0)
+ standby();
+ }
+ if (as->suspends_pending > 0) {
+ suspends_pending -= as->suspends_pending;
+ if (suspends_pending <= 0)