summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorAnton Arapov <anton@redhat.com>2012-08-07 11:21:50 +0200
committerAnton Arapov <anton@redhat.com>2012-08-07 12:52:25 +0200
commit1d44b6f3fcf6058fb7c960b7558766967e8028f7 (patch)
tree53d88547c973ba048d233091a3f91f3173ad01df /arch/x86/kernel
parentd91eda5d7b0383e6a0c83e0146ff141ff3b1355b (diff)
downloadkernel-uprobes-1d44b6f3fcf6058fb7c960b7558766967e8028f7.tar.gz
kernel-uprobes-1d44b6f3fcf6058fb7c960b7558766967e8028f7.tar.xz
kernel-uprobes-1d44b6f3fcf6058fb7c960b7558766967e8028f7.zip
fedora kernel: 222b075b3ff0d9e88aa9353e3c80667756ed7361v3.5.0-4
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c29
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile59
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/copy.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-bios.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-mode.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vesa.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vga.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakemain.c81
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S170
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h48
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S62
-rw-r--r--arch/x86/kernel/acpi/sleep.c33
-rw-r--r--arch/x86/kernel/acpi/sleep.h2
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic/apic.c42
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c1
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c404
-rw-r--r--arch/x86/kernel/apic/numaq_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/check.c20
-rw-r--r--arch/x86/kernel/cpu/common.c10
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c4
-rw-r--r--arch/x86/kernel/cpu/match.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c10
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c9
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl25
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c18
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c570
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c149
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c15
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c6
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/dumpstack.c23
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/e820.c53
-rw-r--r--arch/x86/kernel/entry_32.S60
-rw-r--r--arch/x86/kernel/entry_64.S60
-rw-r--r--arch/x86/kernel/ftrace.c594
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_32.S228
-rw-r--r--arch/x86/kernel/head_64.S84
-rw-r--r--arch/x86/kernel/hpet.c66
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/init_task.c42
-rw-r--r--arch/x86/kernel/irq_32.c8
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/kprobes.c4
-rw-r--r--arch/x86/kernel/kvmclock.c15
-rw-r--r--arch/x86/kernel/mca_32.c476
-rw-r--r--arch/x86/kernel/microcode_core.c31
-rw-r--r--arch/x86/kernel/mpparse.c22
-rw-r--r--arch/x86/kernel/nmi.c107
-rw-r--r--arch/x86/kernel/nmi_selftest.c17
-rw-r--r--arch/x86/kernel/paravirt.c12
-rw-r--r--arch/x86/kernel/pci-calgary_64.c8
-rw-r--r--arch/x86/kernel/pci-dma.c17
-rw-r--r--arch/x86/kernel/pci-nommu.c8
-rw-r--r--arch/x86/kernel/process.c73
-rw-r--r--arch/x86/kernel/process_32.c11
-rw-r--r--arch/x86/kernel/process_64.c19
-rw-r--r--arch/x86/kernel/ptrace.c13
-rw-r--r--arch/x86/kernel/reboot.c270
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/setup.c42
-rw-r--r--arch/x86/kernel/signal.c74
-rw-r--r--arch/x86/kernel/smp.c100
-rw-r--r--arch/x86/kernel/smpboot.c227
-rw-r--r--arch/x86/kernel/tboot.c7
-rw-r--r--arch/x86/kernel/test_rodata.c10
-rw-r--r--arch/x86/kernel/time.c6
-rw-r--r--arch/x86/kernel/trampoline.c42
-rw-r--r--arch/x86/kernel/trampoline_32.S83
-rw-r--r--arch/x86/kernel/trampoline_64.S171
-rw-r--r--arch/x86/kernel/traps.c16
-rw-r--r--arch/x86/kernel/uprobes.c3
-rw-r--r--arch/x86/kernel/vmlinux.lds.S12
-rw-r--r--arch/x86/kernel/vsmp_64.c40
-rw-r--r--arch/x86/kernel/vsyscall_64.c39
-rw-r--r--arch/x86/kernel/x86_init.c8
-rw-r--r--arch/x86/kernel/xsave.c2
101 files changed, 2356 insertions, 2934 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d23d83577d6..8215e5652d9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
+extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
@@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
-obj-y += trampoline.o trampoline_$(BITS).o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
@@ -48,8 +47,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
-obj-$(CONFIG_X86_32) += reboot_32.o
-obj-$(CONFIG_MCA) += mca_32.o
obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_PCI) += early-quirks.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 6f35260bb3e..163b2258147 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,14 +1,7 @@
-subdir- := realmode
-
obj-$(CONFIG_ACPI) += boot.o
-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
endif
-$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
-
-$(obj)/realmode/wakeup.bin: FORCE
- $(Q)$(MAKE) $(build)=$(obj)/realmode
-
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7c439fe4941..b2297e58c6e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
return 0;
}
- if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
+ if (intsrc->source_irq == 0) {
if (acpi_skip_timer_override) {
- printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+ printk(PREFIX "BIOS IRQ0 override ignored.\n");
return 0;
}
- if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
+
+ if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
+ && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
}
@@ -990,7 +992,7 @@ void __init mp_config_acpi_legacy_irqs(void)
int i;
struct mpc_intsrc mp_irq;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
/*
* Fabricate the legacy ISA bus (bus #31).
*/
@@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
}
/*
- * Force ignoring BIOS IRQ0 pin2 override
+ * Force ignoring BIOS IRQ0 override
*/
static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
{
- /*
- * The ati_ixp4x0_rev() early PCI quirk should have set
- * the acpi_skip_timer_override flag already:
- */
if (!acpi_skip_timer_override) {
- WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
- pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+ pr_notice("%s detected: Ignoring BIOS IRQ0 override\n",
d->ident);
acpi_skip_timer_override = 1;
}
@@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
* is enabled. This input is incorrectly designated the
* ISA IRQ 0 via an interrupt source override even though
* it is wired to the output of the master 8259A and INTIN0
- * is not connected at all. Force ignoring BIOS IRQ0 pin2
+ * is not connected at all. Force ignoring BIOS IRQ0
* override in that cases.
*/
{
@@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
},
},
+ {
+ .callback = dmi_ignore_irq0_timer_override,
+ .ident = "FUJITSU SIEMENS",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
+ },
+ },
{}
};
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
deleted file mode 100644
index 6a564ac67ef..00000000000
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/x86/kernel/acpi/realmode/Makefile
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License. See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-always := wakeup.bin
-targets := wakeup.elf wakeup.lds
-
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
-
-# The link order of the video-*.o modules can matter. In particular,
-# video-vga.o *must* be listed first, followed by video-vesa.o.
-# Hardware-specific drivers should follow in the order they should be
-# probed, and video-bios.o should typically be last.
-wakeup-y += video-vga.o
-wakeup-y += video-vesa.o
-wakeup-y += video-bios.o
-
-targets += $(wakeup-y)
-
-bootsrc := $(src)/../../../boot
-
-# ---------------------------------------------------------------------------
-
-# How to compile the 16-bit code. Note we always compile for -march=i386,
-# that way we can complain to the user if the CPU is insufficient.
-# Compile with _SETUP since this is similar to the boot-time setup code.
-KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
- -I$(srctree)/$(bootsrc) \
- $(cflags-y) \
- -Wall -Wstrict-prototypes \
- -march=i386 -mregparm=3 \
- -include $(srctree)/$(bootsrc)/code16gcc.h \
- -fno-strict-aliasing -fomit-frame-pointer \
- $(call cc-option, -ffreestanding) \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time)) \
- $(call cc-option, -fno-stack-protector) \
- $(call cc-option, -mpreferred-stack-boundary=2)
-KBUILD_CFLAGS += $(call cc-option, -m32)
-KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
-GCOV_PROFILE := n
-
-WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
-
-LDFLAGS_wakeup.elf := -T
-
-CPPFLAGS_wakeup.lds += -P -C
-
-$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE
- $(call if_changed,ld)
-
-OBJCOPYFLAGS_wakeup.bin := -O binary
-
-$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
- $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
deleted file mode 100644
index f51eb0bb56c..00000000000
--- a/arch/x86/kernel/acpi/realmode/bioscall.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
deleted file mode 100644
index dc59ebee69d..00000000000
--- a/arch/x86/kernel/acpi/realmode/copy.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
deleted file mode 100644
index 6206033ba20..00000000000
--- a/arch/x86/kernel/acpi/realmode/regs.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
deleted file mode 100644
index 7deabc144a2..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-bios.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
deleted file mode 100644
index 328ad209f11..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-mode.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
deleted file mode 100644
index 9dbb9672226..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-vesa.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
deleted file mode 100644
index bcc81255f37..00000000000
--- a/arch/x86/kernel/acpi/realmode/video-vga.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c
deleted file mode 100644
index 883962d9eef..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakemain.c
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "wakeup.h"
-#include "boot.h"
-
-static void udelay(int loops)
-{
- while (loops--)
- io_delay(); /* Approximately 1 us */
-}
-
-static void beep(unsigned int hz)
-{
- u8 enable;
-
- if (!hz) {
- enable = 0x00; /* Turn off speaker */
- } else {
- u16 div = 1193181/hz;
-
- outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */
- io_delay();
- outb(div, 0x42); /* LSB of counter */
- io_delay();
- outb(div >> 8, 0x42); /* MSB of counter */
- io_delay();
-
- enable = 0x03; /* Turn on speaker */
- }
- inb(0x61); /* Dummy read of System Control Port B */
- io_delay();
- outb(enable, 0x61); /* Enable timer 2 output to speaker */
- io_delay();
-}
-
-#define DOT_HZ 880
-#define DASH_HZ 587
-#define US_PER_DOT 125000
-
-/* Okay, this is totally silly, but it's kind of fun. */
-static void send_morse(const char *pattern)
-{
- char s;
-
- while ((s = *pattern++)) {
- switch (s) {
- case '.':
- beep(DOT_HZ);
- udelay(US_PER_DOT);
- beep(0);
- udelay(US_PER_DOT);
- break;
- case '-':
- beep(DASH_HZ);
- udelay(US_PER_DOT * 3);
- beep(0);
- udelay(US_PER_DOT);
- break;
- default: /* Assume it's a space */
- udelay(US_PER_DOT * 3);
- break;
- }
- }
-}
-
-void main(void)
-{
- /* Kill machine if structures are wrong */
- if (wakeup_header.real_magic != 0x12345678)
- while (1);
-
- if (wakeup_header.realmode_flags & 4)
- send_morse("...-");
-
- if (wakeup_header.realmode_flags & 1)
- asm volatile("lcallw $0xc000,$3");
-
- if (wakeup_header.realmode_flags & 2) {
- /* Need to call BIOS */
- probe_cards(0);
- set_mode(wakeup_header.video_mode);
- }
-}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
deleted file mode 100644
index b4fd836e405..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * ACPI wakeup real mode startup stub
- */
-#include <asm/segment.h>
-#include <asm/msr-index.h>
-#include <asm/page_types.h>
-#include <asm/pgtable_types.h>
-#include <asm/processor-flags.h>
-#include "wakeup.h"
-
- .code16
- .section ".jump", "ax"
- .globl _start
-_start:
- cli
- jmp wakeup_code
-
-/* This should match the structure in wakeup.h */
- .section ".header", "a"
- .globl wakeup_header
-wakeup_header:
-video_mode: .short 0 /* Video mode number */
-pmode_return: .byte 0x66, 0xea /* ljmpl */
- .long 0 /* offset goes here */
- .short __KERNEL_CS
-pmode_cr0: .long 0 /* Saved %cr0 */
-pmode_cr3: .long 0 /* Saved %cr3 */
-pmode_cr4: .long 0 /* Saved %cr4 */
-pmode_efer: .quad 0 /* Saved EFER */
-pmode_gdt: .quad 0
-pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
-pmode_behavior: .long 0 /* Wakeup behavior flags */
-realmode_flags: .long 0
-real_magic: .long 0
-trampoline_segment: .word 0
-_pad1: .byte 0
-wakeup_jmp: .byte 0xea /* ljmpw */
-wakeup_jmp_off: .word 3f
-wakeup_jmp_seg: .word 0
-wakeup_gdt: .quad 0, 0, 0
-signature: .long WAKEUP_HEADER_SIGNATURE
-
- .text
- .code16
-wakeup_code:
- cld
-
- /* Apparently some dimwit BIOS programmers don't know how to
- program a PM to RM transition, and we might end up here with
- junk in the data segment descriptor registers. The only way
- to repair that is to go into PM and fix it ourselves... */
- movw $16, %cx
- lgdtl %cs:wakeup_gdt
- movl %cr0, %eax
- orb $X86_CR0_PE, %al
- movl %eax, %cr0
- jmp 1f
-1: ljmpw $8, $2f
-2:
- movw %cx, %ds
- movw %cx, %es
- movw %cx, %ss
- movw %cx, %fs
- movw %cx, %gs
-
- andb $~X86_CR0_PE, %al
- movl %eax, %cr0
- jmp wakeup_jmp
-3:
- /* Set up segments */
- movw %cs, %ax
- movw %ax, %ds
- movw %ax, %es
- movw %ax, %ss
- lidtl wakeup_idt
-
- movl $wakeup_stack_end, %esp
-
- /* Clear the EFLAGS */
- pushl $0
- popfl
-
- /* Check header signature... */
- movl signature, %eax
- cmpl $WAKEUP_HEADER_SIGNATURE, %eax
- jne bogus_real_magic
-
- /* Check we really have everything... */
- movl end_signature, %eax
- cmpl $WAKEUP_END_SIGNATURE, %eax
- jne bogus_real_magic
-
- /* Call the C code */
- calll main
-
- /* Restore MISC_ENABLE before entering protected mode, in case
- BIOS decided to clear XD_DISABLE during S3. */
- movl pmode_behavior, %eax
- btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
- jnc 1f
-
- movl pmode_misc_en, %eax
- movl pmode_misc_en + 4, %edx
- movl $MSR_IA32_MISC_ENABLE, %ecx
- wrmsr
-1:
-
- /* Do any other stuff... */
-
-#ifndef CONFIG_64BIT
- /* This could also be done in C code... */
- movl pmode_cr3, %eax
- movl %eax, %cr3
-
- movl pmode_cr4, %ecx
- jecxz 1f
- movl %ecx, %cr4
-1:
- movl pmode_efer, %eax
- movl pmode_efer + 4, %edx
- movl %eax, %ecx
- orl %edx, %ecx
- jz 1f
- movl $MSR_EFER, %ecx
- wrmsr
-1:
-
- lgdtl pmode_gdt
-
- /* This really couldn't... */
- movl pmode_cr0, %eax
- movl %eax, %cr0
- jmp pmode_return
-#else
- pushw $0
- pushw trampoline_segment
- pushw $0
- lret
-#endif
-
-bogus_real_magic:
-1:
- hlt
- jmp 1b
-
- .data
- .balign 8
-
- /* This is the standard real-mode IDT */
-wakeup_idt:
- .word 0xffff /* limit */
- .long 0 /* address */
- .word 0
-
- .globl HEAP, heap_end
-HEAP:
- .long wakeup_heap
-heap_end:
- .long wakeup_stack
-
- .bss
-wakeup_heap:
- .space 2048
-wakeup_stack:
- .space 2048
-wakeup_stack_end:
-
- .section ".signature","a"
-end_signature:
- .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
deleted file mode 100644
index 97a29e1430e..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Definitions for the wakeup data structure at the head of the
- * wakeup code.
- */
-
-#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
-
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-
-/* This must match data at wakeup.S */
-struct wakeup_header {
- u16 video_mode; /* Video mode number */
- u16 _jmp1; /* ljmpl opcode, 32-bit only */
- u32 pmode_entry; /* Protected mode resume point, 32-bit only */
- u16 _jmp2; /* CS value, 32-bit only */
- u32 pmode_cr0; /* Protected mode cr0 */
- u32 pmode_cr3; /* Protected mode cr3 */
- u32 pmode_cr4; /* Protected mode cr4 */
- u32 pmode_efer_low; /* Protected mode EFER */
- u32 pmode_efer_high;
- u64 pmode_gdt;
- u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
- u32 pmode_misc_en_high;
- u32 pmode_behavior; /* Wakeup routine behavior flags */
- u32 realmode_flags;
- u32 real_magic;
- u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
- u8 _pad1;
- u8 wakeup_jmp;
- u16 wakeup_jmp_off;
- u16 wakeup_jmp_seg;
- u64 wakeup_gdt[3];
- u32 signature; /* To check we have correct structure */
-} __attribute__((__packed__));
-
-extern struct wakeup_header wakeup_header;
-#endif
-
-#define WAKEUP_HEADER_OFFSET 8
-#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
-#define WAKEUP_END_SIGNATURE 0x65a22c82
-
-/* Wakeup behavior bits */
-#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
-
-#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
deleted file mode 100644
index d4f8010a5b1..00000000000
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * wakeup.ld
- *
- * Linker script for the real-mode wakeup code
- */
-#undef i386
-#include "wakeup.h"
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(_start)
-
-SECTIONS
-{
- . = 0;
- .jump : {
- *(.jump)
- } = 0x90909090
-
- . = WAKEUP_HEADER_OFFSET;
- .header : {
- *(.header)
- }
-
- . = ALIGN(16);
- .text : {
- *(.text*)
- } = 0x90909090
-
- . = ALIGN(16);
- .rodata : {
- *(.rodata*)
- }
-
- .videocards : {
- video_cards = .;
- *(.videocards)
- video_cards_end = .;
- }
-
- . = ALIGN(16);
- .data : {
- *(.data*)
- }
-
- . = ALIGN(16);
- .bss : {
- __bss_start = .;
- *(.bss)
- __bss_end = .;
- }
-
- .signature : {
- *(.signature)
- }
-
- _end = .;
-
- /DISCARD/ : {
- *(.note*)
- }
-}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 146a49c763a..95bf99de905 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -14,8 +14,9 @@
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
+#include <asm/realmode.h>
-#include "realmode/wakeup.h"
+#include "../../realmode/rm/wakeup.h"
#include "sleep.h"
unsigned long acpi_realmode_flags;
@@ -36,13 +37,9 @@ asmlinkage void acpi_enter_s3(void)
*/
int acpi_suspend_lowlevel(void)
{
- struct wakeup_header *header;
- /* address in low memory of the wakeup routine. */
- char *acpi_realmode;
+ struct wakeup_header *header =
+ (struct wakeup_header *) __va(real_mode_header->wakeup_header);
- acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
-
- header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
if (header->signature != WAKEUP_HEADER_SIGNATURE) {
printk(KERN_ERR "wakeup header does not match\n");
return -EINVAL;
@@ -50,27 +47,6 @@ int acpi_suspend_lowlevel(void)
header->video_mode = saved_video_mode;
- header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
-
- /*
- * Set up the wakeup GDT. We set these up as Big Real Mode,
- * that is, with limits set to 4 GB. At least the Lenovo
- * Thinkpad X61 is known to need this for the video BIOS
- * initialization quirk to work; this is likely to also
- * be the case for other laptops or integrated video devices.
- */
-
- /* GDT[0]: GDT self-pointer */
- header->wakeup_gdt[0] =
- (u64)(sizeof(header->wakeup_gdt) - 1) +
- ((u64)__pa(&header->wakeup_gdt) << 16);
- /* GDT[1]: big real mode-like code segment */
- header->wakeup_gdt[1] =
- GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
- /* GDT[2]: big real mode-like data segment */
- header->wakeup_gdt[2] =
- GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
-
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -95,7 +71,6 @@ int acpi_suspend_lowlevel(void)
header->pmode_cr3 = (u32)__pa(&initial_page_table);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
- header->trampoline_segment = trampoline_address() >> 4;
#ifdef CONFIG_SMP
stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index d68677a2a01..5653a5791ec 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,8 +2,8 @@
* Variables and functions used by the code in sleep.c
*/
-#include <asm/trampoline.h>
#include <linux/linkage.h>
+#include <asm/realmode.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
deleted file mode 100644
index 63b8ab524f2..00000000000
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ /dev/null
@@ -1,12 +0,0 @@
-/*
- * Wrapper script for the realmode binary as a transport object
- * before copying to low memory.
- */
-#include <asm/page_types.h>
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .globl acpi_wakeup_code
-acpi_wakeup_code:
- .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
- .size acpi_wakeup_code, .-acpi_wakeup_code
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e76c191a83..d5fd66f0d4c 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,7 +20,6 @@
#include <linux/bitops.h>
#include <linux/ioport.h>
#include <linux/suspend.h>
-#include <linux/kmemleak.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/iommu.h>
@@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void)
return 0;
}
memblock_reserve(addr, aper_size);
- /*
- * Kmemleak should not scan this block as it may not be mapped via the
- * kernel direct mapping.
- */
- kmemleak_ignore(phys_to_virt(addr));
printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
aper_size >> 10, addr);
insert_aperture_resource((u32)addr, aper_size);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index edc24480469..39a222e094a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,6 +35,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/irq_remapping.h>
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <asm/pgalloc.h>
@@ -1325,11 +1326,13 @@ void __cpuinit setup_local_APIC(void)
acked);
break;
}
- if (cpu_has_tsc) {
- rdtscll(ntsc);
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
- } else
- max_loops--;
+ if (queued) {
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ }
} while (queued && max_loops > 0);
WARN_ON(max_loops <= 0);
@@ -1441,8 +1444,8 @@ void __init bsp_end_local_APIC_setup(void)
* Now that local APIC setup is completed for BP, configure the fault
* handling for interrupt remapping.
*/
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
+ if (irq_remapping_enabled)
+ irq_remap_enable_fault_handling();
}
@@ -1517,7 +1520,7 @@ void enable_x2apic(void)
int __init enable_IR(void)
{
#ifdef CONFIG_IRQ_REMAP
- if (!intr_remapping_supported()) {
+ if (!irq_remapping_supported()) {
pr_debug("intr-remapping not supported\n");
return -1;
}
@@ -1528,7 +1531,7 @@ int __init enable_IR(void)
return -1;
}
- return enable_intr_remapping();
+ return irq_remapping_enable();
#endif
return -1;
}
@@ -1537,10 +1540,13 @@ void __init enable_IR_x2apic(void)
{
unsigned long flags;
int ret, x2apic_enabled = 0;
- int dmar_table_init_ret;
+ int hardware_init_ret;
+
+ /* Make sure irq_remap_ops are initialized */
+ setup_irq_remapping_ops();
- dmar_table_init_ret = dmar_table_init();
- if (dmar_table_init_ret && !x2apic_supported())
+ hardware_init_ret = irq_remapping_prepare();
+ if (hardware_init_ret && !x2apic_supported())
return;
ret = save_ioapic_entries();
@@ -1556,7 +1562,7 @@ void __init enable_IR_x2apic(void)
if (x2apic_preenabled && nox2apic)
disable_x2apic();
- if (dmar_table_init_ret)
+ if (hardware_init_ret)
ret = -1;
else
ret = enable_IR();
@@ -2176,8 +2182,8 @@ static int lapic_suspend(void)
local_irq_save(flags);
disable_local_APIC();
- if (intr_remapping_enabled)
- disable_intr_remapping();
+ if (irq_remapping_enabled)
+ irq_remapping_disable();
local_irq_restore(flags);
return 0;
@@ -2193,7 +2199,7 @@ static void lapic_resume(void)
return;
local_irq_save(flags);
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
/*
* IO-APIC and PIC have their own resume routines.
* We just mask them here to make sure the interrupt
@@ -2245,8 +2251,8 @@ static void lapic_resume(void)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
- if (intr_remapping_enabled)
- reenable_intr_remapping(x2apic_mode);
+ if (irq_remapping_enabled)
+ irq_remapping_reenable(x2apic_mode);
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 359b6899a36..0e881c46e8c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -227,6 +227,7 @@ static struct apic apic_flat = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -386,6 +387,7 @@ static struct apic apic_physflat = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 634ae6cdd5c..a6e4c6e06c0 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -181,6 +181,7 @@ struct apic apic_noop = {
.read = noop_apic_read,
.write = noop_apic_write,
+ .eoi_write = noop_apic_write,
.icr_read = noop_apic_icr_read,
.icr_write = noop_apic_icr_write,
.wait_icr_idle = noop_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 23e75422e01..6ec6d5d297c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -295,6 +295,7 @@ static struct apic apic_numachip __refconst = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 0cdec7065af..31fbdbfbf96 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -248,6 +248,7 @@ static struct apic apic_bigsmp = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index e42d1d3b913..db4ab1be3c7 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -678,6 +678,7 @@ static struct apic __refdata apic_es7000_cluster = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
@@ -742,6 +743,7 @@ static struct apic __refdata apic_es7000 = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e88300d8e80..5f0ff597437 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -68,23 +68,21 @@
#define for_each_irq_pin(entry, head) \
for (entry = head; entry; entry = entry->next)
-static void __init __ioapic_init_mappings(void);
-
-static unsigned int __io_apic_read (unsigned int apic, unsigned int reg);
-static void __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val);
-static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
-
-static struct io_apic_ops io_apic_ops = {
- .init = __ioapic_init_mappings,
- .read = __io_apic_read,
- .write = __io_apic_write,
- .modify = __io_apic_modify,
-};
-
-void __init set_io_apic_ops(const struct io_apic_ops *ops)
+#ifdef CONFIG_IRQ_REMAP
+static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return cfg->irq_2_iommu.iommu != NULL;
+}
+#else
+static inline bool irq_remapped(struct irq_cfg *cfg)
+{
+ return false;
+}
+static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
{
- io_apic_ops = *ops;
}
+#endif
/*
* Is the SiS APIC rmw bug present ?
@@ -142,7 +140,7 @@ int mp_irq_entries;
/* GSI interrupts */
static int nr_irqs_gsi = NR_IRQS_LEGACY;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+#ifdef CONFIG_EISA
int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
@@ -313,21 +311,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
irq_free_desc(at);
}
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
- return io_apic_ops.read(apic, reg);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
- io_apic_ops.write(apic, reg, value);
-}
-
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
- io_apic_ops.modify(apic, reg, value);
-}
-
struct io_apic {
unsigned int index;
@@ -349,14 +332,14 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
writel(vector, &io_apic->eoi);
}
-static unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
+unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
-static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -370,7 +353,7 @@ static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int va
*
* Older SiS APIC requires we rewrite the index register
*/
-static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
@@ -379,29 +362,6 @@ static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int v
writel(value, &io_apic->data);
}
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
-{
- struct irq_pin_list *entry;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- for_each_irq_pin(entry, cfg->irq_2_pin) {
- unsigned int reg;
- int pin;
-
- pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
- /* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return true;
- }
- }
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return false;
-}
-
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
@@ -875,7 +835,7 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
/*
* EISA Edge/Level control register, ELCR
*/
@@ -912,12 +872,6 @@ static int EISA_ELCR(unsigned int irq)
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) default_ISA_polarity(idx)
-
static int irq_polarity(int idx)
{
int bus = mp_irqs[idx].srcbus;
@@ -975,7 +929,7 @@ static int irq_trigger(int idx)
trigger = default_ISA_trigger(idx);
else
trigger = default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_ISA: /* ISA pin */
{
@@ -992,11 +946,6 @@ static int irq_trigger(int idx)
/* set before the switch */
break;
}
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
default:
{
printk(KERN_WARNING "broken BIOS!!\n");
@@ -1246,7 +1195,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
BUG_ON(!cfg->vector);
vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ for_each_cpu(cpu, cfg->domain)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
@@ -1254,7 +1203,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
if (likely(!cfg->move_in_progress))
return;
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for_each_cpu(cpu, cfg->old_domain) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1361,77 +1310,13 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
fasteoi ? "fasteoi" : "edge");
}
-
-static int setup_ir_ioapic_entry(int irq,
- struct IR_IO_APIC_route_entry *entry,
- unsigned int destination, int vector,
- struct io_apic_irq_attr *attr)
-{
- int index;
- struct irte irte;
- int ioapic_id = mpc_ioapic_id(attr->ioapic);
- struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
-
- if (!iommu) {
- pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
- return -ENODEV;
- }
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0) {
- pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
- return -ENOMEM;
- }
-
- prepare_irte(&irte, vector, destination);
-
- /* Set source-id of interrupt request */
- set_ioapic_sid(&irte, ioapic_id);
-
- modify_irte(irq, &irte);
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
- "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
- "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
- "Avail:%X Vector:%02X Dest:%08X "
- "SID:%04X SQ:%X SVT:%X)\n",
- attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
- irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
- irte.avail, irte.vector, irte.dest_id,
- irte.sid, irte.sq, irte.svt);
-
- memset(entry, 0, sizeof(*entry));
-
- entry->index2 = (index >> 15) & 0x1;
- entry->zero = 0;
- entry->format = 1;
- entry->index = (index & 0x7fff);
- /*
- * IO-APIC RTE will be configured with virtual vector.
- * irq handler will do the explicit EOI to the io-apic.
- */
- entry->vector = attr->ioapic_pin;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = attr->trigger;
- entry->polarity = attr->polarity;
-
- /* Mask level triggered irqs.
- * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
- */
- if (attr->trigger)
- entry->mask = 1;
-
- return 0;
-}
-
static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
unsigned int destination, int vector,
struct io_apic_irq_attr *attr)
{
- if (intr_remapping_enabled)
- return setup_ir_ioapic_entry(irq,
- (struct IR_IO_APIC_route_entry *)entry,
- destination, vector, attr);
+ if (irq_remapping_enabled)
+ return setup_ioapic_remapped_entry(irq, entry, destination,
+ vector, attr);
memset(entry, 0, sizeof(*entry));
@@ -1588,7 +1473,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
{
struct IO_APIC_route_entry entry;
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
return;
memset(&entry, 0, sizeof(entry));
@@ -1674,7 +1559,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
printk(KERN_DEBUG ".... IRQ redirection table:\n");
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
" Pol Stat Indx2 Zero Vect:\n");
} else {
@@ -1683,7 +1568,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
}
for (i = 0; i <= reg_01.bits.entries; i++) {
- if (intr_remapping_enabled) {
+ if (irq_remapping_enabled) {
struct IO_APIC_route_entry entry;
struct IR_IO_APIC_route_entry *ir_entry;
@@ -2050,7 +1935,7 @@ void disable_IO_APIC(void)
* IOAPIC RTE as well as interrupt-remapping table entry).
* As this gets called during crash dump, keep this simple for now.
*/
- if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+ if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
@@ -2074,7 +1959,7 @@ void disable_IO_APIC(void)
* Use virtual wire A mode when interrupt remapping is enabled.
*/
if (cpu_has_apic || apic_from_smp_config())
- disconnect_bsp_APIC(!intr_remapping_enabled &&
+ disconnect_bsp_APIC(!irq_remapping_enabled &&
ioapic_i8259.pin != -1);
}
@@ -2390,71 +2275,6 @@ ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
return ret;
}
-#ifdef CONFIG_IRQ_REMAP
-
-/*
- * Migrate the IO-APIC irq in the presence of intr-remapping.
- *
- * For both level and edge triggered, irq migration is a simple atomic
- * update(of vector and cpu destination) of IRTE and flush the hardware cache.
- *
- * For level triggered, we eliminate the io-apic RTE modification (with the
- * updated vector information), by using a virtual vector (io-apic pin number).
- * Real vector that is used for interrupting cpu will be coming from
- * the interrupt-remapping table entry.
- *
- * As the migration is a simple atomic update of IRTE, the same mechanism
- * is used to migrate MSI irq's in the presence of interrupt-remapping.
- */
-static int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned int dest, irq = data->irq;
- struct irte irte;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return -EINVAL;
-
- if (get_irte(irq, &irte))
- return -EBUSY;
-
- if (assign_irq_vector(irq, cfg, mask))
- return -EBUSY;
-
- dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
-
- irte.vector = cfg->vector;
- irte.dest_id = IRTE_DEST(dest);
-
- /*
- * Atomically updates the IRTE with the new destination, vector
- * and flushes the interrupt entry cache.
- */
- modify_irte(irq, &irte);
-
- /*
- * After this point, all the interrupts will start arriving
- * at the new destination. So, time to cleanup the previous
- * vector allocation.
- */
- if (cfg->move_in_progress)
- send_cleanup_vector(cfg);
-
- cpumask_copy(data->affinity, mask);
- return 0;
-}
-
-#else
-static inline int
-ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- return 0;
-}
-#endif
-
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
@@ -2552,6 +2372,29 @@ static void ack_apic_edge(struct irq_data *data)
atomic_t irq_mis_count;
#ifdef CONFIG_GENERIC_PENDING_IRQ
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+{
+ struct irq_pin_list *entry;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ for_each_irq_pin(entry, cfg->irq_2_pin) {
+ unsigned int reg;
+ int pin;
+
+ pin = entry->pin;
+ reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ /* Is the remote IRR bit set? */
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return false;
+}
+
static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
{
/* If we are moving the irq we need to mask it */
@@ -2699,7 +2542,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
chip->irq_eoi = ir_ack_apic_level;
#ifdef CONFIG_SMP
- chip->irq_set_affinity = ir_ioapic_set_affinity;
+ chip->irq_set_affinity = set_remapped_irq_affinity;
#endif
}
#endif /* CONFIG_IRQ_REMAP */
@@ -2912,7 +2755,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
panic("BIOS bug: timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
@@ -2945,7 +2788,7 @@ static inline void __init check_timer(void)
clear_IO_APIC_pin(0, pin1);
goto out;
}
- if (intr_remapping_enabled)
+ if (irq_remapping_enabled)
panic("timer doesn't work through Interrupt-remapped IO-APIC");
local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
@@ -3169,7 +3012,7 @@ void destroy_irq(unsigned int irq)
irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
if (irq_remapped(cfg))
- free_irte(irq);
+ free_remapped_irq(irq);
raw_spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -3198,54 +3041,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
if (irq_remapped(cfg)) {
- struct irte irte;
- int ir_index;
- u16 sub_handle;
-
- ir_index = map_irq_to_irte_handle(irq, &sub_handle);
- BUG_ON(ir_index == -1);
-
- prepare_irte(&irte, cfg->vector, dest);
-
- /* Set source-id of interrupt request */
- if (pdev)
- set_msi_sid(&irte, pdev);
- else
- set_hpet_sid(&irte, hpet_id);
-
- modify_irte(irq, &irte);
+ compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
+ return err;
+ }
+ if (x2apic_enabled())
+ msg->address_hi = MSI_ADDR_BASE_HI |
+ MSI_ADDR_EXT_DEST_ID(dest);
+ else
msg->address_hi = MSI_ADDR_BASE_HI;
- msg->data = sub_handle;
- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
- MSI_ADDR_IR_SHV |
- MSI_ADDR_IR_INDEX1(ir_index) |
- MSI_ADDR_IR_INDEX2(ir_index);
- } else {
- if (x2apic_enabled())
- msg->address_hi = MSI_ADDR_BASE_HI |
- MSI_ADDR_EXT_DEST_ID(dest);
- else
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL:
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU:
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED:
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(cfg->vector);
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(cfg->vector);
- }
return err;
}
@@ -3288,33 +3111,6 @@ static struct irq_chip msi_chip = {
.irq_retrigger = ioapic_retrigger_irq,
};
-/*
- * Map the PCI dev to the corresponding remapping hardware unit
- * and allocate 'nvec' consecutive interrupt-remapping table entries
- * in it.
- */
-static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
-{
- struct intel_iommu *iommu;
- int index;
-
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- printk(KERN_ERR
- "Unable to map PCI %s to iommu\n", pci_name(dev));
- return -ENOENT;
- }
-
- index = alloc_irte(iommu, irq, nvec);
- if (index < 0) {
- printk(KERN_ERR
- "Unable to allocate %d IRTE for PCI %s\n", nvec,
- pci_name(dev));
- return -ENOSPC;
- }
- return index;
-}
-
static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
{
struct irq_chip *chip = &msi_chip;
@@ -3345,7 +3141,6 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int node, ret, sub_handle, index = 0;
unsigned int irq, irq_want;
struct msi_desc *msidesc;
- struct intel_iommu *iommu = NULL;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3359,7 +3154,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (irq == 0)
return -1;
irq_want = irq + 1;
- if (!intr_remapping_enabled)
+ if (!irq_remapping_enabled)
goto no_ir;
if (!sub_handle) {
@@ -3367,23 +3162,16 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
* allocate the consecutive block of IRTE's
* for 'nvec'
*/
- index = msi_alloc_irte(dev, irq, nvec);
+ index = msi_alloc_remapped_irq(dev, irq, nvec);
if (index < 0) {
ret = index;
goto error;
}
} else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
+ ret = msi_setup_remapped_irq(dev, irq, index,
+ sub_handle);
+ if (ret < 0)
goto error;
- }
- /*
- * setup the mapping between the irq and the IRTE
- * base index, the sub_handle pointing to the
- * appropriate interrupt remap table entry.
- */
- set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
ret = setup_msi_irq(dev, msidesc, irq);
@@ -3501,15 +3289,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
struct msi_msg msg;
int ret;
- if (intr_remapping_enabled) {
- struct intel_iommu *iommu = map_hpet_to_ir(id);
- int index;
-
- if (!iommu)
- return -1;
-
- index = alloc_irte(iommu, irq, 1);
- if (index < 0)
+ if (irq_remapping_enabled) {
+ if (!setup_hpet_msi_remapped(irq, id))
return -1;
}
@@ -3888,8 +3669,8 @@ void __init setup_ioapic_dest(void)
else
mask = apic->target_cpus();
- if (intr_remapping_enabled)
- ir_ioapic_set_affinity(idata, mask, false);
+ if (irq_remapping_enabled)
+ set_remapped_irq_affinity(idata, mask, false);
else
ioapic_set_affinity(idata, mask, false);
}
@@ -3931,12 +3712,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
return res;
}
-void __init ioapic_and_gsi_init(void)
-{
- io_apic_ops.init();
-}
-
-static void __init __ioapic_init_mappings(void)
+void __init native_io_apic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 00d2422ca7c..f00a68cca37 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -530,6 +530,7 @@ static struct apic __refdata apic_numaq = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index ff2c1b9aac4..1b291da09e6 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -142,6 +142,7 @@ static struct apic apic_default = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index fea000b27f0..659897c0075 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -546,6 +546,7 @@ static struct apic apic_summit = {
.read = native_apic_mem_read,
.write = native_apic_mem_write,
+ .eoi_write = native_apic_mem_write,
.icr_read = native_apic_icr_read,
.icr_write = native_apic_icr_write,
.wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 48f3103b3c9..ff35cff0e1a 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -260,6 +260,7 @@ static struct apic apic_x2apic_cluster = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 991e315f422..c17e982db27 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -172,6 +172,7 @@ static struct apic apic_x2apic_phys = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 87bfa69e216..c6d03f7a440 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -404,6 +404,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.read = native_apic_msr_read,
.write = native_apic_msr_write,
+ .eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 2245eb5bd06..a899ba2add0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -2401,7 +2401,7 @@ static void __exit apm_exit(void)
* (pm_idle), Wait for all processors to update cached/local
* copies of pm_idle before proceeding.
*/
- cpu_idle_wait();
+ kick_all_cpus_sync();
}
if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
&& (apm_info.connection_version > 0x0100)) {
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 5da1269e8dd..e2dbcb7dabd 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -27,21 +27,29 @@ static int num_scan_areas;
static __init int set_corruption_check(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- memory_corruption_check = simple_strtol(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ memory_corruption_check = val;
+ return 0;
}
early_param("memory_corruption_check", set_corruption_check);
static __init int set_corruption_check_period(char *arg)
{
- char *end;
+ ssize_t ret;
+ unsigned long val;
- corruption_check_period = simple_strtoul(arg, &end, 10);
+ ret = kstrtoul(arg, 10, &val);
+ if (ret)
+ return ret;
- return (*end == 0) ? 0 : -EINVAL;
+ corruption_check_period = val;
+ return 0;
}
early_param("memory_corruption_check_period", set_corruption_check_period);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cf79302198a..6b9333b429b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr)
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
+static DEFINE_PER_CPU(u32, debug_stack_use_ctr);
+
void debug_stack_set_zero(void)
{
+ this_cpu_inc(debug_stack_use_ctr);
load_idt((const struct desc_ptr *)&nmi_idt_descr);
}
void debug_stack_reset(void)
{
- load_idt((const struct desc_ptr *)&idt_descr);
+ if (WARN_ON(!this_cpu_read(debug_stack_use_ctr)))
+ return;
+ if (this_cpu_dec_return(debug_stack_use_ctr) == 0)
+ load_idt((const struct desc_ptr *)&idt_descr);
}
#else /* CONFIG_X86_64 */
@@ -1185,7 +1191,7 @@ void __cpuinit cpu_init(void)
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(numa_node) == 0 &&
+ if (cpu != 0 && this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b8f3653dddb..9a7c90d80bc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -615,14 +615,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
new_l2 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid >> index_msb;
+ l2_id = c->apicid & ~((1 << index_msb) - 1);
break;
case 3:
new_l3 = this_leaf.size/1024;
num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
index_msb = get_count_order(
num_threads_sharing);
- l3_id = c->apicid >> index_msb;
+ l3_id = c->apicid & ~((1 << index_msb) - 1);
break;
default:
break;
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 5502b289341..36565373af8 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -23,7 +23,7 @@
* %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
*
* Arrays used to match for this should also be declared using
- * MODULE_DEVICE_TABLE(x86_cpu, ...)
+ * MODULE_DEVICE_TABLE(x86cpu, ...)
*
* This always matches against the boot cpu, assuming models and features are
* consistent over all CPUs.
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 507ea58688e..cd8b166a173 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
struct mce m;
/* Only corrected MC is reported */
- if (!corrected)
+ if (!corrected || !(mem_err->validation_bits &
+ CPER_MEM_VALID_PHYSICAL_ADDRESS))
return;
mce_setup(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1ccd453903d..413c2ced887 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -126,6 +126,16 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
USER
),
+ MCESEV(
+ KEEP, "HT thread notices Action required: instruction fetch error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 61604aefc40..c46ed494f00 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -591,7 +591,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- percpu_inc(mce_poll_count);
+ this_cpu_inc(mce_poll_count);
mce_gather_info(&m, NULL);
@@ -649,16 +649,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static int mce_no_way_out(struct mce *m, char **msg)
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
{
- int i;
+ int i, ret = 0;
for (i = 0; i < banks; i++) {
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
+ if (m->status & MCI_STATUS_VAL)
+ __set_bit(i, validp);
if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
- return 1;
+ ret = 1;
}
- return 0;
+ return ret;
}
/*
@@ -1021,11 +1023,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/
int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
atomic_inc(&mce_entry);
- percpu_inc(mce_exception_count);
+ this_cpu_inc(mce_exception_count);
if (!banks)
goto out;
@@ -1035,7 +1038,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
final = &__get_cpu_var(mces_seen);
*final = m;
- no_way_out = mce_no_way_out(&m, &msg);
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks);
barrier();
@@ -1055,6 +1059,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
order = mce_start(&no_way_out);
for (i = 0; i < banks; i++) {
__clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
if (!mce_banks[i].ctl)
continue;
@@ -1180,6 +1186,7 @@ void mce_notify_process(void)
{
unsigned long pfn;
struct mce_info *mi = mce_find_info();
+ int flags = MF_ACTION_REQUIRED;
if (!mi)
mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
@@ -1194,8 +1201,9 @@ void mce_notify_process(void)
* doomed. We still need to mark the page as poisoned and alert any
* other users of the page.
*/
- if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
- mi->restartable == 0) {
+ if (!mi->restartable)
+ flags |= MF_MUST_KILL;
+ if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
pr_err("Memory error not recovered");
force_sig(SIGBUS, current);
}
@@ -1245,15 +1253,15 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mce_start_timer(unsigned long data)
+static void mce_timer_fn(unsigned long data)
{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long iv;
WARN_ON(smp_processor_id() != data);
@@ -1266,13 +1274,14 @@ static void mce_start_timer(unsigned long data)
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(mce_next_interval);
+ iv = __this_cpu_read(mce_next_interval);
if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
+ iv = max(iv / 2, (unsigned long) HZ/100);
else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ __this_cpu_write(mce_next_interval, iv);
- t->expires = jiffies + *n;
+ t->expires = jiffies + iv;
add_timer_on(t, smp_processor_id());
}
@@ -1439,6 +1448,43 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
if (c->x86 == 6 && banks > 0)
mce_banks[0].ctl = 0;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 val, hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ for (i = 0; i < ARRAY_SIZE(msrs); i++) {
+ rdmsrl(msrs[i], val);
+
+ /* CntP bit set? */
+ if (val & BIT_64(62)) {
+ val &= ~BIT_64(62);
+ wrmsrl(msrs[i], val);
+ }
+ }
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1513,17 +1559,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(mce_next_interval);
+ unsigned long iv = check_interval * HZ;
- setup_timer(t, mce_start_timer, smp_processor_id());
+ setup_timer(t, mce_timer_fn, smp_processor_id());
if (mce_ignore_ce)
return;
- *n = check_interval * HZ;
- if (!*n)
+ __this_cpu_write(mce_next_interval, iv);
+ if (!iv)
return;
- t->expires = round_jiffies(jiffies + *n);
+ t->expires = round_jiffies(jiffies + iv);
add_timer_on(t, smp_processor_id());
}
@@ -2233,7 +2279,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED_FROZEN:
if (!mce_ignore_ce && check_interval) {
t->expires = round_jiffies(jiffies +
- __get_cpu_var(mce_next_interval));
+ per_cpu(mce_next_interval, cpu));
add_timer_on(t, cpu);
}
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 2c1d178be46..f4873a64f46 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -421,10 +421,10 @@ RW_ATTR(threshold_limit);
RW_ATTR(error_count);
static struct attribute *default_attrs[] = {
- &interrupt_enable.attr,
&threshold_limit.attr,
&error_count.attr,
- NULL
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
};
#define to_block(k) container_of(k, struct threshold_block, kobj)
@@ -501,6 +501,11 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
+ if (b->interrupt_capable)
+ threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
+ else
+ threshold_ktype.default_attrs[2] = NULL;
+
INIT_LIST_HEAD(&b->miscj);
if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
index dfea390e160..c7b3fe2d72e 100644
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
#
# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
#
@@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
print OUT "#include <asm/cpufeature.h>\n\n";
print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
+%features = ();
+$err = 0;
+
while (defined($line = <IN>)) {
if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
$macro = $1;
- $feature = $2;
+ $feature = "\L$2";
$tail = $3;
if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
- $feature = $1;
+ $feature = "\L$1";
}
- if ($feature ne '') {
- printf OUT "\t%-32s = \"%s\",\n",
- "[$macro]", "\L$feature";
+ next if ($feature eq '');
+
+ if ($features{$feature}++) {
+ print STDERR "$in: duplicate feature name: $feature\n";
+ $err++;
}
+ printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature;
}
}
print OUT "};\n";
close(IN);
close(OUT);
+
+if ($err) {
+ unlink($out);
+ exit(1);
+}
+
+exit(0);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ac140c7be39..bdda2e6c673 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -266,7 +266,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
if (align > max_align)
align = max_align;
- sizek = 1 << align;
+ sizek = 1UL << align;
if (debug_print) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bb8e03407e1..c4706cf9c01 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -484,9 +484,6 @@ static int __x86_pmu_event_init(struct perf_event *event)
/* mark unused */
event->hw.extra_reg.idx = EXTRA_REG_NONE;
-
- /* mark not used */
- event->hw.extra_reg.idx = EXTRA_REG_NONE;
event->hw.branch_reg.idx = EXTRA_REG_NONE;
return x86_pmu.hw_config(event);
@@ -1186,8 +1183,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1222,7 +1217,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
* event overflow
*/
handled++;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
if (!x86_perf_event_set_period(event))
continue;
@@ -1501,6 +1496,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
if (!cpuc->shared_regs)
goto error;
}
+ cpuc->is_fake = 1;
return cpuc;
error:
free_fake_cpuc(cpuc);
@@ -1761,6 +1757,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
}
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+ return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
#ifdef CONFIG_COMPAT
#include <asm/compat.h>
@@ -1785,7 +1787,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
if (bytes != sizeof(frame))
break;
- if (fp < compat_ptr(regs->sp))
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
@@ -1831,7 +1833,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
if (bytes != sizeof(frame))
break;
- if ((unsigned long)fp < regs->sp)
+ if (!valid_user_frame(fp, sizeof(frame)))
break;
perf_callchain_store(entry, frame.return_address);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6638aaf5449..7241e2fc3c1 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -117,6 +117,7 @@ struct cpu_hw_events {
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
unsigned int group_flag;
+ int is_fake;
/*
* Intel DebugStore bits
@@ -364,6 +365,7 @@ struct x86_pmu {
int pebs_record_size;
void (*drain_pebs)(struct pt_regs *regs);
struct event_constraint *pebs_constraints;
+ void (*pebs_aliases)(struct perf_event *event);
/*
* Intel LBR
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 9edc786aef8..11a4eb9131d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -134,8 +134,13 @@ static u64 amd_pmu_event_map(int hw_event)
static int amd_pmu_hw_config(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
+ int ret;
+ /* pass precise event sampling to ibs: */
+ if (event->attr.precise_ip && get_ibs_caps())
+ return -ENOENT;
+
+ ret = x86_pmu_hw_config(event);
if (ret)
return ret;
@@ -205,10 +210,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
* when we come here
*/
for (i = 0; i < x86_pmu.num_counters; i++) {
- if (nb->owners[i] == event) {
- cmpxchg(nb->owners+i, event, NULL);
+ if (cmpxchg(nb->owners + i, event, NULL) == event)
break;
- }
}
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 3b8a2d30d14..da9bcdcd985 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -9,6 +9,7 @@
#include <linux/perf_event.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/ptrace.h>
#include <asm/apic.h>
@@ -16,36 +17,591 @@ static u32 ibs_caps;
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-static struct pmu perf_ibs;
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
+
+enum ibs_states {
+ IBS_ENABLED = 0,
+ IBS_STARTED = 1,
+ IBS_STOPPING = 2,
+
+ IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+ struct perf_event *event;
+ unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+ struct pmu pmu;
+ unsigned int msr;
+ u64 config_mask;
+ u64 cnt_mask;
+ u64 enable_mask;
+ u64 valid_mask;
+ u64 max_period;
+ unsigned long offset_mask[1];
+ int offset_max;
+ struct cpu_perf_ibs __percpu *pcpu;
+ u64 (*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+ u32 size;
+ union {
+ u32 data[0]; /* data buffer starts here */
+ u32 caps;
+ };
+ u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+ s64 left = local64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int overflow = 0;
+
+ /*
+ * If we are way outside a reasonable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ if (unlikely(left < (s64)min)) {
+ left += period;
+ local64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ overflow = 1;
+ }
+
+ /*
+ * If the hw period that triggers the sw overflow is too short
+ * we might hit the irq handler. This biases the results.
+ * Thus we shorten the next-to-last period and set the last
+ * period to the max period.
+ */
+ if (left > max) {
+ left -= max;
+ if (left > max)
+ left = max;
+ else if (left < min)
+ left = min;
+ }
+
+ *hw_period = (u64)left;
+
+ return overflow;
+}
+
+static int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int shift = 64 - width;
+ u64 prev_raw_count;
+ u64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous event value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic event atomically:
+ */
+ prev_raw_count = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ return 0;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
+
+ return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+ if (perf_ibs_fetch.pmu.type == type)
+ return &perf_ibs_fetch;
+ if (perf_ibs_op.pmu.type == type)
+ return &perf_ibs_op;
+ return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
+ * perf record -a -e r076:p ... # same as -e cpu-cycles:p
+ * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+ switch (event->attr.precise_ip) {
+ case 0:
+ return -ENOENT;
+ case 1:
+ case 2:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ switch (event->attr.type) {
+ case PERF_TYPE_HARDWARE:
+ switch (event->attr.config) {
+ case PERF_COUNT_HW_CPU_CYCLES:
+ *config = 0;
+ return 0;
+ }
+ break;
+ case PERF_TYPE_RAW:
+ switch (event->attr.config) {
+ case 0x0076:
+ *config = 0;
+ return 0;
+ case 0x00C1:
+ *config = IBS_OP_CNT_CTL;
+ return 0;
+ }
+ break;
+ default:
+ return -ENOENT;
+ }
+
+ return -EOPNOTSUPP;
+}
static int perf_ibs_init(struct perf_event *event)
{
- if (perf_ibs.type != event->attr.type)
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs;
+ u64 max_cnt, config;
+ int ret;
+
+ perf_ibs = get_ibs_pmu(event->attr.type);
+ if (perf_ibs) {
+ config = event->attr.config;
+ } else {
+ perf_ibs = &perf_ibs_op;
+ ret = perf_ibs_precise_event(event, &config);
+ if (ret)
+ return ret;
+ }
+
+ if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
+
+ if (config & ~perf_ibs->config_mask)
+ return -EINVAL;
+
+ if (hwc->sample_period) {
+ if (config & perf_ibs->cnt_mask)
+ /* raw max_cnt may not be set */
+ return -EINVAL;
+ if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+ /*
+ * lower 4 bits can not be set in ibs max cnt,
+ * but allowing it in case we adjust the
+ * sample period to set a frequency.
+ */
+ return -EINVAL;
+ hwc->sample_period &= ~0x0FULL;
+ if (!hwc->sample_period)
+ hwc->sample_period = 0x10;
+ } else {
+ max_cnt = config & perf_ibs->cnt_mask;
+ config &= ~perf_ibs->cnt_mask;
+ event->attr.sample_period = max_cnt << 4;
+ hwc->sample_period = event->attr.sample_period;
+ }
+
+ if (!hwc->sample_period)
+ return -EINVAL;
+
+ /*
+ * If we modify hwc->sample_period, we also need to update
+ * hwc->last_period and hwc->period_left.
+ */
+ hwc->last_period = hwc->sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+
+ hwc->config_base = perf_ibs->msr;
+ hwc->config = config;
+
return 0;
}
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 *period)
+{
+ int overflow;
+
+ /* ignore lower 4 bits in min count: */
+ overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+ local64_set(&hwc->prev_count, 0);
+
+ return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+ return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+ u64 count = 0;
+
+ if (config & IBS_OP_VAL)
+ count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+ if (ibs_caps & IBS_CAPS_RDWROPCNT)
+ count += (config & IBS_OP_CUR_CNT) >> 32;
+
+ return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+ u64 *config)
+{
+ u64 count = perf_ibs->get_count(*config);
+
+ /*
+ * Set width to 64 since we do not overflow on max width but
+ * instead on max count. In perf_ibs_set_period() we clear
+ * prev count manually on overflow.
+ */
+ while (!perf_event_try_update(event, count, 64)) {
+ rdmsrl(event->hw.config_base, *config);
+ count = perf_ibs->get_count(*config);
+ }
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+ struct hw_perf_event *hwc, u64 config)
+{
+ config &= ~perf_ibs->cnt_mask;
+ wrmsrl(hwc->config_base, config);
+ config &= ~perf_ibs->enable_mask;
+ wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 period;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ perf_ibs_set_period(perf_ibs, hwc, &period);
+ set_bit(IBS_STARTED, pcpu->state);
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ u64 config;
+ int stopping;
+
+ stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+ if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+ return;
+
+ rdmsrl(hwc->config_base, config);
+
+ if (stopping) {
+ set_bit(IBS_STOPPING, pcpu->state);
+ perf_ibs_disable_event(perf_ibs, hwc, config);
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /*
+ * Clear valid bit to not count rollovers on update, rollovers
+ * are only updated in the irq handler.
+ */
+ config &= ~perf_ibs->valid_mask;
+
+ perf_ibs_event_update(perf_ibs, event, &config);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
static int perf_ibs_add(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+ return -ENOSPC;
+
+ event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ pcpu->event = event;
+
+ if (flags & PERF_EF_START)
+ perf_ibs_start(event, PERF_EF_RELOAD);
+
return 0;
}
static void perf_ibs_del(struct perf_event *event, int flags)
{
+ struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+ if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+ return;
+
+ perf_ibs_stop(event, PERF_EF_UPDATE);
+
+ pcpu->event = NULL;
+
+ perf_event_update_userpage(event);
}
-static struct pmu perf_ibs = {
- .event_init= perf_ibs_init,
- .add= perf_ibs_add,
- .del= perf_ibs_del,
+static void perf_ibs_read(struct perf_event *event) { }
+
+static struct perf_ibs perf_ibs_fetch = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSFETCHCTL,
+ .config_mask = IBS_FETCH_CONFIG_MASK,
+ .cnt_mask = IBS_FETCH_MAX_CNT,
+ .enable_mask = IBS_FETCH_ENABLE,
+ .valid_mask = IBS_FETCH_VAL,
+ .max_period = IBS_FETCH_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
+ .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
+
+ .get_count = get_ibs_fetch_count,
};
+static struct perf_ibs perf_ibs_op = {
+ .pmu = {
+ .task_ctx_nr = perf_invalid_context,
+
+ .event_init = perf_ibs_init,
+ .add = perf_ibs_add,
+ .del = perf_ibs_del,
+ .start = perf_ibs_start,
+ .stop = perf_ibs_stop,
+ .read = perf_ibs_read,
+ },
+ .msr = MSR_AMD64_IBSOPCTL,
+ .config_mask = IBS_OP_CONFIG_MASK,
+ .cnt_mask = IBS_OP_MAX_CNT,
+ .enable_mask = IBS_OP_ENABLE,
+ .valid_mask = IBS_OP_VAL,
+ .max_period = IBS_OP_MAX_CNT << 4,
+ .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
+ .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
+
+ .get_count = get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+ struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+ struct perf_event *event = pcpu->event;
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_sample_data data;
+ struct perf_raw_record raw;
+ struct pt_regs regs;
+ struct perf_ibs_data ibs_data;
+ int offset, size, check_rip, offset_max, throttle = 0;
+ unsigned int msr;
+ u64 *buf, *config, period;
+
+ if (!test_bit(IBS_STARTED, pcpu->state)) {
+ /*
+ * Catch spurious interrupts after stopping IBS: After
+ * disabling IBS there could be still incomming NMIs
+ * with samples that even have the valid bit cleared.
+ * Mark all this NMIs as handled.
+ */
+ return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+ }
+
+ msr = hwc->config_base;
+ buf = ibs_data.regs;
+ rdmsrl(msr, *buf);
+ if (!(*buf++ & perf_ibs->valid_mask))
+ return 0;
+
+ config = &ibs_data.regs[0];
+ perf_ibs_event_update(perf_ibs, event, config);
+ perf_sample_data_init(&data, 0, hwc->last_period);
+ if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+ goto out; /* no sw counter overflow */
+
+ ibs_data.caps = ibs_caps;
+ size = 1;
+ offset = 1;
+ check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+ if (event->attr.sample_type & PERF_SAMPLE_RAW)
+ offset_max = perf_ibs->offset_max;
+ else if (check_rip)
+ offset_max = 2;
+ else
+ offset_max = 1;
+ do {
+ rdmsrl(msr + offset, *buf++);
+ size++;
+ offset = find_next_bit(perf_ibs->offset_mask,
+ perf_ibs->offset_max,
+ offset + 1);
+ } while (offset < offset_max);
+ ibs_data.size = sizeof(u64) * size;
+
+ regs = *iregs;
+ if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+ } else {
+ instruction_pointer_set(&regs, ibs_data.regs[1]);
+ regs.flags |= PERF_EFLAGS_EXACT;
+ }
+
+ if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+ raw.size = sizeof(u32) + ibs_data.size;
+ raw.data = ibs_data.data;
+ data.raw = &raw;
+ }
+
+ throttle = perf_event_overflow(event, &data, &regs);
+out:
+ if (throttle)
+ perf_ibs_disable_event(perf_ibs, hwc, *config);
+ else
+ perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+ perf_event_update_userpage(event);
+
+ return 1;
+}
+
+static int __kprobes
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+ int handled = 0;
+
+ handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+ handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+ struct cpu_perf_ibs __percpu *pcpu;
+ int ret;
+
+ pcpu = alloc_percpu(struct cpu_perf_ibs);
+ if (!pcpu)
+ return -ENOMEM;
+
+ perf_ibs->pcpu = pcpu;
+
+ ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+ if (ret) {
+ perf_ibs->pcpu = NULL;
+ free_percpu(pcpu);
+ }
+
+ return ret;
+}
+
static __init int perf_event_ibs_init(void)
{
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
- perf_pmu_register(&perf_ibs, "ibs", -1);
+ perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+ if (ibs_caps & IBS_CAPS_OPCNT)
+ perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+ perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+ register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 26b3e2fef10..187c294bc65 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1027,8 +1027,6 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
u64 status;
int handled;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
/*
@@ -1082,7 +1080,7 @@ again:
if (!intel_pmu_save_and_restart(event))
continue;
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
if (has_branch_stack(event))
data.br_stack = &cpuc->lbr_stack;
@@ -1121,27 +1119,33 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
-static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+static int intel_alt_er(int idx)
{
if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
- return false;
+ return idx;
- if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
- event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01bb;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
- event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
- } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+ if (idx == EXTRA_REG_RSP_0)
+ return EXTRA_REG_RSP_1;
+
+ if (idx == EXTRA_REG_RSP_1)
+ return EXTRA_REG_RSP_0;
+
+ return idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+ event->hw.extra_reg.idx = idx;
+
+ if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
event->hw.config |= 0x01b7;
- event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+ } else if (idx == EXTRA_REG_RSP_1) {
+ event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+ event->hw.config |= 0x01bb;
+ event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
-
- if (event->hw.extra_reg.idx == orig_idx)
- return false;
-
- return true;
}
/*
@@ -1159,14 +1163,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
struct event_constraint *c = &emptyconstraint;
struct er_account *era;
unsigned long flags;
- int orig_idx = reg->idx;
+ int idx = reg->idx;
- /* already allocated shared msr */
- if (reg->alloc)
+ /*
+ * reg->alloc can be set due to existing state, so for fake cpuc we
+ * need to ignore this, otherwise we might fail to allocate proper fake
+ * state for this extra reg constraint. Also see the comment below.
+ */
+ if (reg->alloc && !cpuc->is_fake)
return NULL; /* call x86_get_event_constraint() */
again:
- era = &cpuc->shared_regs->regs[reg->idx];
+ era = &cpuc->shared_regs->regs[idx];
/*
* we use spin_lock_irqsave() to avoid lockdep issues when
* passing a fake cpuc
@@ -1175,6 +1183,29 @@ again:
if (!atomic_read(&era->ref) || era->config == reg->config) {
+ /*
+ * If its a fake cpuc -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ *
+ * Not doing the ER fixup will only result in era->reg being
+ * wrong, but since we won't actually try and program hardware
+ * this isn't a problem either.
+ */
+ if (!cpuc->is_fake) {
+ if (idx != reg->idx)
+ intel_fixup_er(event, idx);
+
+ /*
+ * x86_schedule_events() can call get_event_constraints()
+ * multiple times on events in the case of incremental
+ * scheduling(). reg->alloc ensures we only do the ER
+ * allocation once.
+ */
+ reg->alloc = 1;
+ }
+
/* lock in msr value */
era->config = reg->config;
era->reg = reg->reg;
@@ -1182,17 +1213,17 @@ again:
/* one more user */
atomic_inc(&era->ref);
- /* no need to reallocate during incremental event scheduling */
- reg->alloc = 1;
-
/*
* need to call x86_get_event_constraint()
* to check if associated event has constraints
*/
c = NULL;
- } else if (intel_try_alt_er(event, orig_idx)) {
- raw_spin_unlock_irqrestore(&era->lock, flags);
- goto again;
+ } else {
+ idx = intel_alt_er(idx);
+ if (idx != reg->idx) {
+ raw_spin_unlock_irqrestore(&era->lock, flags);
+ goto again;
+ }
}
raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1206,11 +1237,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
struct er_account *era;
/*
- * only put constraint if extra reg was actually
- * allocated. Also takes care of event which do
- * not use an extra shared reg
+ * Only put constraint if extra reg was actually allocated. Also takes
+ * care of event which do not use an extra shared reg.
+ *
+ * Also, if this is a fake cpuc we shouldn't touch any event state
+ * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+ * either since it'll be thrown out.
*/
- if (!reg->alloc)
+ if (!reg->alloc || cpuc->is_fake)
return;
era = &cpuc->shared_regs->regs[reg->idx];
@@ -1302,15 +1336,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
intel_put_shared_regs_event_constraints(cpuc, event);
}
-static int intel_pmu_hw_config(struct perf_event *event)
+static void intel_pebs_aliases_core2(struct perf_event *event)
{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.precise_ip &&
- (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
/*
* Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
* (0x003c) so that we can use it with PEBS.
@@ -1331,10 +1359,48 @@ static int intel_pmu_hw_config(struct perf_event *event)
*/
u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+ if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use UOPS_RETIRED.ALL
+ * (0x01c2), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * UOPS_RETIRED.ALL counts the number of cycles that retires
+ * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+ * larger than the maximum number of micro-ops that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less micro-ops, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
event->hw.config = alt_config;
}
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.precise_ip && x86_pmu.pebs_aliases)
+ x86_pmu.pebs_aliases(event);
if (intel_pmu_needs_lbr_smpl(event)) {
ret = intel_pmu_setup_lbr_filter(event);
@@ -1609,6 +1675,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
+ .pebs_aliases = intel_pebs_aliases_core2,
.format_attrs = intel_arch3_formats_attr,
@@ -1842,8 +1909,9 @@ __init int intel_pmu_init(void)
break;
case 42: /* SandyBridge */
- x86_add_quirk(intel_sandybridge_quirk);
case 45: /* SandyBridge, "Romely-EP" */
+ x86_add_quirk(intel_sandybridge_quirk);
+ case 58: /* IvyBridge */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -1851,6 +1919,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_snb_event_constraints;
x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 7f64df19e7d..35e2192df9f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -316,8 +316,7 @@ int intel_pmu_drain_bts_buffer(void)
ds->bts_index = ds->bts_buffer_base;
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
regs.ip = 0;
/*
@@ -401,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
@@ -564,8 +556,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
if (!intel_pmu_save_and_restart(event))
return;
- perf_sample_data_init(&data, 0);
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, event->hw.last_period);
/*
* We use the interrupt regs as a base because the PEBS record
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a2dfacfd710..47124a73dd7 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1005,8 +1005,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- perf_sample_data_init(&data, 0);
-
cpuc = &__get_cpu_var(cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1034,10 +1032,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
handled += overflow;
/* event overflow for sure */
- data.period = event->hw.last_period;
+ perf_sample_data_init(&data, 0, hwc->last_period);
if (!x86_perf_event_set_period(event))
continue;
+
+
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index addf9e82a7f..ee8e9abc859 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
const struct cpuid_bit *cb;
static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
+ { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1b81839b6c8..571246d81ed 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -271,7 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
return 1;
- show_registers(regs);
+ show_regs(regs);
#ifdef CONFIG_X86_32
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -311,16 +311,33 @@ void die(const char *str, struct pt_regs *regs, long err)
static int __init kstack_setup(char *s)
{
+ ssize_t ret;
+ unsigned long val;
+
if (!s)
return -EINVAL;
- kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+ kstack_depth_to_print = val;
return 0;
}
early_param("kstack", kstack_setup);
static int __init code_bytes_setup(char *s)
{
- code_bytes = simple_strtoul(s, NULL, 0);
+ ssize_t ret;
+ unsigned long val;
+
+ if (!s)
+ return -EINVAL;
+
+ ret = kstrtoul(s, 0, &val);
+ if (ret)
+ return ret;
+
+ code_bytes = val;
if (code_bytes > 8192)
code_bytes = 8192;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 88ec9129271..e0b1d783daa 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -82,7 +82,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 17107bd6e1f..791b76122aa 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -245,7 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
-void show_registers(struct pt_regs *regs)
+void show_regs(struct pt_regs *regs)
{
int i;
unsigned long sp;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976e..41857970517 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
int x = e820x->nr_map;
if (x >= ARRAY_SIZE(e820x->map)) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n",
+ (unsigned long long) start,
+ (unsigned long long) (start + size - 1));
return;
}
@@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type)
switch (type) {
case E820_RAM:
case E820_RESERVED_KERN:
- printk(KERN_CONT "(usable)");
+ printk(KERN_CONT "usable");
break;
case E820_RESERVED:
- printk(KERN_CONT "(reserved)");
+ printk(KERN_CONT "reserved");
break;
case E820_ACPI:
- printk(KERN_CONT "(ACPI data)");
+ printk(KERN_CONT "ACPI data");
break;
case E820_NVS:
- printk(KERN_CONT "(ACPI NVS)");
+ printk(KERN_CONT "ACPI NVS");
break;
case E820_UNUSABLE:
- printk(KERN_CONT "(unusable)");
+ printk(KERN_CONT "unusable");
break;
default:
printk(KERN_CONT "type %u", type);
@@ -158,10 +160,10 @@ void __init e820_print_map(char *who)
int i;
for (i = 0; i < e820.nr_map; i++) {
- printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+ printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
(unsigned long long) e820.map[i].addr,
(unsigned long long)
- (e820.map[i].addr + e820.map[i].size));
+ (e820.map[i].addr + e820.map[i].size - 1));
e820_print_type(e820.map[i].type);
printk(KERN_CONT "\n");
}
@@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ",
+ (unsigned long long) start, (unsigned long long) (end - 1));
e820_print_type(old_type);
printk(KERN_CONT " ==> ");
e820_print_type(new_type);
@@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
size = ULLONG_MAX - start;
end = start + size;
- printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
- (unsigned long long) start,
- (unsigned long long) end);
+ printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ",
+ (unsigned long long) start, (unsigned long long) (end - 1));
if (checktype)
e820_print_type(old_type);
printk(KERN_CONT "\n");
@@ -567,7 +567,7 @@ void __init update_e820(void)
if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
return;
e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
+ printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
@@ -637,8 +637,8 @@ __init void e820_setup_gap(void)
if (!found) {
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
printk(KERN_ERR
- "PCI: Warning: Cannot find a gap in the 32bit address range\n"
- "PCI: Unassigned devices with 32bit resource registers may break!\n");
+ "e820: cannot find a gap in the 32bit address range\n"
+ "e820: PCI devices with unassigned 32bit BARs may break!\n");
}
#endif
@@ -648,8 +648,8 @@ __init void e820_setup_gap(void)
pci_mem_start = gapstart;
printk(KERN_INFO
- "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
- pci_mem_start, gapstart, gapsize);
+ "e820: [mem %#010lx-%#010lx] available for PCI devices\n",
+ gapstart, gapstart + gapsize - 1);
}
/**
@@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- printk(KERN_INFO "extended physical RAM map:\n");
+ printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
}
@@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
if (addr) {
e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+ printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n");
update_e820_saved();
}
@@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
- printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+ printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
last_pfn, max_arch_pfn);
return last_pfn;
}
@@ -888,7 +888,7 @@ void __init finish_e820_parsing(void)
early_panic("Invalid user supplied memory map");
e820.nr_map = nr;
- printk(KERN_INFO "user-defined physical RAM map:\n");
+ printk(KERN_INFO "e820: user-defined physical RAM map:\n");
e820_print_map("user");
}
}
@@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void)
end = MAX_RESOURCE_SIZE;
if (start >= end)
continue;
- printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
- start, end);
+ printk(KERN_DEBUG
+ "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n",
+ start, end);
reserve_region_with_split(&iomem_resource, start, end,
"RAM buffer");
}
@@ -1047,7 +1048,7 @@ void __init setup_memory_map(void)
who = x86_init.resources.memory_setup();
memcpy(&e820_saved, &e820, sizeof(struct e820map));
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7b784f4ef1e..623f2883747 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -56,6 +56,7 @@
#include <asm/irq_vectors.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+#include <asm/asm.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -151,10 +152,8 @@
.pushsection .fixup, "ax"
99: movl $0, (%esp)
jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
.popsection
+ _ASM_EXTABLE(98b,99b)
.endm
.macro PTGS_TO_GS
@@ -164,10 +163,8 @@
.pushsection .fixup, "ax"
99: movl $0, PT_GS(%esp)
jmp 98b
-.section __ex_table, "a"
- .align 4
- .long 98b, 99b
.popsection
+ _ASM_EXTABLE(98b,99b)
.endm
.macro GS_TO_REG reg
@@ -249,12 +246,10 @@
jmp 2b
6: movl $0, (%esp)
jmp 3b
-.section __ex_table, "a"
- .align 4
- .long 1b, 4b
- .long 2b, 5b
- .long 3b, 6b
.popsection
+ _ASM_EXTABLE(1b,4b)
+ _ASM_EXTABLE(2b,5b)
+ _ASM_EXTABLE(3b,6b)
POP_GS_EX
.endm
@@ -321,7 +316,6 @@ ret_from_exception:
preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
-resume_userspace_sig:
#ifdef CONFIG_VM86
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
@@ -415,10 +409,7 @@ sysenter_past_esp:
jae syscall_fault
1: movl (%ebp),%ebp
movl %ebp,PT_EBP(%esp)
-.section __ex_table,"a"
- .align 4
- .long 1b,syscall_fault
-.previous
+ _ASM_EXTABLE(1b,syscall_fault)
GET_THREAD_INFO(%ebp)
@@ -485,10 +476,8 @@ sysexit_audit:
.pushsection .fixup,"ax"
2: movl $0,PT_FS(%esp)
jmp 1b
-.section __ex_table,"a"
- .align 4
- .long 1b,2b
.popsection
+ _ASM_EXTABLE(1b,2b)
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
@@ -543,10 +532,7 @@ ENTRY(iret_exc)
pushl $do_iret_error
jmp error_code
.previous
-.section __ex_table,"a"
- .align 4
- .long irq_return,iret_exc
-.previous
+ _ASM_EXTABLE(irq_return,iret_exc)
CFI_RESTORE_STATE
ldt_ss:
@@ -628,9 +614,13 @@ work_notifysig: # deal with pending signals and
# vm86-space
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace_sig
+ jmp resume_userspace
ALIGN
work_notifysig_v86:
@@ -643,9 +633,13 @@ work_notifysig_v86:
#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace_sig
+ jmp resume_userspace
END(work_pending)
# perform syscall exit tracing
@@ -901,10 +895,7 @@ END(device_not_available)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iret
-.section __ex_table,"a"
- .align 4
- .long native_iret, iret_exc
-.previous
+ _ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
ENTRY(native_irq_enable_sysexit)
@@ -1093,13 +1084,10 @@ ENTRY(xen_failsafe_callback)
movl %eax,16(%esp)
jmp 4b
.previous
-.section __ex_table,"a"
- .align 4
- .long 1b,6b
- .long 2b,7b
- .long 3b,8b
- .long 4b,9b
-.previous
+ _ASM_EXTABLE(1b,6b)
+ _ASM_EXTABLE(2b,7b)
+ _ASM_EXTABLE(3b,8b)
+ _ASM_EXTABLE(4b,9b)
ENDPROC(xen_failsafe_callback)
BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cdc79b5cfcd..7d65133b51b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
#include <asm/paravirt.h>
#include <asm/ftrace.h>
#include <asm/percpu.h>
+#include <asm/asm.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -190,6 +191,44 @@ ENDPROC(native_usergs_sysret64)
.endm
/*
+ * When dynamic function tracer is enabled it will add a breakpoint
+ * to all locations that it is about to modify, sync CPUs, update
+ * all the code, sync CPUs, then remove the breakpoints. In this time
+ * if lockdep is enabled, it might jump back into the debug handler
+ * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
+ *
+ * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
+ * make sure the stack pointer does not get reset back to the top
+ * of the debug stack, and instead just reuses the current stack.
+ */
+#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
+
+.macro TRACE_IRQS_OFF_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_OFF
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_ON_DEBUG
+ call debug_stack_set_zero
+ TRACE_IRQS_ON
+ call debug_stack_reset
+.endm
+
+.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON_DEBUG
+1:
+.endm
+
+#else
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+#endif
+
+/*
* C code is not supposed to know about undefined top of stack. Every time
* a C function with an pt_regs argument is called from the SYSCALL based
* fast path FIXUP_TOP_OF_STACK is needed.
@@ -900,18 +939,12 @@ restore_args:
irq_return:
INTERRUPT_RETURN
-
- .section __ex_table, "a"
- .quad irq_return, bad_iret
- .previous
+ _ASM_EXTABLE(irq_return, bad_iret)
#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
iretq
-
- .section __ex_table,"a"
- .quad native_iret, bad_iret
- .previous
+ _ASM_EXTABLE(native_iret, bad_iret)
#endif
.section .fixup,"ax"
@@ -1103,7 +1136,7 @@ ENTRY(\sym)
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
call save_paranoid
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
@@ -1181,10 +1214,7 @@ gs_change:
CFI_ENDPROC
END(native_load_gs_index)
- .section __ex_table,"a"
- .align 8
- .quad gs_change,bad_gs
- .previous
+ _ASM_EXTABLE(gs_change,bad_gs)
.section .fixup,"ax"
/* running with kernelgs */
bad_gs:
@@ -1401,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
+ TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
testl $3,CS(%rsp)
@@ -1412,7 +1442,7 @@ paranoid_swapgs:
RESTORE_ALL 8
jmp irq_return
paranoid_restore:
- TRACE_IRQS_IRETQ 0
+ TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
jmp irq_return
paranoid_userspace:
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c9a281f272f..c3a7cb4bf6e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,40 +24,21 @@
#include <trace/syscall.h>
#include <asm/cacheflush.h>
+#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
-#include <asm/nmi.h>
-
#ifdef CONFIG_DYNAMIC_FTRACE
-/*
- * modifying_code is set to notify NMIs that they need to use
- * memory barriers when entering or exiting. But we don't want
- * to burden NMIs with unnecessary memory barriers when code
- * modification is not being done (which is most of the time).
- *
- * A mutex is already held when ftrace_arch_code_modify_prepare
- * and post_process are called. No locks need to be taken here.
- *
- * Stop machine will make sure currently running NMIs are done
- * and new NMIs will see the updated variable before we need
- * to worry about NMIs doing memory barriers.
- */
-static int modifying_code __read_mostly;
-static DEFINE_PER_CPU(int, save_modifying_code);
-
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
set_all_modules_text_rw();
- modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
- modifying_code = 0;
set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
@@ -90,134 +71,6 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
return calc.code;
}
-/*
- * Modifying code must take extra care. On an SMP machine, if
- * the code being modified is also being executed on another CPU
- * that CPU will have undefined results and possibly take a GPF.
- * We use kstop_machine to stop other CPUS from exectuing code.
- * But this does not stop NMIs from happening. We still need
- * to protect against that. We separate out the modification of
- * the code to take care of this.
- *
- * Two buffers are added: An IP buffer and a "code" buffer.
- *
- * 1) Put the instruction pointer into the IP buffer
- * and the new code into the "code" buffer.
- * 2) Wait for any running NMIs to finish and set a flag that says
- * we are modifying code, it is done in an atomic operation.
- * 3) Write the code
- * 4) clear the flag.
- * 5) Wait for any running NMIs to finish.
- *
- * If an NMI is executed, the first thing it does is to call
- * "ftrace_nmi_enter". This will check if the flag is set to write
- * and if it is, it will write what is in the IP and "code" buffers.
- *
- * The trick is, it does not matter if everyone is writing the same
- * content to the code location. Also, if a CPU is executing code
- * it is OK to write to that code location if the contents being written
- * are the same as what exists.
- */
-
-#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
-static atomic_t nmi_running = ATOMIC_INIT(0);
-static int mod_code_status; /* holds return value of text write */
-static void *mod_code_ip; /* holds the IP to write to */
-static const void *mod_code_newcode; /* holds the text to write to the IP */
-
-static unsigned nmi_wait_count;
-static atomic_t nmi_update_count = ATOMIC_INIT(0);
-
-int ftrace_arch_read_dyn_info(char *buf, int size)
-{
- int r;
-
- r = snprintf(buf, size, "%u %u",
- nmi_wait_count,
- atomic_read(&nmi_update_count));
- return r;
-}
-
-static void clear_mod_flag(void)
-{
- int old = atomic_read(&nmi_running);
-
- for (;;) {
- int new = old & ~MOD_CODE_WRITE_FLAG;
-
- if (old == new)
- break;
-
- old = atomic_cmpxchg(&nmi_running, old, new);
- }
-}
-
-static void ftrace_mod_code(void)
-{
- /*
- * Yes, more than one CPU process can be writing to mod_code_status.
- * (and the code itself)
- * But if one were to fail, then they all should, and if one were
- * to succeed, then they all should.
- */
- mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
- MCOUNT_INSN_SIZE);
-
- /* if we fail, then kill any new writers */
- if (mod_code_status)
- clear_mod_flag();
-}
-
-void ftrace_nmi_enter(void)
-{
- __this_cpu_write(save_modifying_code, modifying_code);
-
- if (!__this_cpu_read(save_modifying_code))
- return;
-
- if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
- smp_rmb();
- ftrace_mod_code();
- atomic_inc(&nmi_update_count);
- }
- /* Must have previous changes seen before executions */
- smp_mb();
-}
-
-void ftrace_nmi_exit(void)
-{
- if (!__this_cpu_read(save_modifying_code))
- return;
-
- /* Finish all executions before clearing nmi_running */
- smp_mb();
- atomic_dec(&nmi_running);
-}
-
-static void wait_for_nmi_and_set_mod_flag(void)
-{
- if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
- return;
-
- do {
- cpu_relax();
- } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
-
- nmi_wait_count++;
-}
-
-static void wait_for_nmi(void)
-{
- if (!atomic_read(&nmi_running))
- return;
-
- do {
- cpu_relax();
- } while (atomic_read(&nmi_running));
-
- nmi_wait_count++;
-}
-
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -238,26 +91,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
- mod_code_ip = (void *)ip;
- mod_code_newcode = new_code;
-
- /* The buffers need to be visible before we let NMIs write them */
- smp_mb();
-
- wait_for_nmi_and_set_mod_flag();
-
- /* Make sure all running NMIs have finished before we write the code */
- smp_mb();
-
- ftrace_mod_code();
-
- /* Make sure the write happens before clearing the bit */
- smp_mb();
-
- clear_mod_flag();
- wait_for_nmi();
-
- return mod_code_status;
+ return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
}
static const unsigned char *ftrace_nop_replace(void)
@@ -266,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void)
}
static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code)
{
unsigned char replaced[MCOUNT_INSN_SIZE];
@@ -307,7 +141,20 @@ int ftrace_make_nop(struct module *mod,
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
- return ftrace_modify_code(rec->ip, old, new);
+ /*
+ * On boot up, and when modules are loaded, the MCOUNT_ADDR
+ * is converted to a nop, and will never become MCOUNT_ADDR
+ * again. This code is either running before SMP (on boot up)
+ * or before the code will ever be executed (module load).
+ * We do not want to use the breakpoint version in this case,
+ * just modify the code directly.
+ */
+ if (addr == MCOUNT_ADDR)
+ return ftrace_modify_code_direct(rec->ip, old, new);
+
+ /* Normal cases use add_brk_on_nop */
+ WARN_ONCE(1, "invalid use of ftrace_make_nop");
+ return -EINVAL;
}
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -318,9 +165,47 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
- return ftrace_modify_code(rec->ip, old, new);
+ /* Should only be called when module is loaded */
+ return ftrace_modify_code_direct(rec->ip, old, new);
}
+/*
+ * The modifying_ftrace_code is used to tell the breakpoint
+ * handler to call ftrace_int3_handler(). If it fails to
+ * call this handler for a breakpoint added by ftrace, then
+ * the kernel may crash.
+ *
+ * As atomic_writes on x86 do not need a barrier, we do not
+ * need to add smp_mb()s for this to work. It is also considered
+ * that we can not read the modifying_ftrace_code before
+ * executing the breakpoint. That would be quite remarkable if
+ * it could do that. Here's the flow that is required:
+ *
+ * CPU-0 CPU-1
+ *
+ * atomic_inc(mfc);
+ * write int3s
+ * <trap-int3> // implicit (r)mb
+ * if (atomic_read(mfc))
+ * call ftrace_int3_handler()
+ *
+ * Then when we are finished:
+ *
+ * atomic_dec(mfc);
+ *
+ * If we hit a breakpoint that was not set by ftrace, it does not
+ * matter if ftrace_int3_handler() is called or not. It will
+ * simply be ignored. But it is crucial that a ftrace nop/caller
+ * breakpoint is handled. No other user should ever place a
+ * breakpoint on an ftrace nop/caller location. It must only
+ * be done by this code.
+ */
+atomic_t modifying_ftrace_code __read_mostly;
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code);
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
@@ -329,9 +214,376 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
new = ftrace_call_replace(ip, (unsigned long)func);
+
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
ret = ftrace_modify_code(ip, old, new);
+ atomic_dec(&modifying_ftrace_code);
+
+ return ret;
+}
+
+/*
+ * A breakpoint was added to the code address we are about to
+ * modify, and this is the handle that will just skip over it.
+ * We are either changing a nop into a trace call, or a trace
+ * call to a nop. While the change is taking place, we treat
+ * it just like it was a nop.
+ */
+int ftrace_int3_handler(struct pt_regs *regs)
+{
+ if (WARN_ON_ONCE(!regs))
+ return 0;
+
+ if (!ftrace_location(regs->ip - 1))
+ return 0;
+
+ regs->ip += MCOUNT_INSN_SIZE - 1;
+
+ return 1;
+}
+
+static int ftrace_write(unsigned long ip, const char *val, int size)
+{
+ /*
+ * On x86_64, kernel text mappings are mapped read-only with
+ * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
+ * of the kernel text mapping to modify the kernel text.
+ *
+ * For 32bit kernels, these mappings are same and we can use
+ * kernel identity mapping to modify code.
+ */
+ if (within(ip, (unsigned long)_text, (unsigned long)_etext))
+ ip = (unsigned long)__va(__pa(ip));
+
+ return probe_kernel_write((void *)ip, val, size);
+}
+
+static int add_break(unsigned long ip, const char *old)
+{
+ unsigned char replaced[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+
+ if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
+ return -EINVAL;
+
+ if (ftrace_write(ip, &brk, 1))
+ return -EPERM;
+
+ return 0;
+}
+
+static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned const char *old;
+ unsigned long ip = rec->ip;
+
+ old = ftrace_call_replace(ip, addr);
+
+ return add_break(rec->ip, old);
+}
+
+
+static int add_brk_on_nop(struct dyn_ftrace *rec)
+{
+ unsigned const char *old;
+
+ old = ftrace_nop_replace();
+
+ return add_break(rec->ip, old);
+}
+
+static int add_breakpoints(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_brk_on_nop(rec);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_brk_on_call(rec, ftrace_addr);
+ }
+ return 0;
+}
+
+/*
+ * On error, we need to remove breakpoints. This needs to
+ * be done caefully. If the address does not currently have a
+ * breakpoint, we know we are done. Otherwise, we look at the
+ * remaining 4 bytes of the instruction. If it matches a nop
+ * we replace the breakpoint with the nop. Otherwise we replace
+ * it with the call instruction.
+ */
+static int remove_breakpoint(struct dyn_ftrace *rec)
+{
+ unsigned char ins[MCOUNT_INSN_SIZE];
+ unsigned char brk = BREAKPOINT_INSTRUCTION;
+ const unsigned char *nop;
+ unsigned long ftrace_addr;
+ unsigned long ip = rec->ip;
+
+ /* If we fail the read, just give up */
+ if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
+ return -EFAULT;
+
+ /* If this does not have a breakpoint, we are done */
+ if (ins[0] != brk)
+ return -1;
+
+ nop = ftrace_nop_replace();
+
+ /*
+ * If the last 4 bytes of the instruction do not match
+ * a nop, then we assume that this is a call to ftrace_addr.
+ */
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
+ /*
+ * For extra paranoidism, we check if the breakpoint is on
+ * a call that would actually jump to the ftrace_addr.
+ * If not, don't touch the breakpoint, we make just create
+ * a disaster.
+ */
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
+ return -EINVAL;
+ }
+
+ return probe_kernel_write((void *)ip, &nop[0], 1);
+}
+
+static int add_update_code(unsigned long ip, unsigned const char *new)
+{
+ /* skip breakpoint */
+ ip++;
+ new++;
+ if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
+ return -EPERM;
+ return 0;
+}
+
+static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+ return add_update_code(ip, new);
+}
+
+static int add_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+ return add_update_code(ip, new);
+}
+
+static int add_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_test_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return add_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return add_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_call_replace(ip, addr);
+
+ if (ftrace_write(ip, new, 1))
+ return -EPERM;
+
+ return 0;
+}
+
+static int finish_update_nop(struct dyn_ftrace *rec)
+{
+ unsigned long ip = rec->ip;
+ unsigned const char *new;
+
+ new = ftrace_nop_replace();
+
+ if (ftrace_write(ip, new, 1))
+ return -EPERM;
+ return 0;
+}
+
+static int finish_update(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ret = ftrace_update_record(rec, enable);
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
+ /* converting nop to call */
+ return finish_update_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ /* converting a call to a nop */
+ return finish_update_nop(rec);
+ }
+
+ return 0;
+}
+
+static void do_sync_core(void *data)
+{
+ sync_core();
+}
+
+static void run_sync(void)
+{
+ int enable_irqs = irqs_disabled();
+
+ /* We may be called with interrupts disbled (on bootup). */
+ if (enable_irqs)
+ local_irq_enable();
+ on_each_cpu(do_sync_core, NULL, 1);
+ if (enable_irqs)
+ local_irq_disable();
+}
+
+void ftrace_replace_code(int enable)
+{
+ struct ftrace_rec_iter *iter;
+ struct dyn_ftrace *rec;
+ const char *report = "adding breakpoints";
+ int count = 0;
+ int ret;
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_breakpoints(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ count++;
+ }
+
+ run_sync();
+
+ report = "updating code";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = add_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ report = "removing breakpoints";
+
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+
+ ret = finish_update(rec, enable);
+ if (ret)
+ goto remove_breakpoints;
+ }
+
+ run_sync();
+
+ return;
+
+ remove_breakpoints:
+ ftrace_bug(ret, rec ? rec->ip : 0);
+ printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
+ for_ftrace_rec_iter(iter) {
+ rec = ftrace_rec_iter_record(iter);
+ remove_breakpoint(rec);
+ }
+}
+
+static int
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+ unsigned const char *new_code)
+{
+ int ret;
+
+ ret = add_break(ip, old_code);
+ if (ret)
+ goto out;
+
+ run_sync();
+
+ ret = add_update_code(ip, new_code);
+ if (ret)
+ goto fail_update;
+
+ run_sync();
+
+ ret = ftrace_write(ip, new_code, 1);
+ if (ret) {
+ ret = -EPERM;
+ goto out;
+ }
+ run_sync();
+ out:
return ret;
+
+ fail_update:
+ probe_kernel_write((void *)ip, &old_code[0], 1);
+ goto out;
+}
+
+void arch_ftrace_update_code(int command)
+{
+ /* See comment above by declaration of modifying_ftrace_code */
+ atomic_inc(&modifying_ftrace_code);
+
+ ftrace_modify_all_code(command);
+
+ atomic_dec(&modifying_ftrace_code);
}
int __init ftrace_dyn_arch_init(void *data)
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 51ff18616d5..c18f59d1010 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -14,7 +14,6 @@
#include <asm/sections.h>
#include <asm/e820.h>
#include <asm/page.h>
-#include <asm/trampoline.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 3a3b779f41d..037df57a99a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,7 +24,6 @@
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/e820.h>
-#include <asm/trampoline.h>
#include <asm/bios_ebda.h>
static void __init zap_identity_mappings(void)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ce0be7cd085..d42ab17b739 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -21,6 +21,7 @@
#include <asm/msr-index.h>
#include <asm/cpufeature.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -273,10 +274,7 @@ num_subarch_entries = (. - subarch_entries) / 4
* If cpu hotplug is not supported then this code can go in init section
* which will be freed later
*/
-
__CPUINIT
-
-#ifdef CONFIG_SMP
ENTRY(startup_32_smp)
cld
movl $(__BOOT_DS),%eax
@@ -287,7 +285,7 @@ ENTRY(startup_32_smp)
movl pa(stack_start),%ecx
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
-#endif /* CONFIG_SMP */
+
default_entry:
/*
@@ -363,28 +361,23 @@ default_entry:
pushl $0
popfl
-#ifdef CONFIG_SMP
- cmpb $0, ready
- jnz checkCPUtype
-#endif /* CONFIG_SMP */
-
/*
* start system 32-bit setup. We need to re-do some of the things done
* in 16-bit mode for the "real" operations.
*/
- call setup_idt
-
-checkCPUtype:
-
- movl $-1,X86_CPUID # -1 for no CPUID initially
-
+ movl setup_once_ref,%eax
+ andl %eax,%eax
+ jz 1f # Did we do this already?
+ call *%eax
+1:
+
/* check if it is 486 or 386. */
/*
* XXX - this does a lot of unnecessary setup. Alignment checks don't
* apply at our cpl of 0 and the stack ought to be aligned already, and
* we don't need to preserve eflags.
*/
-
+ movl $-1,X86_CPUID # -1 for no CPUID initially
movb $3,X86 # at least 386
pushfl # push EFLAGS
popl %eax # get EFLAGS
@@ -450,21 +443,6 @@ is386: movl $2,%ecx # set MP
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
-#ifdef CONFIG_CC_STACKPROTECTOR
- /*
- * The linker can't handle this by relocation. Manually set
- * base address in stack canary segment descriptor.
- */
- cmpb $0,ready
- jne 1f
- movl $gdt_page,%eax
- movl $stack_canary,%ecx
- movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
- shrl $16, %ecx
- movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
- movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-1:
-#endif
movl $(__KERNEL_STACK_CANARY),%eax
movl %eax,%gs
@@ -473,7 +451,6 @@ is386: movl $2,%ecx # set MP
cld # gcc2 wants the direction flag cleared at all times
pushl $0 # fake return address for unwinder
- movb $1, ready
jmp *(initial_code)
/*
@@ -495,81 +472,122 @@ check_x87:
.byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
ret
+
+#include "verify_cpu.S"
+
/*
- * setup_idt
+ * setup_once
*
- * sets up a idt with 256 entries pointing to
- * ignore_int, interrupt gates. It doesn't actually load
- * idt - that can be done only after paging has been enabled
- * and the kernel moved to PAGE_OFFSET. Interrupts
- * are enabled elsewhere, when we can be relatively
- * sure everything is ok.
+ * The setup work we only want to run on the BSP.
*
* Warning: %esi is live across this function.
*/
-setup_idt:
- lea ignore_int,%edx
- movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax /* selector = 0x0010 = cs */
- movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
+__INIT
+setup_once:
+ /*
+ * Set up a idt with 256 entries pointing to ignore_int,
+ * interrupt gates. It doesn't actually load idt - that needs
+ * to be done on each CPU. Interrupts are enabled elsewhere,
+ * when we can be relatively sure everything is ok.
+ */
- lea idt_table,%edi
- mov $256,%ecx
-rp_sidt:
+ movl $idt_table,%edi
+ movl $early_idt_handlers,%eax
+ movl $NUM_EXCEPTION_VECTORS,%ecx
+1:
movl %eax,(%edi)
- movl %edx,4(%edi)
+ movl %eax,4(%edi)
+ /* interrupt gate, dpl=0, present */
+ movl $(0x8E000000 + __KERNEL_CS),2(%edi)
+ addl $9,%eax
addl $8,%edi
- dec %ecx
- jne rp_sidt
+ loop 1b
-.macro set_early_handler handler,trapno
- lea \handler,%edx
+ movl $256 - NUM_EXCEPTION_VECTORS,%ecx
+ movl $ignore_int,%edx
movl $(__KERNEL_CS << 16),%eax
- movw %dx,%ax
+ movw %dx,%ax /* selector = 0x0010 = cs */
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
- lea idt_table,%edi
- movl %eax,8*\trapno(%edi)
- movl %edx,8*\trapno+4(%edi)
-.endm
+2:
+ movl %eax,(%edi)
+ movl %edx,4(%edi)
+ addl $8,%edi
+ loop 2b
- set_early_handler handler=early_divide_err,trapno=0
- set_early_handler handler=early_illegal_opcode,trapno=6
- set_early_handler handler=early_protection_fault,trapno=13
- set_early_handler handler=early_page_fault,trapno=14
+#ifdef CONFIG_CC_STACKPROTECTOR
+ /*
+ * Configure the stack canary. The linker can't handle this by
+ * relocation. Manually set base address in stack canary
+ * segment descriptor.
+ */
+ movl $gdt_page,%eax
+ movl $stack_canary,%ecx
+ movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+ shrl $16, %ecx
+ movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+ movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+ andl $0,setup_once_ref /* Once is enough, thanks */
ret
-early_divide_err:
- xor %edx,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
+ENTRY(early_idt_handlers)
+ # 36(%esp) %eflags
+ # 32(%esp) %cs
+ # 28(%esp) %eip
+ # 24(%rsp) error code
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ .if (EXCEPTION_ERRCODE_MASK >> i) & 1
+ ASM_NOP2
+ .else
+ pushl $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushl $i # 20(%esp) Vector number
+ jmp early_idt_handler
+ i = i + 1
+ .endr
+ENDPROC(early_idt_handlers)
+
+ /* This is global to keep gas from relaxing the jumps */
+ENTRY(early_idt_handler)
+ cld
+ cmpl $2,%ss:early_recursion_flag
+ je hlt_loop
+ incl %ss:early_recursion_flag
-early_illegal_opcode:
- movl $6,%edx
- pushl $0 /* fake errcode */
- jmp early_fault
+ push %eax # 16(%esp)
+ push %ecx # 12(%esp)
+ push %edx # 8(%esp)
+ push %ds # 4(%esp)
+ push %es # 0(%esp)
+ movl $(__KERNEL_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
-early_protection_fault:
- movl $13,%edx
- jmp early_fault
+ cmpl $(__KERNEL_CS),32(%esp)
+ jne 10f
-early_page_fault:
- movl $14,%edx
- jmp early_fault
+ leal 28(%esp),%eax # Pointer to %eip
+ call early_fixup_exception
+ andl %eax,%eax
+ jnz ex_entry /* found an exception entry */
-early_fault:
- cld
+10:
#ifdef CONFIG_PRINTK
- pusha
- movl $(__KERNEL_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- cmpl $2,early_recursion_flag
- je hlt_loop
- incl early_recursion_flag
+ xorl %eax,%eax
+ movw %ax,2(%esp) /* clean up the segment values on some cpus */
+ movw %ax,6(%esp)
+ movw %ax,34(%esp)
+ leal 40(%esp),%eax
+ pushl %eax /* %esp before the exception */
+ pushl %ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
movl %cr2,%eax
pushl %eax
- pushl %edx /* trapno */
+ pushl (20+6*4)(%esp) /* trapno */
pushl $fault_msg
call printk
#endif
@@ -578,6 +596,17 @@ hlt_loop:
hlt
jmp hlt_loop
+ex_entry:
+ pop %es
+ pop %ds
+ pop %edx
+ pop %ecx
+ pop %eax
+ addl $8,%esp /* drop vector number and error code */
+ decl %ss:early_recursion_flag
+ iret
+ENDPROC(early_idt_handler)
+
/* This is the default interrupt "handler" :-) */
ALIGN
ignore_int:
@@ -611,13 +640,18 @@ ignore_int:
popl %eax
#endif
iret
+ENDPROC(ignore_int)
+__INITDATA
+ .align 4
+early_recursion_flag:
+ .long 0
-#include "verify_cpu.S"
-
- __REFDATA
-.align 4
+__REFDATA
+ .align 4
ENTRY(initial_code)
.long i386_start_kernel
+ENTRY(setup_once_ref)
+ .long setup_once
/*
* BSS section
@@ -670,22 +704,19 @@ ENTRY(initial_page_table)
ENTRY(stack_start)
.long init_thread_union+THREAD_SIZE
-early_recursion_flag:
- .long 0
-
-ready: .byte 0
-
+__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
fault_msg:
/* fault info: */
.ascii "BUG: Int %d: CR2 %p\n"
-/* pusha regs: */
- .ascii " EDI %p ESI %p EBP %p ESP %p\n"
- .ascii " EBX %p EDX %p ECX %p EAX %p\n"
+/* regs pushed in early_idt_handler: */
+ .ascii " EDI %p ESI %p EBP %p EBX %p\n"
+ .ascii " ESP %p ES %p DS %p\n"
+ .ascii " EDX %p ECX %p EAX %p\n"
/* fault frame: */
- .ascii " err %p EIP %p CS %p flg %p\n"
+ .ascii " vec %p err %p EIP %p CS %p flg %p\n"
.ascii "Stack: %p %p %p %p %p %p %p %p\n"
.ascii " %p %p %p %p %p %p %p %p\n"
.asciz " %p %p %p %p %p %p %p %p\n"
@@ -699,6 +730,7 @@ fault_msg:
* segment size, and 32-bit linear address value:
*/
+ .data
.globl boot_gdt_descr
.globl idt_descr
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 40f4eb3766d..94bf9cc2c7e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,12 +19,15 @@
#include <asm/cache.h>
#include <asm/processor-flags.h>
#include <asm/percpu.h>
+#include <asm/nops.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
+#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
#else
-#define GET_CR2_INTO_RCX movq %cr2, %rcx
+#define GET_CR2_INTO(reg) movq %cr2, reg
+#define INTERRUPT_RETURN iretq
#endif
/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -136,10 +139,6 @@ ident_complete:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
- /* Fixup trampoline */
- addq %rbp, trampoline_level4_pgt + 0(%rip)
- addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
-
/* Due to ENTRY(), sometimes the empty space gets filled with
* zeros. Better take a jmp than relying on empty space being
* filled with 0x90 (nop)
@@ -270,36 +269,56 @@ bad_address:
jmp bad_address
.section ".init.text","ax"
-#ifdef CONFIG_EARLY_PRINTK
.globl early_idt_handlers
early_idt_handlers:
+ # 104(%rsp) %rflags
+ # 96(%rsp) %cs
+ # 88(%rsp) %rip
+ # 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- movl $i, %esi
+ .if (EXCEPTION_ERRCODE_MASK >> i) & 1
+ ASM_NOP2
+ .else
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .endif
+ pushq $i # 72(%rsp) Vector number
jmp early_idt_handler
i = i + 1
.endr
-#endif
ENTRY(early_idt_handler)
-#ifdef CONFIG_EARLY_PRINTK
+ cld
+
cmpl $2,early_recursion_flag(%rip)
jz 1f
incl early_recursion_flag(%rip)
- GET_CR2_INTO_RCX
- movq %rcx,%r9
- xorl %r8d,%r8d # zero for error code
- movl %esi,%ecx # get vector number
- # Test %ecx against mask of vectors that push error code.
- cmpl $31,%ecx
- ja 0f
- movl $1,%eax
- salq %cl,%rax
- testl $0x27d00,%eax
- je 0f
- popq %r8 # get error code
-0: movq 0(%rsp),%rcx # get ip
- movq 8(%rsp),%rdx # get cs
+
+ pushq %rax # 64(%rsp)
+ pushq %rcx # 56(%rsp)
+ pushq %rdx # 48(%rsp)
+ pushq %rsi # 40(%rsp)
+ pushq %rdi # 32(%rsp)
+ pushq %r8 # 24(%rsp)
+ pushq %r9 # 16(%rsp)
+ pushq %r10 # 8(%rsp)
+ pushq %r11 # 0(%rsp)
+
+ cmpl $__KERNEL_CS,96(%rsp)
+ jne 10f
+
+ leaq 88(%rsp),%rdi # Pointer to %rip
+ call early_fixup_exception
+ andl %eax,%eax
+ jnz 20f # Found an exception entry
+
+10:
+#ifdef CONFIG_EARLY_PRINTK
+ GET_CR2_INTO(%r9) # can clobber any volatile register if pv
+ movl 80(%rsp),%r8d # error code
+ movl 72(%rsp),%esi # vector number
+ movl 96(%rsp),%edx # %cs
+ movq 88(%rsp),%rcx # %rip
xorl %eax,%eax
leaq early_idt_msg(%rip),%rdi
call early_printk
@@ -308,17 +327,32 @@ ENTRY(early_idt_handler)
call dump_stack
#ifdef CONFIG_KALLSYMS
leaq early_idt_ripmsg(%rip),%rdi
- movq 0(%rsp),%rsi # get rip again
+ movq 40(%rsp),%rsi # %rip again
call __print_symbol
#endif
#endif /* EARLY_PRINTK */
1: hlt
jmp 1b
-#ifdef CONFIG_EARLY_PRINTK
+20: # Exception table entry found
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rdi
+ popq %rsi
+ popq %rdx
+ popq %rcx
+ popq %rax
+ addq $16,%rsp # drop vector number and error code
+ decl early_recursion_flag(%rip)
+ INTERRUPT_RETURN
+
+ .balign 4
early_recursion_flag:
.long 0
+#ifdef CONFIG_EARLY_PRINTK
early_idt_msg:
.asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
early_idt_ripmsg:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad0de0c2714..1460a5df92f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -94,13 +94,18 @@ static int hpet_verbose;
static int __init hpet_setup(char *str)
{
- if (str) {
+ while (str) {
+ char *next = strchr(str, ',');
+
+ if (next)
+ *next++ = 0;
if (!strncmp("disable", str, 7))
boot_hpet_disable = 1;
if (!strncmp("force", str, 5))
hpet_force_user = 1;
if (!strncmp("verbose", str, 7))
hpet_verbose = 1;
+ str = next;
}
return 1;
}
@@ -319,8 +324,6 @@ static void hpet_set_mode(enum clock_event_mode mode,
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned int) delta;
cfg = hpet_readl(HPET_Tn_CFG(timer));
- /* Make sure we use edge triggered interrupts */
- cfg &= ~HPET_TN_LEVEL;
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
HPET_TN_SETVAL | HPET_TN_32BIT;
hpet_writel(cfg, HPET_Tn_CFG(timer));
@@ -787,15 +790,16 @@ static int hpet_clocksource_register(void)
return 0;
}
+static u32 *hpet_boot_cfg;
+
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
int __init hpet_enable(void)
{
- unsigned long hpet_period;
- unsigned int id;
+ u32 hpet_period, cfg, id;
u64 freq;
- int i;
+ unsigned int i, last;
if (!is_hpet_capable())
return 0;
@@ -847,15 +851,45 @@ int __init hpet_enable(void)
id = hpet_readl(HPET_ID);
hpet_print_config();
+ last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+
#ifdef CONFIG_HPET_EMULATE_RTC
/*
* The legacy routing mode needs at least two channels, tick timer
* and the rtc emulation channel.
*/
- if (!(id & HPET_ID_NUMBER))
+ if (!last)
goto out_nohpet;
#endif
+ cfg = hpet_readl(HPET_CFG);
+ hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg),
+ GFP_KERNEL);
+ if (hpet_boot_cfg)
+ *hpet_boot_cfg = cfg;
+ else
+ pr_warn("HPET initial state will not be saved\n");
+ cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
+ hpet_writel(cfg, HPET_CFG);
+ if (cfg)
+ pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
+ cfg);
+
+ for (i = 0; i <= last; ++i) {
+ cfg = hpet_readl(HPET_Tn_CFG(i));
+ if (hpet_boot_cfg)
+ hpet_boot_cfg[i + 1] = cfg;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
+ hpet_writel(cfg, HPET_Tn_CFG(i));
+ cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
+ | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
+ | HPET_TN_FSB | HPET_TN_FSB_CAP);
+ if (cfg)
+ pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
+ cfg, i);
+ }
+ hpet_print_config();
+
if (hpet_clocksource_register())
goto out_nohpet;
@@ -923,14 +957,28 @@ fs_initcall(hpet_late_init);
void hpet_disable(void)
{
if (is_hpet_capable() && hpet_virt_address) {
- unsigned int cfg = hpet_readl(HPET_CFG);
+ unsigned int cfg = hpet_readl(HPET_CFG), id, last;
- if (hpet_legacy_int_enabled) {
+ if (hpet_boot_cfg)
+ cfg = *hpet_boot_cfg;
+ else if (hpet_legacy_int_enabled) {
cfg &= ~HPET_CFG_LEGACY;
hpet_legacy_int_enabled = 0;
}
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
+
+ if (!hpet_boot_cfg)
+ return;
+
+ id = hpet_readl(HPET_ID);
+ last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+
+ for (id = 0; id <= last; ++id)
+ hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
+
+ if (*hpet_boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(*hpet_boot_cfg, HPET_CFG);
}
}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 2d6e6498c17..f250431fb50 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -88,7 +88,7 @@ void kernel_fpu_begin(void)
__thread_clear_has_fpu(me);
/* We do 'stts()' in kernel_fpu_end() */
} else {
- percpu_write(fpu_owner_task, NULL);
+ this_cpu_write(fpu_owner_task, NULL);
clts();
}
}
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
deleted file mode 100644
index 43e9ccf4494..00000000000
--- a/arch/x86/kernel/init_task.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-#include <linux/mqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-
-static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
-static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
-
-/*
- * Initial thread structure.
- *
- * We need to make sure that this is THREAD_SIZE aligned due to the
- * way process stacks are handled. This is done by having a special
- * "init_task" linker map entry..
- */
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
-
-/*
- * Initial task structure.
- *
- * All other task structs will be allocated on slabs in fork.c
- */
-struct task_struct init_task = INIT_TASK(init_task);
-EXPORT_SYMBOL(init_task);
-
-/*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
- * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data..cacheline_aligned
- * section. Since TSS's are completely CPU-local, we want them
- * on exact cacheline boundaries, to eliminate cacheline ping-pong.
- */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 58b7f27cb3e..344faf8d0d6 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu)
return;
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
@@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu)
per_cpu(hardirq_ctx, cpu) = irqctx;
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
- THREAD_FLAGS,
- THREAD_ORDER));
+ THREADINFO_GFP,
+ THREAD_SIZE_ORDER));
memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8bfb6146f75..3f61904365c 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags)
/**
* kgdb_arch_handle_exception - Handle architecture specific GDB packets.
- * @vector: The error vector of the exception that happened.
+ * @e_vector: The error vector of the exception that happened.
* @signo: The signal number of the exception that happened.
* @err_code: The error code of the exception that happened.
- * @remcom_in_buffer: The buffer of the packet we have read.
- * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
- * @regs: The &struct pt_regs of the current process.
+ * @remcomInBuffer: The buffer of the packet we have read.
+ * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @linux_regs: The &struct pt_regs of the current process.
*
* This function MUST handle the 'c' and 's' command packets,
* as well packets to set / remove a hardware breakpoint, if used.
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e213fc8408d..e2f751efb7b 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1037,9 +1037,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
"current sp %p does not match saved sp %p\n",
stack_addr(regs), kcb->jprobe_saved_sp);
printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
- show_registers(saved_regs);
+ show_regs(saved_regs);
printk(KERN_ERR "Current registers\n");
- show_registers(regs);
+ show_regs(regs);
BUG();
}
*regs = kcb->jprobe_saved_regs;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f8492da65bf..f1b42b3a186 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,7 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <linux/percpu.h>
+#include <linux/hardirq.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
@@ -114,6 +115,20 @@ static void kvm_get_preset_lpj(void)
preset_lpj = lpj;
}
+bool kvm_check_and_clear_guest_paused(void)
+{
+ bool ret = false;
+ struct pvclock_vcpu_time_info *src;
+
+ src = &__get_cpu_var(hv_clock);
+ if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
+ __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
+ ret = true;
+ }
+
+ return ret;
+}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
deleted file mode 100644
index 7eb1e2b9782..00000000000
--- a/arch/x86/kernel/mca_32.c
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Written by Martin Kolinek, February 1996
- *
- * Changes:
- *
- * Chris Beauregard July 28th, 1996
- * - Fixed up integrated SCSI detection
- *
- * Chris Beauregard August 3rd, 1996
- * - Made mca_info local
- * - Made integrated registers accessible through standard function calls
- * - Added name field
- * - More sanity checking
- *
- * Chris Beauregard August 9th, 1996
- * - Rewrote /proc/mca
- *
- * Chris Beauregard January 7th, 1997
- * - Added basic NMI-processing
- * - Added more information to mca_info structure
- *
- * David Weinehall October 12th, 1998
- * - Made a lot of cleaning up in the source
- * - Added use of save_flags / restore_flags
- * - Added the 'driver_loaded' flag in MCA_adapter
- * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter
- *
- * David Weinehall March 24th, 1999
- * - Fixed the output of 'Driver Installed' in /proc/mca/pos
- * - Made the Integrated Video & SCSI show up even if they have id 0000
- *
- * Alexander Viro November 9th, 1999
- * - Switched to regular procfs methods
- *
- * Alfred Arnold & David Weinehall August 23rd, 2000
- * - Added support for Planar POS-registers
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/mca.h>
-#include <linux/kprobes.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <linux/proc_fs.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/ioport.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-
-static unsigned char which_scsi;
-
-int MCA_bus;
-EXPORT_SYMBOL(MCA_bus);
-
-/*
- * Motherboard register spinlock. Untested on SMP at the moment, but
- * are there any MCA SMP boxes?
- *
- * Yes - Alan
- */
-static DEFINE_SPINLOCK(mca_lock);
-
-/* Build the status info for the adapter */
-
-static void mca_configure_adapter_status(struct mca_device *mca_dev)
-{
- mca_dev->status = MCA_ADAPTER_NONE;
-
- mca_dev->pos_id = mca_dev->pos[0]
- + (mca_dev->pos[1] << 8);
-
- if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
-
- /*
- * id = 0x0000 usually indicates hardware failure,
- * however, ZP Gu (zpg@castle.net> reports that his 9556
- * has 0x0000 as id and everything still works. There
- * also seem to be an adapter with id = 0x0000; the
- * NCR Parallel Bus Memory Card. Until this is confirmed,
- * however, this code will stay.
- */
-
- mca_dev->status = MCA_ADAPTER_ERROR;
-
- return;
- } else if (mca_dev->pos_id != 0xffff) {
-
- /*
- * 0xffff usually indicates that there's no adapter,
- * however, some integrated adapters may have 0xffff as
- * their id and still be valid. Examples are on-board
- * VGA of the 55sx, the integrated SCSI of the 56 & 57,
- * and possibly also the 95 ULTIMEDIA.
- */
-
- mca_dev->status = MCA_ADAPTER_NORMAL;
- }
-
- if ((mca_dev->pos_id == 0xffff ||
- mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
- int j;
-
- for (j = 2; j < 8; j++) {
- if (mca_dev->pos[j] != 0xff) {
- mca_dev->status = MCA_ADAPTER_NORMAL;
- break;
- }
- }
- }
-
- if (!(mca_dev->pos[2] & MCA_ENABLED)) {
-
- /* enabled bit is in POS 2 */
-
- mca_dev->status = MCA_ADAPTER_DISABLED;
- }
-} /* mca_configure_adapter_status */
-
-/*--------------------------------------------------------------------*/
-
-static struct resource mca_standard_resources[] = {
- { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" },
- { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" },
- { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" },
- { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" },
- { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" },
- { .start = 0x96, .end = 0x97, .name = "POS (MCA)" },
- { .start = 0x100, .end = 0x107, .name = "POS (MCA)" }
-};
-
-#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
-
-/*
- * mca_read_and_store_pos - read the POS registers into a memory buffer
- * @pos: a char pointer to 8 bytes, contains the POS register value on
- * successful return
- *
- * Returns 1 if a card actually exists (i.e. the pos isn't
- * all 0xff) or 0 otherwise
- */
-static int mca_read_and_store_pos(unsigned char *pos)
-{
- int j;
- int found = 0;
-
- for (j = 0; j < 8; j++) {
- pos[j] = inb_p(MCA_POS_REG(j));
- if (pos[j] != 0xff) {
- /* 0xff all across means no device. 0x00 means
- * something's broken, but a device is
- * probably there. However, if you get 0x00
- * from a motherboard register it won't matter
- * what we find. For the record, on the
- * 57SLC, the integrated SCSI adapter has
- * 0xffff for the adapter ID, but nonzero for
- * other registers. */
-
- found = 1;
- }
- }
- return found;
-}
-
-static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
-{
- unsigned char byte;
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return 0;
-
- spin_lock_irqsave(&mca_lock, flags);
- if (mca_dev->pos_register) {
- /* Disable adapter setup, enable motherboard setup */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
-
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
- } else {
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read the appropriate register */
-
- outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG);
- byte = inb_p(MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
- }
- spin_unlock_irqrestore(&mca_lock, flags);
-
- mca_dev->pos[reg] = byte;
-
- return byte;
-}
-
-static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
- unsigned char byte)
-{
- unsigned long flags;
-
- if (reg < 0 || reg >= 8)
- return;
-
- spin_lock_irqsave(&mca_lock, flags);
-
- /* Make sure motherboard setup is off */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /* Read in the appropriate register */
-
- outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG);
- outb_p(byte, MCA_POS_REG(reg));
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- spin_unlock_irqrestore(&mca_lock, flags);
-
- /* Update the global register list, while we have the byte */
-
- mca_dev->pos[reg] = byte;
-
-}
-
-/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
-{
- return irq;
-}
-
-static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
-{
- return port;
-}
-
-static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
-{
- return mem;
-}
-
-
-static int __init mca_init(void)
-{
- unsigned int i, j;
- struct mca_device *mca_dev;
- unsigned char pos[8];
- short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
- struct mca_bus *bus;
-
- /*
- * WARNING: Be careful when making changes here. Putting an adapter
- * and the motherboard simultaneously into setup mode may result in
- * damage to chips (according to The Indispensable PC Hardware Book
- * by Hans-Peter Messmer). Also, we disable system interrupts (so
- * that we are not disturbed in the middle of this).
- */
-
- /* Make sure the MCA bus is present */
-
- if (mca_system_init()) {
- printk(KERN_ERR "MCA bus system initialisation failed\n");
- return -ENODEV;
- }
-
- if (!MCA_bus)
- return -ENODEV;
-
- printk(KERN_INFO "Micro Channel bus detected.\n");
-
- /* All MCA systems have at least a primary bus */
- bus = mca_attach_bus(MCA_PRIMARY_BUS);
- if (!bus)
- goto out_nomem;
- bus->default_dma_mask = 0xffffffffLL;
- bus->f.mca_write_pos = mca_pc_write_pos;
- bus->f.mca_read_pos = mca_pc_read_pos;
- bus->f.mca_transform_irq = mca_dummy_transform_irq;
- bus->f.mca_transform_ioport = mca_dummy_transform_ioport;
- bus->f.mca_transform_memory = mca_dummy_transform_memory;
-
- /* get the motherboard device */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
- if (unlikely(!mca_dev))
- goto out_nomem;
-
- /*
- * We do not expect many MCA interrupts during initialization,
- * but let us be safe:
- */
- spin_lock_irq(&mca_lock);
-
- /* Make sure adapter setup is off */
-
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Read motherboard POS registers */
-
- mca_dev->pos_register = 0x7f;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for a motherboard */
- mca_dev->pos_id = MCA_MOTHERBOARD_POS;
- mca_dev->slot = MCA_MOTHERBOARD;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- /* Put motherboard into video setup mode, read integrated video
- * POS registers, and turn motherboard setup off.
- */
-
- mca_dev->pos_register = 0xdf;
- outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG);
- mca_dev->name[0] = 0;
- mca_read_and_store_pos(mca_dev->pos);
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for the integrated video */
- mca_dev->pos_id = MCA_INTEGVIDEO_POS;
- mca_dev->slot = MCA_INTEGVIDEO;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
-
- /*
- * Put motherboard into scsi setup mode, read integrated scsi
- * POS registers, and turn motherboard setup off.
- *
- * It seems there are two possible SCSI registers. Martin says that
- * for the 56,57, 0xf7 is the one, but fails on the 76.
- * Alfredo (apena@vnet.ibm.com) says
- * 0xfd works on his machine. We'll try both of them. I figure it's
- * a good bet that only one could be valid at a time. This could
- * screw up though if one is used for something else on the other
- * machine.
- */
-
- for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
- outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
- if (mca_read_and_store_pos(pos))
- break;
- }
- if (which_scsi) {
- /* found a scsi card */
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_configure_adapter_status(mca_dev);
- /* fake POS and slot for integrated SCSI controller */
- mca_dev->pos_id = MCA_INTEGSCSI_POS;
- mca_dev->slot = MCA_INTEGSCSI;
- mca_dev->pos_register = which_scsi;
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
-
- /* Turn off motherboard setup */
-
- outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
-
- /*
- * Now loop over MCA slots: put each adapter into setup mode, and
- * read its POS registers. Then put adapter setup off.
- */
-
- for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
- outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
- if (!mca_read_and_store_pos(pos))
- continue;
-
- mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if (unlikely(!mca_dev))
- goto out_unlock_nomem;
-
- for (j = 0; j < 8; j++)
- mca_dev->pos[j] = pos[j];
-
- mca_dev->driver_loaded = 0;
- mca_dev->slot = i;
- mca_dev->pos_register = 0;
- mca_configure_adapter_status(mca_dev);
- mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- }
- outb_p(0, MCA_ADAPTER_SETUP_REG);
-
- /* Enable interrupts and return memory start */
- spin_unlock_irq(&mca_lock);
-
- for (i = 0; i < MCA_STANDARD_RESOURCES; i++)
- request_resource(&ioport_resource, mca_standard_resources + i);
-
- mca_do_proc_init();
-
- return 0;
-
- out_unlock_nomem:
- spin_unlock_irq(&mca_lock);
- out_nomem:
- printk(KERN_EMERG "Failed memory allocation in MCA setup!\n");
- return -ENOMEM;
-}
-
-subsys_initcall(mca_init);
-
-/*--------------------------------------------------------------------*/
-
-static __kprobes void
-mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
-{
- int slot = mca_dev->slot;
-
- if (slot == MCA_INTEGSCSI) {
- printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_INTEGVIDEO) {
- printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
- mca_dev->name);
- } else if (slot == MCA_MOTHERBOARD) {
- printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
- mca_dev->name);
- }
-
- /* More info available in POS 6 and 7? */
-
- if (check_flag) {
- unsigned char pos6, pos7;
-
- pos6 = mca_device_read_pos(mca_dev, 6);
- pos7 = mca_device_read_pos(mca_dev, 7);
-
- printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7);
- }
-
-} /* mca_handle_nmi_slot */
-
-/*--------------------------------------------------------------------*/
-
-static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
-{
- struct mca_device *mca_dev = to_mca_device(dev);
- unsigned char pos5;
-
- pos5 = mca_device_read_pos(mca_dev, 5);
-
- if (!(pos5 & 0x80)) {
- /*
- * Bit 7 of POS 5 is reset when this adapter has a hardware
- * error. Bit 7 it reset if there's error information
- * available in POS 6 and 7.
- */
- mca_handle_nmi_device(mca_dev, !(pos5 & 0x40));
- return 1;
- }
- return 0;
-}
-
-void __kprobes mca_handle_nmi(void)
-{
- /*
- * First try - scan the various adapters and see if a specific
- * adapter was responsible for the error.
- */
- bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
-}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index c9bda6d6035..24b852b61be 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -298,20 +298,31 @@ static ssize_t reload_store(struct device *dev,
const char *buf, size_t size)
{
unsigned long val;
- int cpu = dev->id;
- int ret = 0;
- char *end;
+ int cpu;
+ ssize_t ret = 0, tmp_ret;
- val = simple_strtoul(buf, &end, 0);
- if (end == buf)
+ /* allow reload only from the BSP */
+ if (boot_cpu_data.cpu_index != dev->id)
return -EINVAL;
- if (val == 1) {
- get_online_cpus();
- if (cpu_online(cpu))
- ret = reload_for_cpu(cpu);
- put_online_cpus();
+ ret = kstrtoul(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return size;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ tmp_ret = reload_for_cpu(cpu);
+ if (tmp_ret != 0)
+ pr_warn("Error reloading microcode on CPU %d\n", cpu);
+
+ /* save retval of the first encountered reload error */
+ if (!ret)
+ ret = tmp_ret;
}
+ put_online_cpus();
if (!ret)
ret = size;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ca470e4c92d..d2b56489d70 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,7 +27,6 @@
#include <asm/proto.h>
#include <asm/bios_ebda.h>
#include <asm/e820.h>
-#include <asm/trampoline.h>
#include <asm/setup.h>
#include <asm/smp.h>
@@ -97,7 +96,7 @@ static void __init MP_bus_info(struct mpc_bus *m)
set_bit(m->busid, mp_bus_not_pci);
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -105,12 +104,10 @@ static void __init MP_bus_info(struct mpc_bus *m)
x86_init.mpparse.mpc_oem_pci_bus(m);
clear_bit(m->busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+#ifdef CONFIG_EISA
mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
- mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
#endif
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -368,9 +365,6 @@ static void __init construct_ioapic_table(int mpc_default_type)
case 3:
memcpy(bus.bustype, "EISA ", 6);
break;
- case 4:
- case 7:
- memcpy(bus.bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -573,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
struct mpf_intel *mpf;
unsigned long mem;
- apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
- bp, length);
+ apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
+ base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -589,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#endif
mpf_found = mpf;
- printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
- mpf, (u64)virt_to_phys(mpf));
+ printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+ (unsigned long long) virt_to_phys(mpf),
+ (unsigned long long) virt_to_phys(mpf) +
+ sizeof(*mpf) - 1, mpf);
mem = virt_to_phys(mpf);
memblock_reserve(mem, sizeof(*mpf));
@@ -623,7 +619,7 @@ void __init default_find_smp_config(void)
return;
/*
* If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
+ * configuration is in an EISA bus machine with an
* extended bios data area.
*
* there is a real-mode segmented pointer pointing to the
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 32856fa4384..a0b2f84457b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -19,8 +19,6 @@
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/mca.h>
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -31,14 +29,6 @@
#include <asm/nmi.h>
#include <asm/x86_init.h>
-#define NMI_MAX_NAMELEN 16
-struct nmiaction {
- struct list_head list;
- nmi_handler_t handler;
- unsigned int flags;
- char *name;
-};
-
struct nmi_desc {
spinlock_t lock;
struct list_head head;
@@ -54,6 +44,14 @@ static struct nmi_desc nmi_desc[NMI_MAX] =
.lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
.head = LIST_HEAD_INIT(nmi_desc[1].head),
},
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[2].head),
+ },
+ {
+ .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
+ .head = LIST_HEAD_INIT(nmi_desc[3].head),
+ },
};
@@ -84,7 +82,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
-static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -107,11 +105,14 @@ static int notrace __kprobes nmi_handle(unsigned int type, struct pt_regs *regs,
return handled;
}
-static int __setup_nmi(unsigned int type, struct nmiaction *action)
+int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
struct nmi_desc *desc = nmi_to_desc(type);
unsigned long flags;
+ if (!action->handler)
+ return -EINVAL;
+
spin_lock_irqsave(&desc->lock, flags);
/*
@@ -120,6 +121,8 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
* to manage expectations
*/
WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
+ WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
/*
* some handlers need to be executed first otherwise a fake
@@ -133,8 +136,9 @@ static int __setup_nmi(unsigned int type, struct nmiaction *action)
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
+EXPORT_SYMBOL(__register_nmi_handler);
-static struct nmiaction *__free_nmi(unsigned int type, const char *name)
+void unregister_nmi_handler(unsigned int type, const char *name)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *n;
@@ -157,61 +161,16 @@ static struct nmiaction *__free_nmi(unsigned int type, const char *name)
spin_unlock_irqrestore(&desc->lock, flags);
synchronize_rcu();
- return (n);
}
-
-int register_nmi_handler(unsigned int type, nmi_handler_t handler,
- unsigned long nmiflags, const char *devname)
-{
- struct nmiaction *action;
- int retval = -ENOMEM;
-
- if (!handler)
- return -EINVAL;
-
- action = kzalloc(sizeof(struct nmiaction), GFP_KERNEL);
- if (!action)
- goto fail_action;
-
- action->handler = handler;
- action->flags = nmiflags;
- action->name = kstrndup(devname, NMI_MAX_NAMELEN, GFP_KERNEL);
- if (!action->name)
- goto fail_action_name;
-
- retval = __setup_nmi(type, action);
-
- if (retval)
- goto fail_setup_nmi;
-
- return retval;
-
-fail_setup_nmi:
- kfree(action->name);
-fail_action_name:
- kfree(action);
-fail_action:
-
- return retval;
-}
-EXPORT_SYMBOL_GPL(register_nmi_handler);
-
-void unregister_nmi_handler(unsigned int type, const char *name)
-{
- struct nmiaction *a;
-
- a = __free_nmi(type, name);
- if (a) {
- kfree(a->name);
- kfree(a);
- }
-}
-
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
-static notrace __kprobes void
+static __kprobes void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_SERR, regs, false))
+ return;
+
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
@@ -236,15 +195,19 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
-static notrace __kprobes void
+static __kprobes void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
+ /* check to see if anyone registered against these types of errors */
+ if (nmi_handle(NMI_IO_CHECK, regs, false))
+ return;
+
pr_emerg(
"NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
reason, smp_processor_id());
- show_registers(regs);
+ show_regs(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
@@ -263,7 +226,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
outb(reason, NMI_REASON_PORT);
}
-static notrace __kprobes void
+static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -282,16 +245,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
__this_cpu_add(nmi_stats.unknown, 1);
-#ifdef CONFIG_MCA
- /*
- * Might actually be able to figure out what the guilty party
- * is:
- */
- if (MCA_bus) {
- mca_handle_nmi();
- return;
- }
-#endif
pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
reason, smp_processor_id());
@@ -305,7 +258,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
+static __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 2c39dcd510f..149b8d9c6ad 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -13,6 +13,7 @@
#include <linux/cpumask.h>
#include <linux/delay.h>
#include <linux/init.h>
+#include <linux/percpu.h>
#include <asm/apic.h>
#include <asm/nmi.h>
@@ -41,7 +42,7 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
static void __init init_nmi_testsuite(void)
{
/* trap all the unknown NMIs we may generate */
- register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+ register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
}
static void __init cleanup_nmi_testsuite(void)
@@ -63,7 +64,7 @@ static void __init test_nmi_ipi(struct cpumask *mask)
{
unsigned long timeout;
- if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback,
NMI_FLAG_FIRST, "nmi_selftest")) {
nmi_fail = FAILURE;
return;
@@ -117,15 +118,15 @@ static void __init dotest(void (*testcase_fn)(void), int expected)
unexpected_testcase_failures++;
if (nmi_fail == FAILURE)
- printk("FAILED |");
+ printk(KERN_CONT "FAILED |");
else if (nmi_fail == TIMEOUT)
- printk("TIMEOUT|");
+ printk(KERN_CONT "TIMEOUT|");
else
- printk("ERROR |");
+ printk(KERN_CONT "ERROR |");
dump_stack();
} else {
testcase_successes++;
- printk(" ok |");
+ printk(KERN_CONT " ok |");
}
testcase_total++;
@@ -150,10 +151,10 @@ void __init nmi_selftest(void)
print_testname("remote IPI");
dotest(remote_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
print_testname("local IPI");
dotest(local_ipi, SUCCESS);
- printk("\n");
+ printk(KERN_CONT "\n");
cleanup_nmi_testsuite();
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index ab137605e69..9ce885996fd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -241,16 +241,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- percpu_write(paravirt_lazy_mode, mode);
+ this_cpu_write(paravirt_lazy_mode, mode);
}
static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
+ BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
- percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
+ this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
@@ -267,7 +267,7 @@ void paravirt_start_context_switch(struct task_struct *prev)
{
BUG_ON(preemptible());
- if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
arch_leave_lazy_mmu_mode();
set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
}
@@ -289,7 +289,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
if (in_interrupt())
return PARAVIRT_LAZY_NONE;
- return percpu_read(paravirt_lazy_mode);
+ return this_cpu_read(paravirt_lazy_mode);
}
void arch_flush_lazy_mmu_mode(void)
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index d0b2fb9ccbb..b72838bae64 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1480,8 +1480,9 @@ cleanup:
static int __init calgary_parse_options(char *p)
{
unsigned int bridge;
+ unsigned long val;
size_t len;
- char* endp;
+ ssize_t ret;
while (*p) {
if (!strncmp(p, "64k", 3))
@@ -1512,10 +1513,11 @@ static int __init calgary_parse_options(char *p)
++p;
if (*p == '\0')
break;
- bridge = simple_strtoul(p, &endp, 0);
- if (p == endp)
+ ret = kstrtoul(p, 0, &val);
+ if (ret)
break;
+ bridge = val;
if (bridge < MAX_PHB_BUS_NUM) {
printk(KERN_INFO "Calgary: disabling "
"translation for PHB %#x\n", bridge);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 3003250ac51..c0f420f76cd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -101,13 +101,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
{
unsigned long dma_mask;
struct page *page;
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
dma_addr_t addr;
dma_mask = dma_alloc_coherent_mask(dev, flag);
flag |= __GFP_ZERO;
again:
- page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
+ page = NULL;
+ if (!(flag & GFP_ATOMIC))
+ page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ if (!page)
+ page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
if (!page)
return NULL;
@@ -127,6 +132,16 @@ again:
return page_address(page);
}
+void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t dma_addr, struct dma_attrs *attrs)
+{
+ unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct page *page = virt_to_page(vaddr);
+
+ if (!dma_release_from_contiguous(dev, page, count))
+ free_pages((unsigned long)vaddr, get_order(size));
+}
+
/*
* See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
* parameter documentation.
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index f96050685b4..871be4a84c7 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
return nents;
}
-static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
- dma_addr_t dma_addr, struct dma_attrs *attrs)
-{
- free_pages((unsigned long)vaddr, get_order(size));
-}
-
static void nommu_sync_single_for_device(struct device *dev,
dma_addr_t addr, size_t size,
enum dma_data_direction dir)
@@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
struct dma_map_ops nommu_dma_ops = {
.alloc = dma_generic_alloc_coherent,
- .free = nommu_free_coherent,
+ .free = dma_generic_free_coherent,
.map_sg = nommu_map_sg,
.map_page = nommu_map_page,
.sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1d92a5ab6e8..735279e54e5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -27,6 +27,15 @@
#include <asm/debugreg.h>
#include <asm/nmi.h>
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -47,10 +56,16 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
+/*
+ * this gets called so that we can store lazy state into memory and copy the
+ * current task into the new thread.
+ */
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
int ret;
+ unlazy_fpu(src);
+
*dst = *src;
if (fpu_allocated(&src->thread.fpu)) {
memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
@@ -67,10 +82,9 @@ void free_thread_xstate(struct task_struct *tsk)
fpu_free(&tsk->thread.fpu);
}
-void free_thread_info(struct thread_info *ti)
+void arch_release_task_struct(struct task_struct *tsk)
{
- free_thread_xstate(ti->task);
- free_pages((unsigned long)ti, THREAD_ORDER);
+ free_thread_xstate(tsk);
}
void arch_task_cache_init(void)
@@ -81,6 +95,16 @@ void arch_task_cache_init(void)
SLAB_PANIC | SLAB_NOTRACK, NULL);
}
+static inline void drop_fpu(struct task_struct *tsk)
+{
+ /*
+ * Forget coprocessor state..
+ */
+ tsk->fpu_counter = 0;
+ clear_fpu(tsk);
+ clear_used_math();
+}
+
/*
* Free current thread data structures etc..
*/
@@ -103,12 +127,8 @@ void exit_thread(void)
put_cpu();
kfree(bp);
}
-}
-void show_regs(struct pt_regs *regs)
-{
- show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
+ drop_fpu(me);
}
void show_regs_common(void)
@@ -143,12 +163,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- /*
- * Forget coprocessor state..
- */
- tsk->fpu_counter = 0;
- clear_fpu(tsk);
- clear_used_math();
+ drop_fpu(tsk);
}
static void hard_disable_TSC(void)
@@ -377,7 +392,7 @@ static inline void play_dead(void)
#ifdef CONFIG_X86_64
void enter_idle(void)
{
- percpu_write(is_idle, 1);
+ this_cpu_write(is_idle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
@@ -516,26 +531,6 @@ void stop_this_cpu(void *dummy)
}
}
-static void do_nothing(void *unused)
-{
-}
-
-/*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
- *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
- */
-void cpu_idle_wait(void)
-{
- smp_mb();
- /* kick all the CPUs so that they exit out of pm_idle */
- smp_call_function(do_nothing, NULL, 1);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
/* Default MONITOR/MWAIT with no hints, used for default C1 state */
static void mwait_idle(void)
{
@@ -594,9 +589,17 @@ int mwait_usable(const struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
+ /* Use mwait if idle=mwait boot option is given */
if (boot_option_idle_override == IDLE_FORCE_MWAIT)
return 1;
+ /*
+ * Any idle= boot option other than idle=mwait means that we must not
+ * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
+ */
+ if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+ return 0;
+
if (c->cpuid_level < MWAIT_INFO)
return 0;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ae6847303e2..516fa186121 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -126,15 +126,6 @@ void release_thread(struct task_struct *dead_task)
release_vm86_irqs(dead_task);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
@@ -302,7 +293,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
switch_fpu_finish(next_p, fpu);
- percpu_write(current_task, next_p);
+ this_cpu_write(current_task, next_p);
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 43d8b48b23e..61cdf7fdf09 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -145,15 +145,6 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
return get_desc_base(&t->thread.tls_array[tls]);
}
-/*
- * This gets called before we allocate a new thread and copy
- * the current task into it.
- */
-void prepare_to_copy(struct task_struct *tsk)
-{
- unlazy_fpu(tsk);
-}
-
int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long unused,
struct task_struct *p, struct pt_regs *regs)
@@ -237,7 +228,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
current->thread.usersp = new_sp;
regs->ip = new_ip;
regs->sp = new_sp;
- percpu_write(old_rsp, new_sp);
+ this_cpu_write(old_rsp, new_sp);
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
@@ -359,11 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = percpu_read(old_rsp);
- percpu_write(old_rsp, next->usersp);
- percpu_write(current_task, next_p);
+ prev->usersp = this_cpu_read(old_rsp);
+ this_cpu_write(old_rsp, next->usersp);
+ this_cpu_write(current_task, next_p);
- percpu_write(kernel_stack,
+ this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 4609190683f..c4c6a5c2bf0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1415,12 +1415,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
#endif
}
-/*
- * This is declared in linux/regset.h and defined in machine-dependent
- * code. We put the export here to ensure no machine forgets it.
- */
-EXPORT_SYMBOL_GPL(task_user_regset_view);
-
static void fill_sigtrap_info(struct task_struct *tsk,
struct pt_regs *regs,
int error_code, int si_code,
@@ -1480,7 +1474,11 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->flags |= X86_EFLAGS_TF;
/* do the secure computing check first */
- secure_computing(regs->orig_ax);
+ if (secure_computing(regs->orig_ax)) {
+ /* seccomp failures shouldn't expose any additional code. */
+ ret = -1L;
+ goto out;
+ }
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
@@ -1505,6 +1503,7 @@ long syscall_trace_enter(struct pt_regs *regs)
regs->dx, regs->r10);
#endif
+out:
return ret ?: regs->orig_ax;
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d840e69a853..5de92f1abd7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -24,6 +24,7 @@
#ifdef CONFIG_X86_32
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
+# include <asm/realmode.h>
#else
# include <asm/x86_init.h>
#endif
@@ -39,7 +40,8 @@ static int reboot_mode;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
-/* This variable is used privately to keep track of whether or not
+/*
+ * This variable is used privately to keep track of whether or not
* reboot_type is still set to its default value (i.e., reboot= hasn't
* been set on the command line). This is needed so that we can
* suppress DMI scanning for reboot quirks. Without it, it's
@@ -51,7 +53,8 @@ static int reboot_default = 1;
static int reboot_cpu = -1;
#endif
-/* This is set if we need to go through the 'emergency' path.
+/*
+ * This is set if we need to go through the 'emergency' path.
* When machine_emergency_restart() is called, we may be on
* an inconsistent state and won't be able to do a clean cleanup
*/
@@ -60,22 +63,24 @@ static int reboot_emergency;
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
- warm Don't set the cold reboot flag
- cold Set the cold reboot flag
- bios Reboot by jumping through the BIOS (only for X86_32)
- smp Reboot by executing reset on BSP or other CPU (only for X86_32)
- triple Force a triple fault (init)
- kbd Use the keyboard controller. cold reset (default)
- acpi Use the RESET_REG in the FADT
- efi Use efi reset_system runtime service
- pci Use the so-called "PCI reset register", CF9
- force Avoid anything that could hang.
+/*
+ * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
+ * warm Don't set the cold reboot flag
+ * cold Set the cold reboot flag
+ * bios Reboot by jumping through the BIOS (only for X86_32)
+ * smp Reboot by executing reset on BSP or other CPU (only for X86_32)
+ * triple Force a triple fault (init)
+ * kbd Use the keyboard controller. cold reset (default)
+ * acpi Use the RESET_REG in the FADT
+ * efi Use efi reset_system runtime service
+ * pci Use the so-called "PCI reset register", CF9
+ * force Avoid anything that could hang.
*/
static int __init reboot_setup(char *str)
{
for (;;) {
- /* Having anything passed on the command line via
+ /*
+ * Having anything passed on the command line via
* reboot= will cause us to disable DMI checking
* below.
*/
@@ -98,9 +103,11 @@ static int __init reboot_setup(char *str)
if (isdigit(*(str+2)))
reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
}
- /* we will leave sorting out the final value
- when we are ready to reboot, since we might not
- have detected BSP APIC ID or smp_num_cpu */
+ /*
+ * We will leave sorting out the final value
+ * when we are ready to reboot, since we might not
+ * have detected BSP APIC ID or smp_num_cpu
+ */
break;
#endif /* CONFIG_SMP */
@@ -150,6 +157,62 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
return 0;
}
+void machine_real_restart(unsigned int type)
+{
+ void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int))
+ real_mode_header->machine_real_restart_asm;
+
+ local_irq_disable();
+
+ /*
+ * Write zero to CMOS register number 0x0f, which the BIOS POST
+ * routine will recognize as telling it to do a proper reboot. (Well
+ * that's what this book in front of me says -- it may only apply to
+ * the Phoenix BIOS though, it's not clear). At the same time,
+ * disable NMIs by setting the top bit in the CMOS address register,
+ * as we're about to do peculiar things to the CPU. I'm not sure if
+ * `outb_p' is needed instead of just `outb'. Use it to be on the
+ * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
+ */
+ spin_lock(&rtc_lock);
+ CMOS_WRITE(0x00, 0x8f);
+ spin_unlock(&rtc_lock);
+
+ /*
+ * Switch back to the initial page table.
+ */
+ load_cr3(initial_page_table);
+
+ /*
+ * Write 0x1234 to absolute memory location 0x472. The BIOS reads
+ * this on booting to tell it to "Bypass memory test (also warm
+ * boot)". This seems like a fairly standard thing that gets set by
+ * REBOOT.COM programs, and the previous reset routine did this
+ * too. */
+ *((unsigned short *)0x472) = reboot_mode;
+
+ /* Jump to the identity-mapped low memory code */
+ restart_lowmem(type);
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(machine_real_restart);
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+/*
+ * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
+ */
+static int __init set_pci_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_CF9) {
+ reboot_type = BOOT_CF9;
+ printk(KERN_INFO "%s series board detected. "
+ "Selecting PCI-method for reboots.\n", d->ident);
+ }
+ return 0;
+}
+
static int __init set_kbd_reboot(const struct dmi_system_id *d)
{
if (reboot_type != BOOT_KBD) {
@@ -159,7 +222,12 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
return 0;
}
+/*
+ * This is a single dmi_table handling all reboot quirks. Note that
+ * REBOOT_BIOS is only available for 32bit
+ */
static struct dmi_system_id __initdata reboot_dmi_table[] = {
+#ifdef CONFIG_X86_32
{ /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
.ident = "Dell E520",
@@ -184,7 +252,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -192,7 +260,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -201,7 +269,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 745",
.matches = {
@@ -210,7 +278,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 330",
.matches = {
@@ -219,7 +287,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 360",
.matches = {
@@ -228,7 +296,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
- { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
.callback = set_bios_reboot,
.ident = "Dell OptiPlex 760",
.matches = {
@@ -301,7 +369,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
},
},
- { /* Handle problems with rebooting on ASUS P4S800 */
+ { /* Handle problems with rebooting on ASUS P4S800 */
.callback = set_bios_reboot,
.ident = "ASUS P4S800",
.matches = {
@@ -309,7 +377,9 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
},
},
- { /* Handle reboot issue on Acer Aspire one */
+#endif /* CONFIG_X86_32 */
+
+ { /* Handle reboot issue on Acer Aspire one */
.callback = set_kbd_reboot,
.ident = "Acer Aspire One A110",
.matches = {
@@ -317,96 +387,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
},
},
- { }
-};
-
-static int __init reboot_init(void)
-{
- /* Only do the DMI check if reboot_type hasn't been overridden
- * on the command line
- */
- if (reboot_default) {
- dmi_check_system(reboot_dmi_table);
- }
- return 0;
-}
-core_initcall(reboot_init);
-
-extern const unsigned char machine_real_restart_asm[];
-extern const u64 machine_real_restart_gdt[3];
-
-void machine_real_restart(unsigned int type)
-{
- void *restart_va;
- unsigned long restart_pa;
- void (*restart_lowmem)(unsigned int);
- u64 *lowmem_gdt;
-
- local_irq_disable();
-
- /* Write zero to CMOS register number 0x0f, which the BIOS POST
- routine will recognize as telling it to do a proper reboot. (Well
- that's what this book in front of me says -- it may only apply to
- the Phoenix BIOS though, it's not clear). At the same time,
- disable NMIs by setting the top bit in the CMOS address register,
- as we're about to do peculiar things to the CPU. I'm not sure if
- `outb_p' is needed instead of just `outb'. Use it to be on the
- safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
- */
- spin_lock(&rtc_lock);
- CMOS_WRITE(0x00, 0x8f);
- spin_unlock(&rtc_lock);
-
- /*
- * Switch back to the initial page table.
- */
- load_cr3(initial_page_table);
-
- /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
- this on booting to tell it to "Bypass memory test (also warm
- boot)". This seems like a fairly standard thing that gets set by
- REBOOT.COM programs, and the previous reset routine did this
- too. */
- *((unsigned short *)0x472) = reboot_mode;
-
- /* Patch the GDT in the low memory trampoline */
- lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
-
- restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
- restart_pa = virt_to_phys(restart_va);
- restart_lowmem = (void (*)(unsigned int))restart_pa;
-
- /* GDT[0]: GDT self-pointer */
- lowmem_gdt[0] =
- (u64)(sizeof(machine_real_restart_gdt) - 1) +
- ((u64)virt_to_phys(lowmem_gdt) << 16);
- /* GDT[1]: 64K real mode code segment */
- lowmem_gdt[1] =
- GDT_ENTRY(0x009b, restart_pa, 0xffff);
-
- /* Jump to the identity-mapped low memory code */
- restart_lowmem(type);
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(machine_real_restart);
-#endif
-
-#endif /* CONFIG_X86_32 */
-
-/*
- * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
- */
-static int __init set_pci_reboot(const struct dmi_system_id *d)
-{
- if (reboot_type != BOOT_CF9) {
- reboot_type = BOOT_CF9;
- printk(KERN_INFO "%s series board detected. "
- "Selecting PCI-method for reboots.\n", d->ident);
- }
- return 0;
-}
-
-static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
{ /* Handle problems with rebooting on Apple MacBook5 */
.callback = set_pci_reboot,
.ident = "Apple MacBook5",
@@ -471,20 +451,28 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
},
},
+ { /* Handle problems with rebooting on the Precision M6600. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ },
+ },
{ }
};
-static int __init pci_reboot_init(void)
+static int __init reboot_init(void)
{
- /* Only do the DMI check if reboot_type hasn't been overridden
+ /*
+ * Only do the DMI check if reboot_type hasn't been overridden
* on the command line
*/
- if (reboot_default) {
- dmi_check_system(pci_reboot_dmi_table);
- }
+ if (reboot_default)
+ dmi_check_system(reboot_dmi_table);
return 0;
}
-core_initcall(pci_reboot_init);
+core_initcall(reboot_init);
static inline void kb_wait(void)
{
@@ -502,14 +490,14 @@ static void vmxoff_nmi(int cpu, struct pt_regs *regs)
cpu_emergency_vmxoff();
}
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization
- */
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
static void emergency_vmx_disable_all(void)
{
/* Just make sure we won't change CPUs while doing this */
local_irq_disable();
- /* We need to disable VMX on all CPUs before rebooting, otherwise
+ /*
+ * We need to disable VMX on all CPUs before rebooting, otherwise
* we risk hanging up the machine, because the CPU ignore INIT
* signals when VMX is enabled.
*
@@ -528,8 +516,7 @@ static void emergency_vmx_disable_all(void)
* is still enabling VMX.
*/
if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU.
- */
+ /* Disable VMX on this CPU. */
cpu_vmxoff();
/* Halt and disable VMX on the other CPUs */
@@ -574,12 +561,12 @@ static void native_machine_emergency_restart(void)
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
case BOOT_KBD:
- mach_reboot_fixups(); /* for board specific fixups */
+ mach_reboot_fixups(); /* For board specific fixups */
for (i = 0; i < 10; i++) {
kb_wait();
udelay(50);
- outb(0xfe, 0x64); /* pulse reset low */
+ outb(0xfe, 0x64); /* Pulse reset low */
udelay(50);
}
if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
@@ -621,7 +608,7 @@ static void native_machine_emergency_restart(void)
case BOOT_CF9:
port_cf9_safe = true;
- /* fall through */
+ /* Fall through */
case BOOT_CF9_COND:
if (port_cf9_safe) {
@@ -659,9 +646,12 @@ void native_machine_shutdown(void)
/* Make certain I only run on the appropriate processor */
set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
- /* O.K Now that I'm on the appropriate processor,
- * stop all of the others.
+ /*
+ * O.K Now that I'm on the appropriate processor, stop all of the
+ * others. Also disable the local irq to not receive the per-cpu
+ * timer interrupt which may trigger scheduler's load balance.
*/
+ local_irq_disable();
stop_other_cpus();
#endif
@@ -697,12 +687,11 @@ static void native_machine_restart(char *__unused)
static void native_machine_halt(void)
{
- /* stop other cpus and apics */
+ /* Stop other cpus and apics */
machine_shutdown();
tboot_shutdown(TB_SHUTDOWN_HALT);
- /* stop this cpu */
stop_this_cpu(NULL);
}
@@ -713,7 +702,7 @@ static void native_machine_power_off(void)
machine_shutdown();
pm_power_off();
}
- /* a fallback in case there is no PM info available */
+ /* A fallback in case there is no PM info available */
tboot_shutdown(TB_SHUTDOWN_HALT);
}
@@ -775,7 +764,8 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu = raw_smp_processor_id();
- /* Don't do anything if this handler is invoked on crashing cpu.
+ /*
+ * Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
@@ -799,7 +789,8 @@ static void smp_send_nmi_allbutself(void)
apic->send_IPI_allbutself(NMI_VECTOR);
}
-/* Halt all other CPUs, calling the specified function on each of them
+/*
+ * Halt all other CPUs, calling the specified function on each of them
*
* This function can be used to halt all other CPUs on crash
* or emergency reboot time. The function passed as parameter
@@ -810,7 +801,7 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
unsigned long msecs;
local_irq_disable();
- /* Make a note of crashing cpu. Will be used in NMI callback.*/
+ /* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
shootdown_callback = callback;
@@ -819,8 +810,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* Would it be better to replace the trap vector here? */
if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback,
NMI_FLAG_FIRST, "crash"))
- return; /* return what? */
- /* Ensure the new callback function is set before sending
+ return; /* Return what? */
+ /*
+ * Ensure the new callback function is set before sending
* out the NMI
*/
wmb();
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
deleted file mode 100644
index 1d5c46df0d7..00000000000
--- a/arch/x86/kernel/reboot_32.S
+++ /dev/null
@@ -1,135 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-/*
- * The following code and data reboots the machine by switching to real
- * mode and jumping to the BIOS reset entry point, as if the CPU has
- * really been reset. The previous version asked the keyboard
- * controller to pulse the CPU reset line, which is more thorough, but
- * doesn't work with at least one type of 486 motherboard. It is easy
- * to stop this code working; hence the copious comments.
- *
- * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
- */
- .section ".x86_trampoline","a"
- .balign 16
- .code32
-ENTRY(machine_real_restart_asm)
-r_base = .
- /* Get our own relocated address */
- call 1f
-1: popl %ebx
- subl $(1b - r_base), %ebx
-
- /* Compute the equivalent real-mode segment */
- movl %ebx, %ecx
- shrl $4, %ecx
-
- /* Patch post-real-mode segment jump */
- movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
- movw %ax, (101f - r_base)(%ebx)
- movw %cx, (102f - r_base)(%ebx)
-
- /* Set up the IDT for real mode. */
- lidtl (machine_real_restart_idt - r_base)(%ebx)
-
- /*
- * Set up a GDT from which we can load segment descriptors for real
- * mode. The GDT is not used in real mode; it is just needed here to
- * prepare the descriptors.
- */
- lgdtl (machine_real_restart_gdt - r_base)(%ebx)
-
- /*
- * Load the data segment registers with 16-bit compatible values
- */
- movl $16, %ecx
- movl %ecx, %ds
- movl %ecx, %es
- movl %ecx, %fs
- movl %ecx, %gs
- movl %ecx, %ss
- ljmpl $8, $1f - r_base
-
-/*
- * This is 16-bit protected mode code to disable paging and the cache,
- * switch to real mode and jump to the BIOS reset code.
- *
- * The instruction that switches to real mode by writing to CR0 must be
- * followed immediately by a far jump instruction, which set CS to a
- * valid value for real mode, and flushes the prefetch queue to avoid
- * running instructions that have already been decoded in protected
- * mode.
- *
- * Clears all the flags except ET, especially PG (paging), PE
- * (protected-mode enable) and TS (task switch for coprocessor state
- * save). Flushes the TLB after paging has been disabled. Sets CD and
- * NW, to disable the cache on a 486, and invalidates the cache. This
- * is more like the state of a 486 after reset. I don't know if
- * something else should be done for other chips.
- *
- * More could be done here to set up the registers as if a CPU reset had
- * occurred; hopefully real BIOSs don't assume much. This is not the
- * actual BIOS entry point, anyway (that is at 0xfffffff0).
- *
- * Most of this work is probably excessive, but it is what is tested.
- */
- .code16
-1:
- xorl %ecx, %ecx
- movl %cr0, %eax
- andl $0x00000011, %eax
- orl $0x60000000, %eax
- movl %eax, %cr0
- movl %ecx, %cr3
- movl %cr0, %edx
- andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
- jz 2f
- wbinvd
-2:
- andb $0x10, %al
- movl %eax, %cr0
- .byte 0xea /* ljmpw */
-101: .word 0 /* Offset */
-102: .word 0 /* Segment */
-
-bios:
- ljmpw $0xf000, $0xfff0
-
-apm:
- movw $0x1000, %ax
- movw %ax, %ss
- movw $0xf000, %sp
- movw $0x5307, %ax
- movw $0x0001, %bx
- movw $0x0003, %cx
- int $0x15
-
-END(machine_real_restart_asm)
-
- .balign 16
- /* These must match <asm/reboot.h */
-dispatch_table:
- .word bios - r_base
- .word apm - r_base
-END(dispatch_table)
-
- .balign 16
-machine_real_restart_idt:
- .word 0xffff /* Length - real mode default value */
- .long 0 /* Base - real mode default value */
-END(machine_real_restart_idt)
-
- .balign 16
-ENTRY(machine_real_restart_gdt)
- .quad 0 /* Self-pointer, filled in by PM code */
- .quad 0 /* 16-bit code segment, filled in by PM code */
- /*
- * 16-bit data segment with the selector value 16 = 0x10 and
- * base value 0x100; since this is consistent with real mode
- * semantics we don't have to reload the segments once CR0.PE = 0.
- */
- .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
-END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1a290156205..16be6dc14db 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -34,7 +34,6 @@
#include <linux/memblock.h>
#include <linux/seq_file.h>
#include <linux/console.h>
-#include <linux/mca.h>
#include <linux/root_dev.h>
#include <linux/highmem.h>
#include <linux/module.h>
@@ -50,6 +49,7 @@
#include <asm/pci-direct.h>
#include <linux/init_ohci1394_dma.h>
#include <linux/kvm_para.h>
+#include <linux/dma-contiguous.h>
#include <linux/errno.h>
#include <linux/kernel.h>
@@ -73,7 +73,7 @@
#include <asm/mtrr.h>
#include <asm/apic.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
#include <asm/setup.h>
@@ -179,12 +179,6 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
/* common cpu data for all cpus */
struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
EXPORT_SYMBOL(boot_cpu_data);
-static void set_mca_bus(int x)
-{
-#ifdef CONFIG_MCA
- MCA_bus = x;
-#endif
-}
unsigned int def_to_bigsmp;
@@ -340,8 +334,8 @@ static void __init relocate_initrd(void)
memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
- printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
- ramdisk_here, ramdisk_here + ramdisk_size);
+ printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
q = (char *)initrd_start;
@@ -372,8 +366,8 @@ static void __init relocate_initrd(void)
/* high pages is not converted by early_res_to_bootmem */
ramdisk_image = boot_params.hdr.ramdisk_image;
ramdisk_size = boot_params.hdr.ramdisk_size;
- printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
- " %08llx - %08llx\n",
+ printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
+ " [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
ramdisk_here, ramdisk_here + ramdisk_size - 1);
}
@@ -393,14 +387,13 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
- printk(KERN_ERR "initrd too large to handle, "
- "disabling initrd\n");
- return;
+ panic("initrd too large to handle, "
+ "disabling initrd (%lld needed, %lld available)\n",
+ ramdisk_size, end_of_lowmem>>1);
}
- printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
- ramdisk_end);
+ printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
+ ramdisk_end - 1);
if (ramdisk_end <= end_of_lowmem) {
@@ -717,7 +710,6 @@ void __init setup_arch(char **cmdline_p)
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
if (boot_params.sys_desc_table.length != 0) {
- set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
machine_id = boot_params.sys_desc_table.table[0];
machine_submodel_id = boot_params.sys_desc_table.table[1];
BIOS_revision = boot_params.sys_desc_table.table[2];
@@ -914,10 +906,10 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
#endif
- printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
- max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
+ (max_pfn_mapped<<PAGE_SHIFT) - 1);
- setup_trampolines();
+ setup_real_mode();
init_gbpages();
@@ -934,6 +926,7 @@ void __init setup_arch(char **cmdline_p)
}
#endif
memblock.current_limit = get_max_mapped();
+ dma_contiguous_reserve(0);
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -975,6 +968,8 @@ void __init setup_arch(char **cmdline_p)
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
mmu_cr4_features = read_cr4();
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
}
#ifdef CONFIG_X86_32
@@ -1012,7 +1007,8 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
init_apic_mappings();
- ioapic_and_gsi_init();
+ if (x86_io_apic_ops.init)
+ x86_io_apic_ops.init();
kvm_guest_init();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 041af2fd088..21af737053a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -479,18 +479,8 @@ asmlinkage int
sys_sigsuspend(int history0, int history1, old_sigset_t mask)
{
sigset_t blocked;
-
- current->saved_sigmask = current->blocked;
-
- mask &= _BLOCKABLE;
siginitset(&blocked, mask);
- set_current_blocked(&blocked);
-
- current->state = TASK_INTERRUPTIBLE;
- schedule();
-
- set_restore_sigmask();
- return -ERESTARTNOHAND;
+ return sigsuspend(&blocked);
}
asmlinkage int
@@ -565,7 +555,6 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
sizeof(frame->extramask))))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -591,7 +580,6 @@ long sys_rt_sigreturn(struct pt_regs *regs)
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
@@ -657,42 +645,28 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
struct pt_regs *regs)
{
int usig = signr_convert(sig);
- sigset_t *set = &current->blocked;
- int ret;
-
- if (current_thread_info()->status & TS_RESTORE_SIGMASK)
- set = &current->saved_sigmask;
+ sigset_t *set = sigmask_to_save();
/* Set up the stack frame */
if (is_ia32) {
if (ka->sa.sa_flags & SA_SIGINFO)
- ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+ return ia32_setup_rt_frame(usig, ka, info, set, regs);
else
- ret = ia32_setup_frame(usig, ka, set, regs);
+ return ia32_setup_frame(usig, ka, set, regs);
#ifdef CONFIG_X86_X32_ABI
} else if (is_x32) {
- ret = x32_setup_rt_frame(usig, ka, info,
+ return x32_setup_rt_frame(usig, ka, info,
(compat_sigset_t *)set, regs);
#endif
} else {
- ret = __setup_rt_frame(sig, ka, info, set, regs);
+ return __setup_rt_frame(sig, ka, info, set, regs);
}
-
- if (ret) {
- force_sigsegv(sig, current);
- return -EFAULT;
- }
-
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- return ret;
}
-static int
+static void
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
struct pt_regs *regs)
{
- int ret;
-
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -723,10 +697,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
regs->flags &= ~X86_EFLAGS_TF;
- ret = setup_rt_frame(sig, ka, info, regs);
-
- if (ret)
- return ret;
+ if (setup_rt_frame(sig, ka, info, regs) < 0) {
+ force_sigsegv(sig, current);
+ return;
+ }
/*
* Clear the direction flag as per the ABI for function entry.
@@ -741,12 +715,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
regs->flags &= ~X86_EFLAGS_TF;
- block_sigmask(ka, sig);
-
- tracehook_signal_handler(sig, info, ka, regs,
- test_thread_flag(TIF_SINGLESTEP));
-
- return 0;
+ signal_delivered(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
}
#ifdef CONFIG_X86_32
@@ -767,16 +737,6 @@ static void do_signal(struct pt_regs *regs)
siginfo_t info;
int signr;
- /*
- * We want the common case to go fast, which is why we may in certain
- * cases get here from kernel mode. Just return without doing anything
- * if so.
- * X86_32: vm86 regs switched out by assembly code before reaching
- * here, so testing against kernel CS suffices.
- */
- if (!user_mode(regs))
- return;
-
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
/* Whee! Actually deliver the signal. */
@@ -806,10 +766,7 @@ static void do_signal(struct pt_regs *regs)
* If there's no signal to deliver, we just put the saved sigmask
* back.
*/
- if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
- current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
- set_current_blocked(&current->saved_sigmask);
- }
+ restore_saved_sigmask();
}
/*
@@ -837,8 +794,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
- if (current->replacement_session_keyring)
- key_replace_session_keyring();
}
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
@@ -946,7 +901,6 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- sigdelsetmask(&set, ~_BLOCKABLE);
set_current_blocked(&set);
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 66c74f481ca..48d2b7ded42 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -109,6 +109,9 @@
* about nothing of note with C stepping upwards.
*/
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool smp_no_nmi_ipi = false;
+
/*
* this function sends a 'reschedule' IPI to another CPU.
* it goes straight through and wastes no time serializing
@@ -149,8 +152,6 @@ void native_send_call_func_ipi(const struct cpumask *mask)
free_cpumask_var(allbutself);
}
-static atomic_t stopping_cpu = ATOMIC_INIT(-1);
-
static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
{
/* We are registered on stopping cpu too, avoid spurious NMI */
@@ -162,7 +163,19 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-static void native_nmi_stop_other_cpus(int wait)
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+asmlinkage void smp_reboot_interrupt(void)
+{
+ ack_APIC_irq();
+ irq_enter();
+ stop_this_cpu(NULL);
+ irq_exit();
+}
+
+static void native_stop_other_cpus(int wait)
{
unsigned long flags;
unsigned long timeout;
@@ -174,20 +187,25 @@ static void native_nmi_stop_other_cpus(int wait)
* Use an own vector here because smp_call_function
* does lots of things not suitable in a panic situation.
*/
+
+ /*
+ * We start by using the REBOOT_VECTOR irq.
+ * The irq is treated as a sync point to allow critical
+ * regions of code on other cpus to release their spin locks
+ * and re-enable irqs. Jumping straight to an NMI might
+ * accidentally cause deadlocks with further shutdown/panic
+ * code. By syncing, we give the cpus up to one second to
+ * finish their work before we force them off with the NMI.
+ */
if (num_online_cpus() > 1) {
/* did someone beat us here? */
if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
return;
- if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
- NMI_FLAG_FIRST, "smp_stop"))
- /* Note: we ignore failures here */
- return;
-
- /* sync above data before sending NMI */
+ /* sync above data before sending IRQ */
wmb();
- apic->send_IPI_allbutself(NMI_VECTOR);
+ apic->send_IPI_allbutself(REBOOT_VECTOR);
/*
* Don't wait longer than a second if the caller
@@ -197,63 +215,37 @@ static void native_nmi_stop_other_cpus(int wait)
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+ if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
+ if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop"))
+ /* Note: we ignore failures here */
+ /* Hope the REBOOT_IRQ is good enough */
+ goto finish;
- local_irq_save(flags);
- disable_local_APIC();
- local_irq_restore(flags);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-
-asmlinkage void smp_reboot_interrupt(void)
-{
- ack_APIC_irq();
- irq_enter();
- stop_this_cpu(NULL);
- irq_exit();
-}
-
-static void native_irq_stop_other_cpus(int wait)
-{
- unsigned long flags;
- unsigned long timeout;
+ /* sync above data before sending IRQ */
+ wmb();
- if (reboot_force)
- return;
+ pr_emerg("Shutting down cpus with NMI\n");
- /*
- * Use an own vector here because smp_call_function
- * does lots of things not suitable in a panic situation.
- * On most systems we could also use an NMI here,
- * but there are a few systems around where NMI
- * is problematic so stay with an non NMI for now
- * (this implies we cannot stop CPUs spinning with irq off
- * currently)
- */
- if (num_online_cpus() > 1) {
- apic->send_IPI_allbutself(REBOOT_VECTOR);
+ apic->send_IPI_allbutself(NMI_VECTOR);
/*
- * Don't wait longer than a second if the caller
+ * Don't wait longer than a 10 ms if the caller
* didn't ask us to wait.
*/
- timeout = USEC_PER_SEC;
+ timeout = USEC_PER_MSEC * 10;
while (num_online_cpus() > 1 && (wait || timeout--))
udelay(1);
}
+finish:
local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
}
-static void native_smp_disable_nmi_ipi(void)
-{
- smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
-}
-
/*
* Reschedule call back.
*/
@@ -287,8 +279,8 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
static int __init nonmi_ipi_setup(char *str)
{
- native_smp_disable_nmi_ipi();
- return 1;
+ smp_no_nmi_ipi = true;
+ return 1;
}
__setup("nonmi_ipi", nonmi_ipi_setup);
@@ -298,7 +290,7 @@ struct smp_ops smp_ops = {
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .stop_other_cpus = native_nmi_stop_other_cpus,
+ .stop_other_cpus = native_stop_other_cpus,
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6e1e406038c..7bd8a082365 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -57,7 +57,7 @@
#include <asm/nmi.h>
#include <asm/irq.h>
#include <asm/idle.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
#include <asm/pgtable.h>
@@ -73,23 +73,13 @@
#include <asm/smpboot_hooks.h>
#include <asm/i8259.h>
+#include <asm/realmode.h>
+
/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
-/* Store all idle threads, this can be reused instead of creating
-* a new thread. Also avoids complicated thread destroy functionality
-* for idle threads.
-*/
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
-
-/*
* We need this for trampoline_base protection from concurrent accesses when
* off- and onlining cores wildly.
*/
@@ -97,20 +87,16 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
void cpu_hotplug_driver_lock(void)
{
- mutex_lock(&x86_cpu_hotplug_driver_mutex);
+ mutex_lock(&x86_cpu_hotplug_driver_mutex);
}
void cpu_hotplug_driver_unlock(void)
{
- mutex_unlock(&x86_cpu_hotplug_driver_mutex);
+ mutex_unlock(&x86_cpu_hotplug_driver_mutex);
}
ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#else
-static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x) (idle_thread_array[(x)])
-#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
#endif
/* Number of siblings per CPU package */
@@ -315,59 +301,102 @@ void __cpuinit smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
-static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
+static bool __cpuinit
+topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+ "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
+ "[node: %d != %d]. Ignoring dependency.\n",
+ cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
+}
+
+#define link_mask(_m, c1, c2) \
+do { \
+ cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
+ cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
+} while (0)
+
+static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (c->phys_proc_id == o->phys_proc_id &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
+ c->compute_unit_id == o->compute_unit_id)
+ return topology_sane(c, o, "smt");
+
+ } else if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_core_id == o->cpu_core_id) {
+ return topology_sane(c, o, "smt");
+ }
+
+ return false;
+}
+
+static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
- cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
- cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
+ return topology_sane(c, o, "llc");
+
+ return false;
}
+static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id) {
+ if (cpu_has(c, X86_FEATURE_AMD_DCM))
+ return true;
+
+ return topology_sane(c, o, "mc");
+ }
+ return false;
+}
void __cpuinit set_cpu_sibling_map(int cpu)
{
- int i;
+ bool has_mc = boot_cpu_data.x86_max_cores > 1;
+ bool has_smt = smp_num_siblings > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cpuinfo_x86 *o;
+ int i;
cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
- if (smp_num_siblings > 1) {
- for_each_cpu(i, cpu_sibling_setup_mask) {
- struct cpuinfo_x86 *o = &cpu_data(i);
-
- if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
- if (c->phys_proc_id == o->phys_proc_id &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
- c->compute_unit_id == o->compute_unit_id)
- link_thread_siblings(cpu, i);
- } else if (c->phys_proc_id == o->phys_proc_id &&
- c->cpu_core_id == o->cpu_core_id) {
- link_thread_siblings(cpu, i);
- }
- }
- } else {
+ if (!has_smt && !has_mc) {
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ cpumask_set_cpu(cpu, cpu_core_mask(cpu));
+ c->booted_cores = 1;
+ return;
}
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+ for_each_cpu(i, cpu_sibling_setup_mask) {
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_smt && match_smt(c, o)))
+ link_mask(sibling, cpu, i);
+
+ if ((i == cpu) || (has_mc && match_llc(c, o)))
+ link_mask(llc_shared, cpu, i);
- if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
- cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
- c->booted_cores = 1;
- return;
}
+ /*
+ * This needs a separate iteration over the cpus because we rely on all
+ * cpu_sibling_mask links to be set-up.
+ */
for_each_cpu(i, cpu_sibling_setup_mask) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
- cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- cpumask_set_cpu(cpu, cpu_core_mask(i));
+ o = &cpu_data(i);
+
+ if ((i == cpu) || (has_mc && match_mc(c, o))) {
+ link_mask(core, cpu, i);
+
/*
* Does this new cpu bringup a new core?
*/
@@ -393,16 +422,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
/* maps the cpu to the sched domain representing multi-core */
const struct cpumask *cpu_coregroup_mask(int cpu)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if ((sched_mc_power_savings || sched_smt_power_savings) &&
- !(cpu_has(c, X86_FEATURE_AMD_DCM)))
- return cpu_core_mask(cpu);
- else
- return cpu_llc_shared_mask(cpu);
+ return cpu_llc_shared_mask(cpu);
}
static void impress_friends(void)
@@ -618,22 +638,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-struct create_idle {
- struct work_struct work;
- struct task_struct *idle;
- struct completion done;
- int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
- struct create_idle *c_idle =
- container_of(work, struct create_idle, work);
-
- c_idle->idle = fork_idle(c_idle->cpu);
- complete(&c_idle->done);
-}
-
/* reduce the number of lines printed when booting a large cpu count system */
static void __cpuinit announce_cpu(int cpu, int apicid)
{
@@ -660,61 +664,35 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
* Returns zero if CPU booted OK, else error code from
* ->wakeup_secondary_cpu.
*/
-static int __cpuinit do_boot_cpu(int apicid, int cpu)
+static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
{
+ volatile u32 *trampoline_status =
+ (volatile u32 *) __va(real_mode_header->trampoline_status);
+ /* start_ip had better be page-aligned! */
+ unsigned long start_ip = real_mode_header->trampoline_start;
+
unsigned long boot_error = 0;
- unsigned long start_ip;
int timeout;
- struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- };
-
- INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
alternatives_smp_switch(1);
- c_idle.idle = get_idle_for_cpu(cpu);
+ idle->thread.sp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + task_stack_page(idle))) - 1);
+ per_cpu(current_task, cpu) = idle;
- /*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
- */
- if (c_idle.idle) {
- c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
- init_idle(c_idle.idle, cpu);
- goto do_rest;
- }
-
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
-
- if (IS_ERR(c_idle.idle)) {
- printk("failed fork for CPU %d\n", cpu);
- destroy_work_on_stack(&c_idle.work);
- return PTR_ERR(c_idle.idle);
- }
-
- set_idle_for_cpu(cpu, c_idle.idle);
-do_rest:
- per_cpu(current_task, cpu) = c_idle.idle;
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(c_idle.idle) -
+ (unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
- stack_start = c_idle.idle->thread.sp;
-
- /* start_ip had better be page-aligned! */
- start_ip = trampoline_address();
+ stack_start = idle->thread.sp;
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -778,8 +756,7 @@ do_rest:
pr_debug("CPU%d: has booted.\n", cpu);
} else {
boot_error = 1;
- if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
- == 0xA5A5A5A5)
+ if (*trampoline_status == 0xA5A5A5A5)
/* trampoline started but...? */
pr_err("CPU%d: Stuck ??\n", cpu);
else
@@ -805,7 +782,7 @@ do_rest:
}
/* mark "stuck" area as not stuck */
- *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
+ *trampoline_status = 0;
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
/*
@@ -813,12 +790,10 @@ do_rest:
*/
smpboot_restore_warm_reset_vector();
}
-
- destroy_work_on_stack(&c_idle.work);
return boot_error;
}
-int __cpuinit native_cpu_up(unsigned int cpu)
+int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int apicid = apic->cpu_present_to_apicid(cpu);
unsigned long flags;
@@ -851,7 +826,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- err = do_boot_cpu(apicid, cpu);
+ err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 6410744ac5c..f84fe00fad4 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -32,7 +32,7 @@
#include <linux/mm.h>
#include <linux/tboot.h>
-#include <asm/trampoline.h>
+#include <asm/realmode.h>
#include <asm/processor.h>
#include <asm/bootparam.h>
#include <asm/pgtable.h>
@@ -44,7 +44,7 @@
#include <asm/e820.h>
#include <asm/io.h>
-#include "acpi/realmode/wakeup.h"
+#include "../realmode/rm/wakeup.h"
/* Global pointer to shared data; NULL means no measured launch. */
struct tboot *tboot __read_mostly;
@@ -201,7 +201,8 @@ static int tboot_setup_sleep(void)
add_mac_region(e820.map[i].addr, e820.map[i].size);
}
- tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
+ tboot->acpi_sinfo.kernel_s3_resume_vector =
+ real_mode_header->wakeup_start;
return 0;
}
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index c29e235792a..b79133abda4 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
+#include <asm/asm.h>
int rodata_test(void)
{
@@ -42,14 +43,7 @@ int rodata_test(void)
".section .fixup,\"ax\"\n"
"2: jmp 1b\n"
".previous\n"
- ".section __ex_table,\"a\"\n"
- " .align 16\n"
-#ifdef CONFIG_X86_32
- " .long 0b,2b\n"
-#else
- " .quad 0b,2b\n"
-#endif
- ".previous"
+ _ASM_EXTABLE(0b,2b)
: [rslt] "=r" (result)
: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
);
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index c6eba2b4267..24d3c91e981 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -14,7 +14,6 @@
#include <linux/i8253.h>
#include <linux/time.h>
#include <linux/export.h>
-#include <linux/mca.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
@@ -58,11 +57,6 @@ EXPORT_SYMBOL(profile_pc);
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
global_clock_event->event_handler(global_clock_event);
-
- /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
- if (MCA_bus)
- outb_p(inb_p(0x61)| 0x80, 0x61);
-
return IRQ_HANDLED;
}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
deleted file mode 100644
index a73b61055ad..00000000000
--- a/arch/x86/kernel/trampoline.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <linux/io.h>
-#include <linux/memblock.h>
-
-#include <asm/trampoline.h>
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
-
-unsigned char *x86_trampoline_base;
-
-void __init setup_trampolines(void)
-{
- phys_addr_t mem;
- size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
- /* Has to be in very low memory so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (!mem)
- panic("Cannot allocate trampoline\n");
-
- x86_trampoline_base = __va(mem);
- memblock_reserve(mem, size);
-
- printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
- x86_trampoline_base, (unsigned long long)mem, size);
-
- memcpy(x86_trampoline_base, x86_trampoline_start, size);
-}
-
-/*
- * setup_trampolines() gets called very early, to guarantee the
- * availability of low memory. This is before the proper kernel page
- * tables are set up, so we cannot set page permissions in that
- * function. Thus, we use an arch_initcall instead.
- */
-static int __init configure_trampolines(void)
-{
- size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
-
- set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
- return 0;
-}
-arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
deleted file mode 100644
index 451c0a7ef7f..00000000000
--- a/arch/x86/kernel/trampoline_32.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- *
- * This is only used for booting secondary CPUs in SMP machine
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * We jump into arch/x86/kernel/head_32.S.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * If you work on this file, check the object module with
- * objdump --reloc to make sure there are no relocation
- * entries except for:
- *
- * TYPE VALUE
- * R_386_32 startup_32_smp
- * R_386_32 boot_gdt
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/segment.h>
-#include <asm/page_types.h>
-
-#ifdef CONFIG_SMP
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .code16
-
-ENTRY(trampoline_data)
-r_base = .
- wbinvd # Needed for NUMA-Q should be harmless for others
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
-
- cli # We should be safe anyway
-
- movl $0xA5A5A5A5, trampoline_status - r_base
- # write marker for master knows we're running
-
- /* GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl boot_idt_descr - r_base # load idt with 0, 0
- lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate
-
- xor %ax, %ax
- inc %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
- # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S
- ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET)
-
- # These need to be in the same 64K segment as the above;
- # hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt_descr:
- .word __BOOT_DS + 7 # gdt limit
- .long boot_gdt - __PAGE_OFFSET # gdt base
-
-boot_idt_descr:
- .word 0 # idt limit = 0
- .long 0 # idt base = 0L
-
-ENTRY(trampoline_status)
- .long 0
-
-.globl trampoline_end
-trampoline_end:
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
deleted file mode 100644
index 09ff51799e9..00000000000
--- a/arch/x86/kernel/trampoline_64.S
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- *
- * Trampoline.S Derived from Setup.S by Linus Torvalds
- *
- * 4 Jan 1997 Michael Chastain: changed to gnu as.
- * 15 Sept 2005 Eric Biederman: 64bit PIC support
- *
- * Entry: CS:IP point to the start of our code, we are
- * in real mode with no stack, but the rest of the
- * trampoline page to make our stack and everything else
- * is a mystery.
- *
- * On entry to trampoline_data, the processor is in real mode
- * with 16-bit addressing and 16-bit data. CS has some value
- * and IP is zero. Thus, data addresses need to be absolute
- * (no relocation) and are taken with regard to r_base.
- *
- * With the addition of trampoline_level4_pgt this code can
- * now enter a 64bit kernel that lives at arbitrary 64bit
- * physical addresses.
- *
- * If you work on this file, check the object module with objdump
- * --full-contents --reloc to make sure there are no relocation
- * entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/pgtable_types.h>
-#include <asm/page_types.h>
-#include <asm/msr.h>
-#include <asm/segment.h>
-#include <asm/processor-flags.h>
-
- .section ".x86_trampoline","a"
- .balign PAGE_SIZE
- .code16
-
-ENTRY(trampoline_data)
-r_base = .
- cli # We should be safe anyway
- wbinvd
- mov %cs, %ax # Code and data in the same place
- mov %ax, %ds
- mov %ax, %es
- mov %ax, %ss
-
-
- movl $0xA5A5A5A5, trampoline_status - r_base
- # write marker for master knows we're running
-
- # Setup stack
- movw $(trampoline_stack_end - r_base), %sp
-
- call verify_cpu # Verify the cpu supports long mode
- testl %eax, %eax # Check for return code
- jnz no_longmode
-
- mov %cs, %ax
- movzx %ax, %esi # Find the 32bit trampoline location
- shll $4, %esi
-
- # Fixup the absolute vectors
- leal (startup_32 - r_base)(%esi), %eax
- movl %eax, startup_32_vector - r_base
- leal (startup_64 - r_base)(%esi), %eax
- movl %eax, startup_64_vector - r_base
- leal (tgdt - r_base)(%esi), %eax
- movl %eax, (tgdt + 2 - r_base)
-
- /*
- * GDT tables in non default location kernel can be beyond 16MB and
- * lgdt will not be able to load the address as in real mode default
- * operand size is 16bit. Use lgdtl instead to force operand size
- * to 32 bit.
- */
-
- lidtl tidt - r_base # load idt with 0, 0
- lgdtl tgdt - r_base # load gdt with whatever is appropriate
-
- mov $X86_CR0_PE, %ax # protected mode (PE) bit
- lmsw %ax # into protected mode
-
- # flush prefetch and jump to startup_32
- ljmpl *(startup_32_vector - r_base)
-
- .code32
- .balign 4
-startup_32:
- movl $__KERNEL_DS, %eax # Initialize the %ds segment register
- movl %eax, %ds
-
- movl $X86_CR4_PAE, %eax
- movl %eax, %cr4 # Enable PAE mode
-
- # Setup trampoline 4 level pagetables
- leal (trampoline_level4_pgt - r_base)(%esi), %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- movl $(1 << _EFER_LME), %eax # Enable Long Mode
- xorl %edx, %edx
- wrmsr
-
- # Enable paging and in turn activate Long Mode
- # Enable protected mode
- movl $(X86_CR0_PG | X86_CR0_PE), %eax
- movl %eax, %cr0
-
- /*
- * At this point we're in long mode but in 32bit compatibility mode
- * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
- * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
- * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
- */
- ljmp *(startup_64_vector - r_base)(%esi)
-
- .code64
- .balign 4
-startup_64:
- # Now jump into the kernel using virtual addresses
- movq $secondary_startup_64, %rax
- jmp *%rax
-
- .code16
-no_longmode:
- hlt
- jmp no_longmode
-#include "verify_cpu.S"
-
- .balign 4
- # Careful these need to be in the same 64K segment as the above;
-tidt:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-
- # Duplicate the global descriptor table
- # so the kernel can live anywhere
- .balign 4
-tgdt:
- .short tgdt_end - tgdt # gdt limit
- .long tgdt - r_base
- .short 0
- .quad 0x00cf9b000000ffff # __KERNEL32_CS
- .quad 0x00af9b000000ffff # __KERNEL_CS
- .quad 0x00cf93000000ffff # __KERNEL_DS
-tgdt_end:
-
- .balign 4
-startup_32_vector:
- .long startup_32 - r_base
- .word __KERNEL32_CS, 0
-
- .balign 4
-startup_64_vector:
- .long startup_64 - r_base
- .word __KERNEL_CS, 0
-
- .balign 4
-ENTRY(trampoline_status)
- .long 0
-
-trampoline_stack:
- .org 0x1000
-trampoline_stack_end:
-ENTRY(trampoline_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 510,8,0
- .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(trampoline_end)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ff9281f1602..05b31d92f69 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,10 +37,6 @@
#include <linux/eisa.h>
#endif
-#ifdef CONFIG_MCA
-#include <linux/mca.h>
-#endif
-
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -50,6 +46,7 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
+#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/i387.h>
@@ -303,8 +300,17 @@ gp_in_kernel:
}
/* May run on IST stack. */
-dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
{
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /*
+ * ftrace must be first, everything else may cause a recursive crash.
+ * See note by declaration of modifying_ftrace_code in ftrace.c
+ */
+ if (unlikely(atomic_read(&modifying_ftrace_code)) &&
+ ftrace_int3_handler(regs))
+ return;
+#endif
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 36fd42091fa..dc4e910a7d9 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -409,10 +409,9 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,
* arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
* @mm: the probed address space.
* @arch_uprobe: the probepoint information.
- * @addr: virtual address at which to install the probepoint
* Return 0 on success or a -ve number on error.
*/
-int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm)
{
int ret;
struct insn insn;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0f703f10901..22a1530146a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -197,18 +197,6 @@ SECTIONS
INIT_DATA_SECTION(16)
- /*
- * Code and data for a variety of lowlevel trampolines, to be
- * copied into base memory (< 1 MiB) during initialization.
- * Since it is copied early, the main copy can be discarded
- * afterwards.
- */
- .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
- x86_trampoline_start = .;
- *(.x86_trampoline)
- x86_trampoline_end = .;
- }
-
.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
__x86_cpu_dev_start = .;
*(.x86_cpu_dev.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a1d804bcd48..8eeb55a551b 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
+#include <linux/smp.h>
#include <asm/apic.h>
#include <asm/pci-direct.h>
@@ -22,6 +23,8 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
+#define TOPOLOGY_REGISTER_OFFSET 0x10
+
#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*
* Interrupt control on vSMPowered systems:
@@ -149,12 +152,49 @@ int is_vsmp_box(void)
return 0;
}
#endif
+
+static void __init vsmp_cap_cpus(void)
+{
+#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
+ void __iomem *address;
+ unsigned int cfg, topology, node_shift, maxcpus;
+
+ /*
+ * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
+ * ones present in the first board, unless explicitly overridden by
+ * setup_max_cpus
+ */
+ if (setup_max_cpus != NR_CPUS)
+ return;
+
+ /* Read the vSMP Foundation topology register */
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
+ if (WARN_ON(!address))
+ return;
+
+ topology = readl(address);
+ node_shift = (topology >> 16) & 0x7;
+ if (!node_shift)
+ /* The value 0 should be decoded as 8 */
+ node_shift = 8;
+ maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
+
+ pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
+ maxcpus);
+ setup_max_cpus = maxcpus;
+ early_iounmap(address, 4);
+#endif
+}
+
void __init vsmp_init(void)
{
detect_vsmp_box();
if (!is_vsmp_box())
return;
+ vsmp_cap_cpus();
+
set_vsmp_pv_ops();
return;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7515cf0e180..5db36caf428 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -139,6 +139,19 @@ static int addr_to_vsyscall_nr(unsigned long addr)
return nr;
}
+#ifdef CONFIG_SECCOMP
+static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
+{
+ if (!seccomp_mode(&tsk->seccomp))
+ return 0;
+ task_pt_regs(tsk)->orig_ax = syscall_nr;
+ task_pt_regs(tsk)->ax = syscall_nr;
+ return __secure_computing(syscall_nr);
+}
+#else
+#define vsyscall_seccomp(_tsk, _nr) 0
+#endif
+
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
/*
@@ -174,6 +187,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
int vsyscall_nr;
int prev_sig_on_uaccess_error;
long ret;
+ int skip;
/*
* No point in checking CS -- the only way to get here is a user mode
@@ -205,9 +219,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
}
tsk = current;
- if (seccomp_mode(&tsk->seccomp))
- do_exit(SIGKILL);
-
/*
* With a real vsyscall, page faults cause SIGSEGV. We want to
* preserve that behavior to make writing exploits harder.
@@ -222,8 +233,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
* address 0".
*/
ret = -EFAULT;
+ skip = 0;
switch (vsyscall_nr) {
case 0:
+ skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
+ if (skip)
+ break;
+
if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
!write_ok_or_segv(regs->si, sizeof(struct timezone)))
break;
@@ -234,6 +250,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
break;
case 1:
+ skip = vsyscall_seccomp(tsk, __NR_time);
+ if (skip)
+ break;
+
if (!write_ok_or_segv(regs->di, sizeof(time_t)))
break;
@@ -241,6 +261,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
break;
case 2:
+ skip = vsyscall_seccomp(tsk, __NR_getcpu);
+ if (skip)
+ break;
+
if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
!write_ok_or_segv(regs->si, sizeof(unsigned)))
break;
@@ -253,6 +277,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+ if (skip) {
+ if ((long)regs->ax <= 0L) /* seccomp errno emulation */
+ goto do_ret;
+ goto done; /* seccomp trace/trap */
+ }
+
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
@@ -271,10 +301,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
regs->ax = ret;
+do_ret:
/* Emulate a ret instruction. */
regs->ip = caller;
regs->sp += 8;
-
+done:
return true;
sigsegv:
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 9cf71d0b2d3..35c5e543f55 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -18,6 +18,7 @@
#include <asm/e820.h>
#include <asm/time.h>
#include <asm/irq.h>
+#include <asm/io_apic.h>
#include <asm/pat.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
@@ -119,3 +120,10 @@ struct x86_msi_ops x86_msi = {
.teardown_msi_irqs = default_teardown_msi_irqs,
.restore_msi_irqs = default_restore_msi_irqs,
};
+
+struct x86_io_apic_ops x86_io_apic_ops = {
+ .init = native_io_apic_init_mappings,
+ .read = native_io_apic_read,
+ .write = native_io_apic_write,
+ .modify = native_io_apic_modify,
+};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index e62728e30b0..bd18149b2b0 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -48,8 +48,6 @@ void __sanitize_i387_state(struct task_struct *tsk)
if (!fx)
return;
- BUG_ON(__thread_has_fpu(tsk));
-
xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
/*