diff options
author | Anton Arapov <anton@redhat.com> | 2012-04-16 10:05:28 +0200 |
---|---|---|
committer | Anton Arapov <anton@redhat.com> | 2012-04-16 10:05:28 +0200 |
commit | b4b6116a13633898cf868f2f103c96a90c4c20f8 (patch) | |
tree | 93d1b7e2cfcdf473d8d4ff3ad141fa864f8491f6 /arch/ia64/kernel | |
parent | edd4be777c953e5faafc80d091d3084b4343f5d3 (diff) | |
download | kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.gz kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.tar.xz kernel-uprobes-b4b6116a13633898cf868f2f103c96a90c4c20f8.zip |
fedora kernel: d9aad82f3319f3cfd1aebc01234254ef0c37ad84v3.3.2-1
Signed-off-by: Anton Arapov <anton@redhat.com>
Diffstat (limited to 'arch/ia64/kernel')
88 files changed, 46525 insertions, 0 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile new file mode 100644 index 00000000000..d959c84904b --- /dev/null +++ b/arch/ia64/kernel/Makefile @@ -0,0 +1,114 @@ +# +# Makefile for the linux kernel. +# + +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif + +extra-y := head.o init_task.o vmlinux.lds + +obj-y := entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ + irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o \ + salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ + unwind.o mca.o mca_asm.o topology.o dma-mapping.o + +obj-$(CONFIG_ACPI) += acpi.o acpi-ext.o +obj-$(CONFIG_IA64_BRL_EMU) += brl_emu.o + +obj-$(CONFIG_IA64_PALINFO) += palinfo.o +obj-$(CONFIG_IOSAPIC) += iosapic.o +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_SMP) += smp.o smpboot.o +obj-$(CONFIG_NUMA) += numa.o +obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o +obj-$(CONFIG_IA64_CYCLONE) += cyclone.o +obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o +obj-$(CONFIG_AUDIT) += audit.o +obj-$(CONFIG_PCI_MSI) += msi_ia64.o +mca_recovery-y += mca_drv.o mca_drv_asm.o +obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o + +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o \ + paravirt_patch.o + +obj-$(CONFIG_IA64_ESI) += esi.o +ifneq ($(CONFIG_IA64_ESI),) +obj-y += esi_stub.o # must be in kernel proper +endif +obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o +obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o + +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + +# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. +CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 + +# The gate DSO image is built using a special linker script. +include $(srctree)/arch/ia64/kernel/Makefile.gate +# tell compiled for native +CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE + +# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config +define sed-y + "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}" +endef +quiet_cmd_nr_irqs = GEN $@ +define cmd_nr_irqs + (set -e; \ + echo "#ifndef __ASM_NR_IRQS_H__"; \ + echo "#define __ASM_NR_IRQS_H__"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by Kbuild"; \ + echo " *"; \ + echo " */"; \ + echo ""; \ + sed -ne $(sed-y) $<; \ + echo ""; \ + echo "#endif" ) > $@ +endef + +# We use internal kbuild rules to avoid the "is up to date" message from make +arch/$(SRCARCH)/kernel/nr-irqs.s: arch/$(SRCARCH)/kernel/nr-irqs.c + $(Q)mkdir -p $(dir $@) + $(call if_changed_dep,cc_s_c) + +include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s + $(Q)mkdir -p $(dir $@) + $(call cmd,nr_irqs) + +# +# native ivt.S, entry.S and fsys.S +# +ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o +define paravirtualized_native +AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE +AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK +extra-y += pvchk-$(1) +endef +$(foreach obj,$(ASM_PARAVIRT_OBJS),$(eval $(call paravirtualized_native,$(obj)))) + +# +# Checker for paravirtualizations of privileged operations. +# +quiet_cmd_pv_check_sed = PVCHK $@ +define cmd_pv_check_sed + sed -f $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed $< > $@ +endef + +$(obj)/pvchk-sed-%.s: $(src)/%.S $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed FORCE + $(call if_changed_dep,as_s_S) +$(obj)/pvchk-%.s: $(obj)/pvchk-sed-%.s FORCE + $(call if_changed,pv_check_sed) +$(obj)/pvchk-%.o: $(obj)/pvchk-%.s FORCE + $(call if_changed,as_o_S) +.PRECIOUS: $(obj)/pvchk-sed-%.s $(obj)/pvchk-%.s $(obj)/pvchk-%.o diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate new file mode 100644 index 00000000000..ceeffc50976 --- /dev/null +++ b/arch/ia64/kernel/Makefile.gate @@ -0,0 +1,27 @@ +# The gate DSO image is built using a special linker script. + +targets += gate.so gate-syms.o + +extra-y += gate.so gate-syms.o gate.lds gate.o + +CPPFLAGS_gate.lds := -P -C -U$(ARCH) + +quiet_cmd_gate = GATE $@ + cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@ + +GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +$(obj)/built-in.o: $(obj)/gate-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o + +GATECFLAGS_gate-syms.o = -r +$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE + $(call if_changed,gate) + +# gate-data.o contains the gate DSO image as data in section .data..gate. +# We must build gate.so before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/gate-data.o: $(obj)/gate.so diff --git a/arch/ia64/kernel/acpi-ext.c b/arch/ia64/kernel/acpi-ext.c new file mode 100644 index 00000000000..8b9318d311a --- /dev/null +++ b/arch/ia64/kernel/acpi-ext.c @@ -0,0 +1,104 @@ +/* + * (c) Copyright 2003, 2006 Hewlett-Packard Development Company, L.P. + * Alex Williamson <alex.williamson@hp.com> + * Bjorn Helgaas <bjorn.helgaas@hp.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/acpi.h> + +#include <asm/acpi-ext.h> + +/* + * Device CSRs that do not appear in PCI config space should be described + * via ACPI. This would normally be done with Address Space Descriptors + * marked as "consumer-only," but old versions of Windows and Linux ignore + * the producer/consumer flag, so HP invented a vendor-defined resource to + * describe the location and size of CSR space. + */ + +struct acpi_vendor_uuid hp_ccsr_uuid = { + .subtype = 2, + .data = { 0xf9, 0xad, 0xe9, 0x69, 0x4f, 0x92, 0x5f, 0xab, 0xf6, 0x4a, + 0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad }, +}; + +static acpi_status hp_ccsr_locate(acpi_handle obj, u64 *base, u64 *length) +{ + acpi_status status; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_resource *resource; + struct acpi_resource_vendor_typed *vendor; + + status = acpi_get_vendor_resource(obj, METHOD_NAME__CRS, &hp_ccsr_uuid, + &buffer); + + resource = buffer.pointer; + vendor = &resource->data.vendor_typed; + + if (ACPI_FAILURE(status) || vendor->byte_length < 16) { + status = AE_NOT_FOUND; + goto exit; + } + + memcpy(base, vendor->byte_data, sizeof(*base)); + memcpy(length, vendor->byte_data + 8, sizeof(*length)); + + exit: + kfree(buffer.pointer); + return status; +} + +struct csr_space { + u64 base; + u64 length; +}; + +static acpi_status find_csr_space(struct acpi_resource *resource, void *data) +{ + struct csr_space *space = data; + struct acpi_resource_address64 addr; + acpi_status status; + + status = acpi_resource_to_address64(resource, &addr); + if (ACPI_SUCCESS(status) && + addr.resource_type == ACPI_MEMORY_RANGE && + addr.address_length && + addr.producer_consumer == ACPI_CONSUMER) { + space->base = addr.minimum; + space->length = addr.address_length; + return AE_CTRL_TERMINATE; + } + return AE_OK; /* keep looking */ +} + +static acpi_status hp_crs_locate(acpi_handle obj, u64 *base, u64 *length) +{ + struct csr_space space = { 0, 0 }; + + acpi_walk_resources(obj, METHOD_NAME__CRS, find_csr_space, &space); + if (!space.length) + return AE_NOT_FOUND; + + *base = space.base; + *length = space.length; + return AE_OK; +} + +acpi_status hp_acpi_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length) +{ + acpi_status status; + + status = hp_ccsr_locate(obj, csr_base, csr_length); + if (ACPI_SUCCESS(status)) + return status; + + return hp_crs_locate(obj, csr_base, csr_length); +} +EXPORT_SYMBOL(hp_acpi_csr_space); diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c new file mode 100644 index 00000000000..5207035dc06 --- /dev/null +++ b/arch/ia64/kernel/acpi.c @@ -0,0 +1,1048 @@ +/* + * acpi.c - Architecture-Specific Low-Level ACPI Support + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com> + * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 2000 Intel Corp. + * Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com> + * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com> + * Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com> + * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de> + * Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/irq.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/mmzone.h> +#include <linux/nodemask.h> +#include <linux/slab.h> +#include <acpi/processor.h> +#include <asm/io.h> +#include <asm/iosapic.h> +#include <asm/machvec.h> +#include <asm/page.h> +#include <asm/system.h> +#include <asm/numa.h> +#include <asm/sal.h> +#include <asm/cyclone.h> +#include <asm/xen/hypervisor.h> + +#define BAD_MADT_ENTRY(entry, end) ( \ + (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ + ((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) + +#define PREFIX "ACPI: " + +u32 acpi_rsdt_forced; +unsigned int acpi_cpei_override; +unsigned int acpi_cpei_phys_cpuid; + +unsigned long acpi_wakeup_address = 0; + +#ifdef CONFIG_IA64_GENERIC +static unsigned long __init acpi_find_rsdp(void) +{ + unsigned long rsdp_phys = 0; + + if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) + rsdp_phys = efi.acpi20; + else if (efi.acpi != EFI_INVALID_TABLE_ADDR) + printk(KERN_WARNING PREFIX + "v1.0/r0.71 tables no longer supported\n"); + return rsdp_phys; +} + +const char __init * +acpi_get_sysname(void) +{ + unsigned long rsdp_phys; + struct acpi_table_rsdp *rsdp; + struct acpi_table_xsdt *xsdt; + struct acpi_table_header *hdr; +#ifdef CONFIG_INTEL_IOMMU + u64 i, nentries; +#endif + + rsdp_phys = acpi_find_rsdp(); + if (!rsdp_phys) { + printk(KERN_ERR + "ACPI 2.0 RSDP not found, default to \"dig\"\n"); + return "dig"; + } + + rsdp = (struct acpi_table_rsdp *)__va(rsdp_phys); + if (strncmp(rsdp->signature, ACPI_SIG_RSDP, sizeof(ACPI_SIG_RSDP) - 1)) { + printk(KERN_ERR + "ACPI 2.0 RSDP signature incorrect, default to \"dig\"\n"); + return "dig"; + } + + xsdt = (struct acpi_table_xsdt *)__va(rsdp->xsdt_physical_address); + hdr = &xsdt->header; + if (strncmp(hdr->signature, ACPI_SIG_XSDT, sizeof(ACPI_SIG_XSDT) - 1)) { + printk(KERN_ERR + "ACPI 2.0 XSDT signature incorrect, default to \"dig\"\n"); + return "dig"; + } + + if (!strcmp(hdr->oem_id, "HP")) { + return "hpzx1"; + } else if (!strcmp(hdr->oem_id, "SGI")) { + if (!strcmp(hdr->oem_table_id + 4, "UV")) + return "uv"; + else + return "sn2"; + } else if (xen_pv_domain() && !strcmp(hdr->oem_id, "XEN")) { + return "xen"; + } + +#ifdef CONFIG_INTEL_IOMMU + /* Look for Intel IOMMU */ + nentries = (hdr->length - sizeof(*hdr)) / + sizeof(xsdt->table_offset_entry[0]); + for (i = 0; i < nentries; i++) { + hdr = __va(xsdt->table_offset_entry[i]); + if (strncmp(hdr->signature, ACPI_SIG_DMAR, + sizeof(ACPI_SIG_DMAR) - 1) == 0) + return "dig_vtd"; + } +#endif + + return "dig"; +} +#endif /* CONFIG_IA64_GENERIC */ + +#define ACPI_MAX_PLATFORM_INTERRUPTS 256 + +/* Array to record platform interrupt vectors for generic interrupt routing. */ +int platform_intr_list[ACPI_MAX_PLATFORM_INTERRUPTS] = { + [0 ... ACPI_MAX_PLATFORM_INTERRUPTS - 1] = -1 +}; + +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_IOSAPIC; + +/* + * Interrupt routing API for device drivers. Provides interrupt vector for + * a generic platform event. Currently only CPEI is implemented. + */ +int acpi_request_vector(u32 int_type) +{ + int vector = -1; + + if (int_type < ACPI_MAX_PLATFORM_INTERRUPTS) { + /* corrected platform error interrupt */ + vector = platform_intr_list[int_type]; + } else + printk(KERN_ERR + "acpi_request_vector(): invalid interrupt type\n"); + return vector; +} + +char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size) +{ + return __va(phys_addr); +} + +void __init __acpi_unmap_table(char *map, unsigned long size) +{ +} + +/* -------------------------------------------------------------------------- + Boot-time Table Parsing + -------------------------------------------------------------------------- */ + +static int available_cpus __initdata; +struct acpi_table_madt *acpi_madt __initdata; +static u8 has_8259; + +static int __init +acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, + const unsigned long end) +{ + struct acpi_madt_local_apic_override *lapic; + + lapic = (struct acpi_madt_local_apic_override *)header; + + if (BAD_MADT_ENTRY(lapic, end)) + return -EINVAL; + + if (lapic->address) { + iounmap(ipi_base_addr); + ipi_base_addr = ioremap(lapic->address, 0); + } + return 0; +} + +static int __init +acpi_parse_lsapic(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_local_sapic *lsapic; + + lsapic = (struct acpi_madt_local_sapic *)header; + + /*Skip BAD_MADT_ENTRY check, as lsapic size could vary */ + + if (lsapic->lapic_flags & ACPI_MADT_ENABLED) { +#ifdef CONFIG_SMP + smp_boot_data.cpu_phys_id[available_cpus] = + (lsapic->id << 8) | lsapic->eid; +#endif + ++available_cpus; + } + + total_cpus++; + return 0; +} + +static int __init +acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_local_apic_nmi *lacpi_nmi; + + lacpi_nmi = (struct acpi_madt_local_apic_nmi *)header; + + if (BAD_MADT_ENTRY(lacpi_nmi, end)) + return -EINVAL; + + /* TBD: Support lapic_nmi entries */ + return 0; +} + +static int __init +acpi_parse_iosapic(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_io_sapic *iosapic; + + iosapic = (struct acpi_madt_io_sapic *)header; + + if (BAD_MADT_ENTRY(iosapic, end)) + return -EINVAL; + + return iosapic_init(iosapic->address, iosapic->global_irq_base); +} + +static unsigned int __initdata acpi_madt_rev; + +static int __init +acpi_parse_plat_int_src(struct acpi_subtable_header * header, + const unsigned long end) +{ + struct acpi_madt_interrupt_source *plintsrc; + int vector; + + plintsrc = (struct acpi_madt_interrupt_source *)header; + + if (BAD_MADT_ENTRY(plintsrc, end)) + return -EINVAL; + + /* + * Get vector assignment for this interrupt, set attributes, + * and program the IOSAPIC routing table. + */ + vector = iosapic_register_platform_intr(plintsrc->type, + plintsrc->global_irq, + plintsrc->io_sapic_vector, + plintsrc->eid, + plintsrc->id, + ((plintsrc->inti_flags & ACPI_MADT_POLARITY_MASK) == + ACPI_MADT_POLARITY_ACTIVE_HIGH) ? + IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, + ((plintsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) == + ACPI_MADT_TRIGGER_EDGE) ? + IOSAPIC_EDGE : IOSAPIC_LEVEL); + + platform_intr_list[plintsrc->type] = vector; + if (acpi_madt_rev > 1) { + acpi_cpei_override = plintsrc->flags & ACPI_MADT_CPEI_OVERRIDE; + } + + /* + * Save the physical id, so we can check when its being removed + */ + acpi_cpei_phys_cpuid = ((plintsrc->id << 8) | (plintsrc->eid)) & 0xffff; + + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +unsigned int can_cpei_retarget(void) +{ + extern int cpe_vector; + extern unsigned int force_cpei_retarget; + + /* + * Only if CPEI is supported and the override flag + * is present, otherwise return that its re-targettable + * if we are in polling mode. + */ + if (cpe_vector > 0) { + if (acpi_cpei_override || force_cpei_retarget) + return 1; + else + return 0; + } + return 1; +} + +unsigned int is_cpu_cpei_target(unsigned int cpu) +{ + unsigned int logical_id; + + logical_id = cpu_logical_id(acpi_cpei_phys_cpuid); + + if (logical_id == cpu) + return 1; + else + return 0; +} + +void set_cpei_target_cpu(unsigned int cpu) +{ + acpi_cpei_phys_cpuid = cpu_physical_id(cpu); +} +#endif + +unsigned int get_cpei_target_cpu(void) +{ + return acpi_cpei_phys_cpuid; +} + +static int __init +acpi_parse_int_src_ovr(struct acpi_subtable_header * header, + const unsigned long end) +{ + struct acpi_madt_interrupt_override *p; + + p = (struct acpi_madt_interrupt_override *)header; + + if (BAD_MADT_ENTRY(p, end)) + return -EINVAL; + + iosapic_override_isa_irq(p->source_irq, p->global_irq, + ((p->inti_flags & ACPI_MADT_POLARITY_MASK) == + ACPI_MADT_POLARITY_ACTIVE_HIGH) ? + IOSAPIC_POL_HIGH : IOSAPIC_POL_LOW, + ((p->inti_flags & ACPI_MADT_TRIGGER_MASK) == + ACPI_MADT_TRIGGER_EDGE) ? + IOSAPIC_EDGE : IOSAPIC_LEVEL); + return 0; +} + +static int __init +acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_nmi_source *nmi_src; + + nmi_src = (struct acpi_madt_nmi_source *)header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + /* TBD: Support nimsrc entries */ + return 0; +} + +static void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERMOW", 6))) { + + /* + * Unfortunately ITC_DRIFT is not yet part of the + * official SAL spec, so the ITC_DRIFT bit is not + * set by the BIOS on this hardware. + */ + sal_platform_features |= IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT; + + cyclone_setup(); + } +} + +static int __init acpi_parse_madt(struct acpi_table_header *table) +{ + if (!table) + return -EINVAL; + + acpi_madt = (struct acpi_table_madt *)table; + + acpi_madt_rev = acpi_madt->header.revision; + + /* remember the value for reference after free_initmem() */ +#ifdef CONFIG_ITANIUM + has_8259 = 1; /* Firmware on old Itanium systems is broken */ +#else + has_8259 = acpi_madt->flags & ACPI_MADT_PCAT_COMPAT; +#endif + iosapic_system_init(has_8259); + + /* Get base address of IPI Message Block */ + + if (acpi_madt->address) + ipi_base_addr = ioremap(acpi_madt->address, 0); + + printk(KERN_INFO PREFIX "Local APIC address %p\n", ipi_base_addr); + + acpi_madt_oem_check(acpi_madt->header.oem_id, + acpi_madt->header.oem_table_id); + + return 0; +} + +#ifdef CONFIG_ACPI_NUMA + +#undef SLIT_DEBUG + +#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32) + +static int __initdata srat_num_cpus; /* number of cpus */ +static u32 __devinitdata pxm_flag[PXM_FLAG_LEN]; +#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag)) +#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag)) +static struct acpi_table_slit __initdata *slit_table; +cpumask_t early_cpu_possible_map = CPU_MASK_NONE; + +static int __init +get_processor_proximity_domain(struct acpi_srat_cpu_affinity *pa) +{ + int pxm; + + pxm = pa->proximity_domain_lo; + if (ia64_platform_is("sn2") || acpi_srat_revision >= 2) + pxm += pa->proximity_domain_hi[0] << 8; + return pxm; +} + +static int __init +get_memory_proximity_domain(struct acpi_srat_mem_affinity *ma) +{ + int pxm; + + pxm = ma->proximity_domain; + if (!ia64_platform_is("sn2") && acpi_srat_revision <= 1) + pxm &= 0xff; + + return pxm; +} + +/* + * ACPI 2.0 SLIT (System Locality Information Table) + * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf + */ +void __init acpi_numa_slit_init(struct acpi_table_slit *slit) +{ + u32 len; + + len = sizeof(struct acpi_table_header) + 8 + + slit->locality_count * slit->locality_count; + if (slit->header.length != len) { + printk(KERN_ERR + "ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n", + len, slit->header.length); + return; + } + slit_table = slit; +} + +void __init +acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) +{ + int pxm; + + if (!(pa->flags & ACPI_SRAT_CPU_ENABLED)) + return; + + if (srat_num_cpus >= ARRAY_SIZE(node_cpuid)) { + printk_once(KERN_WARNING + "node_cpuid[%ld] is too small, may not be able to use all cpus\n", + ARRAY_SIZE(node_cpuid)); + return; + } + pxm = get_processor_proximity_domain(pa); + + /* record this node in proximity bitmap */ + pxm_bit_set(pxm); + + node_cpuid[srat_num_cpus].phys_id = + (pa->apic_id << 8) | (pa->local_sapic_eid); + /* nid should be overridden as logical node id later */ + node_cpuid[srat_num_cpus].nid = pxm; + cpu_set(srat_num_cpus, early_cpu_possible_map); + srat_num_cpus++; +} + +void __init +acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) +{ + unsigned long paddr, size; + int pxm; + struct node_memblk_s *p, *q, *pend; + + pxm = get_memory_proximity_domain(ma); + + /* fill node memory chunk structure */ + paddr = ma->base_address; + size = ma->length; + + /* Ignore disabled entries */ + if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) + return; + + /* record this node in proximity bitmap */ + pxm_bit_set(pxm); + + /* Insertion sort based on base address */ + pend = &node_memblk[num_node_memblks]; + for (p = &node_memblk[0]; p < pend; p++) { + if (paddr < p->start_paddr) + break; + } + if (p < pend) { + for (q = pend - 1; q >= p; q--) + *(q + 1) = *q; + } + p->start_paddr = paddr; + p->size = size; + p->nid = pxm; + num_node_memblks++; +} + +void __init acpi_numa_arch_fixup(void) +{ + int i, j, node_from, node_to; + + /* If there's no SRAT, fix the phys_id and mark node 0 online */ + if (srat_num_cpus == 0) { + node_set_online(0); + node_cpuid[0].phys_id = hard_smp_processor_id(); + return; + } + + /* + * MCD - This can probably be dropped now. No need for pxm ID to node ID + * mapping with sparse node numbering iff MAX_PXM_DOMAINS <= MAX_NUMNODES. + */ + nodes_clear(node_online_map); + for (i = 0; i < MAX_PXM_DOMAINS; i++) { + if (pxm_bit_test(i)) { + int nid = acpi_map_pxm_to_node(i); + node_set_online(nid); + } + } + + /* set logical node id in memory chunk structure */ + for (i = 0; i < num_node_memblks; i++) + node_memblk[i].nid = pxm_to_node(node_memblk[i].nid); + + /* assign memory bank numbers for each chunk on each node */ + for_each_online_node(i) { + int bank; + + bank = 0; + for (j = 0; j < num_node_memblks; j++) + if (node_memblk[j].nid == i) + node_memblk[j].bank = bank++; + } + + /* set logical node id in cpu structure */ + for_each_possible_early_cpu(i) + node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid); + + printk(KERN_INFO "Number of logical nodes in system = %d\n", + num_online_nodes()); + printk(KERN_INFO "Number of memory chunks in system = %d\n", + num_node_memblks); + + if (!slit_table) { + for (i = 0; i < MAX_NUMNODES; i++) + for (j = 0; j < MAX_NUMNODES; j++) + node_distance(i, j) = i == j ? LOCAL_DISTANCE : + REMOTE_DISTANCE; + return; + } + + memset(numa_slit, -1, sizeof(numa_slit)); + for (i = 0; i < slit_table->locality_count; i++) { + if (!pxm_bit_test(i)) + continue; + node_from = pxm_to_node(i); + for (j = 0; j < slit_table->locality_count; j++) { + if (!pxm_bit_test(j)) + continue; + node_to = pxm_to_node(j); + node_distance(node_from, node_to) = + slit_table->entry[i * slit_table->locality_count + j]; + } + } + +#ifdef SLIT_DEBUG + printk("ACPI 2.0 SLIT locality table:\n"); + for_each_online_node(i) { + for_each_online_node(j) + printk("%03d ", node_distance(i, j)); + printk("\n"); + } +#endif +} +#endif /* CONFIG_ACPI_NUMA */ + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity) +{ + if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) + return gsi; + + if (has_8259 && gsi < 16) + return isa_irq_to_vector(gsi); + + return iosapic_register_intr(gsi, + (polarity == + ACPI_ACTIVE_HIGH) ? IOSAPIC_POL_HIGH : + IOSAPIC_POL_LOW, + (triggering == + ACPI_EDGE_SENSITIVE) ? IOSAPIC_EDGE : + IOSAPIC_LEVEL); +} + +void acpi_unregister_gsi(u32 gsi) +{ + if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM) + return; + + if (has_8259 && gsi < 16) + return; + + iosapic_unregister_intr(gsi); +} + +static int __init acpi_parse_fadt(struct acpi_table_header *table) +{ + struct acpi_table_header *fadt_header; + struct acpi_table_fadt *fadt; + + if (!table) + return -EINVAL; + + fadt_header = (struct acpi_table_header *)table; + if (fadt_header->revision != 3) + return -ENODEV; /* Only deal with ACPI 2.0 FADT */ + + fadt = (struct acpi_table_fadt *)fadt_header; + + acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, + ACPI_ACTIVE_LOW); + return 0; +} + +int __init early_acpi_boot_init(void) +{ + int ret; + + /* + * do a partial walk of MADT to determine how many CPUs + * we have including offline CPUs + */ + if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + printk(KERN_ERR PREFIX "Can't find MADT\n"); + return 0; + } + + ret = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, + acpi_parse_lsapic, NR_CPUS); + if (ret < 1) + printk(KERN_ERR PREFIX + "Error parsing MADT - no LAPIC entries\n"); + +#ifdef CONFIG_SMP + if (available_cpus == 0) { + printk(KERN_INFO "ACPI: Found 0 CPUS; assuming 1\n"); + printk(KERN_INFO "CPU 0 (0x%04x)", hard_smp_processor_id()); + smp_boot_data.cpu_phys_id[available_cpus] = + hard_smp_processor_id(); + available_cpus = 1; /* We've got at least one of these, no? */ + } + smp_boot_data.cpu_count = available_cpus; +#endif + /* Make boot-up look pretty */ + printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, + total_cpus); + + return 0; +} + +int __init acpi_boot_init(void) +{ + + /* + * MADT + * ---- + * Parse the Multiple APIC Description Table (MADT), if exists. + * Note that this table provides platform SMP configuration + * information -- the successor to MPS tables. + */ + + if (acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + printk(KERN_ERR PREFIX "Can't find MADT\n"); + goto skip_madt; + } + + /* Local APIC */ + + if (acpi_table_parse_madt + (ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, acpi_parse_lapic_addr_ovr, 0) < 0) + printk(KERN_ERR PREFIX + "Error parsing LAPIC address override entry\n"); + + if (acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0) + < 0) + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + + /* I/O APIC */ + + if (acpi_table_parse_madt + (ACPI_MADT_TYPE_IO_SAPIC, acpi_parse_iosapic, NR_IOSAPICS) < 1) { + if (!ia64_platform_is("sn2")) + printk(KERN_ERR PREFIX + "Error parsing MADT - no IOSAPIC entries\n"); + } + + /* System-Level Interrupt Routing */ + + if (acpi_table_parse_madt + (ACPI_MADT_TYPE_INTERRUPT_SOURCE, acpi_parse_plat_int_src, + ACPI_MAX_PLATFORM_INTERRUPTS) < 0) + printk(KERN_ERR PREFIX + "Error parsing platform interrupt source entry\n"); + + if (acpi_table_parse_madt + (ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 0) < 0) + printk(KERN_ERR PREFIX + "Error parsing interrupt source overrides entry\n"); + + if (acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 0) < 0) + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + skip_madt: + + /* + * FADT says whether a legacy keyboard controller is present. + * The FADT also contains an SCI_INT line, by which the system + * gets interrupts such as power and sleep buttons. If it's not + * on a Legacy interrupt, it needs to be setup. + */ + if (acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt)) + printk(KERN_ERR PREFIX "Can't find FADT\n"); + +#ifdef CONFIG_ACPI_NUMA +#ifdef CONFIG_SMP + if (srat_num_cpus == 0) { + int cpu, i = 1; + for (cpu = 0; cpu < smp_boot_data.cpu_count; cpu++) + if (smp_boot_data.cpu_phys_id[cpu] != + hard_smp_processor_id()) + node_cpuid[i++].phys_id = + smp_boot_data.cpu_phys_id[cpu]; + } +#endif + build_cpu_to_node_map(); +#endif + return 0; +} + +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +{ + int tmp; + + if (has_8259 && gsi < 16) + *irq = isa_irq_to_vector(gsi); + else { + tmp = gsi_to_irq(gsi); + if (tmp == -1) + return -1; + *irq = tmp; + } + return 0; +} + +int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) +{ + if (isa_irq >= 16) + return -1; + *gsi = isa_irq; + return 0; +} + +/* + * ACPI based hotplug CPU support + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +static __cpuinit +int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int pxm_id; + int nid; + + pxm_id = acpi_get_pxm(handle); + /* + * We don't have cpu-only-node hotadd. But if the system equips + * SRAT table, pxm is already found and node is ready. + * So, just pxm_to_nid(pxm) is OK. + * This code here is for the system which doesn't have full SRAT + * table for possible cpus. + */ + nid = acpi_map_pxm_to_node(pxm_id); + node_cpuid[cpu].phys_id = physid; + node_cpuid[cpu].nid = nid; +#endif + return (0); +} + +int additional_cpus __initdata = -1; + +static __init int setup_additional_cpus(char *s) +{ + if (s) + additional_cpus = simple_strtol(s, NULL, 0); + + return 0; +} + +early_param("additional_cpus", setup_additional_cpus); + +/* + * cpu_possible_map should be static, it cannot change as CPUs + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_map on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with additional_cpus=NUM + * - Otherwise don't reserve additional CPUs. + */ +__init void prefill_possible_map(void) +{ + int i; + int possible, disabled_cpus; + + disabled_cpus = total_cpus - available_cpus; + + if (additional_cpus == -1) { + if (disabled_cpus > 0) + additional_cpus = disabled_cpus; + else + additional_cpus = 0; + } + + possible = available_cpus + additional_cpus; + + if (possible > nr_cpu_ids) + possible = nr_cpu_ids; + + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + possible, max((possible - available_cpus), 0)); + + for (i = 0; i < possible; i++) + set_cpu_possible(i, true); +} + +static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_madt_local_sapic *lsapic; + cpumask_t tmp_map; + int cpu, physid; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER) + { + kfree(buffer.pointer); + return -EINVAL; + } + + lsapic = (struct acpi_madt_local_sapic *)obj->buffer.pointer; + + if ((lsapic->header.type != ACPI_MADT_TYPE_LOCAL_SAPIC) || + (!(lsapic->lapic_flags & ACPI_MADT_ENABLED))) { + kfree(buffer.pointer); + return -EINVAL; + } + + physid = ((lsapic->id << 8) | (lsapic->eid)); + + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + cpumask_complement(&tmp_map, cpu_present_mask); + cpu = cpumask_first(&tmp_map); + if (cpu >= nr_cpu_ids) + return -EINVAL; + + acpi_map_cpu2node(handle, cpu, physid); + + cpu_set(cpu, cpu_present_map); + ia64_cpu_to_sapicid[cpu] = physid; + + acpi_processor_set_pdc(handle); + + *pcpu = cpu; + return (0); +} + +/* wrapper to silence section mismatch warning */ +int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu) +{ + return _acpi_map_lsapic(handle, pcpu); +} +EXPORT_SYMBOL(acpi_map_lsapic); + +int acpi_unmap_lsapic(int cpu) +{ + ia64_cpu_to_sapicid[cpu] = -1; + cpu_clear(cpu, cpu_present_map); + +#ifdef CONFIG_ACPI_NUMA + /* NUMA specific cleanup's */ +#endif + + return (0); +} + +EXPORT_SYMBOL(acpi_unmap_lsapic); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +#ifdef CONFIG_ACPI_NUMA +static acpi_status __devinit +acpi_map_iosapic(acpi_handle handle, u32 depth, void *context, void **ret) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_madt_io_sapic *iosapic; + unsigned int gsi_base; + int pxm, node; + + /* Only care about objects w/ a method that returns the MADT */ + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return AE_OK; + + if (!buffer.length || !buffer.pointer) + return AE_OK; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*iosapic)) { + kfree(buffer.pointer); + return AE_OK; + } + + iosapic = (struct acpi_madt_io_sapic *)obj->buffer.pointer; + + if (iosapic->header.type != ACPI_MADT_TYPE_IO_SAPIC) { + kfree(buffer.pointer); + return AE_OK; + } + + gsi_base = iosapic->global_irq_base; + + kfree(buffer.pointer); + + /* + * OK, it's an IOSAPIC MADT entry, look for a _PXM value to tell + * us which node to associate this with. + */ + pxm = acpi_get_pxm(handle); + if (pxm < 0) + return AE_OK; + + node = pxm_to_node(pxm); + + if (node >= MAX_NUMNODES || !node_online(node) || + cpumask_empty(cpumask_of_node(node))) + return AE_OK; + + /* We know a gsi to node mapping! */ + map_iosapic_to_node(gsi_base, node); + return AE_OK; +} + +static int __init +acpi_map_iosapics (void) +{ + acpi_get_devices(NULL, acpi_map_iosapic, NULL, NULL); + return 0; +} + +fs_initcall(acpi_map_iosapics); +#endif /* CONFIG_ACPI_NUMA */ + +int __ref acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + int err; + + if ((err = iosapic_init(phys_addr, gsi_base))) + return err; + +#ifdef CONFIG_ACPI_NUMA + acpi_map_iosapic(handle, 0, NULL, NULL); +#endif /* CONFIG_ACPI_NUMA */ + + return 0; +} + +EXPORT_SYMBOL(acpi_register_ioapic); + +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) +{ + return iosapic_remove(gsi_base); +} + +EXPORT_SYMBOL(acpi_unregister_ioapic); + +/* + * acpi_suspend_lowlevel() - save kernel state and suspend. + * + * TBD when when IA64 starts to support suspend... + */ +int acpi_suspend_lowlevel(void) { return 0; } diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c new file mode 100644 index 00000000000..af565016904 --- /dev/null +++ b/arch/ia64/kernel/asm-offsets.c @@ -0,0 +1,322 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#define ASM_OFFSETS_C 1 + +#include <linux/sched.h> +#include <linux/pid.h> +#include <linux/clocksource.h> +#include <linux/kbuild.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/siginfo.h> +#include <asm/sigcontext.h> +#include <asm/mca.h> + +#include <asm/xen/interface.h> +#include <asm/xen/hypervisor.h> + +#include "../kernel/sigframe.h" +#include "../kernel/fsyscall_gtod_data.h" + +void foo(void) +{ + DEFINE(IA64_TASK_SIZE, sizeof (struct task_struct)); + DEFINE(IA64_THREAD_INFO_SIZE, sizeof (struct thread_info)); + DEFINE(IA64_PT_REGS_SIZE, sizeof (struct pt_regs)); + DEFINE(IA64_SWITCH_STACK_SIZE, sizeof (struct switch_stack)); + DEFINE(IA64_SIGINFO_SIZE, sizeof (struct siginfo)); + DEFINE(IA64_CPU_SIZE, sizeof (struct cpuinfo_ia64)); + DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe)); + DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info)); + + BUILD_BUG_ON(sizeof(struct upid) != 32); + DEFINE(IA64_UPID_SHIFT, 5); + + BLANK(); + + DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); + DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp)); + DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave)); + DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime)); + DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime)); +#endif + + BLANK(); + + DEFINE(IA64_TASK_BLOCKED_OFFSET,offsetof (struct task_struct, blocked)); + DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); + DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); + DEFINE(IA64_TASK_TGIDLINK_OFFSET, offsetof (struct task_struct, pids[PIDTYPE_PID].pid)); + DEFINE(IA64_PID_LEVEL_OFFSET, offsetof (struct pid, level)); + DEFINE(IA64_PID_UPID_OFFSET, offsetof (struct pid, numbers[0])); + DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); + DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); + DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); + DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); + DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); + DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); + DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); + DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); + + BLANK(); + + DEFINE(IA64_SIGHAND_SIGLOCK_OFFSET,offsetof (struct sighand_struct, siglock)); + + BLANK(); + + DEFINE(IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,offsetof (struct signal_struct, + group_stop_count)); + DEFINE(IA64_SIGNAL_SHARED_PENDING_OFFSET,offsetof (struct signal_struct, shared_pending)); + + BLANK(); + + DEFINE(IA64_PT_REGS_B6_OFFSET, offsetof (struct pt_regs, b6)); + DEFINE(IA64_PT_REGS_B7_OFFSET, offsetof (struct pt_regs, b7)); + DEFINE(IA64_PT_REGS_AR_CSD_OFFSET, offsetof (struct pt_regs, ar_csd)); + DEFINE(IA64_PT_REGS_AR_SSD_OFFSET, offsetof (struct pt_regs, ar_ssd)); + DEFINE(IA64_PT_REGS_R8_OFFSET, offsetof (struct pt_regs, r8)); + DEFINE(IA64_PT_REGS_R9_OFFSET, offsetof (struct pt_regs, r9)); + DEFINE(IA64_PT_REGS_R10_OFFSET, offsetof (struct pt_regs, r10)); + DEFINE(IA64_PT_REGS_R11_OFFSET, offsetof (struct pt_regs, r11)); + DEFINE(IA64_PT_REGS_CR_IPSR_OFFSET, offsetof (struct pt_regs, cr_ipsr)); + DEFINE(IA64_PT_REGS_CR_IIP_OFFSET, offsetof (struct pt_regs, cr_iip)); + DEFINE(IA64_PT_REGS_CR_IFS_OFFSET, offsetof (struct pt_regs, cr_ifs)); + DEFINE(IA64_PT_REGS_AR_UNAT_OFFSET, offsetof (struct pt_regs, ar_unat)); + DEFINE(IA64_PT_REGS_AR_PFS_OFFSET, offsetof (struct pt_regs, ar_pfs)); + DEFINE(IA64_PT_REGS_AR_RSC_OFFSET, offsetof (struct pt_regs, ar_rsc)); + DEFINE(IA64_PT_REGS_AR_RNAT_OFFSET, offsetof (struct pt_regs, ar_rnat)); + + DEFINE(IA64_PT_REGS_AR_BSPSTORE_OFFSET, offsetof (struct pt_regs, ar_bspstore)); + DEFINE(IA64_PT_REGS_PR_OFFSET, offsetof (struct pt_regs, pr)); + DEFINE(IA64_PT_REGS_B0_OFFSET, offsetof (struct pt_regs, b0)); + DEFINE(IA64_PT_REGS_LOADRS_OFFSET, offsetof (struct pt_regs, loadrs)); + DEFINE(IA64_PT_REGS_R1_OFFSET, offsetof (struct pt_regs, r1)); + DEFINE(IA64_PT_REGS_R12_OFFSET, offsetof (struct pt_regs, r12)); + DEFINE(IA64_PT_REGS_R13_OFFSET, offsetof (struct pt_regs, r13)); + DEFINE(IA64_PT_REGS_AR_FPSR_OFFSET, offsetof (struct pt_regs, ar_fpsr)); + DEFINE(IA64_PT_REGS_R15_OFFSET, offsetof (struct pt_regs, r15)); + DEFINE(IA64_PT_REGS_R14_OFFSET, offsetof (struct pt_regs, r14)); + DEFINE(IA64_PT_REGS_R2_OFFSET, offsetof (struct pt_regs, r2)); + DEFINE(IA64_PT_REGS_R3_OFFSET, offsetof (struct pt_regs, r3)); + DEFINE(IA64_PT_REGS_R16_OFFSET, offsetof (struct pt_regs, r16)); + DEFINE(IA64_PT_REGS_R17_OFFSET, offsetof (struct pt_regs, r17)); + DEFINE(IA64_PT_REGS_R18_OFFSET, offsetof (struct pt_regs, r18)); + DEFINE(IA64_PT_REGS_R19_OFFSET, offsetof (struct pt_regs, r19)); + DEFINE(IA64_PT_REGS_R20_OFFSET, offsetof (struct pt_regs, r20)); + DEFINE(IA64_PT_REGS_R21_OFFSET, offsetof (struct pt_regs, r21)); + DEFINE(IA64_PT_REGS_R22_OFFSET, offsetof (struct pt_regs, r22)); + DEFINE(IA64_PT_REGS_R23_OFFSET, offsetof (struct pt_regs, r23)); + DEFINE(IA64_PT_REGS_R24_OFFSET, offsetof (struct pt_regs, r24)); + DEFINE(IA64_PT_REGS_R25_OFFSET, offsetof (struct pt_regs, r25)); + DEFINE(IA64_PT_REGS_R26_OFFSET, offsetof (struct pt_regs, r26)); + DEFINE(IA64_PT_REGS_R27_OFFSET, offsetof (struct pt_regs, r27)); + DEFINE(IA64_PT_REGS_R28_OFFSET, offsetof (struct pt_regs, r28)); + DEFINE(IA64_PT_REGS_R29_OFFSET, offsetof (struct pt_regs, r29)); + DEFINE(IA64_PT_REGS_R30_OFFSET, offsetof (struct pt_regs, r30)); + DEFINE(IA64_PT_REGS_R31_OFFSET, offsetof (struct pt_regs, r31)); + DEFINE(IA64_PT_REGS_AR_CCV_OFFSET, offsetof (struct pt_regs, ar_ccv)); + DEFINE(IA64_PT_REGS_F6_OFFSET, offsetof (struct pt_regs, f6)); + DEFINE(IA64_PT_REGS_F7_OFFSET, offsetof (struct pt_regs, f7)); + DEFINE(IA64_PT_REGS_F8_OFFSET, offsetof (struct pt_regs, f8)); + DEFINE(IA64_PT_REGS_F9_OFFSET, offsetof (struct pt_regs, f9)); + DEFINE(IA64_PT_REGS_F10_OFFSET, offsetof (struct pt_regs, f10)); + DEFINE(IA64_PT_REGS_F11_OFFSET, offsetof (struct pt_regs, f11)); + + BLANK(); + + DEFINE(IA64_SWITCH_STACK_CALLER_UNAT_OFFSET, offsetof (struct switch_stack, caller_unat)); + DEFINE(IA64_SWITCH_STACK_AR_FPSR_OFFSET, offsetof (struct switch_stack, ar_fpsr)); + DEFINE(IA64_SWITCH_STACK_F2_OFFSET, offsetof (struct switch_stack, f2)); + DEFINE(IA64_SWITCH_STACK_F3_OFFSET, offsetof (struct switch_stack, f3)); + DEFINE(IA64_SWITCH_STACK_F4_OFFSET, offsetof (struct switch_stack, f4)); + DEFINE(IA64_SWITCH_STACK_F5_OFFSET, offsetof (struct switch_stack, f5)); + DEFINE(IA64_SWITCH_STACK_F12_OFFSET, offsetof (struct switch_stack, f12)); + DEFINE(IA64_SWITCH_STACK_F13_OFFSET, offsetof (struct switch_stack, f13)); + DEFINE(IA64_SWITCH_STACK_F14_OFFSET, offsetof (struct switch_stack, f14)); + DEFINE(IA64_SWITCH_STACK_F15_OFFSET, offsetof (struct switch_stack, f15)); + DEFINE(IA64_SWITCH_STACK_F16_OFFSET, offsetof (struct switch_stack, f16)); + DEFINE(IA64_SWITCH_STACK_F17_OFFSET, offsetof (struct switch_stack, f17)); + DEFINE(IA64_SWITCH_STACK_F18_OFFSET, offsetof (struct switch_stack, f18)); + DEFINE(IA64_SWITCH_STACK_F19_OFFSET, offsetof (struct switch_stack, f19)); + DEFINE(IA64_SWITCH_STACK_F20_OFFSET, offsetof (struct switch_stack, f20)); + DEFINE(IA64_SWITCH_STACK_F21_OFFSET, offsetof (struct switch_stack, f21)); + DEFINE(IA64_SWITCH_STACK_F22_OFFSET, offsetof (struct switch_stack, f22)); + DEFINE(IA64_SWITCH_STACK_F23_OFFSET, offsetof (struct switch_stack, f23)); + DEFINE(IA64_SWITCH_STACK_F24_OFFSET, offsetof (struct switch_stack, f24)); + DEFINE(IA64_SWITCH_STACK_F25_OFFSET, offsetof (struct switch_stack, f25)); + DEFINE(IA64_SWITCH_STACK_F26_OFFSET, offsetof (struct switch_stack, f26)); + DEFINE(IA64_SWITCH_STACK_F27_OFFSET, offsetof (struct switch_stack, f27)); + DEFINE(IA64_SWITCH_STACK_F28_OFFSET, offsetof (struct switch_stack, f28)); + DEFINE(IA64_SWITCH_STACK_F29_OFFSET, offsetof (struct switch_stack, f29)); + DEFINE(IA64_SWITCH_STACK_F30_OFFSET, offsetof (struct switch_stack, f30)); + DEFINE(IA64_SWITCH_STACK_F31_OFFSET, offsetof (struct switch_stack, f31)); + DEFINE(IA64_SWITCH_STACK_R4_OFFSET, offsetof (struct switch_stack, r4)); + DEFINE(IA64_SWITCH_STACK_R5_OFFSET, offsetof (struct switch_stack, r5)); + DEFINE(IA64_SWITCH_STACK_R6_OFFSET, offsetof (struct switch_stack, r6)); + DEFINE(IA64_SWITCH_STACK_R7_OFFSET, offsetof (struct switch_stack, r7)); + DEFINE(IA64_SWITCH_STACK_B0_OFFSET, offsetof (struct switch_stack, b0)); + DEFINE(IA64_SWITCH_STACK_B1_OFFSET, offsetof (struct switch_stack, b1)); + DEFINE(IA64_SWITCH_STACK_B2_OFFSET, offsetof (struct switch_stack, b2)); + DEFINE(IA64_SWITCH_STACK_B3_OFFSET, offsetof (struct switch_stack, b3)); + DEFINE(IA64_SWITCH_STACK_B4_OFFSET, offsetof (struct switch_stack, b4)); + DEFINE(IA64_SWITCH_STACK_B5_OFFSET, offsetof (struct switch_stack, b5)); + DEFINE(IA64_SWITCH_STACK_AR_PFS_OFFSET, offsetof (struct switch_stack, ar_pfs)); + DEFINE(IA64_SWITCH_STACK_AR_LC_OFFSET, offsetof (struct switch_stack, ar_lc)); + DEFINE(IA64_SWITCH_STACK_AR_UNAT_OFFSET, offsetof (struct switch_stack, ar_unat)); + DEFINE(IA64_SWITCH_STACK_AR_RNAT_OFFSET, offsetof (struct switch_stack, ar_rnat)); + DEFINE(IA64_SWITCH_STACK_AR_BSPSTORE_OFFSET, offsetof (struct switch_stack, ar_bspstore)); + DEFINE(IA64_SWITCH_STACK_PR_OFFSET, offsetof (struct switch_stack, pr)); + + BLANK(); + + DEFINE(IA64_SIGCONTEXT_IP_OFFSET, offsetof (struct sigcontext, sc_ip)); + DEFINE(IA64_SIGCONTEXT_AR_BSP_OFFSET, offsetof (struct sigcontext, sc_ar_bsp)); + DEFINE(IA64_SIGCONTEXT_AR_FPSR_OFFSET, offsetof (struct sigcontext, sc_ar_fpsr)); + DEFINE(IA64_SIGCONTEXT_AR_RNAT_OFFSET, offsetof (struct sigcontext, sc_ar_rnat)); + DEFINE(IA64_SIGCONTEXT_AR_UNAT_OFFSET, offsetof (struct sigcontext, sc_ar_unat)); + DEFINE(IA64_SIGCONTEXT_B0_OFFSET, offsetof (struct sigcontext, sc_br[0])); + DEFINE(IA64_SIGCONTEXT_CFM_OFFSET, offsetof (struct sigcontext, sc_cfm)); + DEFINE(IA64_SIGCONTEXT_FLAGS_OFFSET, offsetof (struct sigcontext, sc_flags)); + DEFINE(IA64_SIGCONTEXT_FR6_OFFSET, offsetof (struct sigcontext, sc_fr[6])); + DEFINE(IA64_SIGCONTEXT_PR_OFFSET, offsetof (struct sigcontext, sc_pr)); + DEFINE(IA64_SIGCONTEXT_R12_OFFSET, offsetof (struct sigcontext, sc_gr[12])); + DEFINE(IA64_SIGCONTEXT_RBS_BASE_OFFSET,offsetof (struct sigcontext, sc_rbs_base)); + DEFINE(IA64_SIGCONTEXT_LOADRS_OFFSET, offsetof (struct sigcontext, sc_loadrs)); + + BLANK(); + + DEFINE(IA64_SIGPENDING_SIGNAL_OFFSET, offsetof (struct sigpending, signal)); + + BLANK(); + + DEFINE(IA64_SIGFRAME_ARG0_OFFSET, offsetof (struct sigframe, arg0)); + DEFINE(IA64_SIGFRAME_ARG1_OFFSET, offsetof (struct sigframe, arg1)); + DEFINE(IA64_SIGFRAME_ARG2_OFFSET, offsetof (struct sigframe, arg2)); + DEFINE(IA64_SIGFRAME_HANDLER_OFFSET, offsetof (struct sigframe, handler)); + DEFINE(IA64_SIGFRAME_SIGCONTEXT_OFFSET, offsetof (struct sigframe, sc)); + BLANK(); + /* for assembly files which can't include sched.h: */ + DEFINE(IA64_CLONE_VFORK, CLONE_VFORK); + DEFINE(IA64_CLONE_VM, CLONE_VM); + + BLANK(); + DEFINE(IA64_CPUINFO_NSEC_PER_CYC_OFFSET, + offsetof (struct cpuinfo_ia64, nsec_per_cyc)); + DEFINE(IA64_CPUINFO_PTCE_BASE_OFFSET, + offsetof (struct cpuinfo_ia64, ptce_base)); + DEFINE(IA64_CPUINFO_PTCE_COUNT_OFFSET, + offsetof (struct cpuinfo_ia64, ptce_count)); + DEFINE(IA64_CPUINFO_PTCE_STRIDE_OFFSET, + offsetof (struct cpuinfo_ia64, ptce_stride)); + BLANK(); + DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, + offsetof (struct timespec, tv_nsec)); + + DEFINE(CLONE_SETTLS_BIT, 19); +#if CLONE_SETTLS != (1<<19) +# error "CLONE_SETTLS_BIT incorrect, please fix" +#endif + + BLANK(); + DEFINE(IA64_MCA_CPU_MCA_STACK_OFFSET, + offsetof (struct ia64_mca_cpu, mca_stack)); + DEFINE(IA64_MCA_CPU_INIT_STACK_OFFSET, + offsetof (struct ia64_mca_cpu, init_stack)); + BLANK(); + DEFINE(IA64_SAL_OS_STATE_OS_GP_OFFSET, + offsetof (struct ia64_sal_os_state, os_gp)); + DEFINE(IA64_SAL_OS_STATE_PROC_STATE_PARAM_OFFSET, + offsetof (struct ia64_sal_os_state, proc_state_param)); + DEFINE(IA64_SAL_OS_STATE_SAL_RA_OFFSET, + offsetof (struct ia64_sal_os_state, sal_ra)); + DEFINE(IA64_SAL_OS_STATE_SAL_GP_OFFSET, + offsetof (struct ia64_sal_os_state, sal_gp)); + DEFINE(IA64_SAL_OS_STATE_PAL_MIN_STATE_OFFSET, + offsetof (struct ia64_sal_os_state, pal_min_state)); + DEFINE(IA64_SAL_OS_STATE_OS_STATUS_OFFSET, + offsetof (struct ia64_sal_os_state, os_status)); + DEFINE(IA64_SAL_OS_STATE_CONTEXT_OFFSET, + offsetof (struct ia64_sal_os_state, context)); + DEFINE(IA64_SAL_OS_STATE_SIZE, + sizeof (struct ia64_sal_os_state)); + BLANK(); + + DEFINE(IA64_PMSA_GR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_gr)); + DEFINE(IA64_PMSA_BANK1_GR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_bank1_gr)); + DEFINE(IA64_PMSA_PR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_pr)); + DEFINE(IA64_PMSA_BR0_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_br0)); + DEFINE(IA64_PMSA_RSC_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_rsc)); + DEFINE(IA64_PMSA_IIP_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_iip)); + DEFINE(IA64_PMSA_IPSR_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_ipsr)); + DEFINE(IA64_PMSA_IFS_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_ifs)); + DEFINE(IA64_PMSA_XIP_OFFSET, + offsetof (struct pal_min_state_area_s, pmsa_xip)); + BLANK(); + + /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ + DEFINE(IA64_GTOD_LOCK_OFFSET, + offsetof (struct fsyscall_gtod_data_t, lock)); + DEFINE(IA64_GTOD_WALL_TIME_OFFSET, + offsetof (struct fsyscall_gtod_data_t, wall_time)); + DEFINE(IA64_GTOD_MONO_TIME_OFFSET, + offsetof (struct fsyscall_gtod_data_t, monotonic_time)); + DEFINE(IA64_CLKSRC_MASK_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_mask)); + DEFINE(IA64_CLKSRC_MULT_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_mult)); + DEFINE(IA64_CLKSRC_SHIFT_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_shift)); + DEFINE(IA64_CLKSRC_MMIO_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_fsys_mmio)); + DEFINE(IA64_CLKSRC_CYCLE_LAST_OFFSET, + offsetof (struct fsyscall_gtod_data_t, clk_cycle_last)); + DEFINE(IA64_ITC_JITTER_OFFSET, + offsetof (struct itc_jitter_data_t, itc_jitter)); + DEFINE(IA64_ITC_LASTCYCLE_OFFSET, + offsetof (struct itc_jitter_data_t, itc_lastcycle)); + +#ifdef CONFIG_XEN + BLANK(); + + DEFINE(XEN_NATIVE_ASM, XEN_NATIVE); + DEFINE(XEN_PV_DOMAIN_ASM, XEN_PV_DOMAIN); + +#define DEFINE_MAPPED_REG_OFS(sym, field) \ + DEFINE(sym, (XMAPPEDREGS_OFS + offsetof(struct mapped_regs, field))) + + DEFINE_MAPPED_REG_OFS(XSI_PSR_I_ADDR_OFS, interrupt_mask_addr); + DEFINE_MAPPED_REG_OFS(XSI_IPSR_OFS, ipsr); + DEFINE_MAPPED_REG_OFS(XSI_IIP_OFS, iip); + DEFINE_MAPPED_REG_OFS(XSI_IFS_OFS, ifs); + DEFINE_MAPPED_REG_OFS(XSI_PRECOVER_IFS_OFS, precover_ifs); + DEFINE_MAPPED_REG_OFS(XSI_ISR_OFS, isr); + DEFINE_MAPPED_REG_OFS(XSI_IFA_OFS, ifa); + DEFINE_MAPPED_REG_OFS(XSI_IIPA_OFS, iipa); + DEFINE_MAPPED_REG_OFS(XSI_IIM_OFS, iim); + DEFINE_MAPPED_REG_OFS(XSI_IHA_OFS, iha); + DEFINE_MAPPED_REG_OFS(XSI_ITIR_OFS, itir); + DEFINE_MAPPED_REG_OFS(XSI_PSR_IC_OFS, interrupt_collection_enabled); + DEFINE_MAPPED_REG_OFS(XSI_BANKNUM_OFS, banknum); + DEFINE_MAPPED_REG_OFS(XSI_BANK0_R16_OFS, bank0_regs[0]); + DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]); + DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat); + DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat); + DEFINE_MAPPED_REG_OFS(XSI_ITC_OFFSET_OFS, itc_offset); + DEFINE_MAPPED_REG_OFS(XSI_ITC_LAST_OFS, itc_last); +#endif /* CONFIG_XEN */ +} diff --git a/arch/ia64/kernel/audit.c b/arch/ia64/kernel/audit.c new file mode 100644 index 00000000000..96a9d18ff4c --- /dev/null +++ b/arch/ia64/kernel/audit.c @@ -0,0 +1,60 @@ +#include <linux/init.h> +#include <linux/types.h> +#include <linux/audit.h> +#include <asm/unistd.h> + +static unsigned dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +static unsigned read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +static unsigned write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +static unsigned chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +static unsigned signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int audit_classify_arch(int arch) +{ + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c new file mode 100644 index 00000000000..0b286ca164f --- /dev/null +++ b/arch/ia64/kernel/brl_emu.c @@ -0,0 +1,234 @@ +/* + * Emulation of the "brl" instruction for IA64 processors that + * don't support it in hardware. + * Author: Stephan Zeisset, Intel Corp. <Stephan.Zeisset@intel.com> + * + * 02/22/02 D. Mosberger Clear si_flgs, si_isr, and si_imm to avoid + * leaking kernel bits. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <asm/uaccess.h> +#include <asm/processor.h> + +extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5; + +struct illegal_op_return { + unsigned long fkt, arg1, arg2, arg3; +}; + +/* + * The unimplemented bits of a virtual address must be set + * to the value of the most significant implemented bit. + * unimpl_va_mask includes all unimplemented bits and + * the most significant implemented bit, so the result + * of an and operation with the mask must be all 0's + * or all 1's for the address to be valid. + */ +#define unimplemented_virtual_address(va) ( \ + ((va) & local_cpu_data->unimpl_va_mask) != 0 && \ + ((va) & local_cpu_data->unimpl_va_mask) != local_cpu_data->unimpl_va_mask \ +) + +/* + * The unimplemented bits of a physical address must be 0. + * unimpl_pa_mask includes all unimplemented bits, so the result + * of an and operation with the mask must be all 0's for the + * address to be valid. + */ +#define unimplemented_physical_address(pa) ( \ + ((pa) & local_cpu_data->unimpl_pa_mask) != 0 \ +) + +/* + * Handle an illegal operation fault that was caused by an + * unimplemented "brl" instruction. + * If we are not successful (e.g because the illegal operation + * wasn't caused by a "brl" after all), we return -1. + * If we are successful, we return either 0 or the address + * of a "fixup" function for manipulating preserved register + * state. + */ + +struct illegal_op_return +ia64_emulate_brl (struct pt_regs *regs, unsigned long ar_ec) +{ + unsigned long bundle[2]; + unsigned long opcode, btype, qp, offset, cpl; + unsigned long next_ip; + struct siginfo siginfo; + struct illegal_op_return rv; + long tmp_taken, unimplemented_address; + + rv.fkt = (unsigned long) -1; + + /* + * Decode the instruction bundle. + */ + + if (copy_from_user(bundle, (void *) (regs->cr_iip), sizeof(bundle))) + return rv; + + next_ip = (unsigned long) regs->cr_iip + 16; + + /* "brl" must be in slot 2. */ + if (ia64_psr(regs)->ri != 1) return rv; + + /* Must be "mlx" template */ + if ((bundle[0] & 0x1e) != 0x4) return rv; + + opcode = (bundle[1] >> 60); + btype = ((bundle[1] >> 29) & 0x7); + qp = ((bundle[1] >> 23) & 0x3f); + offset = ((bundle[1] & 0x0800000000000000L) << 4) + | ((bundle[1] & 0x00fffff000000000L) >> 32) + | ((bundle[1] & 0x00000000007fffffL) << 40) + | ((bundle[0] & 0xffff000000000000L) >> 24); + + tmp_taken = regs->pr & (1L << qp); + + switch(opcode) { + + case 0xC: + /* + * Long Branch. + */ + if (btype != 0) return rv; + rv.fkt = 0; + if (!(tmp_taken)) { + /* + * Qualifying predicate is 0. + * Skip instruction. + */ + regs->cr_iip = next_ip; + ia64_psr(regs)->ri = 0; + return rv; + } + break; + + case 0xD: + /* + * Long Call. + */ + rv.fkt = 0; + if (!(tmp_taken)) { + /* + * Qualifying predicate is 0. + * Skip instruction. + */ + regs->cr_iip = next_ip; + ia64_psr(regs)->ri = 0; + return rv; + } + + /* + * BR[btype] = IP+16 + */ + switch(btype) { + case 0: + regs->b0 = next_ip; + break; + case 1: + rv.fkt = (unsigned long) &ia64_set_b1; + break; + case 2: + rv.fkt = (unsigned long) &ia64_set_b2; + break; + case 3: + rv.fkt = (unsigned long) &ia64_set_b3; + break; + case 4: + rv.fkt = (unsigned long) &ia64_set_b4; + break; + case 5: + rv.fkt = (unsigned long) &ia64_set_b5; + break; + case 6: + regs->b6 = next_ip; + break; + case 7: + regs->b7 = next_ip; + break; + } + rv.arg1 = next_ip; + + /* + * AR[PFS].pfm = CFM + * AR[PFS].pec = AR[EC] + * AR[PFS].ppl = PSR.cpl + */ + cpl = ia64_psr(regs)->cpl; + regs->ar_pfs = ((regs->cr_ifs & 0x3fffffffff) + | (ar_ec << 52) | (cpl << 62)); + + /* + * CFM.sof -= CFM.sol + * CFM.sol = 0 + * CFM.sor = 0 + * CFM.rrb.gr = 0 + * CFM.rrb.fr = 0 + * CFM.rrb.pr = 0 + */ + regs->cr_ifs = ((regs->cr_ifs & 0xffffffc00000007f) + - ((regs->cr_ifs >> 7) & 0x7f)); + + break; + + default: + /* + * Unknown opcode. + */ + return rv; + + } + + regs->cr_iip += offset; + ia64_psr(regs)->ri = 0; + + if (ia64_psr(regs)->it == 0) + unimplemented_address = unimplemented_physical_address(regs->cr_iip); + else + unimplemented_address = unimplemented_virtual_address(regs->cr_iip); + + if (unimplemented_address) { + /* + * The target address contains unimplemented bits. + */ + printk(KERN_DEBUG "Woah! Unimplemented Instruction Address Trap!\n"); + siginfo.si_signo = SIGILL; + siginfo.si_errno = 0; + siginfo.si_flags = 0; + siginfo.si_isr = 0; + siginfo.si_imm = 0; + siginfo.si_code = ILL_BADIADDR; + force_sig_info(SIGILL, &siginfo, current); + } else if (ia64_psr(regs)->tb) { + /* + * Branch Tracing is enabled. + * Force a taken branch signal. + */ + siginfo.si_signo = SIGTRAP; + siginfo.si_errno = 0; + siginfo.si_code = TRAP_BRANCH; + siginfo.si_flags = 0; + siginfo.si_isr = 0; + siginfo.si_addr = 0; + siginfo.si_imm = 0; + force_sig_info(SIGTRAP, &siginfo, current); + } else if (ia64_psr(regs)->ss) { + /* + * Single Step is enabled. + * Force a trace signal. + */ + siginfo.si_signo = SIGTRAP; + siginfo.si_errno = 0; + siginfo.si_code = TRAP_TRACE; + siginfo.si_flags = 0; + siginfo.si_isr = 0; + siginfo.si_addr = 0; + siginfo.si_imm = 0; + force_sig_info(SIGTRAP, &siginfo, current); + } + return rv; +} diff --git a/arch/ia64/kernel/cpufreq/Kconfig b/arch/ia64/kernel/cpufreq/Kconfig new file mode 100644 index 00000000000..2d9d5279b98 --- /dev/null +++ b/arch/ia64/kernel/cpufreq/Kconfig @@ -0,0 +1,29 @@ + +# +# CPU Frequency scaling +# + +menu "CPU Frequency scaling" + +source "drivers/cpufreq/Kconfig" + +if CPU_FREQ + +comment "CPUFreq processor drivers" + +config IA64_ACPI_CPUFREQ + tristate "ACPI Processor P-States driver" + select CPU_FREQ_TABLE + depends on ACPI_PROCESSOR + help + This driver adds a CPUFreq driver which utilizes the ACPI + Processor Performance States. + + For details, take a look at <file:Documentation/cpu-freq/>. + + If in doubt, say N. + +endif # CPU_FREQ + +endmenu + diff --git a/arch/ia64/kernel/cpufreq/Makefile b/arch/ia64/kernel/cpufreq/Makefile new file mode 100644 index 00000000000..4838f2a57c7 --- /dev/null +++ b/arch/ia64/kernel/cpufreq/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_IA64_ACPI_CPUFREQ) += acpi-cpufreq.o + diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c new file mode 100644 index 00000000000..f09b174244d --- /dev/null +++ b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c @@ -0,0 +1,437 @@ +/* + * arch/ia64/kernel/cpufreq/acpi-cpufreq.c + * This file provides the ACPI based P-state support. This + * module works with generic cpufreq infrastructure. Most of + * the code is based on i386 version + * (arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c) + * + * Copyright (C) 2005 Intel Corp + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <asm/io.h> +#include <asm/uaccess.h> +#include <asm/pal.h> + +#include <linux/acpi.h> +#include <acpi/processor.h> + +MODULE_AUTHOR("Venkatesh Pallipadi"); +MODULE_DESCRIPTION("ACPI Processor P-States Driver"); +MODULE_LICENSE("GPL"); + + +struct cpufreq_acpi_io { + struct acpi_processor_performance acpi_data; + struct cpufreq_frequency_table *freq_table; + unsigned int resume; +}; + +static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS]; + +static struct cpufreq_driver acpi_cpufreq_driver; + + +static int +processor_set_pstate ( + u32 value) +{ + s64 retval; + + pr_debug("processor_set_pstate\n"); + + retval = ia64_pal_set_pstate((u64)value); + + if (retval) { + pr_debug("Failed to set freq to 0x%x, with error 0x%lx\n", + value, retval); + return -ENODEV; + } + return (int)retval; +} + + +static int +processor_get_pstate ( + u32 *value) +{ + u64 pstate_index = 0; + s64 retval; + + pr_debug("processor_get_pstate\n"); + + retval = ia64_pal_get_pstate(&pstate_index, + PAL_GET_PSTATE_TYPE_INSTANT); + *value = (u32) pstate_index; + + if (retval) + pr_debug("Failed to get current freq with " + "error 0x%lx, idx 0x%x\n", retval, *value); + + return (int)retval; +} + + +/* To be used only after data->acpi_data is initialized */ +static unsigned +extract_clock ( + struct cpufreq_acpi_io *data, + unsigned value, + unsigned int cpu) +{ + unsigned long i; + + pr_debug("extract_clock\n"); + + for (i = 0; i < data->acpi_data.state_count; i++) { + if (value == data->acpi_data.states[i].status) + return data->acpi_data.states[i].core_frequency; + } + return data->acpi_data.states[i-1].core_frequency; +} + + +static unsigned int +processor_get_freq ( + struct cpufreq_acpi_io *data, + unsigned int cpu) +{ + int ret = 0; + u32 value = 0; + cpumask_t saved_mask; + unsigned long clock_freq; + + pr_debug("processor_get_freq\n"); + + saved_mask = current->cpus_allowed; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + if (smp_processor_id() != cpu) + goto migrate_end; + + /* processor_get_pstate gets the instantaneous frequency */ + ret = processor_get_pstate(&value); + + if (ret) { + set_cpus_allowed_ptr(current, &saved_mask); + printk(KERN_WARNING "get performance failed with error %d\n", + ret); + ret = 0; + goto migrate_end; + } + clock_freq = extract_clock(data, value, cpu); + ret = (clock_freq*1000); + +migrate_end: + set_cpus_allowed_ptr(current, &saved_mask); + return ret; +} + + +static int +processor_set_freq ( + struct cpufreq_acpi_io *data, + unsigned int cpu, + int state) +{ + int ret = 0; + u32 value = 0; + struct cpufreq_freqs cpufreq_freqs; + cpumask_t saved_mask; + int retval; + + pr_debug("processor_set_freq\n"); + + saved_mask = current->cpus_allowed; + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + if (smp_processor_id() != cpu) { + retval = -EAGAIN; + goto migrate_end; + } + + if (state == data->acpi_data.state) { + if (unlikely(data->resume)) { + pr_debug("Called after resume, resetting to P%d\n", state); + data->resume = 0; + } else { + pr_debug("Already at target state (P%d)\n", state); + retval = 0; + goto migrate_end; + } + } + + pr_debug("Transitioning from P%d to P%d\n", + data->acpi_data.state, state); + + /* cpufreq frequency struct */ + cpufreq_freqs.cpu = cpu; + cpufreq_freqs.old = data->freq_table[data->acpi_data.state].frequency; + cpufreq_freqs.new = data->freq_table[state].frequency; + + /* notify cpufreq */ + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE); + + /* + * First we write the target state's 'control' value to the + * control_register. + */ + + value = (u32) data->acpi_data.states[state].control; + + pr_debug("Transitioning to state: 0x%08x\n", value); + + ret = processor_set_pstate(value); + if (ret) { + unsigned int tmp = cpufreq_freqs.new; + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); + cpufreq_freqs.new = cpufreq_freqs.old; + cpufreq_freqs.old = tmp; + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE); + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); + printk(KERN_WARNING "Transition failed with error %d\n", ret); + retval = -ENODEV; + goto migrate_end; + } + + cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE); + + data->acpi_data.state = state; + + retval = 0; + +migrate_end: + set_cpus_allowed_ptr(current, &saved_mask); + return (retval); +} + + +static unsigned int +acpi_cpufreq_get ( + unsigned int cpu) +{ + struct cpufreq_acpi_io *data = acpi_io_data[cpu]; + + pr_debug("acpi_cpufreq_get\n"); + + return processor_get_freq(data, cpu); +} + + +static int +acpi_cpufreq_target ( + struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; + unsigned int next_state = 0; + unsigned int result = 0; + + pr_debug("acpi_cpufreq_setpolicy\n"); + + result = cpufreq_frequency_table_target(policy, + data->freq_table, target_freq, relation, &next_state); + if (result) + return (result); + + result = processor_set_freq(data, policy->cpu, next_state); + + return (result); +} + + +static int +acpi_cpufreq_verify ( + struct cpufreq_policy *policy) +{ + unsigned int result = 0; + struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; + + pr_debug("acpi_cpufreq_verify\n"); + + result = cpufreq_frequency_table_verify(policy, + data->freq_table); + + return (result); +} + + +static int +acpi_cpufreq_cpu_init ( + struct cpufreq_policy *policy) +{ + unsigned int i; + unsigned int cpu = policy->cpu; + struct cpufreq_acpi_io *data; + unsigned int result = 0; + + pr_debug("acpi_cpufreq_cpu_init\n"); + + data = kzalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL); + if (!data) + return (-ENOMEM); + + acpi_io_data[cpu] = data; + + result = acpi_processor_register_performance(&data->acpi_data, cpu); + + if (result) + goto err_free; + + /* capability check */ + if (data->acpi_data.state_count <= 1) { + pr_debug("No P-States\n"); + result = -ENODEV; + goto err_unreg; + } + + if ((data->acpi_data.control_register.space_id != + ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data.status_register.space_id != + ACPI_ADR_SPACE_FIXED_HARDWARE)) { + pr_debug("Unsupported address space [%d, %d]\n", + (u32) (data->acpi_data.control_register.space_id), + (u32) (data->acpi_data.status_register.space_id)); + result = -ENODEV; + goto err_unreg; + } + + /* alloc freq_table */ + data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * + (data->acpi_data.state_count + 1), + GFP_KERNEL); + if (!data->freq_table) { + result = -ENOMEM; + goto err_unreg; + } + + /* detect transition latency */ + policy->cpuinfo.transition_latency = 0; + for (i=0; i<data->acpi_data.state_count; i++) { + if ((data->acpi_data.states[i].transition_latency * 1000) > + policy->cpuinfo.transition_latency) { + policy->cpuinfo.transition_latency = + data->acpi_data.states[i].transition_latency * 1000; + } + } + policy->cur = processor_get_freq(data, policy->cpu); + + /* table init */ + for (i = 0; i <= data->acpi_data.state_count; i++) + { + data->freq_table[i].index = i; + if (i < data->acpi_data.state_count) { + data->freq_table[i].frequency = + data->acpi_data.states[i].core_frequency * 1000; + } else { + data->freq_table[i].frequency = CPUFREQ_TABLE_END; + } + } + + result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); + if (result) { + goto err_freqfree; + } + + /* notify BIOS that we exist */ + acpi_processor_notify_smm(THIS_MODULE); + + printk(KERN_INFO "acpi-cpufreq: CPU%u - ACPI performance management " + "activated.\n", cpu); + + for (i = 0; i < data->acpi_data.state_count; i++) + pr_debug(" %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n", + (i == data->acpi_data.state?'*':' '), i, + (u32) data->acpi_data.states[i].core_frequency, + (u32) data->acpi_data.states[i].power, + (u32) data->acpi_data.states[i].transition_latency, + (u32) data->acpi_data.states[i].bus_master_latency, + (u32) data->acpi_data.states[i].status, + (u32) data->acpi_data.states[i].control); + + cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); + + /* the first call to ->target() should result in us actually + * writing something to the appropriate registers. */ + data->resume = 1; + + return (result); + + err_freqfree: + kfree(data->freq_table); + err_unreg: + acpi_processor_unregister_performance(&data->acpi_data, cpu); + err_free: + kfree(data); + acpi_io_data[cpu] = NULL; + + return (result); +} + + +static int +acpi_cpufreq_cpu_exit ( + struct cpufreq_policy *policy) +{ + struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; + + pr_debug("acpi_cpufreq_cpu_exit\n"); + + if (data) { + cpufreq_frequency_table_put_attr(policy->cpu); + acpi_io_data[policy->cpu] = NULL; + acpi_processor_unregister_performance(&data->acpi_data, + policy->cpu); + kfree(data); + } + + return (0); +} + + +static struct freq_attr* acpi_cpufreq_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + NULL, +}; + + +static struct cpufreq_driver acpi_cpufreq_driver = { + .verify = acpi_cpufreq_verify, + .target = acpi_cpufreq_target, + .get = acpi_cpufreq_get, + .init = acpi_cpufreq_cpu_init, + .exit = acpi_cpufreq_cpu_exit, + .name = "acpi-cpufreq", + .owner = THIS_MODULE, + .attr = acpi_cpufreq_attr, +}; + + +static int __init +acpi_cpufreq_init (void) +{ + pr_debug("acpi_cpufreq_init\n"); + + return cpufreq_register_driver(&acpi_cpufreq_driver); +} + + +static void __exit +acpi_cpufreq_exit (void) +{ + pr_debug("acpi_cpufreq_exit\n"); + + cpufreq_unregister_driver(&acpi_cpufreq_driver); + return; +} + + +late_initcall(acpi_cpufreq_init); +module_exit(acpi_cpufreq_exit); + diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c new file mode 100644 index 00000000000..b942f4032d7 --- /dev/null +++ b/arch/ia64/kernel/crash.c @@ -0,0 +1,286 @@ +/* + * arch/ia64/kernel/crash.c + * + * Architecture specific (ia64) functions for kexec based crash dumps. + * + * Created by: Khalid Aziz <khalid.aziz@hp.com> + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2005 Intel Corp Zou Nan hai <nanhai.zou@intel.com> + * + */ +#include <linux/smp.h> +#include <linux/delay.h> +#include <linux/crash_dump.h> +#include <linux/bootmem.h> +#include <linux/kexec.h> +#include <linux/elfcore.h> +#include <linux/sysctl.h> +#include <linux/init.h> +#include <linux/kdebug.h> + +#include <asm/mca.h> + +int kdump_status[NR_CPUS]; +static atomic_t kdump_cpu_frozen; +atomic_t kdump_in_progress; +static int kdump_freeze_monarch; +static int kdump_on_init = 1; +static int kdump_on_fatal_mca = 1; + +static inline Elf64_Word +*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += (sizeof(*note) + 3)/4; + memcpy(buf, name, note->n_namesz); + buf += (note->n_namesz + 3)/4; + memcpy(buf, data, data_len); + buf += (data_len + 3)/4; + return buf; +} + +static void +final_note(void *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +extern void ia64_dump_cpu_regs(void *); + +static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus); + +void +crash_save_this_cpu(void) +{ + void *buf; + unsigned long cfm, sof, sol; + + int cpu = smp_processor_id(); + struct elf_prstatus *prstatus = &per_cpu(elf_prstatus, cpu); + + elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg); + memset(prstatus, 0, sizeof(*prstatus)); + prstatus->pr_pid = current->pid; + + ia64_dump_cpu_regs(dst); + cfm = dst[43]; + sol = (cfm >> 7) & 0x7f; + sof = cfm & 0x7f; + dst[46] = (unsigned long)ia64_rse_skip_regs((unsigned long *)dst[46], + sof - sol); + + buf = (u64 *) per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, prstatus, + sizeof(*prstatus)); + final_note(buf); +} + +#ifdef CONFIG_SMP +static int +kdump_wait_cpu_freeze(void) +{ + int cpu_num = num_online_cpus() - 1; + int timeout = 1000; + while(timeout-- > 0) { + if (atomic_read(&kdump_cpu_frozen) == cpu_num) + return 0; + udelay(1000); + } + return 1; +} +#endif + +void +machine_crash_shutdown(struct pt_regs *pt) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + kexec_disable_iosapic(); +#ifdef CONFIG_SMP + /* + * If kdump_on_init is set and an INIT is asserted here, kdump will + * be started again via INIT monarch. + */ + local_irq_disable(); + ia64_set_psr_mc(); /* mask MCA/INIT */ + if (atomic_inc_return(&kdump_in_progress) != 1) + unw_init_running(kdump_cpu_freeze, NULL); + + /* + * Now this cpu is ready for kdump. + * Stop all others by IPI or INIT. They could receive INIT from + * outside and might be INIT monarch, but only thing they have to + * do is falling into kdump_cpu_freeze(). + * + * If an INIT is asserted here: + * - All receivers might be slaves, since some of cpus could already + * be frozen and INIT might be masked on monarch. In this case, + * all slaves will be frozen soon since kdump_in_progress will let + * them into DIE_INIT_SLAVE_LEAVE. + * - One might be a monarch, but INIT rendezvous will fail since + * at least this cpu already have INIT masked so it never join + * to the rendezvous. In this case, all slaves and monarch will + * be frozen soon with no wait since the INIT rendezvous is skipped + * by kdump_in_progress. + */ + kdump_smp_send_stop(); + /* not all cpu response to IPI, send INIT to freeze them */ + if (kdump_wait_cpu_freeze()) { + kdump_smp_send_init(); + /* wait again, don't go ahead if possible */ + kdump_wait_cpu_freeze(); + } +#endif +} + +static void +machine_kdump_on_init(void) +{ + crash_save_vmcoreinfo(); + local_irq_disable(); + kexec_disable_iosapic(); + machine_kexec(ia64_kimage); +} + +void +kdump_cpu_freeze(struct unw_frame_info *info, void *arg) +{ + int cpuid; + + local_irq_disable(); + cpuid = smp_processor_id(); + crash_save_this_cpu(); + current->thread.ksp = (__u64)info->sw - 16; + + ia64_set_psr_mc(); /* mask MCA/INIT and stop reentrance */ + + atomic_inc(&kdump_cpu_frozen); + kdump_status[cpuid] = 1; + mb(); + for (;;) + cpu_relax(); +} + +static int +kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) +{ + struct ia64_mca_notify_die *nd; + struct die_args *args = data; + + if (atomic_read(&kdump_in_progress)) { + switch (val) { + case DIE_INIT_MONARCH_LEAVE: + if (!kdump_freeze_monarch) + break; + /* fall through */ + case DIE_INIT_SLAVE_LEAVE: + case DIE_INIT_MONARCH_ENTER: + case DIE_MCA_RENDZVOUS_LEAVE: + unw_init_running(kdump_cpu_freeze, NULL); + break; + } + } + + if (!kdump_on_init && !kdump_on_fatal_mca) + return NOTIFY_DONE; + + if (!ia64_kimage) { + if (val == DIE_INIT_MONARCH_LEAVE) + ia64_mca_printk(KERN_NOTICE + "%s: kdump not configured\n", + __func__); + return NOTIFY_DONE; + } + + if (val != DIE_INIT_MONARCH_LEAVE && + val != DIE_INIT_MONARCH_PROCESS && + val != DIE_MCA_MONARCH_LEAVE) + return NOTIFY_DONE; + + nd = (struct ia64_mca_notify_die *)args->err; + + switch (val) { + case DIE_INIT_MONARCH_PROCESS: + /* Reason code 1 means machine check rendezvous*/ + if (kdump_on_init && (nd->sos->rv_rc != 1)) { + if (atomic_inc_return(&kdump_in_progress) != 1) + kdump_freeze_monarch = 1; + } + break; + case DIE_INIT_MONARCH_LEAVE: + /* Reason code 1 means machine check rendezvous*/ + if (kdump_on_init && (nd->sos->rv_rc != 1)) + machine_kdump_on_init(); + break; + case DIE_MCA_MONARCH_LEAVE: + /* *(nd->data) indicate if MCA is recoverable */ + if (kdump_on_fatal_mca && !(*(nd->data))) { + if (atomic_inc_return(&kdump_in_progress) == 1) + machine_kdump_on_init(); + /* We got fatal MCA while kdump!? No way!! */ + } + break; + } + return NOTIFY_DONE; +} + +#ifdef CONFIG_SYSCTL +static ctl_table kdump_ctl_table[] = { + { + .procname = "kdump_on_init", + .data = &kdump_on_init, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "kdump_on_fatal_mca", + .data = &kdump_on_fatal_mca, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static ctl_table sys_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kdump_ctl_table, + }, + { } +}; +#endif + +static int +machine_crash_setup(void) +{ + /* be notified before default_monarch_init_process */ + static struct notifier_block kdump_init_notifier_nb = { + .notifier_call = kdump_init_notifier, + .priority = 1, + }; + int ret; + if((ret = register_die_notifier(&kdump_init_notifier_nb)) != 0) + return ret; +#ifdef CONFIG_SYSCTL + register_sysctl_table(sys_table); +#endif + return 0; +} + +__initcall(machine_crash_setup); + diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c new file mode 100644 index 00000000000..c8c9298666f --- /dev/null +++ b/arch/ia64/kernel/crash_dump.c @@ -0,0 +1,50 @@ +/* + * kernel/crash_dump.c - Memory preserving reboot related code. + * + * Created by: Simon Horman <horms@verge.net.au> + * Original code moved from kernel/crash.c + * Original code comment copied from the i386 version of this file + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/uaccess.h> + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + * + * Calling copy_to_user() in atomic context is not desirable. Hence first + * copying the data to a pre-allocated kernel page and then copying to user + * space in non-atomic context. + */ +ssize_t +copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + vaddr = __va(pfn<<PAGE_SHIFT); + if (userbuf) { + if (copy_to_user(buf, (vaddr + offset), csize)) { + return -EFAULT; + } + } else + memcpy(buf, (vaddr + offset), csize); + return csize; +} + diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c new file mode 100644 index 00000000000..4826ff957a3 --- /dev/null +++ b/arch/ia64/kernel/cyclone.c @@ -0,0 +1,124 @@ +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/time.h> +#include <linux/errno.h> +#include <linux/timex.h> +#include <linux/clocksource.h> +#include <asm/io.h> + +/* IBM Summit (EXA) Cyclone counter code*/ +#define CYCLONE_CBAR_ADDR 0xFEB00CD0 +#define CYCLONE_PMCC_OFFSET 0x51A0 +#define CYCLONE_MPMC_OFFSET 0x51D0 +#define CYCLONE_MPCS_OFFSET 0x51A8 +#define CYCLONE_TIMER_FREQ 100000000 + +int use_cyclone; +void __init cyclone_setup(void) +{ + use_cyclone = 1; +} + +static void __iomem *cyclone_mc; + +static cycle_t read_cyclone(struct clocksource *cs) +{ + return (cycle_t)readq((void __iomem *)cyclone_mc); +} + +static struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 300, + .read = read_cyclone, + .mask = (1LL << 40) - 1, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +int __init init_cyclone_clock(void) +{ + u64 __iomem *reg; + u64 base; /* saved cyclone base address */ + u64 offset; /* offset from pageaddr to cyclone_timer register */ + int i; + u32 __iomem *cyclone_timer; /* Cyclone MPMC0 register */ + + if (!use_cyclone) + return 0; + + printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); + + /* find base address */ + offset = (CYCLONE_CBAR_ADDR); + reg = ioremap_nocache(offset, sizeof(u64)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR" + " register.\n"); + use_cyclone = 0; + return -ENODEV; + } + base = readq(reg); + iounmap(reg); + if(!base){ + printk(KERN_ERR "Summit chipset: Could not find valid CBAR" + " value.\n"); + use_cyclone = 0; + return -ENODEV; + } + + /* setup PMCC */ + offset = (base + CYCLONE_PMCC_OFFSET); + reg = ioremap_nocache(offset, sizeof(u64)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid PMCC" + " register.\n"); + use_cyclone = 0; + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* setup MPCS */ + offset = (base + CYCLONE_MPCS_OFFSET); + reg = ioremap_nocache(offset, sizeof(u64)); + if(!reg){ + printk(KERN_ERR "Summit chipset: Could not find valid MPCS" + " register.\n"); + use_cyclone = 0; + return -ENODEV; + } + writel(0x00000001,reg); + iounmap(reg); + + /* map in cyclone_timer */ + offset = (base + CYCLONE_MPMC_OFFSET); + cyclone_timer = ioremap_nocache(offset, sizeof(u32)); + if(!cyclone_timer){ + printk(KERN_ERR "Summit chipset: Could not find valid MPMC" + " register.\n"); + use_cyclone = 0; + return -ENODEV; + } + + /*quick test to make sure its ticking*/ + for(i=0; i<3; i++){ + u32 old = readl(cyclone_timer); + int stall = 100; + while(stall--) barrier(); + if(readl(cyclone_timer) == old){ + printk(KERN_ERR "Summit chipset: Counter not counting!" + " DISABLED\n"); + iounmap(cyclone_timer); + cyclone_timer = NULL; + use_cyclone = 0; + return -ENODEV; + } + } + /* initialize last tick */ + cyclone_mc = cyclone_timer; + clocksource_cyclone.archdata.fsys_mmio = cyclone_timer; + clocksource_register_hz(&clocksource_cyclone, CYCLONE_TIMER_FREQ); + + return 0; +} + +__initcall(init_cyclone_clock); diff --git a/arch/ia64/kernel/dma-mapping.c b/arch/ia64/kernel/dma-mapping.c new file mode 100644 index 00000000000..7f791623820 --- /dev/null +++ b/arch/ia64/kernel/dma-mapping.c @@ -0,0 +1,24 @@ +#include <linux/dma-mapping.h> +#include <linux/export.h> + +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly; + +struct dma_map_ops *dma_ops; +EXPORT_SYMBOL(dma_ops); + +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) + +static int __init dma_init(void) +{ + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + + return 0; +} +fs_initcall(dma_init); + +struct dma_map_ops *dma_get_ops(struct device *dev) +{ + return dma_ops; +} +EXPORT_SYMBOL(dma_get_ops); diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c new file mode 100644 index 00000000000..c38d22e5e90 --- /dev/null +++ b/arch/ia64/kernel/efi.c @@ -0,0 +1,1369 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 0.9 + * April 30, 1999 + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2003 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * (c) Copyright 2006 Hewlett-Packard Development Company, L.P. + * Bjorn Helgaas <bjorn.helgaas@hp.com> + * + * All EFI Runtime Services are not implemented yet as EFI only + * supports physical mode addressing on SoftSDV. This is to be fixed + * in a future version. --drummond 1999-07-20 + * + * Implemented EFI runtime services and virtual mode calls. --davidm + * + * Goutham Rao: <goutham.rao@intel.com> + * Skip non-WB memory and ignore empty memory ranges. + */ +#include <linux/module.h> +#include <linux/bootmem.h> +#include <linux/crash_dump.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/efi.h> +#include <linux/kexec.h> +#include <linux/mm.h> + +#include <asm/io.h> +#include <asm/kregs.h> +#include <asm/meminit.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/mca.h> +#include <asm/tlbflush.h> + +#define EFI_DEBUG 0 + +extern efi_status_t efi_call_phys (void *, ...); + +struct efi efi; +EXPORT_SYMBOL(efi); +static efi_runtime_services_t *runtime; +static u64 mem_limit = ~0UL, max_addr = ~0UL, min_addr = 0UL; + +#define efi_call_virt(f, args...) (*(f))(args) + +#define STUB_GET_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_time (efi_time_t *tm, efi_time_cap_t *tc) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_time_cap_t *atc = NULL; \ + efi_status_t ret; \ + \ + if (tc) \ + atc = adjust_arg(tc); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), \ + adjust_arg(tm), atc); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_time (efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_set_time_t *) __va(runtime->set_time), \ + adjust_arg(tm)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_WAKEUP_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_wakeup_time (efi_bool_t *enabled, efi_bool_t *pending, \ + efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_get_wakeup_time_t *) __va(runtime->get_wakeup_time), \ + adjust_arg(enabled), adjust_arg(pending), adjust_arg(tm)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_WAKEUP_TIME(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_wakeup_time (efi_bool_t enabled, efi_time_t *tm) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_time_t *atm = NULL; \ + efi_status_t ret; \ + \ + if (tm) \ + atm = adjust_arg(tm); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_set_wakeup_time_t *) __va(runtime->set_wakeup_time), \ + enabled, atm); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_variable (efi_char16_t *name, efi_guid_t *vendor, u32 *attr, \ + unsigned long *data_size, void *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + u32 *aattr = NULL; \ + efi_status_t ret; \ + \ + if (attr) \ + aattr = adjust_arg(attr); \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_get_variable_t *) __va(runtime->get_variable), \ + adjust_arg(name), adjust_arg(vendor), aattr, \ + adjust_arg(data_size), adjust_arg(data)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_NEXT_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_next_variable (unsigned long *name_size, efi_char16_t *name, \ + efi_guid_t *vendor) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_get_next_variable_t *) __va(runtime->get_next_variable), \ + adjust_arg(name_size), adjust_arg(name), adjust_arg(vendor)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_SET_VARIABLE(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_set_variable (efi_char16_t *name, efi_guid_t *vendor, \ + u32 attr, unsigned long data_size, \ + void *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix( \ + (efi_set_variable_t *) __va(runtime->set_variable), \ + adjust_arg(name), adjust_arg(vendor), attr, data_size, \ + adjust_arg(data)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_GET_NEXT_HIGH_MONO_COUNT(prefix, adjust_arg) \ +static efi_status_t \ +prefix##_get_next_high_mono_count (u32 *count) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_status_t ret; \ + \ + ia64_save_scratch_fpregs(fr); \ + ret = efi_call_##prefix((efi_get_next_high_mono_count_t *) \ + __va(runtime->get_next_high_mono_count), \ + adjust_arg(count)); \ + ia64_load_scratch_fpregs(fr); \ + return ret; \ +} + +#define STUB_RESET_SYSTEM(prefix, adjust_arg) \ +static void \ +prefix##_reset_system (int reset_type, efi_status_t status, \ + unsigned long data_size, efi_char16_t *data) \ +{ \ + struct ia64_fpreg fr[6]; \ + efi_char16_t *adata = NULL; \ + \ + if (data) \ + adata = adjust_arg(data); \ + \ + ia64_save_scratch_fpregs(fr); \ + efi_call_##prefix( \ + (efi_reset_system_t *) __va(runtime->reset_system), \ + reset_type, status, data_size, adata); \ + /* should not return, but just in case... */ \ + ia64_load_scratch_fpregs(fr); \ +} + +#define phys_ptr(arg) ((__typeof__(arg)) ia64_tpa(arg)) + +STUB_GET_TIME(phys, phys_ptr) +STUB_SET_TIME(phys, phys_ptr) +STUB_GET_WAKEUP_TIME(phys, phys_ptr) +STUB_SET_WAKEUP_TIME(phys, phys_ptr) +STUB_GET_VARIABLE(phys, phys_ptr) +STUB_GET_NEXT_VARIABLE(phys, phys_ptr) +STUB_SET_VARIABLE(phys, phys_ptr) +STUB_GET_NEXT_HIGH_MONO_COUNT(phys, phys_ptr) +STUB_RESET_SYSTEM(phys, phys_ptr) + +#define id(arg) arg + +STUB_GET_TIME(virt, id) +STUB_SET_TIME(virt, id) +STUB_GET_WAKEUP_TIME(virt, id) +STUB_SET_WAKEUP_TIME(virt, id) +STUB_GET_VARIABLE(virt, id) +STUB_GET_NEXT_VARIABLE(virt, id) +STUB_SET_VARIABLE(virt, id) +STUB_GET_NEXT_HIGH_MONO_COUNT(virt, id) +STUB_RESET_SYSTEM(virt, id) + +void +efi_gettimeofday (struct timespec *ts) +{ + efi_time_t tm; + + if ((*efi.get_time)(&tm, NULL) != EFI_SUCCESS) { + memset(ts, 0, sizeof(*ts)); + return; + } + + ts->tv_sec = mktime(tm.year, tm.month, tm.day, + tm.hour, tm.minute, tm.second); + ts->tv_nsec = tm.nanosecond; +} + +static int +is_memory_available (efi_memory_desc_t *md) +{ + if (!(md->attribute & EFI_MEMORY_WB)) + return 0; + + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + return 1; + } + return 0; +} + +typedef struct kern_memdesc { + u64 attribute; + u64 start; + u64 num_pages; +} kern_memdesc_t; + +static kern_memdesc_t *kern_memmap; + +#define efi_md_size(md) (md->num_pages << EFI_PAGE_SHIFT) + +static inline u64 +kmd_end(kern_memdesc_t *kmd) +{ + return (kmd->start + (kmd->num_pages << EFI_PAGE_SHIFT)); +} + +static inline u64 +efi_md_end(efi_memory_desc_t *md) +{ + return (md->phys_addr + efi_md_size(md)); +} + +static inline int +efi_wb(efi_memory_desc_t *md) +{ + return (md->attribute & EFI_MEMORY_WB); +} + +static inline int +efi_uc(efi_memory_desc_t *md) +{ + return (md->attribute & EFI_MEMORY_UC); +} + +static void +walk (efi_freemem_callback_t callback, void *arg, u64 attr) +{ + kern_memdesc_t *k; + u64 start, end, voff; + + voff = (attr == EFI_MEMORY_WB) ? PAGE_OFFSET : __IA64_UNCACHED_OFFSET; + for (k = kern_memmap; k->start != ~0UL; k++) { + if (k->attribute != attr) + continue; + start = PAGE_ALIGN(k->start); + end = (k->start + (k->num_pages << EFI_PAGE_SHIFT)) & PAGE_MASK; + if (start < end) + if ((*callback)(start + voff, end + voff, arg) < 0) + return; + } +} + +/* + * Walk the EFI memory map and call CALLBACK once for each EFI memory + * descriptor that has memory that is available for OS use. + */ +void +efi_memmap_walk (efi_freemem_callback_t callback, void *arg) +{ + walk(callback, arg, EFI_MEMORY_WB); +} + +/* + * Walk the EFI memory map and call CALLBACK once for each EFI memory + * descriptor that has memory that is available for uncached allocator. + */ +void +efi_memmap_walk_uc (efi_freemem_callback_t callback, void *arg) +{ + walk(callback, arg, EFI_MEMORY_UC); +} + +/* + * Look for the PAL_CODE region reported by EFI and map it using an + * ITR to enable safe PAL calls in virtual mode. See IA-64 Processor + * Abstraction Layer chapter 11 in ADAG + */ +void * +efi_get_pal_addr (void) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + int pal_code_count = 0; + u64 vaddr, mask; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->type != EFI_PAL_CODE) + continue; + + if (++pal_code_count > 1) { + printk(KERN_ERR "Too many EFI Pal Code memory ranges, " + "dropped @ %llx\n", md->phys_addr); + continue; + } + /* + * The only ITLB entry in region 7 that is used is the one + * installed by __start(). That entry covers a 64MB range. + */ + mask = ~((1 << KERNEL_TR_PAGE_SHIFT) - 1); + vaddr = PAGE_OFFSET + md->phys_addr; + + /* + * We must check that the PAL mapping won't overlap with the + * kernel mapping. + * + * PAL code is guaranteed to be aligned on a power of 2 between + * 4k and 256KB and that only one ITR is needed to map it. This + * implies that the PAL code is always aligned on its size, + * i.e., the closest matching page size supported by the TLB. + * Therefore PAL code is guaranteed never to cross a 64MB unless + * it is bigger than 64MB (very unlikely!). So for now the + * following test is enough to determine whether or not we need + * a dedicated ITR for the PAL code. + */ + if ((vaddr & mask) == (KERNEL_START & mask)) { + printk(KERN_INFO "%s: no need to install ITR for PAL code\n", + __func__); + continue; + } + + if (efi_md_size(md) > IA64_GRANULE_SIZE) + panic("Whoa! PAL code size bigger than a granule!"); + +#if EFI_DEBUG + mask = ~((1 << IA64_GRANULE_SHIFT) - 1); + + printk(KERN_INFO "CPU %d: mapping PAL code " + "[0x%lx-0x%lx) into [0x%lx-0x%lx)\n", + smp_processor_id(), md->phys_addr, + md->phys_addr + efi_md_size(md), + vaddr & mask, (vaddr & mask) + IA64_GRANULE_SIZE); +#endif + return __va(md->phys_addr); + } + printk(KERN_WARNING "%s: no PAL-code memory-descriptor found\n", + __func__); + return NULL; +} + + +static u8 __init palo_checksum(u8 *buffer, u32 length) +{ + u8 sum = 0; + u8 *end = buffer + length; + + while (buffer < end) + sum = (u8) (sum + *(buffer++)); + + return sum; +} + +/* + * Parse and handle PALO table which is published at: + * http://www.dig64.org/home/DIG64_PALO_R1_0.pdf + */ +static void __init handle_palo(unsigned long palo_phys) +{ + struct palo_table *palo = __va(palo_phys); + u8 checksum; + + if (strncmp(palo->signature, PALO_SIG, sizeof(PALO_SIG) - 1)) { + printk(KERN_INFO "PALO signature incorrect.\n"); + return; + } + + checksum = palo_checksum((u8 *)palo, palo->length); + if (checksum) { + printk(KERN_INFO "PALO checksum incorrect.\n"); + return; + } + + setup_ptcg_sem(palo->max_tlb_purges, NPTCG_FROM_PALO); +} + +void +efi_map_pal_code (void) +{ + void *pal_vaddr = efi_get_pal_addr (); + u64 psr; + + if (!pal_vaddr) + return; + + /* + * Cannot write to CRx with PSR.ic=1 + */ + psr = ia64_clear_ic(); + ia64_itr(0x1, IA64_TR_PALCODE, + GRANULEROUNDDOWN((unsigned long) pal_vaddr), + pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)), + IA64_GRANULE_SHIFT); + paravirt_dv_serialize_data(); + ia64_set_psr(psr); /* restore psr */ +} + +void __init +efi_init (void) +{ + void *efi_map_start, *efi_map_end; + efi_config_table_t *config_tables; + efi_char16_t *c16; + u64 efi_desc_size; + char *cp, vendor[100] = "unknown"; + int i; + unsigned long palo_phys; + + /* + * It's too early to be able to use the standard kernel command line + * support... + */ + for (cp = boot_command_line; *cp; ) { + if (memcmp(cp, "mem=", 4) == 0) { + mem_limit = memparse(cp + 4, &cp); + } else if (memcmp(cp, "max_addr=", 9) == 0) { + max_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp)); + } else if (memcmp(cp, "min_addr=", 9) == 0) { + min_addr = GRANULEROUNDDOWN(memparse(cp + 9, &cp)); + } else { + while (*cp != ' ' && *cp) + ++cp; + while (*cp == ' ') + ++cp; + } + } + if (min_addr != 0UL) + printk(KERN_INFO "Ignoring memory below %lluMB\n", + min_addr >> 20); + if (max_addr != ~0UL) + printk(KERN_INFO "Ignoring memory above %lluMB\n", + max_addr >> 20); + + efi.systab = __va(ia64_boot_param->efi_systab); + + /* + * Verify the EFI Table + */ + if (efi.systab == NULL) + panic("Whoa! Can't find EFI system table.\n"); + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + panic("Whoa! EFI system table signature incorrect\n"); + if ((efi.systab->hdr.revision >> 16) == 0) + printk(KERN_WARNING "Warning: EFI system table version " + "%d.%02d, expected 1.00 or greater\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + config_tables = __va(efi.systab->tables); + + /* Show what we know for posterity */ + c16 = __va(efi.systab->fw_vendor); + if (c16) { + for (i = 0;i < (int) sizeof(vendor) - 1 && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } + + printk(KERN_INFO "EFI v%u.%.02u by %s:", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + efi.mps = EFI_INVALID_TABLE_ADDR; + efi.acpi = EFI_INVALID_TABLE_ADDR; + efi.acpi20 = EFI_INVALID_TABLE_ADDR; + efi.smbios = EFI_INVALID_TABLE_ADDR; + efi.sal_systab = EFI_INVALID_TABLE_ADDR; + efi.boot_info = EFI_INVALID_TABLE_ADDR; + efi.hcdp = EFI_INVALID_TABLE_ADDR; + efi.uga = EFI_INVALID_TABLE_ADDR; + + palo_phys = EFI_INVALID_TABLE_ADDR; + + for (i = 0; i < (int) efi.systab->nr_tables; i++) { + if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { + efi.mps = config_tables[i].table; + printk(" MPS=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { + efi.acpi20 = config_tables[i].table; + printk(" ACPI 2.0=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { + efi.acpi = config_tables[i].table; + printk(" ACPI=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { + efi.smbios = config_tables[i].table; + printk(" SMBIOS=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, SAL_SYSTEM_TABLE_GUID) == 0) { + efi.sal_systab = config_tables[i].table; + printk(" SALsystab=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { + efi.hcdp = config_tables[i].table; + printk(" HCDP=0x%lx", config_tables[i].table); + } else if (efi_guidcmp(config_tables[i].guid, + PROCESSOR_ABSTRACTION_LAYER_OVERWRITE_GUID) == 0) { + palo_phys = config_tables[i].table; + printk(" PALO=0x%lx", config_tables[i].table); + } + } + printk("\n"); + + if (palo_phys != EFI_INVALID_TABLE_ADDR) + handle_palo(palo_phys); + + runtime = __va(efi.systab->runtime); + efi.get_time = phys_get_time; + efi.set_time = phys_set_time; + efi.get_wakeup_time = phys_get_wakeup_time; + efi.set_wakeup_time = phys_set_wakeup_time; + efi.get_variable = phys_get_variable; + efi.get_next_variable = phys_get_next_variable; + efi.set_variable = phys_set_variable; + efi.get_next_high_mono_count = phys_get_next_high_mono_count; + efi.reset_system = phys_reset_system; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + +#if EFI_DEBUG + /* print EFI memory map: */ + { + efi_memory_desc_t *md; + void *p; + + for (i = 0, p = efi_map_start; p < efi_map_end; + ++i, p += efi_desc_size) + { + const char *unit; + unsigned long size; + + md = p; + size = md->num_pages << EFI_PAGE_SHIFT; + + if ((size >> 40) > 0) { + size >>= 40; + unit = "TB"; + } else if ((size >> 30) > 0) { + size >>= 30; + unit = "GB"; + } else if ((size >> 20) > 0) { + size >>= 20; + unit = "MB"; + } else { + size >>= 10; + unit = "KB"; + } + + printk("mem%02d: type=%2u, attr=0x%016lx, " + "range=[0x%016lx-0x%016lx) (%4lu%s)\n", + i, md->type, md->attribute, md->phys_addr, + md->phys_addr + efi_md_size(md), size, unit); + } + } +#endif + + efi_map_pal_code(); + efi_enter_virtual_mode(); +} + +void +efi_enter_virtual_mode (void) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + efi_status_t status; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->attribute & EFI_MEMORY_RUNTIME) { + /* + * Some descriptors have multiple bits set, so the + * order of the tests is relevant. + */ + if (md->attribute & EFI_MEMORY_WB) { + md->virt_addr = (u64) __va(md->phys_addr); + } else if (md->attribute & EFI_MEMORY_UC) { + md->virt_addr = (u64) ioremap(md->phys_addr, 0); + } else if (md->attribute & EFI_MEMORY_WC) { +#if 0 + md->virt_addr = ia64_remap(md->phys_addr, + (_PAGE_A | + _PAGE_P | + _PAGE_D | + _PAGE_MA_WC | + _PAGE_PL_0 | + _PAGE_AR_RW)); +#else + printk(KERN_INFO "EFI_MEMORY_WC mapping\n"); + md->virt_addr = (u64) ioremap(md->phys_addr, 0); +#endif + } else if (md->attribute & EFI_MEMORY_WT) { +#if 0 + md->virt_addr = ia64_remap(md->phys_addr, + (_PAGE_A | + _PAGE_P | + _PAGE_D | + _PAGE_MA_WT | + _PAGE_PL_0 | + _PAGE_AR_RW)); +#else + printk(KERN_INFO "EFI_MEMORY_WT mapping\n"); + md->virt_addr = (u64) ioremap(md->phys_addr, 0); +#endif + } + } + } + + status = efi_call_phys(__va(runtime->set_virtual_address_map), + ia64_boot_param->efi_memmap_size, + efi_desc_size, + ia64_boot_param->efi_memdesc_version, + ia64_boot_param->efi_memmap); + if (status != EFI_SUCCESS) { + printk(KERN_WARNING "warning: unable to switch EFI into " + "virtual mode (status=%lu)\n", status); + return; + } + + /* + * Now that EFI is in virtual mode, we call the EFI functions more + * efficiently: + */ + efi.get_time = virt_get_time; + efi.set_time = virt_set_time; + efi.get_wakeup_time = virt_get_wakeup_time; + efi.set_wakeup_time = virt_set_wakeup_time; + efi.get_variable = virt_get_variable; + efi.get_next_variable = virt_get_next_variable; + efi.set_variable = virt_set_variable; + efi.get_next_high_mono_count = virt_get_next_high_mono_count; + efi.reset_system = virt_reset_system; +} + +/* + * Walk the EFI memory map looking for the I/O port range. There can only be + * one entry of this type, other I/O port ranges should be described via ACPI. + */ +u64 +efi_get_iobase (void) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) { + if (md->attribute & EFI_MEMORY_UC) + return md->phys_addr; + } + } + return 0; +} + +static struct kern_memdesc * +kern_memory_descriptor (unsigned long phys_addr) +{ + struct kern_memdesc *md; + + for (md = kern_memmap; md->start != ~0UL; md++) { + if (phys_addr - md->start < (md->num_pages << EFI_PAGE_SHIFT)) + return md; + } + return NULL; +} + +static efi_memory_desc_t * +efi_memory_descriptor (unsigned long phys_addr) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + if (phys_addr - md->phys_addr < efi_md_size(md)) + return md; + } + return NULL; +} + +static int +efi_memmap_intersects (unsigned long phys_addr, unsigned long size) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + unsigned long end; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + end = phys_addr + size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (md->phys_addr < end && efi_md_end(md) > phys_addr) + return 1; + } + return 0; +} + +u32 +efi_mem_type (unsigned long phys_addr) +{ + efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); + + if (md) + return md->type; + return 0; +} + +u64 +efi_mem_attributes (unsigned long phys_addr) +{ + efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); + + if (md) + return md->attribute; + return 0; +} +EXPORT_SYMBOL(efi_mem_attributes); + +u64 +efi_mem_attribute (unsigned long phys_addr, unsigned long size) +{ + unsigned long end = phys_addr + size; + efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); + u64 attr; + + if (!md) + return 0; + + /* + * EFI_MEMORY_RUNTIME is not a memory attribute; it just tells + * the kernel that firmware needs this region mapped. + */ + attr = md->attribute & ~EFI_MEMORY_RUNTIME; + do { + unsigned long md_end = efi_md_end(md); + + if (end <= md_end) + return attr; + + md = efi_memory_descriptor(md_end); + if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr) + return 0; + } while (md); + return 0; /* never reached */ +} + +u64 +kern_mem_attribute (unsigned long phys_addr, unsigned long size) +{ + unsigned long end = phys_addr + size; + struct kern_memdesc *md; + u64 attr; + + /* + * This is a hack for ioremap calls before we set up kern_memmap. + * Maybe we should do efi_memmap_init() earlier instead. + */ + if (!kern_memmap) { + attr = efi_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB) + return EFI_MEMORY_WB; + return 0; + } + + md = kern_memory_descriptor(phys_addr); + if (!md) + return 0; + + attr = md->attribute; + do { + unsigned long md_end = kmd_end(md); + + if (end <= md_end) + return attr; + + md = kern_memory_descriptor(md_end); + if (!md || md->attribute != attr) + return 0; + } while (md); + return 0; /* never reached */ +} +EXPORT_SYMBOL(kern_mem_attribute); + +int +valid_phys_addr_range (unsigned long phys_addr, unsigned long size) +{ + u64 attr; + + /* + * /dev/mem reads and writes use copy_to_user(), which implicitly + * uses a granule-sized kernel identity mapping. It's really + * only safe to do this for regions in kern_memmap. For more + * details, see Documentation/ia64/aliasing.txt. + */ + attr = kern_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) + return 1; + return 0; +} + +int +valid_mmap_phys_addr_range (unsigned long pfn, unsigned long size) +{ + unsigned long phys_addr = pfn << PAGE_SHIFT; + u64 attr; + + attr = efi_mem_attribute(phys_addr, size); + + /* + * /dev/mem mmap uses normal user pages, so we don't need the entire + * granule, but the entire region we're mapping must support the same + * attribute. + */ + if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) + return 1; + + /* + * Intel firmware doesn't tell us about all the MMIO regions, so + * in general we have to allow mmap requests. But if EFI *does* + * tell us about anything inside this region, we should deny it. + * The user can always map a smaller region to avoid the overlap. + */ + if (efi_memmap_intersects(phys_addr, size)) + return 0; + + return 1; +} + +pgprot_t +phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, + pgprot_t vma_prot) +{ + unsigned long phys_addr = pfn << PAGE_SHIFT; + u64 attr; + + /* + * For /dev/mem mmap, we use user mappings, but if the region is + * in kern_memmap (and hence may be covered by a kernel mapping), + * we must use the same attribute as the kernel mapping. + */ + attr = kern_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB) + return pgprot_cacheable(vma_prot); + else if (attr & EFI_MEMORY_UC) + return pgprot_noncached(vma_prot); + + /* + * Some chipsets don't support UC access to memory. If + * WB is supported, we prefer that. + */ + if (efi_mem_attribute(phys_addr, size) & EFI_MEMORY_WB) + return pgprot_cacheable(vma_prot); + + return pgprot_noncached(vma_prot); +} + +int __init +efi_uart_console_only(void) +{ + efi_status_t status; + char *s, name[] = "ConOut"; + efi_guid_t guid = EFI_GLOBAL_VARIABLE_GUID; + efi_char16_t *utf16, name_utf16[32]; + unsigned char data[1024]; + unsigned long size = sizeof(data); + struct efi_generic_dev_path *hdr, *end_addr; + int uart = 0; + + /* Convert to UTF-16 */ + utf16 = name_utf16; + s = name; + while (*s) + *utf16++ = *s++ & 0x7f; + *utf16 = 0; + + status = efi.get_variable(name_utf16, &guid, NULL, &size, data); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "No EFI %s variable?\n", name); + return 0; + } + + hdr = (struct efi_generic_dev_path *) data; + end_addr = (struct efi_generic_dev_path *) ((u8 *) data + size); + while (hdr < end_addr) { + if (hdr->type == EFI_DEV_MSG && + hdr->sub_type == EFI_DEV_MSG_UART) + uart = 1; + else if (hdr->type == EFI_DEV_END_PATH || + hdr->type == EFI_DEV_END_PATH2) { + if (!uart) + return 0; + if (hdr->sub_type == EFI_DEV_END_ENTIRE) + return 1; + uart = 0; + } + hdr = (struct efi_generic_dev_path *)((u8 *) hdr + hdr->length); + } + printk(KERN_ERR "Malformed %s value\n", name); + return 0; +} + +/* + * Look for the first granule aligned memory descriptor memory + * that is big enough to hold EFI memory map. Make sure this + * descriptor is atleast granule sized so it does not get trimmed + */ +struct kern_memdesc * +find_memmap_space (void) +{ + u64 contig_low=0, contig_high=0; + u64 as = 0, ae; + void *efi_map_start, *efi_map_end, *p, *q; + efi_memory_desc_t *md, *pmd = NULL, *check_md; + u64 space_needed, efi_desc_size; + unsigned long total_mem = 0; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + /* + * Worst case: we need 3 kernel descriptors for each efi descriptor + * (if every entry has a WB part in the middle, and UC head and tail), + * plus one for the end marker. + */ + space_needed = sizeof(kern_memdesc_t) * + (3 * (ia64_boot_param->efi_memmap_size/efi_desc_size) + 1); + + for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) { + md = p; + if (!efi_wb(md)) { + continue; + } + if (pmd == NULL || !efi_wb(pmd) || + efi_md_end(pmd) != md->phys_addr) { + contig_low = GRANULEROUNDUP(md->phys_addr); + contig_high = efi_md_end(md); + for (q = p + efi_desc_size; q < efi_map_end; + q += efi_desc_size) { + check_md = q; + if (!efi_wb(check_md)) + break; + if (contig_high != check_md->phys_addr) + break; + contig_high = efi_md_end(check_md); + } + contig_high = GRANULEROUNDDOWN(contig_high); + } + if (!is_memory_available(md) || md->type == EFI_LOADER_DATA) + continue; + + /* Round ends inward to granule boundaries */ + as = max(contig_low, md->phys_addr); + ae = min(contig_high, efi_md_end(md)); + + /* keep within max_addr= and min_addr= command line arg */ + as = max(as, min_addr); + ae = min(ae, max_addr); + if (ae <= as) + continue; + + /* avoid going over mem= command line arg */ + if (total_mem + (ae - as) > mem_limit) + ae -= total_mem + (ae - as) - mem_limit; + + if (ae <= as) + continue; + + if (ae - as > space_needed) + break; + } + if (p >= efi_map_end) + panic("Can't allocate space for kernel memory descriptors"); + + return __va(as); +} + +/* + * Walk the EFI memory map and gather all memory available for kernel + * to use. We can allocate partial granules only if the unavailable + * parts exist, and are WB. + */ +unsigned long +efi_memmap_init(u64 *s, u64 *e) +{ + struct kern_memdesc *k, *prev = NULL; + u64 contig_low=0, contig_high=0; + u64 as, ae, lim; + void *efi_map_start, *efi_map_end, *p, *q; + efi_memory_desc_t *md, *pmd = NULL, *check_md; + u64 efi_desc_size; + unsigned long total_mem = 0; + + k = kern_memmap = find_memmap_space(); + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; pmd = md, p += efi_desc_size) { + md = p; + if (!efi_wb(md)) { + if (efi_uc(md) && + (md->type == EFI_CONVENTIONAL_MEMORY || + md->type == EFI_BOOT_SERVICES_DATA)) { + k->attribute = EFI_MEMORY_UC; + k->start = md->phys_addr; + k->num_pages = md->num_pages; + k++; + } + continue; + } + if (pmd == NULL || !efi_wb(pmd) || + efi_md_end(pmd) != md->phys_addr) { + contig_low = GRANULEROUNDUP(md->phys_addr); + contig_high = efi_md_end(md); + for (q = p + efi_desc_size; q < efi_map_end; + q += efi_desc_size) { + check_md = q; + if (!efi_wb(check_md)) + break; + if (contig_high != check_md->phys_addr) + break; + contig_high = efi_md_end(check_md); + } + contig_high = GRANULEROUNDDOWN(contig_high); + } + if (!is_memory_available(md)) + continue; + +#ifdef CONFIG_CRASH_DUMP + /* saved_max_pfn should ignore max_addr= command line arg */ + if (saved_max_pfn < (efi_md_end(md) >> PAGE_SHIFT)) + saved_max_pfn = (efi_md_end(md) >> PAGE_SHIFT); +#endif + /* + * Round ends inward to granule boundaries + * Give trimmings to uncached allocator + */ + if (md->phys_addr < contig_low) { + lim = min(efi_md_end(md), contig_low); + if (efi_uc(md)) { + if (k > kern_memmap && + (k-1)->attribute == EFI_MEMORY_UC && + kmd_end(k-1) == md->phys_addr) { + (k-1)->num_pages += + (lim - md->phys_addr) + >> EFI_PAGE_SHIFT; + } else { + k->attribute = EFI_MEMORY_UC; + k->start = md->phys_addr; + k->num_pages = (lim - md->phys_addr) + >> EFI_PAGE_SHIFT; + k++; + } + } + as = contig_low; + } else + as = md->phys_addr; + + if (efi_md_end(md) > contig_high) { + lim = max(md->phys_addr, contig_high); + if (efi_uc(md)) { + if (lim == md->phys_addr && k > kern_memmap && + (k-1)->attribute == EFI_MEMORY_UC && + kmd_end(k-1) == md->phys_addr) { + (k-1)->num_pages += md->num_pages; + } else { + k->attribute = EFI_MEMORY_UC; + k->start = lim; + k->num_pages = (efi_md_end(md) - lim) + >> EFI_PAGE_SHIFT; + k++; + } + } + ae = contig_high; + } else + ae = efi_md_end(md); + + /* keep within max_addr= and min_addr= command line arg */ + as = max(as, min_addr); + ae = min(ae, max_addr); + if (ae <= as) + continue; + + /* avoid going over mem= command line arg */ + if (total_mem + (ae - as) > mem_limit) + ae -= total_mem + (ae - as) - mem_limit; + + if (ae <= as) + continue; + if (prev && kmd_end(prev) == md->phys_addr) { + prev->num_pages += (ae - as) >> EFI_PAGE_SHIFT; + total_mem += ae - as; + continue; + } + k->attribute = EFI_MEMORY_WB; + k->start = as; + k->num_pages = (ae - as) >> EFI_PAGE_SHIFT; + total_mem += ae - as; + prev = k++; + } + k->start = ~0L; /* end-marker */ + + /* reserve the memory we are using for kern_memmap */ + *s = (u64)kern_memmap; + *e = (u64)++k; + + return total_mem; +} + +void +efi_initialize_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) +{ + struct resource *res; + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + char *name; + unsigned long flags; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + res = NULL; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + if (md->num_pages == 0) /* should not happen */ + continue; + + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + switch (md->type) { + + case EFI_MEMORY_MAPPED_IO: + case EFI_MEMORY_MAPPED_IO_PORT_SPACE: + continue; + + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_CONVENTIONAL_MEMORY: + if (md->attribute & EFI_MEMORY_WP) { + name = "System ROM"; + flags |= IORESOURCE_READONLY; + } else if (md->attribute == EFI_MEMORY_UC) + name = "Uncached RAM"; + else + name = "System RAM"; + break; + + case EFI_ACPI_MEMORY_NVS: + name = "ACPI Non-volatile Storage"; + break; + + case EFI_UNUSABLE_MEMORY: + name = "reserved"; + flags |= IORESOURCE_DISABLED; + break; + + case EFI_RESERVED_TYPE: + case EFI_RUNTIME_SERVICES_CODE: + case EFI_RUNTIME_SERVICES_DATA: + case EFI_ACPI_RECLAIM_MEMORY: + default: + name = "reserved"; + break; + } + + if ((res = kzalloc(sizeof(struct resource), + GFP_KERNEL)) == NULL) { + printk(KERN_ERR + "failed to allocate resource for iomem\n"); + return; + } + + res->name = name; + res->start = md->phys_addr; + res->end = md->phys_addr + efi_md_size(md) - 1; + res->flags = flags; + + if (insert_resource(&iomem_resource, res) < 0) + kfree(res); + else { + /* + * We don't know which region contains + * kernel data so we try it repeatedly and + * let the resource manager test it. + */ + insert_resource(res, code_resource); + insert_resource(res, data_resource); + insert_resource(res, bss_resource); +#ifdef CONFIG_KEXEC + insert_resource(res, &efi_memmap_res); + insert_resource(res, &boot_param_res); + if (crashk_res.end > crashk_res.start) + insert_resource(res, &crashk_res); +#endif + } + } +} + +#ifdef CONFIG_KEXEC +/* find a block of memory aligned to 64M exclude reserved regions + rsvd_regions are sorted + */ +unsigned long __init +kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n) +{ + int i; + u64 start, end; + u64 alignment = 1UL << _PAGE_SIZE_64M; + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (!efi_wb(md)) + continue; + start = ALIGN(md->phys_addr, alignment); + end = efi_md_end(md); + for (i = 0; i < n; i++) { + if (__pa(r[i].start) >= start && __pa(r[i].end) < end) { + if (__pa(r[i].start) > start + size) + return start; + start = ALIGN(__pa(r[i].end), alignment); + if (i < n-1 && + __pa(r[i+1].start) < start + size) + continue; + else + break; + } + } + if (end > start + size) + return start; + } + + printk(KERN_WARNING + "Cannot reserve 0x%lx byte of memory for crashdump\n", size); + return ~0UL; +} +#endif + +#ifdef CONFIG_CRASH_DUMP +/* locate the size find a the descriptor at a certain address */ +unsigned long __init +vmcore_find_descriptor_size (unsigned long address) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + unsigned long ret = 0; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + if (efi_wb(md) && md->type == EFI_LOADER_DATA + && md->phys_addr == address) { + ret = efi_md_size(md); + break; + } + } + + if (ret == 0) + printk(KERN_WARNING "Cannot locate EFI vmcore descriptor\n"); + + return ret; +} +#endif diff --git a/arch/ia64/kernel/efi_stub.S b/arch/ia64/kernel/efi_stub.S new file mode 100644 index 00000000000..a56e161d751 --- /dev/null +++ b/arch/ia64/kernel/efi_stub.S @@ -0,0 +1,86 @@ +/* + * EFI call stub. + * + * Copyright (C) 1999-2001 Hewlett-Packard Co + * David Mosberger <davidm@hpl.hp.com> + * + * This stub allows us to make EFI calls in physical mode with interrupts + * turned off. We need this because we can't call SetVirtualMap() until + * the kernel has booted far enough to allow allocation of struct vma_struct + * entries (which we would need to map stuff with memory attributes other + * than uncached or writeback...). Since the GetTime() service gets called + * earlier than that, we need to be able to make physical mode EFI calls from + * the kernel. + */ + +/* + * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System + * Abstraction Layer Specification", revision 2.6e). Note that + * psr.dfl and psr.dfh MUST be cleared, despite what this manual says. + * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call + * (the br.ia instruction fails unless psr.dfl and psr.dfh are + * cleared). Fortunately, SAL promises not to touch the floating + * point regs, so at least we don't have to save f2-f127. + */ +#define PSR_BITS_TO_CLEAR \ + (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT | \ + IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ + IA64_PSR_DFL | IA64_PSR_DFH) + +#define PSR_BITS_TO_SET \ + (IA64_PSR_BN) + +#include <asm/processor.h> +#include <asm/asmmacro.h> + +/* + * Inputs: + * in0 = address of function descriptor of EFI routine to call + * in1..in7 = arguments to routine + * + * Outputs: + * r8 = EFI_STATUS returned by called function + */ + +GLOBAL_ENTRY(efi_call_phys) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,7,7,0 + ld8 r2=[in0],8 // load EFI function's entry point + mov loc0=rp + .body + ;; + mov loc2=gp // save global pointer + mov loc4=ar.rsc // save RSE configuration + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + ;; + ld8 gp=[in0] // load EFI function's global pointer + movl r16=PSR_BITS_TO_CLEAR + mov loc3=psr // save processor status word + movl r17=PSR_BITS_TO_SET + ;; + or loc3=loc3,r17 + mov b6=r2 + ;; + andcm r16=loc3,r16 // get psr with IT, DT, and RT bits cleared + br.call.sptk.many rp=ia64_switch_mode_phys +.ret0: mov out4=in5 + mov out0=in1 + mov out1=in2 + mov out2=in3 + mov out3=in4 + mov out5=in6 + mov out6=in7 + mov loc5=r19 + mov loc6=r20 + br.call.sptk.many rp=b6 // call the EFI function +.ret1: mov ar.rsc=0 // put RSE in enforced lazy, LE mode + mov r16=loc3 + mov r19=loc5 + mov r20=loc6 + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode +.ret2: mov ar.rsc=loc4 // restore RSE configuration + mov ar.pfs=loc1 + mov rp=loc0 + mov gp=loc2 + br.ret.sptk.many rp +END(efi_call_phys) diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c new file mode 100644 index 00000000000..bac1639bc32 --- /dev/null +++ b/arch/ia64/kernel/elfcore.c @@ -0,0 +1,80 @@ +#include <linux/elf.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf64_Half elf_core_extra_phdrs(void) +{ + return GATE_EHDR->e_phnum; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + Elf64_Off ofs = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + struct elf_phdr phdr = gate_phdrs[i]; + + if (phdr.p_type == PT_LOAD) { + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); + phdr.p_filesz = phdr.p_memsz; + if (ofs == 0) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset = ofs; + } + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + void *addr = (void *)gate_phdrs[i].p_vaddr; + size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz); + + *size += memsz; + if (*size > limit || !dump_write(file, addr, memsz)) + return 0; + break; + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + size_t size = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + size += PAGE_ALIGN(gate_phdrs[i].p_memsz); + break; + } + } + return size; +} diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S new file mode 100644 index 00000000000..1ccbe12a4d8 --- /dev/null +++ b/arch/ia64/kernel/entry.S @@ -0,0 +1,1785 @@ +/* + * arch/ia64/kernel/entry.S + * + * Kernel entry points. + * + * Copyright (C) 1998-2003, 2005 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 1999, 2002-2003 + * Asit Mallick <Asit.K.Mallick@intel.com> + * Don Dugger <Don.Dugger@intel.com> + * Suresh Siddha <suresh.b.siddha@intel.com> + * Fenghua Yu <fenghua.yu@intel.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + */ +/* + * ia64_switch_to now places correct virtual mapping in in TR2 for + * kernel stack. This allows us to handle interrupts without changing + * to physical mode. + * + * Jonathan Nicklin <nicklin@missioncriticallinux.com> + * Patrick O'Rourke <orourke@missioncriticallinux.com> + * 11/07/2000 + */ +/* + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * pv_ops. + */ +/* + * Global (preserved) predicate usage on syscall entry/exit path: + * + * pKStk: See entry.h. + * pUStk: See entry.h. + * pSys: See entry.h. + * pNonSys: !pSys + */ + + +#include <asm/asmmacro.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/kregs.h> +#include <asm/asm-offsets.h> +#include <asm/pgtable.h> +#include <asm/percpu.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/unistd.h> +#include <asm/ftrace.h> + +#include "minstate.h" + +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE + /* + * execve() is special because in case of success, we need to + * setup a null register window frame. + */ +ENTRY(ia64_execve) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,2,4,0 + mov loc0=rp + .body + mov out0=in0 // filename + ;; // stop bit between alloc and call + mov out1=in1 // argv + mov out2=in2 // envp + add out3=16,sp // regs + br.call.sptk.many rp=sys_execve +.ret0: + cmp4.ge p6,p7=r8,r0 + mov ar.pfs=loc1 // restore ar.pfs + sxt4 r8=r8 // return 64-bit result + ;; + stf.spill [sp]=f0 +(p6) cmp.ne pKStk,pUStk=r0,r0 // a successful execve() lands us in user-mode... + mov rp=loc0 +(p6) mov ar.pfs=r0 // clear ar.pfs on success +(p7) br.ret.sptk.many rp + + /* + * In theory, we'd have to zap this state only to prevent leaking of + * security sensitive state (e.g., if current->mm->dumpable is zero). However, + * this executes in less than 20 cycles even on Itanium, so it's not worth + * optimizing for...). + */ + mov ar.unat=0; mov ar.lc=0 + mov r4=0; mov f2=f0; mov b1=r0 + mov r5=0; mov f3=f0; mov b2=r0 + mov r6=0; mov f4=f0; mov b3=r0 + mov r7=0; mov f5=f0; mov b4=r0 + ldf.fill f12=[sp]; mov f13=f0; mov b5=r0 + ldf.fill f14=[sp]; ldf.fill f15=[sp]; mov f16=f0 + ldf.fill f17=[sp]; ldf.fill f18=[sp]; mov f19=f0 + ldf.fill f20=[sp]; ldf.fill f21=[sp]; mov f22=f0 + ldf.fill f23=[sp]; ldf.fill f24=[sp]; mov f25=f0 + ldf.fill f26=[sp]; ldf.fill f27=[sp]; mov f28=f0 + ldf.fill f29=[sp]; ldf.fill f30=[sp]; mov f31=f0 + br.ret.sptk.many rp +END(ia64_execve) + +/* + * sys_clone2(u64 flags, u64 ustack_base, u64 ustack_size, u64 parent_tidptr, u64 child_tidptr, + * u64 tls) + */ +GLOBAL_ENTRY(sys_clone2) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc r16=ar.pfs,8,2,6,0 + DO_SAVE_SWITCH_STACK + adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp + mov loc0=rp + mov loc1=r16 // save ar.pfs across do_fork + .body + mov out1=in1 + mov out3=in2 + tbit.nz p6,p0=in0,CLONE_SETTLS_BIT + mov out4=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + ;; +(p6) st8 [r2]=in5 // store TLS in r16 for copy_thread() + mov out5=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID + adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out0=in0 // out0 = clone_flags + br.call.sptk.many rp=do_fork +.ret1: .restore sp + adds sp=IA64_SWITCH_STACK_SIZE,sp // pop the switch stack + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(sys_clone2) + +/* + * sys_clone(u64 flags, u64 ustack_base, u64 parent_tidptr, u64 child_tidptr, u64 tls) + * Deprecated. Use sys_clone2() instead. + */ +GLOBAL_ENTRY(sys_clone) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc r16=ar.pfs,8,2,6,0 + DO_SAVE_SWITCH_STACK + adds r2=PT(R16)+IA64_SWITCH_STACK_SIZE+16,sp + mov loc0=rp + mov loc1=r16 // save ar.pfs across do_fork + .body + mov out1=in1 + mov out3=16 // stacksize (compensates for 16-byte scratch area) + tbit.nz p6,p0=in0,CLONE_SETTLS_BIT + mov out4=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + ;; +(p6) st8 [r2]=in4 // store TLS in r13 (tp) + mov out5=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID + adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out0=in0 // out0 = clone_flags + br.call.sptk.many rp=do_fork +.ret2: .restore sp + adds sp=IA64_SWITCH_STACK_SIZE,sp // pop the switch stack + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(sys_clone) +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ + +/* + * prev_task <- ia64_switch_to(struct task_struct *next) + * With Ingo's new scheduler, interrupts are disabled when this routine gets + * called. The code starting at .map relies on this. The rest of the code + * doesn't care about the interrupt masking status. + */ +GLOBAL_ENTRY(__paravirt_switch_to) + .prologue + alloc r16=ar.pfs,1,0,0,0 + DO_SAVE_SWITCH_STACK + .body + + adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13 + movl r25=init_task + mov r27=IA64_KR(CURRENT_STACK) + adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0 + dep r20=0,in0,61,3 // physical address of "next" + ;; + st8 [r22]=sp // save kernel stack pointer of old task + shr.u r26=r20,IA64_GRANULE_SHIFT + cmp.eq p7,p6=r25,in0 + ;; + /* + * If we've already mapped this task's page, we can skip doing it again. + */ +(p6) cmp.eq p7,p6=r26,r27 +(p6) br.cond.dpnt .map + ;; +.done: + ld8 sp=[r21] // load kernel stack pointer of new task + MOV_TO_KR(CURRENT, in0, r8, r9) // update "current" application register + mov r8=r13 // return pointer to previously running task + mov r13=in0 // set "current" pointer + ;; + DO_LOAD_SWITCH_STACK + +#ifdef CONFIG_SMP + sync.i // ensure "fc"s done by this CPU are visible on other CPUs +#endif + br.ret.sptk.many rp // boogie on out in new context + +.map: + RSM_PSR_IC(r25) // interrupts (psr.i) are already disabled here + movl r25=PAGE_KERNEL + ;; + srlz.d + or r23=r25,r20 // construct PA | page properties + mov r25=IA64_GRANULE_SHIFT<<2 + ;; + MOV_TO_ITIR(p0, r25, r8) + MOV_TO_IFA(in0, r8) // VA of next task... + ;; + mov r25=IA64_TR_CURRENT_STACK + MOV_TO_KR(CURRENT_STACK, r26, r8, r9) // remember last page we mapped... + ;; + itr.d dtr[r25]=r23 // wire in new mapping... + SSM_PSR_IC_AND_SRLZ_D(r8, r9) // reenable the psr.ic bit + br.cond.sptk .done +END(__paravirt_switch_to) + +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE +/* + * Note that interrupts are enabled during save_switch_stack and load_switch_stack. This + * means that we may get an interrupt with "sp" pointing to the new kernel stack while + * ar.bspstore is still pointing to the old kernel backing store area. Since ar.rsc, + * ar.rnat, ar.bsp, and ar.bspstore are all preserved by interrupts, this is not a + * problem. Also, we don't need to specify unwind information for preserved registers + * that are not modified in save_switch_stack as the right unwind information is already + * specified at the call-site of save_switch_stack. + */ + +/* + * save_switch_stack: + * - r16 holds ar.pfs + * - b7 holds address to return to + * - rp (b0) holds return address to save + */ +GLOBAL_ENTRY(save_switch_stack) + .prologue + .altrp b7 + flushrs // flush dirty regs to backing store (must be first in insn group) + .save @priunat,r17 + mov r17=ar.unat // preserve caller's + .body +#ifdef CONFIG_ITANIUM + adds r2=16+128,sp + adds r3=16+64,sp + adds r14=SW(R4)+16,sp + ;; + st8.spill [r14]=r4,16 // spill r4 + lfetch.fault.excl.nt1 [r3],128 + ;; + lfetch.fault.excl.nt1 [r2],128 + lfetch.fault.excl.nt1 [r3],128 + ;; + lfetch.fault.excl [r2] + lfetch.fault.excl [r3] + adds r15=SW(R5)+16,sp +#else + add r2=16+3*128,sp + add r3=16,sp + add r14=SW(R4)+16,sp + ;; + st8.spill [r14]=r4,SW(R6)-SW(R4) // spill r4 and prefetch offset 0x1c0 + lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x010 + ;; + lfetch.fault.excl.nt1 [r3],128 // prefetch offset 0x090 + lfetch.fault.excl.nt1 [r2],128 // prefetch offset 0x190 + ;; + lfetch.fault.excl.nt1 [r3] // prefetch offset 0x110 + lfetch.fault.excl.nt1 [r2] // prefetch offset 0x210 + adds r15=SW(R5)+16,sp +#endif + ;; + st8.spill [r15]=r5,SW(R7)-SW(R5) // spill r5 + mov.m ar.rsc=0 // put RSE in mode: enforced lazy, little endian, pl 0 + add r2=SW(F2)+16,sp // r2 = &sw->f2 + ;; + st8.spill [r14]=r6,SW(B0)-SW(R6) // spill r6 + mov.m r18=ar.fpsr // preserve fpsr + add r3=SW(F3)+16,sp // r3 = &sw->f3 + ;; + stf.spill [r2]=f2,32 + mov.m r19=ar.rnat + mov r21=b0 + + stf.spill [r3]=f3,32 + st8.spill [r15]=r7,SW(B2)-SW(R7) // spill r7 + mov r22=b1 + ;; + // since we're done with the spills, read and save ar.unat: + mov.m r29=ar.unat + mov.m r20=ar.bspstore + mov r23=b2 + stf.spill [r2]=f4,32 + stf.spill [r3]=f5,32 + mov r24=b3 + ;; + st8 [r14]=r21,SW(B1)-SW(B0) // save b0 + st8 [r15]=r23,SW(B3)-SW(B2) // save b2 + mov r25=b4 + mov r26=b5 + ;; + st8 [r14]=r22,SW(B4)-SW(B1) // save b1 + st8 [r15]=r24,SW(AR_PFS)-SW(B3) // save b3 + mov r21=ar.lc // I-unit + stf.spill [r2]=f12,32 + stf.spill [r3]=f13,32 + ;; + st8 [r14]=r25,SW(B5)-SW(B4) // save b4 + st8 [r15]=r16,SW(AR_LC)-SW(AR_PFS) // save ar.pfs + stf.spill [r2]=f14,32 + stf.spill [r3]=f15,32 + ;; + st8 [r14]=r26 // save b5 + st8 [r15]=r21 // save ar.lc + stf.spill [r2]=f16,32 + stf.spill [r3]=f17,32 + ;; + stf.spill [r2]=f18,32 + stf.spill [r3]=f19,32 + ;; + stf.spill [r2]=f20,32 + stf.spill [r3]=f21,32 + ;; + stf.spill [r2]=f22,32 + stf.spill [r3]=f23,32 + ;; + stf.spill [r2]=f24,32 + stf.spill [r3]=f25,32 + ;; + stf.spill [r2]=f26,32 + stf.spill [r3]=f27,32 + ;; + stf.spill [r2]=f28,32 + stf.spill [r3]=f29,32 + ;; + stf.spill [r2]=f30,SW(AR_UNAT)-SW(F30) + stf.spill [r3]=f31,SW(PR)-SW(F31) + add r14=SW(CALLER_UNAT)+16,sp + ;; + st8 [r2]=r29,SW(AR_RNAT)-SW(AR_UNAT) // save ar.unat + st8 [r14]=r17,SW(AR_FPSR)-SW(CALLER_UNAT) // save caller_unat + mov r21=pr + ;; + st8 [r2]=r19,SW(AR_BSPSTORE)-SW(AR_RNAT) // save ar.rnat + st8 [r3]=r21 // save predicate registers + ;; + st8 [r2]=r20 // save ar.bspstore + st8 [r14]=r18 // save fpsr + mov ar.rsc=3 // put RSE back into eager mode, pl 0 + br.cond.sptk.many b7 +END(save_switch_stack) + +/* + * load_switch_stack: + * - "invala" MUST be done at call site (normally in DO_LOAD_SWITCH_STACK) + * - b7 holds address to return to + * - must not touch r8-r11 + */ +GLOBAL_ENTRY(load_switch_stack) + .prologue + .altrp b7 + + .body + lfetch.fault.nt1 [sp] + adds r2=SW(AR_BSPSTORE)+16,sp + adds r3=SW(AR_UNAT)+16,sp + mov ar.rsc=0 // put RSE into enforced lazy mode + adds r14=SW(CALLER_UNAT)+16,sp + adds r15=SW(AR_FPSR)+16,sp + ;; + ld8 r27=[r2],(SW(B0)-SW(AR_BSPSTORE)) // bspstore + ld8 r29=[r3],(SW(B1)-SW(AR_UNAT)) // unat + ;; + ld8 r21=[r2],16 // restore b0 + ld8 r22=[r3],16 // restore b1 + ;; + ld8 r23=[r2],16 // restore b2 + ld8 r24=[r3],16 // restore b3 + ;; + ld8 r25=[r2],16 // restore b4 + ld8 r26=[r3],16 // restore b5 + ;; + ld8 r16=[r2],(SW(PR)-SW(AR_PFS)) // ar.pfs + ld8 r17=[r3],(SW(AR_RNAT)-SW(AR_LC)) // ar.lc + ;; + ld8 r28=[r2] // restore pr + ld8 r30=[r3] // restore rnat + ;; + ld8 r18=[r14],16 // restore caller's unat + ld8 r19=[r15],24 // restore fpsr + ;; + ldf.fill f2=[r14],32 + ldf.fill f3=[r15],32 + ;; + ldf.fill f4=[r14],32 + ldf.fill f5=[r15],32 + ;; + ldf.fill f12=[r14],32 + ldf.fill f13=[r15],32 + ;; + ldf.fill f14=[r14],32 + ldf.fill f15=[r15],32 + ;; + ldf.fill f16=[r14],32 + ldf.fill f17=[r15],32 + ;; + ldf.fill f18=[r14],32 + ldf.fill f19=[r15],32 + mov b0=r21 + ;; + ldf.fill f20=[r14],32 + ldf.fill f21=[r15],32 + mov b1=r22 + ;; + ldf.fill f22=[r14],32 + ldf.fill f23=[r15],32 + mov b2=r23 + ;; + mov ar.bspstore=r27 + mov ar.unat=r29 // establish unat holding the NaT bits for r4-r7 + mov b3=r24 + ;; + ldf.fill f24=[r14],32 + ldf.fill f25=[r15],32 + mov b4=r25 + ;; + ldf.fill f26=[r14],32 + ldf.fill f27=[r15],32 + mov b5=r26 + ;; + ldf.fill f28=[r14],32 + ldf.fill f29=[r15],32 + mov ar.pfs=r16 + ;; + ldf.fill f30=[r14],32 + ldf.fill f31=[r15],24 + mov ar.lc=r17 + ;; + ld8.fill r4=[r14],16 + ld8.fill r5=[r15],16 + mov pr=r28,-1 + ;; + ld8.fill r6=[r14],16 + ld8.fill r7=[r15],16 + + mov ar.unat=r18 // restore caller's unat + mov ar.rnat=r30 // must restore after bspstore but before rsc! + mov ar.fpsr=r19 // restore fpsr + mov ar.rsc=3 // put RSE back into eager mode, pl 0 + br.cond.sptk.many b7 +END(load_switch_stack) + +GLOBAL_ENTRY(prefetch_stack) + add r14 = -IA64_SWITCH_STACK_SIZE, sp + add r15 = IA64_TASK_THREAD_KSP_OFFSET, in0 + ;; + ld8 r16 = [r15] // load next's stack pointer + lfetch.fault.excl [r14], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault.excl [r14], 128 + lfetch.fault [r16], 128 + ;; + lfetch.fault [r16], 128 + br.ret.sptk.many rp +END(prefetch_stack) + +GLOBAL_ENTRY(kernel_execve) + rum psr.ac + mov r15=__NR_execve // put syscall number in place + break __BREAK_SYSCALL + br.ret.sptk.many rp +END(kernel_execve) + +GLOBAL_ENTRY(clone) + mov r15=__NR_clone // put syscall number in place + break __BREAK_SYSCALL + br.ret.sptk.many rp +END(clone) + + /* + * Invoke a system call, but do some tracing before and after the call. + * We MUST preserve the current register frame throughout this routine + * because some system calls (such as ia64_execve) directly + * manipulate ar.pfs. + */ +GLOBAL_ENTRY(ia64_trace_syscall) + PT_REGS_UNWIND_INFO(0) + /* + * We need to preserve the scratch registers f6-f11 in case the system + * call is sigreturn. + */ + adds r16=PT(F6)+16,sp + adds r17=PT(F7)+16,sp + ;; + stf.spill [r16]=f6,32 + stf.spill [r17]=f7,32 + ;; + stf.spill [r16]=f8,32 + stf.spill [r17]=f9,32 + ;; + stf.spill [r16]=f10 + stf.spill [r17]=f11 + br.call.sptk.many rp=syscall_trace_enter // give parent a chance to catch syscall args + cmp.lt p6,p0=r8,r0 // check tracehook + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 + adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10 + mov r10=0 +(p6) br.cond.sptk strace_error // syscall failed -> + adds r16=PT(F6)+16,sp + adds r17=PT(F7)+16,sp + ;; + ldf.fill f6=[r16],32 + ldf.fill f7=[r17],32 + ;; + ldf.fill f8=[r16],32 + ldf.fill f9=[r17],32 + ;; + ldf.fill f10=[r16] + ldf.fill f11=[r17] + // the syscall number may have changed, so re-load it and re-calculate the + // syscall entry-point: + adds r15=PT(R15)+16,sp // r15 = &pt_regs.r15 (syscall #) + ;; + ld8 r15=[r15] + mov r3=NR_syscalls - 1 + ;; + adds r15=-1024,r15 + movl r16=sys_call_table + ;; + shladd r20=r15,3,r16 // r20 = sys_call_table + 8*(syscall-1024) + cmp.leu p6,p7=r15,r3 + ;; +(p6) ld8 r20=[r20] // load address of syscall entry point +(p7) movl r20=sys_ni_syscall + ;; + mov b6=r20 + br.call.sptk.many rp=b6 // do the syscall +.strace_check_retval: + cmp.lt p6,p0=r8,r0 // syscall failed? + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 + adds r3=PT(R10)+16,sp // r3 = &pt_regs.r10 + mov r10=0 +(p6) br.cond.sptk strace_error // syscall failed -> + ;; // avoid RAW on r10 +.strace_save_retval: +.mem.offset 0,0; st8.spill [r2]=r8 // store return value in slot for r8 +.mem.offset 8,0; st8.spill [r3]=r10 // clear error indication in slot for r10 + br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value +.ret3: +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk +(pUStk) rsm psr.i // disable interrupts + br.cond.sptk ia64_work_pending_syscall_end + +strace_error: + ld8 r3=[r2] // load pt_regs.r8 + sub r9=0,r8 // negate return value to get errno value + ;; + cmp.ne p6,p0=r3,r0 // is pt_regs.r8!=0? + adds r3=16,r2 // r3=&pt_regs.r10 + ;; +(p6) mov r10=-1 +(p6) mov r8=r9 + br.cond.sptk .strace_save_retval +END(ia64_trace_syscall) + + /* + * When traced and returning from sigreturn, we invoke syscall_trace but then + * go straight to ia64_leave_kernel rather than ia64_leave_syscall. + */ +GLOBAL_ENTRY(ia64_strace_leave_kernel) + PT_REGS_UNWIND_INFO(0) +{ /* + * Some versions of gas generate bad unwind info if the first instruction of a + * procedure doesn't go into the first slot of a bundle. This is a workaround. + */ + nop.m 0 + nop.i 0 + br.call.sptk.many rp=syscall_trace_leave // give parent a chance to catch return value +} +.ret4: br.cond.sptk ia64_leave_kernel +END(ia64_strace_leave_kernel) + +GLOBAL_ENTRY(ia64_ret_from_clone) + PT_REGS_UNWIND_INFO(0) +{ /* + * Some versions of gas generate bad unwind info if the first instruction of a + * procedure doesn't go into the first slot of a bundle. This is a workaround. + */ + nop.m 0 + nop.i 0 + /* + * We need to call schedule_tail() to complete the scheduling process. + * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the + * address of the previously executing task. + */ + br.call.sptk.many rp=ia64_invoke_schedule_tail +} +.ret8: + adds r2=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; + ld4 r2=[r2] + ;; + mov r8=0 + and r2=_TIF_SYSCALL_TRACEAUDIT,r2 + ;; + cmp.ne p6,p0=r2,r0 +(p6) br.cond.spnt .strace_check_retval + ;; // added stop bits to prevent r8 dependency +END(ia64_ret_from_clone) + // fall through +GLOBAL_ENTRY(ia64_ret_from_syscall) + PT_REGS_UNWIND_INFO(0) + cmp.ge p6,p7=r8,r0 // syscall executed successfully? + adds r2=PT(R8)+16,sp // r2 = &pt_regs.r8 + mov r10=r0 // clear error indication in r10 +(p7) br.cond.spnt handle_syscall_error // handle potential syscall failure +#ifdef CONFIG_PARAVIRT + ;; + br.cond.sptk.few ia64_leave_syscall + ;; +#endif /* CONFIG_PARAVIRT */ +END(ia64_ret_from_syscall) +#ifndef CONFIG_PARAVIRT + // fall through +#endif +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ + +/* + * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't + * need to switch to bank 0 and doesn't restore the scratch registers. + * To avoid leaking kernel bits, the scratch registers are set to + * the following known-to-be-safe values: + * + * r1: restored (global pointer) + * r2: cleared + * r3: 1 (when returning to user-level) + * r8-r11: restored (syscall return value(s)) + * r12: restored (user-level stack pointer) + * r13: restored (user-level thread pointer) + * r14: set to __kernel_syscall_via_epc + * r15: restored (syscall #) + * r16-r17: cleared + * r18: user-level b6 + * r19: cleared + * r20: user-level ar.fpsr + * r21: user-level b0 + * r22: cleared + * r23: user-level ar.bspstore + * r24: user-level ar.rnat + * r25: user-level ar.unat + * r26: user-level ar.pfs + * r27: user-level ar.rsc + * r28: user-level ip + * r29: user-level psr + * r30: user-level cfm + * r31: user-level pr + * f6-f11: cleared + * pr: restored (user-level pr) + * b0: restored (user-level rp) + * b6: restored + * b7: set to __kernel_syscall_via_epc + * ar.unat: restored (user-level ar.unat) + * ar.pfs: restored (user-level ar.pfs) + * ar.rsc: restored (user-level ar.rsc) + * ar.rnat: restored (user-level ar.rnat) + * ar.bspstore: restored (user-level ar.bspstore) + * ar.fpsr: restored (user-level ar.fpsr) + * ar.ccv: cleared + * ar.csd: cleared + * ar.ssd: cleared + */ +GLOBAL_ENTRY(__paravirt_leave_syscall) + PT_REGS_UNWIND_INFO(0) + /* + * work.need_resched etc. mustn't get changed by this CPU before it returns to + * user- or fsys-mode, hence we disable interrupts early on. + * + * p6 controls whether current_thread_info()->flags needs to be check for + * extra work. We always check for extra work when returning to user-level. + * With CONFIG_PREEMPT, we also check for extra work when the preempt_count + * is 0. After extra work processing has been completed, execution + * resumes at ia64_work_processed_syscall with p6 set to 1 if the extra-work-check + * needs to be redone. + */ +#ifdef CONFIG_PREEMPT + RSM_PSR_I(p0, r2, r18) // disable interrupts + cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 + ;; + .pred.rel.mutex pUStk,pKStk +(pKStk) ld4 r21=[r20] // r21 <- preempt_count +(pUStk) mov r21=0 // r21 <- 0 + ;; + cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0) +#else /* !CONFIG_PREEMPT */ + RSM_PSR_I(pUStk, r2, r18) + cmp.eq pLvSys,p0=r0,r0 // pLvSys=1: leave from syscall +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk +#endif +.global __paravirt_work_processed_syscall; +__paravirt_work_processed_syscall: +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + adds r2=PT(LOADRS)+16,r12 + MOV_FROM_ITC(pUStk, p9, r22, r19) // fetch time at leave + adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r18] // load current_thread_info()->flags + ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" + adds r3=PT(AR_BSPSTORE)+16,r12 // deferred + ;; +#else + adds r2=PT(LOADRS)+16,r12 + adds r3=PT(AR_BSPSTORE)+16,r12 + adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r18] // load current_thread_info()->flags + ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" + nop.i 0 + ;; +#endif + mov r16=ar.bsp // M2 get existing backing store pointer + ld8 r18=[r2],PT(R9)-PT(B6) // load b6 +(p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? + ;; + ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage) +(p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending? +(p6) br.cond.spnt .work_pending_syscall + ;; + // start restoring the state saved on the kernel stack (struct pt_regs): + ld8 r9=[r2],PT(CR_IPSR)-PT(R9) + ld8 r11=[r3],PT(CR_IIP)-PT(R11) +(pNonSys) break 0 // bug check: we shouldn't be here if pNonSys is TRUE! + ;; + invala // M0|1 invalidate ALAT + RSM_PSR_I_IC(r28, r29, r30) // M2 turn off interrupts and interruption collection + cmp.eq p9,p0=r0,r0 // A set p9 to indicate that we should restore cr.ifs + + ld8 r29=[r2],16 // M0|1 load cr.ipsr + ld8 r28=[r3],16 // M0|1 load cr.iip +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) add r14=TI_AC_LEAVE+IA64_TASK_SIZE,r13 + ;; + ld8 r30=[r2],16 // M0|1 load cr.ifs + ld8 r25=[r3],16 // M0|1 load ar.unat +(pUStk) add r15=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 + ;; +#else + mov r22=r0 // A clear r22 + ;; + ld8 r30=[r2],16 // M0|1 load cr.ifs + ld8 r25=[r3],16 // M0|1 load ar.unat +(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13 + ;; +#endif + ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs + MOV_FROM_PSR(pKStk, r22, r21) // M2 read PSR now that interrupts are disabled + nop 0 + ;; + ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0 + ld8 r27=[r3],PT(PR)-PT(AR_RSC) // M0|1 load ar.rsc + mov f6=f0 // F clear f6 + ;; + ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // M0|1 load ar.rnat (may be garbage) + ld8 r31=[r3],PT(R1)-PT(PR) // M0|1 load predicates + mov f7=f0 // F clear f7 + ;; + ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // M0|1 load ar.fpsr + ld8.fill r1=[r3],16 // M0|1 load r1 +(pUStk) mov r17=1 // A + ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) st1 [r15]=r17 // M2|3 +#else +(pUStk) st1 [r14]=r17 // M2|3 +#endif + ld8.fill r13=[r3],16 // M0|1 + mov f8=f0 // F clear f8 + ;; + ld8.fill r12=[r2] // M0|1 restore r12 (sp) + ld8.fill r15=[r3] // M0|1 restore r15 + mov b6=r18 // I0 restore b6 + + LOAD_PHYS_STACK_REG_SIZE(r17) + mov f9=f0 // F clear f9 +(pKStk) br.cond.dpnt.many skip_rbs_switch // B + + srlz.d // M0 ensure interruption collection is off (for cover) + shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition + COVER // B add current frame into dirty partition & set cr.ifs + ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + mov r19=ar.bsp // M2 get new backing store pointer + st8 [r14]=r22 // M save time at leave + mov f10=f0 // F clear f10 + + mov r22=r0 // A clear r22 + movl r14=__kernel_syscall_via_epc // X + ;; +#else + mov r19=ar.bsp // M2 get new backing store pointer + mov f10=f0 // F clear f10 + + nop.m 0 + movl r14=__kernel_syscall_via_epc // X + ;; +#endif + mov.m ar.csd=r0 // M2 clear ar.csd + mov.m ar.ccv=r0 // M2 clear ar.ccv + mov b7=r14 // I0 clear b7 (hint with __kernel_syscall_via_epc) + + mov.m ar.ssd=r0 // M2 clear ar.ssd + mov f11=f0 // F clear f11 + br.cond.sptk.many rbs_switch // B +END(__paravirt_leave_syscall) + +GLOBAL_ENTRY(__paravirt_leave_kernel) + PT_REGS_UNWIND_INFO(0) + /* + * work.need_resched etc. mustn't get changed by this CPU before it returns to + * user- or fsys-mode, hence we disable interrupts early on. + * + * p6 controls whether current_thread_info()->flags needs to be check for + * extra work. We always check for extra work when returning to user-level. + * With CONFIG_PREEMPT, we also check for extra work when the preempt_count + * is 0. After extra work processing has been completed, execution + * resumes at .work_processed_syscall with p6 set to 1 if the extra-work-check + * needs to be redone. + */ +#ifdef CONFIG_PREEMPT + RSM_PSR_I(p0, r17, r31) // disable interrupts + cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 + ;; + .pred.rel.mutex pUStk,pKStk +(pKStk) ld4 r21=[r20] // r21 <- preempt_count +(pUStk) mov r21=0 // r21 <- 0 + ;; + cmp.eq p6,p0=r21,r0 // p6 <- pUStk || (preempt_count == 0) +#else + RSM_PSR_I(pUStk, r17, r31) + cmp.eq p0,pLvSys=r0,r0 // pLvSys=0: leave from kernel +(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk +#endif +.work_processed_kernel: + adds r17=TI_FLAGS+IA64_TASK_SIZE,r13 + ;; +(p6) ld4 r31=[r17] // load current_thread_info()->flags + adds r21=PT(PR)+16,r12 + ;; + + lfetch [r21],PT(CR_IPSR)-PT(PR) + adds r2=PT(B6)+16,r12 + adds r3=PT(R16)+16,r12 + ;; + lfetch [r21] + ld8 r28=[r2],8 // load b6 + adds r29=PT(R24)+16,r12 + + ld8.fill r16=[r3],PT(AR_CSD)-PT(R16) + adds r30=PT(AR_CCV)+16,r12 +(p6) and r19=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? + ;; + ld8.fill r24=[r29] + ld8 r15=[r30] // load ar.ccv +(p6) cmp4.ne.unc p6,p0=r19, r0 // any special work pending? + ;; + ld8 r29=[r2],16 // load b7 + ld8 r30=[r3],16 // load ar.csd +(p6) br.cond.spnt .work_pending + ;; + ld8 r31=[r2],16 // load ar.ssd + ld8.fill r8=[r3],16 + ;; + ld8.fill r9=[r2],16 + ld8.fill r10=[r3],PT(R17)-PT(R10) + ;; + ld8.fill r11=[r2],PT(R18)-PT(R11) + ld8.fill r17=[r3],16 + ;; + ld8.fill r18=[r2],16 + ld8.fill r19=[r3],16 + ;; + ld8.fill r20=[r2],16 + ld8.fill r21=[r3],16 + mov ar.csd=r30 + mov ar.ssd=r31 + ;; + RSM_PSR_I_IC(r23, r22, r25) // initiate turning off of interrupt and interruption collection + invala // invalidate ALAT + ;; + ld8.fill r22=[r2],24 + ld8.fill r23=[r3],24 + mov b6=r28 + ;; + ld8.fill r25=[r2],16 + ld8.fill r26=[r3],16 + mov b7=r29 + ;; + ld8.fill r27=[r2],16 + ld8.fill r28=[r3],16 + ;; + ld8.fill r29=[r2],16 + ld8.fill r30=[r3],24 + ;; + ld8.fill r31=[r2],PT(F9)-PT(R31) + adds r3=PT(F10)-PT(F6),r3 + ;; + ldf.fill f9=[r2],PT(F6)-PT(F9) + ldf.fill f10=[r3],PT(F8)-PT(F10) + ;; + ldf.fill f6=[r2],PT(F7)-PT(F6) + ;; + ldf.fill f7=[r2],PT(F11)-PT(F7) + ldf.fill f8=[r3],32 + ;; + srlz.d // ensure that inter. collection is off (VHPT is don't care, since text is pinned) + mov ar.ccv=r15 + ;; + ldf.fill f11=[r2] + BSW_0(r2, r3, r15) // switch back to bank 0 (no stop bit required beforehand...) + ;; +(pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency) + adds r16=PT(CR_IPSR)+16,r12 + adds r17=PT(CR_IIP)+16,r12 + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + .pred.rel.mutex pUStk,pKStk + MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled + MOV_FROM_ITC(pUStk, p9, r22, r29) // M fetch time at leave + nop.i 0 + ;; +#else + MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled + nop.i 0 + nop.i 0 + ;; +#endif + ld8 r29=[r16],16 // load cr.ipsr + ld8 r28=[r17],16 // load cr.iip + ;; + ld8 r30=[r16],16 // load cr.ifs + ld8 r25=[r17],16 // load ar.unat + ;; + ld8 r26=[r16],16 // load ar.pfs + ld8 r27=[r17],16 // load ar.rsc + cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs + ;; + ld8 r24=[r16],16 // load ar.rnat (may be garbage) + ld8 r23=[r17],16 // load ar.bspstore (may be garbage) + ;; + ld8 r31=[r16],16 // load predicates + ld8 r21=[r17],16 // load b0 + ;; + ld8 r19=[r16],16 // load ar.rsc value for "loadrs" + ld8.fill r1=[r17],16 // load r1 + ;; + ld8.fill r12=[r16],16 + ld8.fill r13=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) adds r3=TI_AC_LEAVE+IA64_TASK_SIZE,r18 +#else +(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 +#endif + ;; + ld8 r20=[r16],16 // ar.fpsr + ld8.fill r15=[r17],16 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +(pUStk) adds r18=IA64_TASK_THREAD_ON_USTACK_OFFSET,r18 // deferred +#endif + ;; + ld8.fill r14=[r16],16 + ld8.fill r2=[r17] +(pUStk) mov r17=1 + ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + // mmi_ : ld8 st1 shr;; mmi_ : st8 st1 shr;; + // mib : mov add br -> mib : ld8 add br + // bbb_ : br nop cover;; mbb_ : mov br cover;; + // + // no one require bsp in r16 if (pKStk) branch is selected. +(pUStk) st8 [r3]=r22 // save time at leave +(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack + shr.u r18=r19,16 // get byte size of existing "dirty" partition + ;; + ld8.fill r3=[r16] // deferred + LOAD_PHYS_STACK_REG_SIZE(r17) +(pKStk) br.cond.dpnt skip_rbs_switch + mov r16=ar.bsp // get existing backing store pointer +#else + ld8.fill r3=[r16] +(pUStk) st1 [r18]=r17 // restore current->thread.on_ustack + shr.u r18=r19,16 // get byte size of existing "dirty" partition + ;; + mov r16=ar.bsp // get existing backing store pointer + LOAD_PHYS_STACK_REG_SIZE(r17) +(pKStk) br.cond.dpnt skip_rbs_switch +#endif + + /* + * Restore user backing store. + * + * NOTE: alloc, loadrs, and cover can't be predicated. + */ +(pNonSys) br.cond.dpnt dont_preserve_current_frame + COVER // add current frame into dirty partition and set cr.ifs + ;; + mov r19=ar.bsp // get new backing store pointer +rbs_switch: + sub r16=r16,r18 // krbs = old bsp - size of dirty partition + cmp.ne p9,p0=r0,r0 // clear p9 to skip restore of cr.ifs + ;; + sub r19=r19,r16 // calculate total byte size of dirty partition + add r18=64,r18 // don't force in0-in7 into memory... + ;; + shl r19=r19,16 // shift size of dirty partition into loadrs position + ;; +dont_preserve_current_frame: + /* + * To prevent leaking bits between the kernel and user-space, + * we must clear the stacked registers in the "invalid" partition here. + * Not pretty, but at least it's fast (3.34 registers/cycle on Itanium, + * 5 registers/cycle on McKinley). + */ +# define pRecurse p6 +# define pReturn p7 +#ifdef CONFIG_ITANIUM +# define Nregs 10 +#else +# define Nregs 14 +#endif + alloc loc0=ar.pfs,2,Nregs-2,2,0 + shr.u loc1=r18,9 // RNaTslots <= floor(dirtySize / (64*8)) + sub r17=r17,r18 // r17 = (physStackedSize + 8) - dirtySize + ;; + mov ar.rsc=r19 // load ar.rsc to be used for "loadrs" + shladd in0=loc1,3,r17 + mov in1=0 + ;; + TEXT_ALIGN(32) +rse_clear_invalid: +#ifdef CONFIG_ITANIUM + // cycle 0 + { .mii + alloc loc0=ar.pfs,2,Nregs-2,2,0 + cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse + add out0=-Nregs*8,in0 +}{ .mfb + add out1=1,in1 // increment recursion count + nop.f 0 + nop.b 0 // can't do br.call here because of alloc (WAW on CFM) + ;; +}{ .mfi // cycle 1 + mov loc1=0 + nop.f 0 + mov loc2=0 +}{ .mib + mov loc3=0 + mov loc4=0 +(pRecurse) br.call.sptk.many b0=rse_clear_invalid + +}{ .mfi // cycle 2 + mov loc5=0 + nop.f 0 + cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret +}{ .mib + mov loc6=0 + mov loc7=0 +(pReturn) br.ret.sptk.many b0 +} +#else /* !CONFIG_ITANIUM */ + alloc loc0=ar.pfs,2,Nregs-2,2,0 + cmp.lt pRecurse,p0=Nregs*8,in0 // if more than Nregs regs left to clear, (re)curse + add out0=-Nregs*8,in0 + add out1=1,in1 // increment recursion count + mov loc1=0 + mov loc2=0 + ;; + mov loc3=0 + mov loc4=0 + mov loc5=0 + mov loc6=0 + mov loc7=0 +(pRecurse) br.call.dptk.few b0=rse_clear_invalid + ;; + mov loc8=0 + mov loc9=0 + cmp.ne pReturn,p0=r0,in1 // if recursion count != 0, we need to do a br.ret + mov loc10=0 + mov loc11=0 +(pReturn) br.ret.dptk.many b0 +#endif /* !CONFIG_ITANIUM */ +# undef pRecurse +# undef pReturn + ;; + alloc r17=ar.pfs,0,0,0,0 // drop current register frame + ;; + loadrs + ;; +skip_rbs_switch: + mov ar.unat=r25 // M2 +(pKStk) extr.u r22=r22,21,1 // I0 extract current value of psr.pp from r22 +(pLvSys)mov r19=r0 // A clear r19 for leave_syscall, no-op otherwise + ;; +(pUStk) mov ar.bspstore=r23 // M2 +(pKStk) dep r29=r22,r29,21,1 // I0 update ipsr.pp with psr.pp +(pLvSys)mov r16=r0 // A clear r16 for leave_syscall, no-op otherwise + ;; + MOV_TO_IPSR(p0, r29, r25) // M2 + mov ar.pfs=r26 // I0 +(pLvSys)mov r17=r0 // A clear r17 for leave_syscall, no-op otherwise + + MOV_TO_IFS(p9, r30, r25)// M2 + mov b0=r21 // I0 +(pLvSys)mov r18=r0 // A clear r18 for leave_syscall, no-op otherwise + + mov ar.fpsr=r20 // M2 + MOV_TO_IIP(r28, r25) // M2 + nop 0 + ;; +(pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode + nop 0 +(pLvSys)mov r2=r0 + + mov ar.rsc=r27 // M2 + mov pr=r31,-1 // I0 + RFI // B + + /* + * On entry: + * r20 = ¤t->thread_info->pre_count (if CONFIG_PREEMPT) + * r31 = current->thread_info->flags + * On exit: + * p6 = TRUE if work-pending-check needs to be redone + * + * Interrupts are disabled on entry, reenabled depend on work, and + * disabled on exit. + */ +.work_pending_syscall: + add r2=-8,r2 + add r3=-8,r3 + ;; + st8 [r2]=r8 + st8 [r3]=r10 +.work_pending: + tbit.z p6,p0=r31,TIF_NEED_RESCHED // is resched not needed? +(p6) br.cond.sptk.few .notify +#ifdef CONFIG_PREEMPT +(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 + ;; +(pKStk) st4 [r20]=r21 +#endif + SSM_PSR_I(p0, p6, r2) // enable interrupts + br.call.spnt.many rp=schedule +.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 (re-check) + RSM_PSR_I(p0, r2, r20) // disable interrupts + ;; +#ifdef CONFIG_PREEMPT +(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 + ;; +(pKStk) st4 [r20]=r0 // preempt_count() <- 0 +#endif +(pLvSys)br.cond.sptk.few __paravirt_pending_syscall_end + br.cond.sptk.many .work_processed_kernel + +.notify: +(pUStk) br.call.spnt.many rp=notify_resume_user +.ret10: cmp.ne p6,p0=r0,r0 // p6 <- 0 (don't re-check) +(pLvSys)br.cond.sptk.few __paravirt_pending_syscall_end + br.cond.sptk.many .work_processed_kernel + +.global __paravirt_pending_syscall_end; +__paravirt_pending_syscall_end: + adds r2=PT(R8)+16,r12 + adds r3=PT(R10)+16,r12 + ;; + ld8 r8=[r2] + ld8 r10=[r3] + br.cond.sptk.many __paravirt_work_processed_syscall_target +END(__paravirt_leave_kernel) + +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE +ENTRY(handle_syscall_error) + /* + * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could + * lead us to mistake a negative return value as a failed syscall. Those syscall + * must deposit a non-zero value in pt_regs.r8 to indicate an error. If + * pt_regs.r8 is zero, we assume that the call completed successfully. + */ + PT_REGS_UNWIND_INFO(0) + ld8 r3=[r2] // load pt_regs.r8 + ;; + cmp.eq p6,p7=r3,r0 // is pt_regs.r8==0? + ;; +(p7) mov r10=-1 +(p7) sub r8=0,r8 // negate return value to get errno + br.cond.sptk ia64_leave_syscall +END(handle_syscall_error) + + /* + * Invoke schedule_tail(task) while preserving in0-in7, which may be needed + * in case a system call gets restarted. + */ +GLOBAL_ENTRY(ia64_invoke_schedule_tail) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,2,1,0 + mov loc0=rp + mov out0=r8 // Address of previous task + ;; + br.call.sptk.many rp=schedule_tail +.ret11: mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(ia64_invoke_schedule_tail) + + /* + * Setup stack and call do_notify_resume_user(), keeping interrupts + * disabled. + * + * Note that pSys and pNonSys need to be set up by the caller. + * We declare 8 input registers so the system call args get preserved, + * in case we need to restart a system call. + */ +GLOBAL_ENTRY(notify_resume_user) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(8) + alloc loc1=ar.pfs,8,2,3,0 // preserve all eight input regs in case of syscall restart! + mov r9=ar.unat + mov loc0=rp // save return address + mov out0=0 // there is no "oldset" + adds out1=8,sp // out1=&sigscratch->ar_pfs +(pSys) mov out2=1 // out2==1 => we're in a syscall + ;; +(pNonSys) mov out2=0 // out2==0 => not a syscall + .fframe 16 + .spillsp ar.unat, 16 + st8 [sp]=r9,-16 // allocate space for ar.unat and save it + st8 [out1]=loc1,-8 // save ar.pfs, out1=&sigscratch + .body + br.call.sptk.many rp=do_notify_resume_user +.ret15: .restore sp + adds sp=16,sp // pop scratch stack space + ;; + ld8 r9=[sp] // load new unat from sigscratch->scratch_unat + mov rp=loc0 + ;; + mov ar.unat=r9 + mov ar.pfs=loc1 + br.ret.sptk.many rp +END(notify_resume_user) + +ENTRY(sys_rt_sigreturn) + PT_REGS_UNWIND_INFO(0) + /* + * Allocate 8 input registers since ptrace() may clobber them + */ + alloc r2=ar.pfs,8,0,1,0 + .prologue + PT_REGS_SAVES(16) + adds sp=-16,sp + .body + cmp.eq pNonSys,pSys=r0,r0 // sigreturn isn't a normal syscall... + ;; + /* + * leave_kernel() restores f6-f11 from pt_regs, but since the streamlined + * syscall-entry path does not save them we save them here instead. Note: we + * don't need to save any other registers that are not saved by the stream-lined + * syscall path, because restore_sigcontext() restores them. + */ + adds r16=PT(F6)+32,sp + adds r17=PT(F7)+32,sp + ;; + stf.spill [r16]=f6,32 + stf.spill [r17]=f7,32 + ;; + stf.spill [r16]=f8,32 + stf.spill [r17]=f9,32 + ;; + stf.spill [r16]=f10 + stf.spill [r17]=f11 + adds out0=16,sp // out0 = &sigscratch + br.call.sptk.many rp=ia64_rt_sigreturn +.ret19: .restore sp,0 + adds sp=16,sp + ;; + ld8 r9=[sp] // load new ar.unat + mov.sptk b7=r8,ia64_native_leave_kernel + ;; + mov ar.unat=r9 + br.many b7 +END(sys_rt_sigreturn) + +GLOBAL_ENTRY(ia64_prepare_handle_unaligned) + .prologue + /* + * r16 = fake ar.pfs, we simply need to make sure privilege is still 0 + */ + mov r16=r0 + DO_SAVE_SWITCH_STACK + br.call.sptk.many rp=ia64_handle_unaligned // stack frame setup in ivt +.ret21: .body + DO_LOAD_SWITCH_STACK + br.cond.sptk.many rp // goes to ia64_leave_kernel +END(ia64_prepare_handle_unaligned) + + // + // unw_init_running(void (*callback)(info, arg), void *arg) + // +# define EXTRA_FRAME_SIZE ((UNW_FRAME_INFO_SIZE+15)&~15) + +GLOBAL_ENTRY(unw_init_running) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) + alloc loc1=ar.pfs,2,3,3,0 + ;; + ld8 loc2=[in0],8 + mov loc0=rp + mov r16=loc1 + DO_SAVE_SWITCH_STACK + .body + + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) + .fframe IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE + SWITCH_STACK_SAVES(EXTRA_FRAME_SIZE) + adds sp=-EXTRA_FRAME_SIZE,sp + .body + ;; + adds out0=16,sp // &info + mov out1=r13 // current + adds out2=16+EXTRA_FRAME_SIZE,sp // &switch_stack + br.call.sptk.many rp=unw_init_frame_info +1: adds out0=16,sp // &info + mov b6=loc2 + mov loc2=gp // save gp across indirect function call + ;; + ld8 gp=[in0] + mov out1=in1 // arg + br.call.sptk.many rp=b6 // invoke the callback function +1: mov gp=loc2 // restore gp + + // For now, we don't allow changing registers from within + // unw_init_running; if we ever want to allow that, we'd + // have to do a load_switch_stack here: + .restore sp + adds sp=IA64_SWITCH_STACK_SIZE+EXTRA_FRAME_SIZE,sp + + mov ar.pfs=loc1 + mov rp=loc0 + br.ret.sptk.many rp +END(unw_init_running) + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +GLOBAL_ENTRY(_mcount) + br ftrace_stub +END(_mcount) + +.here: + br.ret.sptk.many b0 + +GLOBAL_ENTRY(ftrace_caller) + alloc out0 = ar.pfs, 8, 0, 4, 0 + mov out3 = r0 + ;; + mov out2 = b0 + add r3 = 0x20, r3 + mov out1 = r1; + br.call.sptk.many b0 = ftrace_patch_gp + //this might be called from module, so we must patch gp +ftrace_patch_gp: + movl gp=__gp + mov b0 = r3 + ;; +.global ftrace_call; +ftrace_call: +{ + .mlx + nop.m 0x0 + movl r3 = .here;; +} + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(ftrace_caller) + +#else +GLOBAL_ENTRY(_mcount) + movl r2 = ftrace_stub + movl r3 = ftrace_trace_function;; + ld8 r3 = [r3];; + ld8 r3 = [r3];; + cmp.eq p7,p0 = r2, r3 +(p7) br.sptk.many ftrace_stub + ;; + + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(_mcount) +#endif + +GLOBAL_ENTRY(ftrace_stub) + mov r3 = b0 + movl r2 = _mcount_ret_helper + ;; + mov b6 = r2 + mov b7 = r3 + br.ret.sptk.many b6 + +_mcount_ret_helper: + mov b0 = r42 + mov r1 = r41 + mov ar.pfs = r40 + br b7 +END(ftrace_stub) + +#endif /* CONFIG_FUNCTION_TRACER */ + + .rodata + .align 8 + .globl sys_call_table +sys_call_table: + data8 sys_ni_syscall // This must be sys_ni_syscall! See ivt.S. + data8 sys_exit // 1025 + data8 sys_read + data8 sys_write + data8 sys_open + data8 sys_close + data8 sys_creat // 1030 + data8 sys_link + data8 sys_unlink + data8 ia64_execve + data8 sys_chdir + data8 sys_fchdir // 1035 + data8 sys_utimes + data8 sys_mknod + data8 sys_chmod + data8 sys_chown + data8 sys_lseek // 1040 + data8 sys_getpid + data8 sys_getppid + data8 sys_mount + data8 sys_umount + data8 sys_setuid // 1045 + data8 sys_getuid + data8 sys_geteuid + data8 sys_ptrace + data8 sys_access + data8 sys_sync // 1050 + data8 sys_fsync + data8 sys_fdatasync + data8 sys_kill + data8 sys_rename + data8 sys_mkdir // 1055 + data8 sys_rmdir + data8 sys_dup + data8 sys_ia64_pipe + data8 sys_times + data8 ia64_brk // 1060 + data8 sys_setgid + data8 sys_getgid + data8 sys_getegid + data8 sys_acct + data8 sys_ioctl // 1065 + data8 sys_fcntl + data8 sys_umask + data8 sys_chroot + data8 sys_ustat + data8 sys_dup2 // 1070 + data8 sys_setreuid + data8 sys_setregid + data8 sys_getresuid + data8 sys_setresuid + data8 sys_getresgid // 1075 + data8 sys_setresgid + data8 sys_getgroups + data8 sys_setgroups + data8 sys_getpgid + data8 sys_setpgid // 1080 + data8 sys_setsid + data8 sys_getsid + data8 sys_sethostname + data8 sys_setrlimit + data8 sys_getrlimit // 1085 + data8 sys_getrusage + data8 sys_gettimeofday + data8 sys_settimeofday + data8 sys_select + data8 sys_poll // 1090 + data8 sys_symlink + data8 sys_readlink + data8 sys_uselib + data8 sys_swapon + data8 sys_swapoff // 1095 + data8 sys_reboot + data8 sys_truncate + data8 sys_ftruncate + data8 sys_fchmod + data8 sys_fchown // 1100 + data8 ia64_getpriority + data8 sys_setpriority + data8 sys_statfs + data8 sys_fstatfs + data8 sys_gettid // 1105 + data8 sys_semget + data8 sys_semop + data8 sys_semctl + data8 sys_msgget + data8 sys_msgsnd // 1110 + data8 sys_msgrcv + data8 sys_msgctl + data8 sys_shmget + data8 sys_shmat + data8 sys_shmdt // 1115 + data8 sys_shmctl + data8 sys_syslog + data8 sys_setitimer + data8 sys_getitimer + data8 sys_ni_syscall // 1120 /* was: ia64_oldstat */ + data8 sys_ni_syscall /* was: ia64_oldlstat */ + data8 sys_ni_syscall /* was: ia64_oldfstat */ + data8 sys_vhangup + data8 sys_lchown + data8 sys_remap_file_pages // 1125 + data8 sys_wait4 + data8 sys_sysinfo + data8 sys_clone + data8 sys_setdomainname + data8 sys_newuname // 1130 + data8 sys_adjtimex + data8 sys_ni_syscall /* was: ia64_create_module */ + data8 sys_init_module + data8 sys_delete_module + data8 sys_ni_syscall // 1135 /* was: sys_get_kernel_syms */ + data8 sys_ni_syscall /* was: sys_query_module */ + data8 sys_quotactl + data8 sys_bdflush + data8 sys_sysfs + data8 sys_personality // 1140 + data8 sys_ni_syscall // sys_afs_syscall + data8 sys_setfsuid + data8 sys_setfsgid + data8 sys_getdents + data8 sys_flock // 1145 + data8 sys_readv + data8 sys_writev + data8 sys_pread64 + data8 sys_pwrite64 + data8 sys_sysctl // 1150 + data8 sys_mmap + data8 sys_munmap + data8 sys_mlock + data8 sys_mlockall + data8 sys_mprotect // 1155 + data8 ia64_mremap + data8 sys_msync + data8 sys_munlock + data8 sys_munlockall + data8 sys_sched_getparam // 1160 + data8 sys_sched_setparam + data8 sys_sched_getscheduler + data8 sys_sched_setscheduler + data8 sys_sched_yield + data8 sys_sched_get_priority_max // 1165 + data8 sys_sched_get_priority_min + data8 sys_sched_rr_get_interval + data8 sys_nanosleep + data8 sys_ni_syscall // old nfsservctl + data8 sys_prctl // 1170 + data8 sys_getpagesize + data8 sys_mmap2 + data8 sys_pciconfig_read + data8 sys_pciconfig_write + data8 sys_perfmonctl // 1175 + data8 sys_sigaltstack + data8 sys_rt_sigaction + data8 sys_rt_sigpending + data8 sys_rt_sigprocmask + data8 sys_rt_sigqueueinfo // 1180 + data8 sys_rt_sigreturn + data8 sys_rt_sigsuspend + data8 sys_rt_sigtimedwait + data8 sys_getcwd + data8 sys_capget // 1185 + data8 sys_capset + data8 sys_sendfile64 + data8 sys_ni_syscall // sys_getpmsg (STREAMS) + data8 sys_ni_syscall // sys_putpmsg (STREAMS) + data8 sys_socket // 1190 + data8 sys_bind + data8 sys_connect + data8 sys_listen + data8 sys_accept + data8 sys_getsockname // 1195 + data8 sys_getpeername + data8 sys_socketpair + data8 sys_send + data8 sys_sendto + data8 sys_recv // 1200 + data8 sys_recvfrom + data8 sys_shutdown + data8 sys_setsockopt + data8 sys_getsockopt + data8 sys_sendmsg // 1205 + data8 sys_recvmsg + data8 sys_pivot_root + data8 sys_mincore + data8 sys_madvise + data8 sys_newstat // 1210 + data8 sys_newlstat + data8 sys_newfstat + data8 sys_clone2 + data8 sys_getdents64 + data8 sys_getunwind // 1215 + data8 sys_readahead + data8 sys_setxattr + data8 sys_lsetxattr + data8 sys_fsetxattr + data8 sys_getxattr // 1220 + data8 sys_lgetxattr + data8 sys_fgetxattr + data8 sys_listxattr + data8 sys_llistxattr + data8 sys_flistxattr // 1225 + data8 sys_removexattr + data8 sys_lremovexattr + data8 sys_fremovexattr + data8 sys_tkill + data8 sys_futex // 1230 + data8 sys_sched_setaffinity + data8 sys_sched_getaffinity + data8 sys_set_tid_address + data8 sys_fadvise64_64 + data8 sys_tgkill // 1235 + data8 sys_exit_group + data8 sys_lookup_dcookie + data8 sys_io_setup + data8 sys_io_destroy + data8 sys_io_getevents // 1240 + data8 sys_io_submit + data8 sys_io_cancel + data8 sys_epoll_create + data8 sys_epoll_ctl + data8 sys_epoll_wait // 1245 + data8 sys_restart_syscall + data8 sys_semtimedop + data8 sys_timer_create + data8 sys_timer_settime + data8 sys_timer_gettime // 1250 + data8 sys_timer_getoverrun + data8 sys_timer_delete + data8 sys_clock_settime + data8 sys_clock_gettime + data8 sys_clock_getres // 1255 + data8 sys_clock_nanosleep + data8 sys_fstatfs64 + data8 sys_statfs64 + data8 sys_mbind + data8 sys_get_mempolicy // 1260 + data8 sys_set_mempolicy + data8 sys_mq_open + data8 sys_mq_unlink + data8 sys_mq_timedsend + data8 sys_mq_timedreceive // 1265 + data8 sys_mq_notify + data8 sys_mq_getsetattr + data8 sys_kexec_load + data8 sys_ni_syscall // reserved for vserver + data8 sys_waitid // 1270 + data8 sys_add_key + data8 sys_request_key + data8 sys_keyctl + data8 sys_ioprio_set + data8 sys_ioprio_get // 1275 + data8 sys_move_pages + data8 sys_inotify_init + data8 sys_inotify_add_watch + data8 sys_inotify_rm_watch + data8 sys_migrate_pages // 1280 + data8 sys_openat + data8 sys_mkdirat + data8 sys_mknodat + data8 sys_fchownat + data8 sys_futimesat // 1285 + data8 sys_newfstatat + data8 sys_unlinkat + data8 sys_renameat + data8 sys_linkat + data8 sys_symlinkat // 1290 + data8 sys_readlinkat + data8 sys_fchmodat + data8 sys_faccessat + data8 sys_pselect6 + data8 sys_ppoll // 1295 + data8 sys_unshare + data8 sys_splice + data8 sys_set_robust_list + data8 sys_get_robust_list + data8 sys_sync_file_range // 1300 + data8 sys_tee + data8 sys_vmsplice + data8 sys_fallocate + data8 sys_getcpu + data8 sys_epoll_pwait // 1305 + data8 sys_utimensat + data8 sys_signalfd + data8 sys_ni_syscall + data8 sys_eventfd + data8 sys_timerfd_create // 1310 + data8 sys_timerfd_settime + data8 sys_timerfd_gettime + data8 sys_signalfd4 + data8 sys_eventfd2 + data8 sys_epoll_create1 // 1315 + data8 sys_dup3 + data8 sys_pipe2 + data8 sys_inotify_init1 + data8 sys_preadv + data8 sys_pwritev // 1320 + data8 sys_rt_tgsigqueueinfo + data8 sys_recvmmsg + data8 sys_fanotify_init + data8 sys_fanotify_mark + data8 sys_prlimit64 // 1325 + data8 sys_name_to_handle_at + data8 sys_open_by_handle_at + data8 sys_clock_adjtime + data8 sys_syncfs + data8 sys_setns // 1330 + data8 sys_sendmmsg + data8 sys_process_vm_readv + data8 sys_process_vm_writev + data8 sys_accept4 + + .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h new file mode 100644 index 00000000000..b83edac0296 --- /dev/null +++ b/arch/ia64/kernel/entry.h @@ -0,0 +1,82 @@ + +/* + * Preserved registers that are shared between code in ivt.S and + * entry.S. Be careful not to step on these! + */ +#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ +#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ +#define PRED_USER_STACK 3 /* returning to user-stacks? */ +#define PRED_SYSCALL 4 /* inside a system call? */ +#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ + +#ifdef __ASSEMBLY__ +# define PASTE2(x,y) x##y +# define PASTE(x,y) PASTE2(x,y) + +# define pLvSys PASTE(p,PRED_LEAVE_SYSCALL) +# define pKStk PASTE(p,PRED_KERNEL_STACK) +# define pUStk PASTE(p,PRED_USER_STACK) +# define pSys PASTE(p,PRED_SYSCALL) +# define pNonSys PASTE(p,PRED_NON_SYSCALL) +#endif + +#define PT(f) (IA64_PT_REGS_##f##_OFFSET) +#define SW(f) (IA64_SWITCH_STACK_##f##_OFFSET) +#define SOS(f) (IA64_SAL_OS_STATE_##f##_OFFSET) + +#define PT_REGS_SAVES(off) \ + .unwabi 3, 'i'; \ + .fframe IA64_PT_REGS_SIZE+16+(off); \ + .spillsp rp, PT(CR_IIP)+16+(off); \ + .spillsp ar.pfs, PT(CR_IFS)+16+(off); \ + .spillsp ar.unat, PT(AR_UNAT)+16+(off); \ + .spillsp ar.fpsr, PT(AR_FPSR)+16+(off); \ + .spillsp pr, PT(PR)+16+(off); + +#define PT_REGS_UNWIND_INFO(off) \ + .prologue; \ + PT_REGS_SAVES(off); \ + .body + +#define SWITCH_STACK_SAVES(off) \ + .savesp ar.unat,SW(CALLER_UNAT)+16+(off); \ + .savesp ar.fpsr,SW(AR_FPSR)+16+(off); \ + .spillsp f2,SW(F2)+16+(off); .spillsp f3,SW(F3)+16+(off); \ + .spillsp f4,SW(F4)+16+(off); .spillsp f5,SW(F5)+16+(off); \ + .spillsp f16,SW(F16)+16+(off); .spillsp f17,SW(F17)+16+(off); \ + .spillsp f18,SW(F18)+16+(off); .spillsp f19,SW(F19)+16+(off); \ + .spillsp f20,SW(F20)+16+(off); .spillsp f21,SW(F21)+16+(off); \ + .spillsp f22,SW(F22)+16+(off); .spillsp f23,SW(F23)+16+(off); \ + .spillsp f24,SW(F24)+16+(off); .spillsp f25,SW(F25)+16+(off); \ + .spillsp f26,SW(F26)+16+(off); .spillsp f27,SW(F27)+16+(off); \ + .spillsp f28,SW(F28)+16+(off); .spillsp f29,SW(F29)+16+(off); \ + .spillsp f30,SW(F30)+16+(off); .spillsp f31,SW(F31)+16+(off); \ + .spillsp r4,SW(R4)+16+(off); .spillsp r5,SW(R5)+16+(off); \ + .spillsp r6,SW(R6)+16+(off); .spillsp r7,SW(R7)+16+(off); \ + .spillsp b0,SW(B0)+16+(off); .spillsp b1,SW(B1)+16+(off); \ + .spillsp b2,SW(B2)+16+(off); .spillsp b3,SW(B3)+16+(off); \ + .spillsp b4,SW(B4)+16+(off); .spillsp b5,SW(B5)+16+(off); \ + .spillsp ar.pfs,SW(AR_PFS)+16+(off); .spillsp ar.lc,SW(AR_LC)+16+(off); \ + .spillsp @priunat,SW(AR_UNAT)+16+(off); \ + .spillsp ar.rnat,SW(AR_RNAT)+16+(off); \ + .spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off); \ + .spillsp pr,SW(PR)+16+(off) + +#define DO_SAVE_SWITCH_STACK \ + movl r28=1f; \ + ;; \ + .fframe IA64_SWITCH_STACK_SIZE; \ + adds sp=-IA64_SWITCH_STACK_SIZE,sp; \ + mov.ret.sptk b7=r28,1f; \ + SWITCH_STACK_SAVES(0); \ + br.cond.sptk.many save_switch_stack; \ +1: + +#define DO_LOAD_SWITCH_STACK \ + movl r28=1f; \ + ;; \ + invala; \ + mov.ret.sptk b7=r28,1f; \ + br.cond.sptk.many load_switch_stack; \ +1: .restore sp; \ + adds sp=IA64_SWITCH_STACK_SIZE,sp diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c new file mode 100644 index 00000000000..2d67317a1ec --- /dev/null +++ b/arch/ia64/kernel/err_inject.c @@ -0,0 +1,303 @@ +/* + * err_inject.c - + * 1.) Inject errors to a processor. + * 2.) Query error injection capabilities. + * This driver along with user space code can be acting as an error + * injection tool. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Written by: Fenghua Yu <fenghua.yu@intel.com>, Intel Corporation + * Copyright (C) 2006, Intel Corp. All rights reserved. + * + */ +#include <linux/device.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#define ERR_INJ_DEBUG + +#define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte; + +#define define_one_ro(name) \ +static DEVICE_ATTR(name, 0444, show_##name, NULL) + +#define define_one_rw(name) \ +static DEVICE_ATTR(name, 0644, show_##name, store_##name) + +static u64 call_start[NR_CPUS]; +static u64 phys_addr[NR_CPUS]; +static u64 err_type_info[NR_CPUS]; +static u64 err_struct_info[NR_CPUS]; +static struct { + u64 data1; + u64 data2; + u64 data3; +} __attribute__((__aligned__(16))) err_data_buffer[NR_CPUS]; +static s64 status[NR_CPUS]; +static u64 capabilities[NR_CPUS]; +static u64 resources[NR_CPUS]; + +#define show(name) \ +static ssize_t \ +show_##name(struct device *dev, struct device_attribute *attr, \ + char *buf) \ +{ \ + u32 cpu=dev->id; \ + return sprintf(buf, "%lx\n", name[cpu]); \ +} + +#define store(name) \ +static ssize_t \ +store_##name(struct device *dev, struct device_attribute *attr, \ + const char *buf, size_t size) \ +{ \ + unsigned int cpu=dev->id; \ + name[cpu] = simple_strtoull(buf, NULL, 16); \ + return size; \ +} + +show(call_start) + +/* It's user's responsibility to call the PAL procedure on a specific + * processor. The cpu number in driver is only used for storing data. + */ +static ssize_t +store_call_start(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned int cpu=dev->id; + unsigned long call_start = simple_strtoull(buf, NULL, 16); + +#ifdef ERR_INJ_DEBUG + printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu); + printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]); + printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]); + printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n", + err_data_buffer[cpu].data1, + err_data_buffer[cpu].data2, + err_data_buffer[cpu].data3); +#endif + switch (call_start) { + case 0: /* Do nothing. */ + break; + case 1: /* Call pal_mc_error_inject in physical mode. */ + status[cpu]=ia64_pal_mc_error_inject_phys(err_type_info[cpu], + err_struct_info[cpu], + ia64_tpa(&err_data_buffer[cpu]), + &capabilities[cpu], + &resources[cpu]); + break; + case 2: /* Call pal_mc_error_inject in virtual mode. */ + status[cpu]=ia64_pal_mc_error_inject_virt(err_type_info[cpu], + err_struct_info[cpu], + ia64_tpa(&err_data_buffer[cpu]), + &capabilities[cpu], + &resources[cpu]); + break; + default: + status[cpu] = -EINVAL; + break; + } + +#ifdef ERR_INJ_DEBUG + printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]); + printk(KERN_DEBUG "capapbilities=%lx,\n", capabilities[cpu]); + printk(KERN_DEBUG "resources=%lx\n", resources[cpu]); +#endif + return size; +} + +show(err_type_info) +store(err_type_info) + +static ssize_t +show_virtual_to_phys(struct device *dev, struct device_attribute *attr, + char *buf) +{ + unsigned int cpu=dev->id; + return sprintf(buf, "%lx\n", phys_addr[cpu]); +} + +static ssize_t +store_virtual_to_phys(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned int cpu=dev->id; + u64 virt_addr=simple_strtoull(buf, NULL, 16); + int ret; + + ret = get_user_pages(current, current->mm, virt_addr, + 1, VM_READ, 0, NULL, NULL); + if (ret<=0) { +#ifdef ERR_INJ_DEBUG + printk("Virtual address %lx is not existing.\n",virt_addr); +#endif + return -EINVAL; + } + + phys_addr[cpu] = ia64_tpa(virt_addr); + return size; +} + +show(err_struct_info) +store(err_struct_info) + +static ssize_t +show_err_data_buffer(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned int cpu=dev->id; + + return sprintf(buf, "%lx, %lx, %lx\n", + err_data_buffer[cpu].data1, + err_data_buffer[cpu].data2, + err_data_buffer[cpu].data3); +} + +static ssize_t +store_err_data_buffer(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned int cpu=dev->id; + int ret; + +#ifdef ERR_INJ_DEBUG + printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n", + err_data_buffer[cpu].data1, + err_data_buffer[cpu].data2, + err_data_buffer[cpu].data3, + cpu); +#endif + ret=sscanf(buf, "%lx, %lx, %lx", + &err_data_buffer[cpu].data1, + &err_data_buffer[cpu].data2, + &err_data_buffer[cpu].data3); + if (ret!=ERR_DATA_BUFFER_SIZE) + return -EINVAL; + + return size; +} + +show(status) +show(capabilities) +show(resources) + +define_one_rw(call_start); +define_one_rw(err_type_info); +define_one_rw(err_struct_info); +define_one_rw(err_data_buffer); +define_one_rw(virtual_to_phys); +define_one_ro(status); +define_one_ro(capabilities); +define_one_ro(resources); + +static struct attribute *default_attrs[] = { + &dev_attr_call_start.attr, + &dev_attr_virtual_to_phys.attr, + &dev_attr_err_type_info.attr, + &dev_attr_err_struct_info.attr, + &dev_attr_err_data_buffer.attr, + &dev_attr_status.attr, + &dev_attr_capabilities.attr, + &dev_attr_resources.attr, + NULL +}; + +static struct attribute_group err_inject_attr_group = { + .attrs = default_attrs, + .name = "err_inject" +}; +/* Add/Remove err_inject interface for CPU device */ +static int __cpuinit err_inject_add_dev(struct device * sys_dev) +{ + return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group); +} + +static int __cpuinit err_inject_remove_dev(struct device * sys_dev) +{ + sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group); + return 0; +} +static int __cpuinit err_inject_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *sys_dev; + + sys_dev = get_cpu_device(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + err_inject_add_dev(sys_dev); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + err_inject_remove_dev(sys_dev); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata err_inject_cpu_notifier = +{ + .notifier_call = err_inject_cpu_callback, +}; + +static int __init +err_inject_init(void) +{ + int i; + +#ifdef ERR_INJ_DEBUG + printk(KERN_INFO "Enter error injection driver.\n"); +#endif + for_each_online_cpu(i) { + err_inject_cpu_callback(&err_inject_cpu_notifier, CPU_ONLINE, + (void *)(long)i); + } + + register_hotcpu_notifier(&err_inject_cpu_notifier); + + return 0; +} + +static void __exit +err_inject_exit(void) +{ + int i; + struct device *sys_dev; + +#ifdef ERR_INJ_DEBUG + printk(KERN_INFO "Exit error injection driver.\n"); +#endif + for_each_online_cpu(i) { + sys_dev = get_cpu_device(i); + sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group); + } + unregister_hotcpu_notifier(&err_inject_cpu_notifier); +} + +module_init(err_inject_init); +module_exit(err_inject_exit); + +MODULE_AUTHOR("Fenghua Yu <fenghua.yu@intel.com>"); +MODULE_DESCRIPTION("MC error injection kernel sysfs interface"); +MODULE_LICENSE("GPL"); diff --git a/arch/ia64/kernel/esi.c b/arch/ia64/kernel/esi.c new file mode 100644 index 00000000000..b091111270c --- /dev/null +++ b/arch/ia64/kernel/esi.c @@ -0,0 +1,205 @@ +/* + * Extensible SAL Interface (ESI) support routines. + * + * Copyright (C) 2006 Hewlett-Packard Co + * Alex Williamson <alex.williamson@hp.com> + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#include <asm/esi.h> +#include <asm/sal.h> + +MODULE_AUTHOR("Alex Williamson <alex.williamson@hp.com>"); +MODULE_DESCRIPTION("Extensible SAL Interface (ESI) support"); +MODULE_LICENSE("GPL"); + +#define MODULE_NAME "esi" + +#define ESI_TABLE_GUID \ + EFI_GUID(0x43EA58DC, 0xCF28, 0x4b06, 0xB3, \ + 0x91, 0xB7, 0x50, 0x59, 0x34, 0x2B, 0xD4) + +enum esi_systab_entry_type { + ESI_DESC_ENTRY_POINT = 0 +}; + +/* + * Entry type: Size: + * 0 48 + */ +#define ESI_DESC_SIZE(type) "\060"[(unsigned) (type)] + +typedef struct ia64_esi_desc_entry_point { + u8 type; + u8 reserved1[15]; + u64 esi_proc; + u64 gp; + efi_guid_t guid; +} ia64_esi_desc_entry_point_t; + +struct pdesc { + void *addr; + void *gp; +}; + +static struct ia64_sal_systab *esi_systab; + +static int __init esi_init (void) +{ + efi_config_table_t *config_tables; + struct ia64_sal_systab *systab; + unsigned long esi = 0; + char *p; + int i; + + config_tables = __va(efi.systab->tables); + + for (i = 0; i < (int) efi.systab->nr_tables; ++i) { + if (efi_guidcmp(config_tables[i].guid, ESI_TABLE_GUID) == 0) { + esi = config_tables[i].table; + break; + } + } + + if (!esi) + return -ENODEV; + + systab = __va(esi); + + if (strncmp(systab->signature, "ESIT", 4) != 0) { + printk(KERN_ERR "bad signature in ESI system table!"); + return -ENODEV; + } + + p = (char *) (systab + 1); + for (i = 0; i < systab->entry_count; i++) { + /* + * The first byte of each entry type contains the type + * descriptor. + */ + switch (*p) { + case ESI_DESC_ENTRY_POINT: + break; + default: + printk(KERN_WARNING "Unknown table type %d found in " + "ESI table, ignoring rest of table\n", *p); + return -ENODEV; + } + + p += ESI_DESC_SIZE(*p); + } + + esi_systab = systab; + return 0; +} + + +int ia64_esi_call (efi_guid_t guid, struct ia64_sal_retval *isrvp, + enum esi_proc_type proc_type, u64 func, + u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5, u64 arg6, + u64 arg7) +{ + struct ia64_fpreg fr[6]; + unsigned long flags = 0; + int i; + char *p; + + if (!esi_systab) + return -1; + + p = (char *) (esi_systab + 1); + for (i = 0; i < esi_systab->entry_count; i++) { + if (*p == ESI_DESC_ENTRY_POINT) { + ia64_esi_desc_entry_point_t *esi = (void *)p; + if (!efi_guidcmp(guid, esi->guid)) { + ia64_sal_handler esi_proc; + struct pdesc pdesc; + + pdesc.addr = __va(esi->esi_proc); + pdesc.gp = __va(esi->gp); + + esi_proc = (ia64_sal_handler) &pdesc; + + ia64_save_scratch_fpregs(fr); + if (proc_type == ESI_PROC_SERIALIZED) + spin_lock_irqsave(&sal_lock, flags); + else if (proc_type == ESI_PROC_MP_SAFE) + local_irq_save(flags); + else + preempt_disable(); + *isrvp = (*esi_proc)(func, arg1, arg2, arg3, + arg4, arg5, arg6, arg7); + if (proc_type == ESI_PROC_SERIALIZED) + spin_unlock_irqrestore(&sal_lock, + flags); + else if (proc_type == ESI_PROC_MP_SAFE) + local_irq_restore(flags); + else + preempt_enable(); + ia64_load_scratch_fpregs(fr); + return 0; + } + } + p += ESI_DESC_SIZE(*p); + } + return -1; +} +EXPORT_SYMBOL_GPL(ia64_esi_call); + +int ia64_esi_call_phys (efi_guid_t guid, struct ia64_sal_retval *isrvp, + u64 func, u64 arg1, u64 arg2, u64 arg3, u64 arg4, + u64 arg5, u64 arg6, u64 arg7) +{ + struct ia64_fpreg fr[6]; + unsigned long flags; + u64 esi_params[8]; + char *p; + int i; + + if (!esi_systab) + return -1; + + p = (char *) (esi_systab + 1); + for (i = 0; i < esi_systab->entry_count; i++) { + if (*p == ESI_DESC_ENTRY_POINT) { + ia64_esi_desc_entry_point_t *esi = (void *)p; + if (!efi_guidcmp(guid, esi->guid)) { + ia64_sal_handler esi_proc; + struct pdesc pdesc; + + pdesc.addr = (void *)esi->esi_proc; + pdesc.gp = (void *)esi->gp; + + esi_proc = (ia64_sal_handler) &pdesc; + + esi_params[0] = func; + esi_params[1] = arg1; + esi_params[2] = arg2; + esi_params[3] = arg3; + esi_params[4] = arg4; + esi_params[5] = arg5; + esi_params[6] = arg6; + esi_params[7] = arg7; + ia64_save_scratch_fpregs(fr); + spin_lock_irqsave(&sal_lock, flags); + *isrvp = esi_call_phys(esi_proc, esi_params); + spin_unlock_irqrestore(&sal_lock, flags); + ia64_load_scratch_fpregs(fr); + return 0; + } + } + p += ESI_DESC_SIZE(*p); + } + return -1; +} +EXPORT_SYMBOL_GPL(ia64_esi_call_phys); + +static void __exit esi_exit (void) +{ +} + +module_init(esi_init); +module_exit(esi_exit); /* makes module removable... */ diff --git a/arch/ia64/kernel/esi_stub.S b/arch/ia64/kernel/esi_stub.S new file mode 100644 index 00000000000..6b3d6c1f99b --- /dev/null +++ b/arch/ia64/kernel/esi_stub.S @@ -0,0 +1,96 @@ +/* + * ESI call stub. + * + * Copyright (C) 2005 Hewlett-Packard Co + * Alex Williamson <alex.williamson@hp.com> + * + * Based on EFI call stub by David Mosberger. The stub is virtually + * identical to the one for EFI phys-mode calls, except that ESI + * calls may have up to 8 arguments, so they get passed to this routine + * through memory. + * + * This stub allows us to make ESI calls in physical mode with interrupts + * turned off. ESI calls may not support calling from virtual mode. + * + * Google for "Extensible SAL specification" for a document describing the + * ESI standard. + */ + +/* + * PSR settings as per SAL spec (Chapter 8 in the "IA-64 System + * Abstraction Layer Specification", revision 2.6e). Note that + * psr.dfl and psr.dfh MUST be cleared, despite what this manual says. + * Otherwise, SAL dies whenever it's trying to do an IA-32 BIOS call + * (the br.ia instruction fails unless psr.dfl and psr.dfh are + * cleared). Fortunately, SAL promises not to touch the floating + * point regs, so at least we don't have to save f2-f127. + */ +#define PSR_BITS_TO_CLEAR \ + (IA64_PSR_I | IA64_PSR_IT | IA64_PSR_DT | IA64_PSR_RT | \ + IA64_PSR_DD | IA64_PSR_SS | IA64_PSR_RI | IA64_PSR_ED | \ + IA64_PSR_DFL | IA64_PSR_DFH) + +#define PSR_BITS_TO_SET \ + (IA64_PSR_BN) + +#include <asm/processor.h> +#include <asm/asmmacro.h> + +/* + * Inputs: + * in0 = address of function descriptor of ESI routine to call + * in1 = address of array of ESI parameters + * + * Outputs: + * r8 = result returned by called function + */ +GLOBAL_ENTRY(esi_call_phys) + .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) + alloc loc1=ar.pfs,2,7,8,0 + ld8 r2=[in0],8 // load ESI function's entry point + mov loc0=rp + .body + ;; + ld8 out0=[in1],8 // ESI params loaded from array + ;; // passing all as inputs doesn't work + ld8 out1=[in1],8 + ;; + ld8 out2=[in1],8 + ;; + ld8 out3=[in1],8 + ;; + ld8 out4=[in1],8 + ;; + ld8 out5=[in1],8 + ;; + ld8 out6=[in1],8 + ;; + ld8 out7=[in1] + mov loc2=gp // save global pointer + mov loc4=ar.rsc // save RSE configuration + mov ar.rsc=0 // put RSE in enforced lazy, LE mode + ;; + ld8 gp=[in0] // load ESI function's global pointer + movl r16=PSR_BITS_TO_CLEAR + mov loc3=psr // save processor status word + movl r17=PSR_BITS_TO_SET + ;; + or loc3=loc3,r17 + mov b6=r2 + ;; + andcm r16=loc3,r16 // get psr with IT, DT, and RT bits cleared + br.call.sptk.many rp=ia64_switch_mode_phys +.ret0: mov loc5=r19 // old ar.bsp + mov loc6=r20 // old sp + br.call.sptk.many rp=b6 // call the ESI function +.ret1: mov ar.rsc=0 // put RSE in enforced lazy, LE mode + mov r16=loc3 // save virtual mode psr + mov r19=loc5 // save virtual mode bspstore + mov r20=loc6 // save virtual mode sp + br.call.sptk.many rp=ia64_switch_mode_virt // return to virtual mode +.ret2: mov ar.rsc=loc4 // restore RSE configuration + mov ar.pfs=loc1 + mov rp=loc0 + mov gp=loc2 + br.ret.sptk.many rp +END(esi_call_phys) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S new file mode 100644 index 00000000000..331d42bda77 --- /dev/null +++ b/arch/ia64/kernel/fsys.S @@ -0,0 +1,1049 @@ +/* + * This file contains the light-weight system call handlers (fsyscall-handlers). + * + * Copyright (C) 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 25-Sep-03 davidm Implement fsys_rt_sigprocmask(). + * 18-Feb-03 louisk Implement fsys_gettimeofday(). + * 28-Feb-03 davidm Fixed several bugs in fsys_gettimeofday(). Tuned it some more, + * probably broke it along the way... ;-) + * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make + * it capable of using memory based clocks without falling back to C code. + * 08-Feb-07 Fenghua Yu Implement fsys_getcpu. + * + */ + +#include <asm/asmmacro.h> +#include <asm/errno.h> +#include <asm/asm-offsets.h> +#include <asm/percpu.h> +#include <asm/thread_info.h> +#include <asm/sal.h> +#include <asm/signal.h> +#include <asm/system.h> +#include <asm/unistd.h> + +#include "entry.h" +#include "paravirt_inst.h" + +/* + * See Documentation/ia64/fsys.txt for details on fsyscalls. + * + * On entry to an fsyscall handler: + * r10 = 0 (i.e., defaults to "successful syscall return") + * r11 = saved ar.pfs (a user-level value) + * r15 = system call number + * r16 = "current" task pointer (in normal kernel-mode, this is in r13) + * r32-r39 = system call arguments + * b6 = return address (a user-level value) + * ar.pfs = previous frame-state (a user-level value) + * PSR.be = cleared to zero (i.e., little-endian byte order is in effect) + * all other registers may contain values passed in from user-mode + * + * On return from an fsyscall handler: + * r11 = saved ar.pfs (as passed into the fsyscall handler) + * r15 = system call number (as passed into the fsyscall handler) + * r32-r39 = system call arguments (as passed into the fsyscall handler) + * b6 = return address (as passed into the fsyscall handler) + * ar.pfs = previous frame-state (as passed into the fsyscall handler) + */ + +ENTRY(fsys_ni_syscall) + .prologue + .altrp b6 + .body + mov r8=ENOSYS + mov r10=-1 + FSYS_RETURN +END(fsys_ni_syscall) + +ENTRY(fsys_getpid) + .prologue + .altrp b6 + .body + add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 + ;; + ld8 r17=[r17] // r17 = current->group_leader + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + ld4 r9=[r9] + add r17=IA64_TASK_TGIDLINK_OFFSET,r17 + ;; + and r9=TIF_ALLWORK_MASK,r9 + ld8 r17=[r17] // r17 = current->group_leader->pids[PIDTYPE_PID].pid + ;; + add r8=IA64_PID_LEVEL_OFFSET,r17 + ;; + ld4 r8=[r8] // r8 = pid->level + add r17=IA64_PID_UPID_OFFSET,r17 // r17 = &pid->numbers[0] + ;; + shl r8=r8,IA64_UPID_SHIFT + ;; + add r17=r17,r8 // r17 = &pid->numbers[pid->level] + ;; + ld4 r8=[r17] // r8 = pid->numbers[pid->level].nr + ;; + mov r17=0 + ;; + cmp.ne p8,p0=0,r9 +(p8) br.spnt.many fsys_fallback_syscall + FSYS_RETURN +END(fsys_getpid) + +ENTRY(fsys_getppid) + .prologue + .altrp b6 + .body + add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 + ;; + ld8 r17=[r17] // r17 = current->group_leader + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + + ld4 r9=[r9] + add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = ¤t->group_leader->real_parent + ;; + and r9=TIF_ALLWORK_MASK,r9 + +1: ld8 r18=[r17] // r18 = current->group_leader->real_parent + ;; + cmp.ne p8,p0=0,r9 + add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = ¤t->group_leader->real_parent->tgid + ;; + + /* + * The .acq is needed to ensure that the read of tgid has returned its data before + * we re-check "real_parent". + */ + ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid +#ifdef CONFIG_SMP + /* + * Re-read current->group_leader->real_parent. + */ + ld8 r19=[r17] // r19 = current->group_leader->real_parent +(p8) br.spnt.many fsys_fallback_syscall + ;; + cmp.ne p6,p0=r18,r19 // did real_parent change? + mov r19=0 // i must not leak kernel bits... +(p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check + ;; + mov r17=0 // i must not leak kernel bits... + mov r18=0 // i must not leak kernel bits... +#else + mov r17=0 // i must not leak kernel bits... + mov r18=0 // i must not leak kernel bits... + mov r19=0 // i must not leak kernel bits... +#endif + FSYS_RETURN +END(fsys_getppid) + +ENTRY(fsys_set_tid_address) + .prologue + .altrp b6 + .body + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + add r17=IA64_TASK_TGIDLINK_OFFSET,r16 + ;; + ld4 r9=[r9] + tnat.z p6,p7=r32 // check argument register for being NaT + ld8 r17=[r17] // r17 = current->pids[PIDTYPE_PID].pid + ;; + and r9=TIF_ALLWORK_MASK,r9 + add r8=IA64_PID_LEVEL_OFFSET,r17 + add r18=IA64_TASK_CLEAR_CHILD_TID_OFFSET,r16 + ;; + ld4 r8=[r8] // r8 = pid->level + add r17=IA64_PID_UPID_OFFSET,r17 // r17 = &pid->numbers[0] + ;; + shl r8=r8,IA64_UPID_SHIFT + ;; + add r17=r17,r8 // r17 = &pid->numbers[pid->level] + ;; + ld4 r8=[r17] // r8 = pid->numbers[pid->level].nr + ;; + cmp.ne p8,p0=0,r9 + mov r17=-1 + ;; +(p6) st8 [r18]=r32 +(p7) st8 [r18]=r17 +(p8) br.spnt.many fsys_fallback_syscall + ;; + mov r17=0 // i must not leak kernel bits... + mov r18=0 // i must not leak kernel bits... + FSYS_RETURN +END(fsys_set_tid_address) + +#if IA64_GTOD_LOCK_OFFSET !=0 +#error fsys_gettimeofday incompatible with changes to struct fsyscall_gtod_data_t +#endif +#if IA64_ITC_JITTER_OFFSET !=0 +#error fsys_gettimeofday incompatible with changes to struct itc_jitter_data_t +#endif +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 +#define CLOCK_DIVIDE_BY_1000 0x4000 +#define CLOCK_ADD_MONOTONIC 0x8000 + +ENTRY(fsys_gettimeofday) + .prologue + .altrp b6 + .body + mov r31 = r32 + tnat.nz p6,p0 = r33 // guard against NaT argument +(p6) br.cond.spnt.few .fail_einval + mov r30 = CLOCK_DIVIDE_BY_1000 + ;; +.gettime: + // Register map + // Incoming r31 = pointer to address where to place result + // r30 = flags determining how time is processed + // r2,r3 = temp r4-r7 preserved + // r8 = result nanoseconds + // r9 = result seconds + // r10 = temporary storage for clock difference + // r11 = preserved: saved ar.pfs + // r12 = preserved: memory stack + // r13 = preserved: thread pointer + // r14 = address of mask / mask value + // r15 = preserved: system call number + // r16 = preserved: current task pointer + // r17 = (not used) + // r18 = (not used) + // r19 = address of itc_lastcycle + // r20 = struct fsyscall_gtod_data (= address of gtod_lock.sequence) + // r21 = address of mmio_ptr + // r22 = address of wall_time or monotonic_time + // r23 = address of shift / value + // r24 = address mult factor / cycle_last value + // r25 = itc_lastcycle value + // r26 = address clocksource cycle_last + // r27 = (not used) + // r28 = sequence number at the beginning of critcal section + // r29 = address of itc_jitter + // r30 = time processing flags / memory address + // r31 = pointer to result + // Predicates + // p6,p7 short term use + // p8 = timesource ar.itc + // p9 = timesource mmio64 + // p10 = timesource mmio32 - not used + // p11 = timesource not to be handled by asm code + // p12 = memory time source ( = p9 | p10) - not used + // p13 = do cmpxchg with itc_lastcycle + // p14 = Divide by 1000 + // p15 = Add monotonic + // + // Note that instructions are optimized for McKinley. McKinley can + // process two bundles simultaneously and therefore we continuously + // try to feed the CPU two bundles and then a stop. + + add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 + tnat.nz p6,p0 = r31 // guard against Nat argument +(p6) br.cond.spnt.few .fail_einval + movl r20 = fsyscall_gtod_data // load fsyscall gettimeofday data address + ;; + ld4 r2 = [r2] // process work pending flags + movl r29 = itc_jitter_data // itc_jitter + add r22 = IA64_GTOD_WALL_TIME_OFFSET,r20 // wall_time + add r21 = IA64_CLKSRC_MMIO_OFFSET,r20 + mov pr = r30,0xc000 // Set predicates according to function + ;; + and r2 = TIF_ALLWORK_MASK,r2 + add r19 = IA64_ITC_LASTCYCLE_OFFSET,r29 +(p15) add r22 = IA64_GTOD_MONO_TIME_OFFSET,r20 // monotonic_time + ;; + add r26 = IA64_CLKSRC_CYCLE_LAST_OFFSET,r20 // clksrc_cycle_last + cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled +(p6) br.cond.spnt.many fsys_fallback_syscall + ;; + // Begin critical section +.time_redo: + ld4.acq r28 = [r20] // gtod_lock.sequence, Must take first + ;; + and r28 = ~1,r28 // And make sequence even to force retry if odd + ;; + ld8 r30 = [r21] // clocksource->mmio_ptr + add r24 = IA64_CLKSRC_MULT_OFFSET,r20 + ld4 r2 = [r29] // itc_jitter value + add r23 = IA64_CLKSRC_SHIFT_OFFSET,r20 + add r14 = IA64_CLKSRC_MASK_OFFSET,r20 + ;; + ld4 r3 = [r24] // clocksource mult value + ld8 r14 = [r14] // clocksource mask value + cmp.eq p8,p9 = 0,r30 // use cpu timer if no mmio_ptr + ;; + setf.sig f7 = r3 // Setup for mult scaling of counter +(p8) cmp.ne p13,p0 = r2,r0 // need itc_jitter compensation, set p13 + ld4 r23 = [r23] // clocksource shift value + ld8 r24 = [r26] // get clksrc_cycle_last value +(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control + ;; + .pred.rel.mutex p8,p9 + MOV_FROM_ITC(p8, p6, r2, r10) // CPU_TIMER. 36 clocks latency!!! +(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues.. +(p13) ld8 r25 = [r19] // get itc_lastcycle value + ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec + ;; + ld8 r8 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET // tv_nsec +(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) + ;; +(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared + sub r10 = r2,r24 // current_cycle - last_cycle + ;; +(p6) sub r10 = r25,r24 // time we got was less than last_cycle +(p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg + ;; +(p7) cmpxchg8.rel r3 = [r19],r2,ar.ccv + ;; +(p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful + ;; +(p7) sub r10 = r3,r24 // then use new last_cycle instead + ;; + and r10 = r10,r14 // Apply mask + ;; + setf.sig f8 = r10 + nop.i 123 + ;; + // fault check takes 5 cycles and we have spare time +EX(.fail_efault, probe.w.fault r31, 3) + xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) + ;; + getf.sig r2 = f8 + mf + ;; + ld4 r10 = [r20] // gtod_lock.sequence + shr.u r2 = r2,r23 // shift by factor + ;; + add r8 = r8,r2 // Add xtime.nsecs + cmp4.ne p7,p0 = r28,r10 +(p7) br.cond.dpnt.few .time_redo // sequence number changed, redo + // End critical section. + // Now r8=tv->tv_nsec and r9=tv->tv_sec + mov r10 = r0 + movl r2 = 1000000000 + add r23 = IA64_TIMESPEC_TV_NSEC_OFFSET, r31 +(p14) movl r3 = 2361183241434822607 // Prep for / 1000 hack + ;; +.time_normalize: + mov r21 = r8 + cmp.ge p6,p0 = r8,r2 +(p14) shr.u r20 = r8, 3 // We can repeat this if necessary just wasting time + ;; +(p14) setf.sig f8 = r20 +(p6) sub r8 = r8,r2 +(p6) add r9 = 1,r9 // two nops before the branch. +(p14) setf.sig f7 = r3 // Chances for repeats are 1 in 10000 for gettod +(p6) br.cond.dpnt.few .time_normalize + ;; + // Divided by 8 though shift. Now divide by 125 + // The compiler was able to do that with a multiply + // and a shift and we do the same +EX(.fail_efault, probe.w.fault r23, 3) // This also costs 5 cycles +(p14) xmpy.hu f8 = f8, f7 // xmpy has 5 cycles latency so use it + ;; +(p14) getf.sig r2 = f8 + ;; + mov r8 = r0 +(p14) shr.u r21 = r2, 4 + ;; +EX(.fail_efault, st8 [r31] = r9) +EX(.fail_efault, st8 [r23] = r21) + FSYS_RETURN +.fail_einval: + mov r8 = EINVAL + mov r10 = -1 + FSYS_RETURN +.fail_efault: + mov r8 = EFAULT + mov r10 = -1 + FSYS_RETURN +END(fsys_gettimeofday) + +ENTRY(fsys_clock_gettime) + .prologue + .altrp b6 + .body + cmp4.ltu p6, p0 = CLOCK_MONOTONIC, r32 + // Fallback if this is not CLOCK_REALTIME or CLOCK_MONOTONIC +(p6) br.spnt.few fsys_fallback_syscall + mov r31 = r33 + shl r30 = r32,15 + br.many .gettime +END(fsys_clock_gettime) + +/* + * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). + */ +#if _NSIG_WORDS != 1 +# error Sorry, fsys_rt_sigprocmask() needs to be updated for _NSIG_WORDS != 1. +#endif +ENTRY(fsys_rt_sigprocmask) + .prologue + .altrp b6 + .body + + add r2=IA64_TASK_BLOCKED_OFFSET,r16 + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + cmp4.ltu p6,p0=SIG_SETMASK,r32 + + cmp.ne p15,p0=r0,r34 // oset != NULL? + tnat.nz p8,p0=r34 + add r31=IA64_TASK_SIGHAND_OFFSET,r16 + ;; + ld8 r3=[r2] // read/prefetch current->blocked + ld4 r9=[r9] + tnat.nz.or p6,p0=r35 + + cmp.ne.or p6,p0=_NSIG_WORDS*8,r35 + tnat.nz.or p6,p0=r32 +(p6) br.spnt.few .fail_einval // fail with EINVAL + ;; +#ifdef CONFIG_SMP + ld8 r31=[r31] // r31 <- current->sighand +#endif + and r9=TIF_ALLWORK_MASK,r9 + tnat.nz.or p8,p0=r33 + ;; + cmp.ne p7,p0=0,r9 + cmp.eq p6,p0=r0,r33 // set == NULL? + add r31=IA64_SIGHAND_SIGLOCK_OFFSET,r31 // r31 <- current->sighand->siglock +(p8) br.spnt.few .fail_efault // fail with EFAULT +(p7) br.spnt.many fsys_fallback_syscall // got pending kernel work... +(p6) br.dpnt.many .store_mask // -> short-circuit to just reading the signal mask + + /* Argh, we actually have to do some work and _update_ the signal mask: */ + +EX(.fail_efault, probe.r.fault r33, 3) // verify user has read-access to *set +EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set + mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1)) + ;; + + RSM_PSR_I(p0, r18, r19) // mask interrupt delivery + andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP + mov r8=EINVAL // default to EINVAL + +#ifdef CONFIG_SMP + // __ticket_spin_trylock(r31) + ld4 r17=[r31] + ;; + mov.m ar.ccv=r17 + extr.u r9=r17,17,15 + adds r19=1,r17 + extr.u r18=r17,0,15 + ;; + cmp.eq p6,p7=r9,r18 + ;; +(p6) cmpxchg4.acq r9=[r31],r19,ar.ccv +(p6) dep.z r20=r19,1,15 // next serving ticket for unlock +(p7) br.cond.spnt.many .lock_contention + ;; + cmp4.eq p0,p7=r9,r17 + adds r31=2,r31 +(p7) br.cond.spnt.many .lock_contention + ld8 r3=[r2] // re-read current->blocked now that we hold the lock + ;; +#else + ld8 r3=[r2] // re-read current->blocked now that we hold the lock +#endif + add r18=IA64_TASK_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r16 + add r19=IA64_TASK_SIGNAL_OFFSET,r16 + cmp4.eq p6,p0=SIG_BLOCK,r32 + ;; + ld8 r19=[r19] // r19 <- current->signal + cmp4.eq p7,p0=SIG_UNBLOCK,r32 + cmp4.eq p8,p0=SIG_SETMASK,r32 + ;; + ld8 r18=[r18] // r18 <- current->pending.signal + .pred.rel.mutex p6,p7,p8 +(p6) or r14=r3,r14 // SIG_BLOCK +(p7) andcm r14=r3,r14 // SIG_UNBLOCK + +(p8) mov r14=r14 // SIG_SETMASK +(p6) mov r8=0 // clear error code + // recalc_sigpending() + add r17=IA64_SIGNAL_GROUP_STOP_COUNT_OFFSET,r19 + + add r19=IA64_SIGNAL_SHARED_PENDING_OFFSET+IA64_SIGPENDING_SIGNAL_OFFSET,r19 + ;; + ld4 r17=[r17] // r17 <- current->signal->group_stop_count +(p7) mov r8=0 // clear error code + + ld8 r19=[r19] // r19 <- current->signal->shared_pending + ;; + cmp4.gt p6,p7=r17,r0 // p6/p7 <- (current->signal->group_stop_count > 0)? +(p8) mov r8=0 // clear error code + + or r18=r18,r19 // r18 <- current->pending | current->signal->shared_pending + ;; + // r18 <- (current->pending | current->signal->shared_pending) & ~current->blocked: + andcm r18=r18,r14 + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 + ;; + +(p7) cmp.ne.or.andcm p6,p7=r18,r0 // p6/p7 <- signal pending + mov r19=0 // i must not leak kernel bits... +(p6) br.cond.dpnt.many .sig_pending + ;; + +1: ld4 r17=[r9] // r17 <- current->thread_info->flags + ;; + mov ar.ccv=r17 + and r18=~_TIF_SIGPENDING,r17 // r18 <- r17 & ~(1 << TIF_SIGPENDING) + ;; + + st8 [r2]=r14 // update current->blocked with new mask + cmpxchg4.acq r8=[r9],r18,ar.ccv // current->thread_info->flags <- r18 + ;; + cmp.ne p6,p0=r17,r8 // update failed? +(p6) br.cond.spnt.few 1b // yes -> retry + +#ifdef CONFIG_SMP + // __ticket_spin_unlock(r31) + st2.rel [r31]=r20 + mov r20=0 // i must not leak kernel bits... +#endif + SSM_PSR_I(p0, p9, r31) + ;; + + srlz.d // ensure psr.i is set again + mov r18=0 // i must not leak kernel bits... + +.store_mask: +EX(.fail_efault, (p15) probe.w.fault r34, 3) // verify user has write-access to *oset +EX(.fail_efault, (p15) st8 [r34]=r3) + mov r2=0 // i must not leak kernel bits... + mov r3=0 // i must not leak kernel bits... + mov r8=0 // return 0 + mov r9=0 // i must not leak kernel bits... + mov r14=0 // i must not leak kernel bits... + mov r17=0 // i must not leak kernel bits... + mov r31=0 // i must not leak kernel bits... + FSYS_RETURN + +.sig_pending: +#ifdef CONFIG_SMP + // __ticket_spin_unlock(r31) + st2.rel [r31]=r20 // release the lock +#endif + SSM_PSR_I(p0, p9, r17) + ;; + srlz.d + br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall + +#ifdef CONFIG_SMP +.lock_contention: + /* Rather than spinning here, fall back on doing a heavy-weight syscall. */ + SSM_PSR_I(p0, p9, r17) + ;; + srlz.d + br.sptk.many fsys_fallback_syscall +#endif +END(fsys_rt_sigprocmask) + +/* + * fsys_getcpu doesn't use the third parameter in this implementation. It reads + * current_thread_info()->cpu and corresponding node in cpu_to_node_map. + */ +ENTRY(fsys_getcpu) + .prologue + .altrp b6 + .body + ;; + add r2=TI_FLAGS+IA64_TASK_SIZE,r16 + tnat.nz p6,p0 = r32 // guard against NaT argument + add r3=TI_CPU+IA64_TASK_SIZE,r16 + ;; + ld4 r3=[r3] // M r3 = thread_info->cpu + ld4 r2=[r2] // M r2 = thread_info->flags +(p6) br.cond.spnt.few .fail_einval // B + ;; + tnat.nz p7,p0 = r33 // I guard against NaT argument +(p7) br.cond.spnt.few .fail_einval // B +#ifdef CONFIG_NUMA + movl r17=cpu_to_node_map + ;; +EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles +EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles + shladd r18=r3,1,r17 + ;; + ld2 r20=[r18] // r20 = cpu_to_node_map[cpu] + and r2 = TIF_ALLWORK_MASK,r2 + ;; + cmp.ne p8,p0=0,r2 +(p8) br.spnt.many fsys_fallback_syscall + ;; + ;; +EX(.fail_efault, st4 [r32] = r3) +EX(.fail_efault, st2 [r33] = r20) + mov r8=0 + ;; +#else +EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles +EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles + and r2 = TIF_ALLWORK_MASK,r2 + ;; + cmp.ne p8,p0=0,r2 +(p8) br.spnt.many fsys_fallback_syscall + ;; +EX(.fail_efault, st4 [r32] = r3) +EX(.fail_efault, st2 [r33] = r0) + mov r8=0 + ;; +#endif + FSYS_RETURN +END(fsys_getcpu) + +ENTRY(fsys_fallback_syscall) + .prologue + .altrp b6 + .body + /* + * We only get here from light-weight syscall handlers. Thus, we already + * know that r15 contains a valid syscall number. No need to re-check. + */ + adds r17=-1024,r15 + movl r14=sys_call_table + ;; + RSM_PSR_I(p0, r26, r27) + shladd r18=r17,3,r14 + ;; + ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point + MOV_FROM_PSR(p0, r29, r26) // read psr (12 cyc load latency) + mov r27=ar.rsc + mov r21=ar.fpsr + mov r26=ar.pfs +END(fsys_fallback_syscall) + /* FALL THROUGH */ +GLOBAL_ENTRY(paravirt_fsys_bubble_down) + .prologue + .altrp b6 + .body + /* + * We get here for syscalls that don't have a lightweight + * handler. For those, we need to bubble down into the kernel + * and that requires setting up a minimal pt_regs structure, + * and initializing the CPU state more or less as if an + * interruption had occurred. To make syscall-restarts work, + * we setup pt_regs such that cr_iip points to the second + * instruction in syscall_via_break. Decrementing the IP + * hence will restart the syscall via break and not + * decrementing IP will return us to the caller, as usual. + * Note that we preserve the value of psr.pp rather than + * initializing it from dcr.pp. This makes it possible to + * distinguish fsyscall execution from other privileged + * execution. + * + * On entry: + * - normal fsyscall handler register usage, except + * that we also have: + * - r18: address of syscall entry point + * - r21: ar.fpsr + * - r26: ar.pfs + * - r27: ar.rsc + * - r29: psr + * + * We used to clear some PSR bits here but that requires slow + * serialization. Fortuntely, that isn't really necessary. + * The rationale is as follows: we used to clear bits + * ~PSR_PRESERVED_BITS in PSR.L. Since + * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we + * ended up clearing PSR.{BE,AC,I,DFL,DFH,DI,DB,SI,TB}. + * However, + * + * PSR.BE : already is turned off in __kernel_syscall_via_epc() + * PSR.AC : don't care (kernel normally turns PSR.AC on) + * PSR.I : already turned off by the time paravirt_fsys_bubble_down gets + * invoked + * PSR.DFL: always 0 (kernel never turns it on) + * PSR.DFH: don't care --- kernel never touches f32-f127 on its own + * initiative + * PSR.DI : always 0 (kernel never turns it on) + * PSR.SI : always 0 (kernel never turns it on) + * PSR.DB : don't care --- kernel never enables kernel-level + * breakpoints + * PSR.TB : must be 0 already; if it wasn't zero on entry to + * __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down + * will trigger a taken branch; the taken-trap-handler then + * converts the syscall into a break-based system-call. + */ + /* + * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc. + * The rest we have to synthesize. + */ +# define PSR_ONE_BITS ((3 << IA64_PSR_CPL0_BIT) \ + | (0x1 << IA64_PSR_RI_BIT) \ + | IA64_PSR_BN | IA64_PSR_I) + + invala // M0|1 + movl r14=ia64_ret_from_syscall // X + + nop.m 0 + movl r28=__kernel_syscall_via_break // X create cr.iip + ;; + + mov r2=r16 // A get task addr to addl-addressable register + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // A + mov r31=pr // I0 save pr (2 cyc) + ;; + st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag + addl r22=IA64_RBS_OFFSET,r2 // A compute base of RBS + add r3=TI_FLAGS+IA64_TASK_SIZE,r2 // A + ;; + ld4 r3=[r3] // M0|1 r3 = current_thread_info()->flags + lfetch.fault.excl.nt1 [r22] // M0|1 prefetch register backing-store + nop.i 0 + ;; + mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0 +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + MOV_FROM_ITC(p0, p6, r30, r23) // M get cycle for accounting +#else + nop.m 0 +#endif + nop.i 0 + ;; + mov r23=ar.bspstore // M2 (12 cyc) save ar.bspstore + mov.m r24=ar.rnat // M2 (5 cyc) read ar.rnat (dual-issues!) + nop.i 0 + ;; + mov ar.bspstore=r22 // M2 (6 cyc) switch to kernel RBS + movl r8=PSR_ONE_BITS // X + ;; + mov r25=ar.unat // M2 (5 cyc) save ar.unat + mov r19=b6 // I0 save b6 (2 cyc) + mov r20=r1 // A save caller's gp in r20 + ;; + or r29=r8,r29 // A construct cr.ipsr value to save + mov b6=r18 // I0 copy syscall entry-point to b6 (7 cyc) + addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // A compute base of memory stack + + mov r18=ar.bsp // M2 save (kernel) ar.bsp (12 cyc) + cmp.ne pKStk,pUStk=r0,r0 // A set pKStk <- 0, pUStk <- 1 + br.call.sptk.many b7=ia64_syscall_setup // B + ;; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + // mov.m r30=ar.itc is called in advance + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r2 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r2 + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at leave kernel + ;; + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel + ;; + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r30,r19 // elapsed time in user mode + ;; + add r20=r20,r22 // sum stime + add r21=r21,r18 // sum utime + ;; + st8 [r16]=r20 // update stime + st8 [r17]=r21 // update utime + ;; +#endif + mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 + mov rp=r14 // I0 set the real return addr + and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A + ;; + SSM_PSR_I(p0, p6, r22) // M2 we're on kernel stacks now, reenable irqs + cmp.eq p8,p0=r3,r0 // A +(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT + + nop.m 0 +(p8) br.call.sptk.many b6=b6 // B (ignore return address) + br.cond.spnt ia64_trace_syscall // B +END(paravirt_fsys_bubble_down) + + .rodata + .align 8 + .globl paravirt_fsyscall_table + + data8 paravirt_fsys_bubble_down +paravirt_fsyscall_table: + data8 fsys_ni_syscall + data8 0 // exit // 1025 + data8 0 // read + data8 0 // write + data8 0 // open + data8 0 // close + data8 0 // creat // 1030 + data8 0 // link + data8 0 // unlink + data8 0 // execve + data8 0 // chdir + data8 0 // fchdir // 1035 + data8 0 // utimes + data8 0 // mknod + data8 0 // chmod + data8 0 // chown + data8 0 // lseek // 1040 + data8 fsys_getpid // getpid + data8 fsys_getppid // getppid + data8 0 // mount + data8 0 // umount + data8 0 // setuid // 1045 + data8 0 // getuid + data8 0 // geteuid + data8 0 // ptrace + data8 0 // access + data8 0 // sync // 1050 + data8 0 // fsync + data8 0 // fdatasync + data8 0 // kill + data8 0 // rename + data8 0 // mkdir // 1055 + data8 0 // rmdir + data8 0 // dup + data8 0 // pipe + data8 0 // times + data8 0 // brk // 1060 + data8 0 // setgid + data8 0 // getgid + data8 0 // getegid + data8 0 // acct + data8 0 // ioctl // 1065 + data8 0 // fcntl + data8 0 // umask + data8 0 // chroot + data8 0 // ustat + data8 0 // dup2 // 1070 + data8 0 // setreuid + data8 0 // setregid + data8 0 // getresuid + data8 0 // setresuid + data8 0 // getresgid // 1075 + data8 0 // setresgid + data8 0 // getgroups + data8 0 // setgroups + data8 0 // getpgid + data8 0 // setpgid // 1080 + data8 0 // setsid + data8 0 // getsid + data8 0 // sethostname + data8 0 // setrlimit + data8 0 // getrlimit // 1085 + data8 0 // getrusage + data8 fsys_gettimeofday // gettimeofday + data8 0 // settimeofday + data8 0 // select + data8 0 // poll // 1090 + data8 0 // symlink + data8 0 // readlink + data8 0 // uselib + data8 0 // swapon + data8 0 // swapoff // 1095 + data8 0 // reboot + data8 0 // truncate + data8 0 // ftruncate + data8 0 // fchmod + data8 0 // fchown // 1100 + data8 0 // getpriority + data8 0 // setpriority + data8 0 // statfs + data8 0 // fstatfs + data8 0 // gettid // 1105 + data8 0 // semget + data8 0 // semop + data8 0 // semctl + data8 0 // msgget + data8 0 // msgsnd // 1110 + data8 0 // msgrcv + data8 0 // msgctl + data8 0 // shmget + data8 0 // shmat + data8 0 // shmdt // 1115 + data8 0 // shmctl + data8 0 // syslog + data8 0 // setitimer + data8 0 // getitimer + data8 0 // 1120 + data8 0 + data8 0 + data8 0 // vhangup + data8 0 // lchown + data8 0 // remap_file_pages // 1125 + data8 0 // wait4 + data8 0 // sysinfo + data8 0 // clone + data8 0 // setdomainname + data8 0 // newuname // 1130 + data8 0 // adjtimex + data8 0 + data8 0 // init_module + data8 0 // delete_module + data8 0 // 1135 + data8 0 + data8 0 // quotactl + data8 0 // bdflush + data8 0 // sysfs + data8 0 // personality // 1140 + data8 0 // afs_syscall + data8 0 // setfsuid + data8 0 // setfsgid + data8 0 // getdents + data8 0 // flock // 1145 + data8 0 // readv + data8 0 // writev + data8 0 // pread64 + data8 0 // pwrite64 + data8 0 // sysctl // 1150 + data8 0 // mmap + data8 0 // munmap + data8 0 // mlock + data8 0 // mlockall + data8 0 // mprotect // 1155 + data8 0 // mremap + data8 0 // msync + data8 0 // munlock + data8 0 // munlockall + data8 0 // sched_getparam // 1160 + data8 0 // sched_setparam + data8 0 // sched_getscheduler + data8 0 // sched_setscheduler + data8 0 // sched_yield + data8 0 // sched_get_priority_max // 1165 + data8 0 // sched_get_priority_min + data8 0 // sched_rr_get_interval + data8 0 // nanosleep + data8 0 // nfsservctl + data8 0 // prctl // 1170 + data8 0 // getpagesize + data8 0 // mmap2 + data8 0 // pciconfig_read + data8 0 // pciconfig_write + data8 0 // perfmonctl // 1175 + data8 0 // sigaltstack + data8 0 // rt_sigaction + data8 0 // rt_sigpending + data8 fsys_rt_sigprocmask // rt_sigprocmask + data8 0 // rt_sigqueueinfo // 1180 + data8 0 // rt_sigreturn + data8 0 // rt_sigsuspend + data8 0 // rt_sigtimedwait + data8 0 // getcwd + data8 0 // capget // 1185 + data8 0 // capset + data8 0 // sendfile + data8 0 + data8 0 + data8 0 // socket // 1190 + data8 0 // bind + data8 0 // connect + data8 0 // listen + data8 0 // accept + data8 0 // getsockname // 1195 + data8 0 // getpeername + data8 0 // socketpair + data8 0 // send + data8 0 // sendto + data8 0 // recv // 1200 + data8 0 // recvfrom + data8 0 // shutdown + data8 0 // setsockopt + data8 0 // getsockopt + data8 0 // sendmsg // 1205 + data8 0 // recvmsg + data8 0 // pivot_root + data8 0 // mincore + data8 0 // madvise + data8 0 // newstat // 1210 + data8 0 // newlstat + data8 0 // newfstat + data8 0 // clone2 + data8 0 // getdents64 + data8 0 // getunwind // 1215 + data8 0 // readahead + data8 0 // setxattr + data8 0 // lsetxattr + data8 0 // fsetxattr + data8 0 // getxattr // 1220 + data8 0 // lgetxattr + data8 0 // fgetxattr + data8 0 // listxattr + data8 0 // llistxattr + data8 0 // flistxattr // 1225 + data8 0 // removexattr + data8 0 // lremovexattr + data8 0 // fremovexattr + data8 0 // tkill + data8 0 // futex // 1230 + data8 0 // sched_setaffinity + data8 0 // sched_getaffinity + data8 fsys_set_tid_address // set_tid_address + data8 0 // fadvise64_64 + data8 0 // tgkill // 1235 + data8 0 // exit_group + data8 0 // lookup_dcookie + data8 0 // io_setup + data8 0 // io_destroy + data8 0 // io_getevents // 1240 + data8 0 // io_submit + data8 0 // io_cancel + data8 0 // epoll_create + data8 0 // epoll_ctl + data8 0 // epoll_wait // 1245 + data8 0 // restart_syscall + data8 0 // semtimedop + data8 0 // timer_create + data8 0 // timer_settime + data8 0 // timer_gettime // 1250 + data8 0 // timer_getoverrun + data8 0 // timer_delete + data8 0 // clock_settime + data8 fsys_clock_gettime // clock_gettime + data8 0 // clock_getres // 1255 + data8 0 // clock_nanosleep + data8 0 // fstatfs64 + data8 0 // statfs64 + data8 0 // mbind + data8 0 // get_mempolicy // 1260 + data8 0 // set_mempolicy + data8 0 // mq_open + data8 0 // mq_unlink + data8 0 // mq_timedsend + data8 0 // mq_timedreceive // 1265 + data8 0 // mq_notify + data8 0 // mq_getsetattr + data8 0 // kexec_load + data8 0 // vserver + data8 0 // waitid // 1270 + data8 0 // add_key + data8 0 // request_key + data8 0 // keyctl + data8 0 // ioprio_set + data8 0 // ioprio_get // 1275 + data8 0 // move_pages + data8 0 // inotify_init + data8 0 // inotify_add_watch + data8 0 // inotify_rm_watch + data8 0 // migrate_pages // 1280 + data8 0 // openat + data8 0 // mkdirat + data8 0 // mknodat + data8 0 // fchownat + data8 0 // futimesat // 1285 + data8 0 // newfstatat + data8 0 // unlinkat + data8 0 // renameat + data8 0 // linkat + data8 0 // symlinkat // 1290 + data8 0 // readlinkat + data8 0 // fchmodat + data8 0 // faccessat + data8 0 + data8 0 // 1295 + data8 0 // unshare + data8 0 // splice + data8 0 // set_robust_list + data8 0 // get_robust_list + data8 0 // sync_file_range // 1300 + data8 0 // tee + data8 0 // vmsplice + data8 0 + data8 fsys_getcpu // getcpu // 1304 + + // fill in zeros for the remaining entries + .zero: + .space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0 diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h new file mode 100644 index 00000000000..57d2ee6c83e --- /dev/null +++ b/arch/ia64/kernel/fsyscall_gtod_data.h @@ -0,0 +1,23 @@ +/* + * (c) Copyright 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Peter Keilty <peter.keilty@hp.com> + * + * fsyscall gettimeofday data + */ + +struct fsyscall_gtod_data_t { + seqlock_t lock; + struct timespec wall_time; + struct timespec monotonic_time; + cycle_t clk_mask; + u32 clk_mult; + u32 clk_shift; + void *clk_fsys_mmio; + cycle_t clk_cycle_last; +} ____cacheline_aligned; + +struct itc_jitter_data_t { + int itc_jitter; + cycle_t itc_lastcycle; +} ____cacheline_aligned; + diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c new file mode 100644 index 00000000000..7fc8c961b1f --- /dev/null +++ b/arch/ia64/kernel/ftrace.c @@ -0,0 +1,206 @@ +/* + * Dynamic function tracing support. + * + * Copyright (C) 2008 Shaohua Li <shaohua.li@intel.com> + * + * For licencing details, see COPYING. + * + * Defines low-level handling of mcount calls when the kernel + * is compiled with the -pg flag. When using dynamic ftrace, the + * mcount call-sites get patched lazily with NOP till they are + * enabled. All code mutation routines here take effect atomically. + */ + +#include <linux/uaccess.h> +#include <linux/ftrace.h> + +#include <asm/cacheflush.h> +#include <asm/patch.h> + +/* In IA64, each function will be added below two bundles with -pg option */ +static unsigned char __attribute__((aligned(8))) +ftrace_orig_code[MCOUNT_INSN_SIZE] = { + 0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */ + 0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */ + 0x05, 0x00, 0xc4, 0x00, /* mov r42=b0 */ + 0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */ + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */ + 0x08, 0x00, 0x00, 0x50 /* br.call.sptk.many b0 = _mcount;; */ +}; + +struct ftrace_orig_insn { + u64 dummy1, dummy2, dummy3; + u64 dummy4:64-41+13; + u64 imm20:20; + u64 dummy5:3; + u64 sign:1; + u64 dummy6:4; +}; + +/* mcount stub will be converted below for nop */ +static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */ + 0x00, 0x00, 0x04, 0x00 +}; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop_code; +} + +/* + * mcount stub will be converted below for call + * Note: Just the last instruction is changed against nop + * */ +static unsigned char __attribute__((aligned(8))) +ftrace_call_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/ + 0xf8, 0xff, 0xff, 0xc8 +}; + +struct ftrace_call_insn { + u64 dummy1, dummy2; + u64 dummy3:48; + u64 imm39_l:16; + u64 imm39_h:23; + u64 dummy4:13; + u64 imm20:20; + u64 dummy5:3; + u64 i:1; + u64 dummy6:4; +}; + +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + struct ftrace_call_insn *code = (void *)ftrace_call_code; + unsigned long offset = addr - (ip + 0x10); + + code->imm39_l = offset >> 24; + code->imm39_h = offset >> 40; + code->imm20 = offset >> 4; + code->i = offset >> 63; + return ftrace_call_code; +} + +static int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code, int do_check) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. We do this by using the + * probe_kernel_* functions. + * + * No real locking needed, this code is run through + * kstop_machine, or before SMP starts. + */ + + if (!do_check) + goto skip_check; + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + +skip_check: + /* replace the text with the new text */ + if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + flush_icache_range(ip, ip + MCOUNT_INSN_SIZE); + + return 0; +} + +static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE]; + unsigned long ip = rec->ip; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + if (rec->flags & FTRACE_FL_CONVERTED) { + struct ftrace_call_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_call_code; + tmp_call = (void *)replaced; + call_insn->imm39_l = tmp_call->imm39_l; + call_insn->imm39_h = tmp_call->imm39_h; + call_insn->imm20 = tmp_call->imm20; + call_insn->i = tmp_call->i; + if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } else { + struct ftrace_orig_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_orig_code; + tmp_call = (void *)replaced; + call_insn->sign = tmp_call->sign; + call_insn->imm20 = tmp_call->imm20; + if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + int ret; + char *new; + + ret = ftrace_make_nop_check(rec, addr); + if (ret) + return ret; + new = ftrace_nop_replace(); + return ftrace_modify_code(rec->ip, NULL, new, 0); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned char *old, *new; + + old= ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + return ftrace_modify_code(ip, old, new, 1); +} + +/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */ +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip; + unsigned long addr = ((struct fnptr *)ftrace_call)->ip; + + if (func == ftrace_stub) + return 0; + ip = ((struct fnptr *)func)->ip; + + ia64_patch_imm64(addr + 2, ip); + + flush_icache_range(addr, addr + 16); + return 0; +} + +/* run from kstop_machine */ +int __init ftrace_dyn_arch_init(void *data) +{ + *(unsigned long *)data = 0; + + return 0; +} diff --git a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S new file mode 100644 index 00000000000..b3ef1c72e13 --- /dev/null +++ b/arch/ia64/kernel/gate-data.S @@ -0,0 +1,3 @@ + .section .data..gate, "aw" + + .incbin "arch/ia64/kernel/gate.so" diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S new file mode 100644 index 00000000000..245d3e1ec7e --- /dev/null +++ b/arch/ia64/kernel/gate.S @@ -0,0 +1,385 @@ +/* + * This file contains the code that gets mapped at the upper end of each task's text + * region. For now, it contains the signal trampoline code only. + * + * Copyright (C) 1999-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ + + +#include <asm/asmmacro.h> +#include <asm/errno.h> +#include <asm/asm-offsets.h> +#include <asm/sigcontext.h> +#include <asm/system.h> +#include <asm/unistd.h> +#include "paravirt_inst.h" + +/* + * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation, + * complications with the linker (which likes to create PLT stubs for branches + * to targets outside the shared object) and to avoid multi-phase kernel builds, we + * simply create minimalistic "patch lists" in special ELF sections. + */ + .section ".data..patch.fsyscall_table", "a" + .previous +#define LOAD_FSYSCALL_TABLE(reg) \ +[1:] movl reg=0; \ + .xdata4 ".data..patch.fsyscall_table", 1b-. + + .section ".data..patch.brl_fsys_bubble_down", "a" + .previous +#define BRL_COND_FSYS_BUBBLE_DOWN(pr) \ +[1:](pr)brl.cond.sptk 0; \ + ;; \ + .xdata4 ".data..patch.brl_fsys_bubble_down", 1b-. + +GLOBAL_ENTRY(__kernel_syscall_via_break) + .prologue + .altrp b6 + .body + /* + * Note: for (fast) syscall restart to work, the break instruction must be + * the first one in the bundle addressed by syscall_via_break. + */ +{ .mib + break 0x100000 + nop.i 0 + br.ret.sptk.many b6 +} +END(__kernel_syscall_via_break) + +# define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) +# define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET) +# define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET) +# define SIGHANDLER_OFF (16 + IA64_SIGFRAME_HANDLER_OFFSET) +# define SIGCONTEXT_OFF (16 + IA64_SIGFRAME_SIGCONTEXT_OFFSET) + +# define FLAGS_OFF IA64_SIGCONTEXT_FLAGS_OFFSET +# define CFM_OFF IA64_SIGCONTEXT_CFM_OFFSET +# define FR6_OFF IA64_SIGCONTEXT_FR6_OFFSET +# define BSP_OFF IA64_SIGCONTEXT_AR_BSP_OFFSET +# define RNAT_OFF IA64_SIGCONTEXT_AR_RNAT_OFFSET +# define UNAT_OFF IA64_SIGCONTEXT_AR_UNAT_OFFSET +# define FPSR_OFF IA64_SIGCONTEXT_AR_FPSR_OFFSET +# define PR_OFF IA64_SIGCONTEXT_PR_OFFSET +# define RP_OFF IA64_SIGCONTEXT_IP_OFFSET +# define SP_OFF IA64_SIGCONTEXT_R12_OFFSET +# define RBS_BASE_OFF IA64_SIGCONTEXT_RBS_BASE_OFFSET +# define LOADRS_OFF IA64_SIGCONTEXT_LOADRS_OFFSET +# define base0 r2 +# define base1 r3 + /* + * When we get here, the memory stack looks like this: + * + * +===============================+ + * | | + * // struct sigframe // + * | | + * +-------------------------------+ <-- sp+16 + * | 16 byte of scratch | + * | space | + * +-------------------------------+ <-- sp + * + * The register stack looks _exactly_ the way it looked at the time the signal + * occurred. In other words, we're treading on a potential mine-field: each + * incoming general register may be a NaT value (including sp, in which case the + * process ends up dying with a SIGSEGV). + * + * The first thing need to do is a cover to get the registers onto the backing + * store. Once that is done, we invoke the signal handler which may modify some + * of the machine state. After returning from the signal handler, we return + * control to the previous context by executing a sigreturn system call. A signal + * handler may call the rt_sigreturn() function to directly return to a given + * sigcontext. However, the user-level sigreturn() needs to do much more than + * calling the rt_sigreturn() system call as it needs to unwind the stack to + * restore preserved registers that may have been saved on the signal handler's + * call stack. + */ + +#define SIGTRAMP_SAVES \ + .unwabi 3, 's'; /* mark this as a sigtramp handler (saves scratch regs) */ \ + .unwabi @svr4, 's'; /* backwards compatibility with old unwinders (remove in v2.7) */ \ + .savesp ar.unat, UNAT_OFF+SIGCONTEXT_OFF; \ + .savesp ar.fpsr, FPSR_OFF+SIGCONTEXT_OFF; \ + .savesp pr, PR_OFF+SIGCONTEXT_OFF; \ + .savesp rp, RP_OFF+SIGCONTEXT_OFF; \ + .savesp ar.pfs, CFM_OFF+SIGCONTEXT_OFF; \ + .vframesp SP_OFF+SIGCONTEXT_OFF + +GLOBAL_ENTRY(__kernel_sigtramp) + // describe the state that is active when we get here: + .prologue + SIGTRAMP_SAVES + .body + + .label_state 1 + + adds base0=SIGHANDLER_OFF,sp + adds base1=RBS_BASE_OFF+SIGCONTEXT_OFF,sp + br.call.sptk.many rp=1f +1: + ld8 r17=[base0],(ARG0_OFF-SIGHANDLER_OFF) // get pointer to signal handler's plabel + ld8 r15=[base1] // get address of new RBS base (or NULL) + cover // push args in interrupted frame onto backing store + ;; + cmp.ne p1,p0=r15,r0 // do we need to switch rbs? (note: pr is saved by kernel) + mov.m r9=ar.bsp // fetch ar.bsp + .spillsp.p p1, ar.rnat, RNAT_OFF+SIGCONTEXT_OFF +(p1) br.cond.spnt setup_rbs // yup -> (clobbers p8, r14-r16, and r18-r20) +back_from_setup_rbs: + alloc r8=ar.pfs,0,0,3,0 + ld8 out0=[base0],16 // load arg0 (signum) + adds base1=(ARG1_OFF-(RBS_BASE_OFF+SIGCONTEXT_OFF)),base1 + ;; + ld8 out1=[base1] // load arg1 (siginfop) + ld8 r10=[r17],8 // get signal handler entry point + ;; + ld8 out2=[base0] // load arg2 (sigcontextp) + ld8 gp=[r17] // get signal handler's global pointer + adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp + ;; + .spillsp ar.bsp, BSP_OFF+SIGCONTEXT_OFF + st8 [base0]=r9 // save sc_ar_bsp + adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp + adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp + ;; + stf.spill [base0]=f6,32 + stf.spill [base1]=f7,32 + ;; + stf.spill [base0]=f8,32 + stf.spill [base1]=f9,32 + mov b6=r10 + ;; + stf.spill [base0]=f10,32 + stf.spill [base1]=f11,32 + ;; + stf.spill [base0]=f12,32 + stf.spill [base1]=f13,32 + ;; + stf.spill [base0]=f14,32 + stf.spill [base1]=f15,32 + br.call.sptk.many rp=b6 // call the signal handler +.ret0: adds base0=(BSP_OFF+SIGCONTEXT_OFF),sp + ;; + ld8 r15=[base0] // fetch sc_ar_bsp + mov r14=ar.bsp + ;; + cmp.ne p1,p0=r14,r15 // do we need to restore the rbs? +(p1) br.cond.spnt restore_rbs // yup -> (clobbers r14-r18, f6 & f7) + ;; +back_from_restore_rbs: + adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp + adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp + ;; + ldf.fill f6=[base0],32 + ldf.fill f7=[base1],32 + ;; + ldf.fill f8=[base0],32 + ldf.fill f9=[base1],32 + ;; + ldf.fill f10=[base0],32 + ldf.fill f11=[base1],32 + ;; + ldf.fill f12=[base0],32 + ldf.fill f13=[base1],32 + ;; + ldf.fill f14=[base0],32 + ldf.fill f15=[base1],32 + mov r15=__NR_rt_sigreturn + .restore sp // pop .prologue + break __BREAK_SYSCALL + + .prologue + SIGTRAMP_SAVES +setup_rbs: + mov ar.rsc=0 // put RSE into enforced lazy mode + ;; + .save ar.rnat, r19 + mov r19=ar.rnat // save RNaT before switching backing store area + adds r14=(RNAT_OFF+SIGCONTEXT_OFF),sp + + mov r18=ar.bspstore + mov ar.bspstore=r15 // switch over to new register backing store area + ;; + + .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF + st8 [r14]=r19 // save sc_ar_rnat + .body + mov.m r16=ar.bsp // sc_loadrs <- (new bsp - new bspstore) << 16 + adds r14=(LOADRS_OFF+SIGCONTEXT_OFF),sp + ;; + invala + sub r15=r16,r15 + extr.u r20=r18,3,6 + ;; + mov ar.rsc=0xf // set RSE into eager mode, pl 3 + cmp.eq p8,p0=63,r20 + shl r15=r15,16 + ;; + st8 [r14]=r15 // save sc_loadrs +(p8) st8 [r18]=r19 // if bspstore points at RNaT slot, store RNaT there now + .restore sp // pop .prologue + br.cond.sptk back_from_setup_rbs + + .prologue + SIGTRAMP_SAVES + .spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF + .body +restore_rbs: + // On input: + // r14 = bsp1 (bsp at the time of return from signal handler) + // r15 = bsp0 (bsp at the time the signal occurred) + // + // Here, we need to calculate bspstore0, the value that ar.bspstore needs + // to be set to, based on bsp0 and the size of the dirty partition on + // the alternate stack (sc_loadrs >> 16). This can be done with the + // following algorithm: + // + // bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1)); + // + // This is what the code below does. + // + alloc r2=ar.pfs,0,0,0,0 // alloc null frame + adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp + adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp + ;; + ld8 r17=[r16] + ld8 r16=[r18] // get new rnat + extr.u r18=r15,3,6 // r18 <- rse_slot_num(bsp0) + ;; + mov ar.rsc=r17 // put RSE into enforced lazy mode + shr.u r17=r17,16 + ;; + sub r14=r14,r17 // r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16) + shr.u r17=r17,3 // r17 <- (sc_loadrs >> 19) + ;; + loadrs // restore dirty partition + extr.u r14=r14,3,6 // r14 <- rse_slot_num(bspstore1) + ;; + add r14=r14,r17 // r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19) + ;; + shr.u r14=r14,6 // r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40 + ;; + sub r14=r14,r17 // r14 <- -rse_num_regs(bspstore1, bsp1) + movl r17=0x8208208208208209 + ;; + add r18=r18,r14 // r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1) + setf.sig f7=r17 + cmp.lt p7,p0=r14,r0 // p7 <- (r14 < 0)? + ;; +(p7) adds r18=-62,r18 // delta -= 62 + ;; + setf.sig f6=r18 + ;; + xmpy.h f6=f6,f7 + ;; + getf.sig r17=f6 + ;; + add r17=r17,r18 + shr r18=r18,63 + ;; + shr r17=r17,5 + ;; + sub r17=r17,r18 // r17 = delta/63 + ;; + add r17=r14,r17 // r17 <- delta/63 - rse_num_regs(bspstore1, bsp1) + ;; + shladd r15=r17,3,r15 // r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1)) + ;; + mov ar.bspstore=r15 // switch back to old register backing store area + ;; + mov ar.rnat=r16 // restore RNaT + mov ar.rsc=0xf // (will be restored later on from sc_ar_rsc) + // invala not necessary as that will happen when returning to user-mode + br.cond.sptk back_from_restore_rbs +END(__kernel_sigtramp) + +/* + * On entry: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * b6 = return address + * On exit: + * r11 = saved ar.pfs + * r15 = system call # + * b0 = saved return address + * all other "scratch" registers: undefined + * all "preserved" registers: same as on entry + */ + +GLOBAL_ENTRY(__kernel_syscall_via_epc) + .prologue + .altrp b6 + .body +{ + /* + * Note: the kernel cannot assume that the first two instructions in this + * bundle get executed. The remaining code must be safe even if + * they do not get executed. + */ + adds r17=-1024,r15 // A + mov r10=0 // A default to successful syscall execution + epc // B causes split-issue +} + ;; + RSM_PSR_BE_I(r20, r22) // M2 (5 cyc to srlz.d) + LOAD_FSYSCALL_TABLE(r14) // X + ;; + mov r16=IA64_KR(CURRENT) // M2 (12 cyc) + shladd r18=r17,3,r14 // A + mov r19=NR_syscalls-1 // A + ;; + lfetch [r18] // M0|1 + MOV_FROM_PSR(p0, r29, r8) // M2 (12 cyc) + // If r17 is a NaT, p6 will be zero + cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? + ;; + mov r21=ar.fpsr // M2 (12 cyc) + tnat.nz p10,p9=r15 // I0 + mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...) + ;; + srlz.d // M0 (forces split-issue) ensure PSR.BE==0 +(p6) ld8 r18=[r18] // M0|1 + nop.i 0 + ;; + nop.m 0 +(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) + nop.i 0 + ;; + SSM_PSR_I(p8, p14, r25) +(p6) mov b7=r18 // I0 +(p8) br.dptk.many b7 // B + + mov r27=ar.rsc // M2 (12 cyc) +/* + * brl.cond doesn't work as intended because the linker would convert this branch + * into a branch to a PLT. Perhaps there will be a way to avoid this with some + * future version of the linker. In the meantime, we just use an indirect branch + * instead. + */ +#ifdef CONFIG_ITANIUM +(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry + ;; +(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down + ;; +(p6) mov b7=r14 +(p6) br.sptk.many b7 +#else + BRL_COND_FSYS_BUBBLE_DOWN(p6) +#endif + SSM_PSR_I(p0, p14, r10) + mov r10=-1 +(p10) mov r8=EINVAL +(p9) mov r8=ENOSYS + FSYS_RETURN + +#ifdef CONFIG_PARAVIRT + /* + * padd to make the size of this symbol constant + * independent of paravirtualization. + */ + .align PAGE_SIZE / 8 +#endif +END(__kernel_syscall_via_epc) diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S new file mode 100644 index 00000000000..d32b0855110 --- /dev/null +++ b/arch/ia64/kernel/gate.lds.S @@ -0,0 +1,109 @@ +/* + * Linker script for gate DSO. The gate pages are an ELF shared object + * prelinked to its virtual address, with only one read-only segment and + * one execute-only segment (both fit in one page). This script controls + * its layout. + */ + + +#include <asm/system.h> +#include "paravirt_patchlist.h" + +SECTIONS +{ + . = GATE_ADDR + SIZEOF_HEADERS; + + .hash : { *(.hash) } :readable + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note*) } :readable :note + + .dynamic : { *(.dynamic) } :readable :dynamic + + /* + * This linker script is used both with -r and with -shared. For + * the layouts to match, we need to skip more than enough space for + * the dynamic symbol table et al. If this amount is insufficient, + * ld -shared will barf. Just increase it here. + */ + . = GATE_ADDR + 0x600; + + .data..patch : { + __paravirt_start_gate_mckinley_e9_patchlist = .; + *(.data..patch.mckinley_e9) + __paravirt_end_gate_mckinley_e9_patchlist = .; + + __paravirt_start_gate_vtop_patchlist = .; + *(.data..patch.vtop) + __paravirt_end_gate_vtop_patchlist = .; + + __paravirt_start_gate_fsyscall_patchlist = .; + *(.data..patch.fsyscall_table) + __paravirt_end_gate_fsyscall_patchlist = .; + + __paravirt_start_gate_brl_fsys_bubble_down_patchlist = .; + *(.data..patch.brl_fsys_bubble_down) + __paravirt_end_gate_brl_fsys_bubble_down_patchlist = .; + } :readable + + .IA_64.unwind_info : { *(.IA_64.unwind_info*) } + .IA_64.unwind : { *(.IA_64.unwind*) } :readable :unwind +#ifdef HAVE_BUGGY_SEGREL + .text (GATE_ADDR + PAGE_SIZE) : { *(.text) *(.text.*) } :readable +#else + . = ALIGN(PERCPU_PAGE_SIZE) + (. & (PERCPU_PAGE_SIZE - 1)); + .text : { *(.text) *(.text.*) } :epc +#endif + + /DISCARD/ : { + *(.got.plt) *(.got) + *(.data .data.* .gnu.linkonce.d.*) + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(__ex_table) + *(__mca_table) + } +} + +/* + * ld does not recognize this name token; use the constant. + */ +#define PT_IA_64_UNWIND 0x70000001 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + readable PT_LOAD FILEHDR PHDRS FLAGS(4); /* PF_R */ +#ifndef HAVE_BUGGY_SEGREL + epc PT_LOAD FILEHDR PHDRS FLAGS(1); /* PF_X */ +#endif + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + unwind PT_IA_64_UNWIND; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_syscall_via_break; + __kernel_syscall_via_epc; + __kernel_sigtramp; + + local: *; + }; +} + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_syscall_via_epc) diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S new file mode 100644 index 00000000000..17a9fba3893 --- /dev/null +++ b/arch/ia64/kernel/head.S @@ -0,0 +1,1229 @@ +/* + * Here is where the ball gets rolling as far as the kernel is concerned. + * When control is transferred to _start, the bootload has already + * loaded us to the correct address. All that's left to do here is + * to set up the kernel's global pointer and jump to the kernel + * entry point. + * + * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999 Intel Corp. + * Copyright (C) 1999 Asit Mallick <Asit.K.Mallick@intel.com> + * Copyright (C) 1999 Don Dugger <Don.Dugger@intel.com> + * Copyright (C) 2002 Fenghua Yu <fenghua.yu@intel.com> + * -Optimize __ia64_save_fpu() and __ia64_load_fpu() for Itanium 2. + * Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com> + * Support for CPU Hotplug + */ + + +#include <asm/asmmacro.h> +#include <asm/fpu.h> +#include <asm/kregs.h> +#include <asm/mmu_context.h> +#include <asm/asm-offsets.h> +#include <asm/pal.h> +#include <asm/paravirt.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/mca_asm.h> +#include <linux/init.h> +#include <linux/linkage.h> + +#ifdef CONFIG_HOTPLUG_CPU +#define SAL_PSR_BITS_TO_SET \ + (IA64_PSR_AC | IA64_PSR_BN | IA64_PSR_MFH | IA64_PSR_MFL) + +#define SAVE_FROM_REG(src, ptr, dest) \ + mov dest=src;; \ + st8 [ptr]=dest,0x08 + +#define RESTORE_REG(reg, ptr, _tmp) \ + ld8 _tmp=[ptr],0x08;; \ + mov reg=_tmp + +#define SAVE_BREAK_REGS(ptr, _idx, _breg, _dest)\ + mov ar.lc=IA64_NUM_DBG_REGS-1;; \ + mov _idx=0;; \ +1: \ + SAVE_FROM_REG(_breg[_idx], ptr, _dest);; \ + add _idx=1,_idx;; \ + br.cloop.sptk.many 1b + +#define RESTORE_BREAK_REGS(ptr, _idx, _breg, _tmp, _lbl)\ + mov ar.lc=IA64_NUM_DBG_REGS-1;; \ + mov _idx=0;; \ +_lbl: RESTORE_REG(_breg[_idx], ptr, _tmp);; \ + add _idx=1, _idx;; \ + br.cloop.sptk.many _lbl + +#define SAVE_ONE_RR(num, _reg, _tmp) \ + movl _tmp=(num<<61);; \ + mov _reg=rr[_tmp] + +#define SAVE_REGION_REGS(_tmp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) \ + SAVE_ONE_RR(0,_r0, _tmp);; \ + SAVE_ONE_RR(1,_r1, _tmp);; \ + SAVE_ONE_RR(2,_r2, _tmp);; \ + SAVE_ONE_RR(3,_r3, _tmp);; \ + SAVE_ONE_RR(4,_r4, _tmp);; \ + SAVE_ONE_RR(5,_r5, _tmp);; \ + SAVE_ONE_RR(6,_r6, _tmp);; \ + SAVE_ONE_RR(7,_r7, _tmp);; + +#define STORE_REGION_REGS(ptr, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) \ + st8 [ptr]=_r0, 8;; \ + st8 [ptr]=_r1, 8;; \ + st8 [ptr]=_r2, 8;; \ + st8 [ptr]=_r3, 8;; \ + st8 [ptr]=_r4, 8;; \ + st8 [ptr]=_r5, 8;; \ + st8 [ptr]=_r6, 8;; \ + st8 [ptr]=_r7, 8;; + +#define RESTORE_REGION_REGS(ptr, _idx1, _idx2, _tmp) \ + mov ar.lc=0x08-1;; \ + movl _idx1=0x00;; \ +RestRR: \ + dep.z _idx2=_idx1,61,3;; \ + ld8 _tmp=[ptr],8;; \ + mov rr[_idx2]=_tmp;; \ + srlz.d;; \ + add _idx1=1,_idx1;; \ + br.cloop.sptk.few RestRR + +#define SET_AREA_FOR_BOOTING_CPU(reg1, reg2) \ + movl reg1=sal_state_for_booting_cpu;; \ + ld8 reg2=[reg1];; + +/* + * Adjust region registers saved before starting to save + * break regs and rest of the states that need to be preserved. + */ +#define SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(_reg1,_reg2,_pred) \ + SAVE_FROM_REG(b0,_reg1,_reg2);; \ + SAVE_FROM_REG(b1,_reg1,_reg2);; \ + SAVE_FROM_REG(b2,_reg1,_reg2);; \ + SAVE_FROM_REG(b3,_reg1,_reg2);; \ + SAVE_FROM_REG(b4,_reg1,_reg2);; \ + SAVE_FROM_REG(b5,_reg1,_reg2);; \ + st8 [_reg1]=r1,0x08;; \ + st8 [_reg1]=r12,0x08;; \ + st8 [_reg1]=r13,0x08;; \ + SAVE_FROM_REG(ar.fpsr,_reg1,_reg2);; \ + SAVE_FROM_REG(ar.pfs,_reg1,_reg2);; \ + SAVE_FROM_REG(ar.rnat,_reg1,_reg2);; \ + SAVE_FROM_REG(ar.unat,_reg1,_reg2);; \ + SAVE_FROM_REG(ar.bspstore,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.dcr,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.iva,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.pta,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.itv,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.pmv,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.cmcv,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.lrr0,_reg1,_reg2);; \ + SAVE_FROM_REG(cr.lrr1,_reg1,_reg2);; \ + st8 [_reg1]=r4,0x08;; \ + st8 [_reg1]=r5,0x08;; \ + st8 [_reg1]=r6,0x08;; \ + st8 [_reg1]=r7,0x08;; \ + st8 [_reg1]=_pred,0x08;; \ + SAVE_FROM_REG(ar.lc, _reg1, _reg2);; \ + stf.spill.nta [_reg1]=f2,16;; \ + stf.spill.nta [_reg1]=f3,16;; \ + stf.spill.nta [_reg1]=f4,16;; \ + stf.spill.nta [_reg1]=f5,16;; \ + stf.spill.nta [_reg1]=f16,16;; \ + stf.spill.nta [_reg1]=f17,16;; \ + stf.spill.nta [_reg1]=f18,16;; \ + stf.spill.nta [_reg1]=f19,16;; \ + stf.spill.nta [_reg1]=f20,16;; \ + stf.spill.nta [_reg1]=f21,16;; \ + stf.spill.nta [_reg1]=f22,16;; \ + stf.spill.nta [_reg1]=f23,16;; \ + stf.spill.nta [_reg1]=f24,16;; \ + stf.spill.nta [_reg1]=f25,16;; \ + stf.spill.nta [_reg1]=f26,16;; \ + stf.spill.nta [_reg1]=f27,16;; \ + stf.spill.nta [_reg1]=f28,16;; \ + stf.spill.nta [_reg1]=f29,16;; \ + stf.spill.nta [_reg1]=f30,16;; \ + stf.spill.nta [_reg1]=f31,16;; + +#else +#define SET_AREA_FOR_BOOTING_CPU(a1, a2) +#define SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(a1,a2, a3) +#define SAVE_REGION_REGS(_tmp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) +#define STORE_REGION_REGS(ptr, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7) +#endif + +#define SET_ONE_RR(num, pgsize, _tmp1, _tmp2, vhpt) \ + movl _tmp1=(num << 61);; \ + mov _tmp2=((ia64_rid(IA64_REGION_ID_KERNEL, (num<<61)) << 8) | (pgsize << 2) | vhpt);; \ + mov rr[_tmp1]=_tmp2 + + __PAGE_ALIGNED_DATA + + .global empty_zero_page +empty_zero_page: + .skip PAGE_SIZE + + .global swapper_pg_dir +swapper_pg_dir: + .skip PAGE_SIZE + + .rodata +halt_msg: + stringz "Halting kernel\n" + + __REF + + .global start_ap + + /* + * Start the kernel. When the bootloader passes control to _start(), r28 + * points to the address of the boot parameter area. Execution reaches + * here in physical mode. + */ +GLOBAL_ENTRY(_start) +start_ap: + .prologue + .save rp, r0 // terminate unwind chain with a NULL rp + .body + + rsm psr.i | psr.ic + ;; + srlz.i + ;; + { + flushrs // must be first insn in group + srlz.i + } + ;; + /* + * Save the region registers, predicate before they get clobbered + */ + SAVE_REGION_REGS(r2, r8,r9,r10,r11,r12,r13,r14,r15); + mov r25=pr;; + + /* + * Initialize kernel region registers: + * rr[0]: VHPT enabled, page size = PAGE_SHIFT + * rr[1]: VHPT enabled, page size = PAGE_SHIFT + * rr[2]: VHPT enabled, page size = PAGE_SHIFT + * rr[3]: VHPT enabled, page size = PAGE_SHIFT + * rr[4]: VHPT enabled, page size = PAGE_SHIFT + * rr[5]: VHPT enabled, page size = PAGE_SHIFT + * rr[6]: VHPT disabled, page size = IA64_GRANULE_SHIFT + * rr[7]: VHPT disabled, page size = IA64_GRANULE_SHIFT + * We initialize all of them to prevent inadvertently assuming + * something about the state of address translation early in boot. + */ + SET_ONE_RR(0, PAGE_SHIFT, r2, r16, 1);; + SET_ONE_RR(1, PAGE_SHIFT, r2, r16, 1);; + SET_ONE_RR(2, PAGE_SHIFT, r2, r16, 1);; + SET_ONE_RR(3, PAGE_SHIFT, r2, r16, 1);; + SET_ONE_RR(4, PAGE_SHIFT, r2, r16, 1);; + SET_ONE_RR(5, PAGE_SHIFT, r2, r16, 1);; + SET_ONE_RR(6, IA64_GRANULE_SHIFT, r2, r16, 0);; + SET_ONE_RR(7, IA64_GRANULE_SHIFT, r2, r16, 0);; + /* + * Now pin mappings into the TLB for kernel text and data + */ + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + movl r17=KERNEL_START + ;; + mov cr.itir=r18 + mov cr.ifa=r17 + mov r16=IA64_TR_KERNEL + mov r3=ip + movl r18=PAGE_KERNEL + ;; + dep r2=0,r3,0,KERNEL_TR_PAGE_SHIFT + ;; + or r18=r2,r18 + ;; + srlz.i + ;; + itr.i itr[r16]=r18 + ;; + itr.d dtr[r16]=r18 + ;; + srlz.i + + /* + * Switch into virtual mode: + */ + movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \ + |IA64_PSR_DI|IA64_PSR_AC) + ;; + mov cr.ipsr=r16 + movl r17=1f + ;; + mov cr.iip=r17 + mov cr.ifs=r0 + ;; + rfi + ;; +1: // now we are in virtual mode + + SET_AREA_FOR_BOOTING_CPU(r2, r16); + + STORE_REGION_REGS(r16, r8,r9,r10,r11,r12,r13,r14,r15); + SAL_TO_OS_BOOT_HANDOFF_STATE_SAVE(r16,r17,r25) + ;; + + // set IVT entry point---can't access I/O ports without it + movl r3=ia64_ivt + ;; + mov cr.iva=r3 + movl r2=FPSR_DEFAULT + ;; + srlz.i + movl gp=__gp + + mov ar.fpsr=r2 + ;; + +#define isAP p2 // are we an Application Processor? +#define isBP p3 // are we the Bootstrap Processor? + +#ifdef CONFIG_SMP + /* + * Find the init_task for the currently booting CPU. At poweron, and in + * UP mode, task_for_booting_cpu is NULL. + */ + movl r3=task_for_booting_cpu + ;; + ld8 r3=[r3] + movl r2=init_task + ;; + cmp.eq isBP,isAP=r3,r0 + ;; +(isAP) mov r2=r3 +#else + movl r2=init_task + cmp.eq isBP,isAP=r0,r0 +#endif + ;; + tpa r3=r2 // r3 == phys addr of task struct + mov r16=-1 +(isBP) br.cond.dpnt .load_current // BP stack is on region 5 --- no need to map it + + // load mapping for stack (virtaddr in r2, physaddr in r3) + rsm psr.ic + movl r17=PAGE_KERNEL + ;; + srlz.d + dep r18=0,r3,0,12 + ;; + or r18=r17,r18 + dep r2=-1,r3,61,3 // IMVA of task + ;; + mov r17=rr[r2] + shr.u r16=r3,IA64_GRANULE_SHIFT + ;; + dep r17=0,r17,8,24 + ;; + mov cr.itir=r17 + mov cr.ifa=r2 + + mov r19=IA64_TR_CURRENT_STACK + ;; + itr.d dtr[r19]=r18 + ;; + ssm psr.ic + srlz.d + ;; + +.load_current: + // load the "current" pointer (r13) and ar.k6 with the current task + mov IA64_KR(CURRENT)=r2 // virtual address + mov IA64_KR(CURRENT_STACK)=r16 + mov r13=r2 + /* + * Reserve space at the top of the stack for "struct pt_regs". Kernel + * threads don't store interesting values in that structure, but the space + * still needs to be there because time-critical stuff such as the context + * switching can be implemented more efficiently (for example, __switch_to() + * always sets the psr.dfh bit of the task it is switching to). + */ + + addl r12=IA64_STK_OFFSET-IA64_PT_REGS_SIZE-16,r2 + addl r2=IA64_RBS_OFFSET,r2 // initialize the RSE + mov ar.rsc=0 // place RSE in enforced lazy mode + ;; + loadrs // clear the dirty partition + movl r19=__phys_per_cpu_start + mov r18=PERCPU_PAGE_SIZE + ;; +#ifndef CONFIG_SMP + add r19=r19,r18 + ;; +#else +(isAP) br.few 2f + movl r20=__cpu0_per_cpu + ;; + shr.u r18=r18,3 +1: + ld8 r21=[r19],8;; + st8[r20]=r21,8 + adds r18=-1,r18;; + cmp4.lt p7,p6=0,r18 +(p7) br.cond.dptk.few 1b + mov r19=r20 + ;; +2: +#endif + tpa r19=r19 + ;; + .pred.rel.mutex isBP,isAP +(isBP) mov IA64_KR(PER_CPU_DATA)=r19 // per-CPU base for cpu0 +(isAP) mov IA64_KR(PER_CPU_DATA)=r0 // clear physical per-CPU base + ;; + mov ar.bspstore=r2 // establish the new RSE stack + ;; + mov ar.rsc=0x3 // place RSE in eager mode + +(isBP) dep r28=-1,r28,61,3 // make address virtual +(isBP) movl r2=ia64_boot_param + ;; +(isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader + +#ifdef CONFIG_PARAVIRT + + movl r14=hypervisor_setup_hooks + movl r15=hypervisor_type + mov r16=num_hypervisor_hooks + ;; + ld8 r2=[r15] + ;; + cmp.ltu p7,p0=r2,r16 // array size check + shladd r8=r2,3,r14 + ;; +(p7) ld8 r9=[r8] + ;; +(p7) mov b1=r9 +(p7) cmp.ne.unc p7,p0=r9,r0 // no actual branch to NULL + ;; +(p7) br.call.sptk.many rp=b1 + + __INITDATA + +default_setup_hook = 0 // Currently nothing needs to be done. + + .weak xen_setup_hook + + .global hypervisor_type +hypervisor_type: + data8 PARAVIRT_HYPERVISOR_TYPE_DEFAULT + + // must have the same order with PARAVIRT_HYPERVISOR_TYPE_xxx + +hypervisor_setup_hooks: + data8 default_setup_hook + data8 xen_setup_hook +num_hypervisor_hooks = (. - hypervisor_setup_hooks) / 8 + .previous + +#endif + +#ifdef CONFIG_SMP +(isAP) br.call.sptk.many rp=start_secondary +.ret0: +(isAP) br.cond.sptk self +#endif + + // This is executed by the bootstrap processor (bsp) only: + +#ifdef CONFIG_IA64_FW_EMU + // initialize PAL & SAL emulator: + br.call.sptk.many rp=sys_fw_init +.ret1: +#endif + br.call.sptk.many rp=start_kernel +.ret2: addl r3=@ltoff(halt_msg),gp + ;; + alloc r2=ar.pfs,8,0,2,0 + ;; + ld8 out0=[r3] + br.call.sptk.many b0=console_print + +self: hint @pause + br.sptk.many self // endless loop +END(_start) + + .text + +GLOBAL_ENTRY(ia64_save_debug_regs) + alloc r16=ar.pfs,1,0,0,0 + mov r20=ar.lc // preserve ar.lc + mov ar.lc=IA64_NUM_DBG_REGS-1 + mov r18=0 + add r19=IA64_NUM_DBG_REGS*8,in0 + ;; +1: mov r16=dbr[r18] +#ifdef CONFIG_ITANIUM + ;; + srlz.d +#endif + mov r17=ibr[r18] + add r18=1,r18 + ;; + st8.nta [in0]=r16,8 + st8.nta [r19]=r17,8 + br.cloop.sptk.many 1b + ;; + mov ar.lc=r20 // restore ar.lc + br.ret.sptk.many rp +END(ia64_save_debug_regs) + +GLOBAL_ENTRY(ia64_load_debug_regs) + alloc r16=ar.pfs,1,0,0,0 + lfetch.nta [in0] + mov r20=ar.lc // preserve ar.lc + add r19=IA64_NUM_DBG_REGS*8,in0 + mov ar.lc=IA64_NUM_DBG_REGS-1 + mov r18=-1 + ;; +1: ld8.nta r16=[in0],8 + ld8.nta r17=[r19],8 + add r18=1,r18 + ;; + mov dbr[r18]=r16 +#ifdef CONFIG_ITANIUM + ;; + srlz.d // Errata 132 (NoFix status) +#endif + mov ibr[r18]=r17 + br.cloop.sptk.many 1b + ;; + mov ar.lc=r20 // restore ar.lc + br.ret.sptk.many rp +END(ia64_load_debug_regs) + +GLOBAL_ENTRY(__ia64_save_fpu) + alloc r2=ar.pfs,1,4,0,0 + adds loc0=96*16-16,in0 + adds loc1=96*16-16-128,in0 + ;; + stf.spill.nta [loc0]=f127,-256 + stf.spill.nta [loc1]=f119,-256 + ;; + stf.spill.nta [loc0]=f111,-256 + stf.spill.nta [loc1]=f103,-256 + ;; + stf.spill.nta [loc0]=f95,-256 + stf.spill.nta [loc1]=f87,-256 + ;; + stf.spill.nta [loc0]=f79,-256 + stf.spill.nta [loc1]=f71,-256 + ;; + stf.spill.nta [loc0]=f63,-256 + stf.spill.nta [loc1]=f55,-256 + adds loc2=96*16-32,in0 + ;; + stf.spill.nta [loc0]=f47,-256 + stf.spill.nta [loc1]=f39,-256 + adds loc3=96*16-32-128,in0 + ;; + stf.spill.nta [loc2]=f126,-256 + stf.spill.nta [loc3]=f118,-256 + ;; + stf.spill.nta [loc2]=f110,-256 + stf.spill.nta [loc3]=f102,-256 + ;; + stf.spill.nta [loc2]=f94,-256 + stf.spill.nta [loc3]=f86,-256 + ;; + stf.spill.nta [loc2]=f78,-256 + stf.spill.nta [loc3]=f70,-256 + ;; + stf.spill.nta [loc2]=f62,-256 + stf.spill.nta [loc3]=f54,-256 + adds loc0=96*16-48,in0 + ;; + stf.spill.nta [loc2]=f46,-256 + stf.spill.nta [loc3]=f38,-256 + adds loc1=96*16-48-128,in0 + ;; + stf.spill.nta [loc0]=f125,-256 + stf.spill.nta [loc1]=f117,-256 + ;; + stf.spill.nta [loc0]=f109,-256 + stf.spill.nta [loc1]=f101,-256 + ;; + stf.spill.nta [loc0]=f93,-256 + stf.spill.nta [loc1]=f85,-256 + ;; + stf.spill.nta [loc0]=f77,-256 + stf.spill.nta [loc1]=f69,-256 + ;; + stf.spill.nta [loc0]=f61,-256 + stf.spill.nta [loc1]=f53,-256 + adds loc2=96*16-64,in0 + ;; + stf.spill.nta [loc0]=f45,-256 + stf.spill.nta [loc1]=f37,-256 + adds loc3=96*16-64-128,in0 + ;; + stf.spill.nta [loc2]=f124,-256 + stf.spill.nta [loc3]=f116,-256 + ;; + stf.spill.nta [loc2]=f108,-256 + stf.spill.nta [loc3]=f100,-256 + ;; + stf.spill.nta [loc2]=f92,-256 + stf.spill.nta [loc3]=f84,-256 + ;; + stf.spill.nta [loc2]=f76,-256 + stf.spill.nta [loc3]=f68,-256 + ;; + stf.spill.nta [loc2]=f60,-256 + stf.spill.nta [loc3]=f52,-256 + adds loc0=96*16-80,in0 + ;; + stf.spill.nta [loc2]=f44,-256 + stf.spill.nta [loc3]=f36,-256 + adds loc1=96*16-80-128,in0 + ;; + stf.spill.nta [loc0]=f123,-256 + stf.spill.nta [loc1]=f115,-256 + ;; + stf.spill.nta [loc0]=f107,-256 + stf.spill.nta [loc1]=f99,-256 + ;; + stf.spill.nta [loc0]=f91,-256 + stf.spill.nta [loc1]=f83,-256 + ;; + stf.spill.nta [loc0]=f75,-256 + stf.spill.nta [loc1]=f67,-256 + ;; + stf.spill.nta [loc0]=f59,-256 + stf.spill.nta [loc1]=f51,-256 + adds loc2=96*16-96,in0 + ;; + stf.spill.nta [loc0]=f43,-256 + stf.spill.nta [loc1]=f35,-256 + adds loc3=96*16-96-128,in0 + ;; + stf.spill.nta [loc2]=f122,-256 + stf.spill.nta [loc3]=f114,-256 + ;; + stf.spill.nta [loc2]=f106,-256 + stf.spill.nta [loc3]=f98,-256 + ;; + stf.spill.nta [loc2]=f90,-256 + stf.spill.nta [loc3]=f82,-256 + ;; + stf.spill.nta [loc2]=f74,-256 + stf.spill.nta [loc3]=f66,-256 + ;; + stf.spill.nta [loc2]=f58,-256 + stf.spill.nta [loc3]=f50,-256 + adds loc0=96*16-112,in0 + ;; + stf.spill.nta [loc2]=f42,-256 + stf.spill.nta [loc3]=f34,-256 + adds loc1=96*16-112-128,in0 + ;; + stf.spill.nta [loc0]=f121,-256 + stf.spill.nta [loc1]=f113,-256 + ;; + stf.spill.nta [loc0]=f105,-256 + stf.spill.nta [loc1]=f97,-256 + ;; + stf.spill.nta [loc0]=f89,-256 + stf.spill.nta [loc1]=f81,-256 + ;; + stf.spill.nta [loc0]=f73,-256 + stf.spill.nta [loc1]=f65,-256 + ;; + stf.spill.nta [loc0]=f57,-256 + stf.spill.nta [loc1]=f49,-256 + adds loc2=96*16-128,in0 + ;; + stf.spill.nta [loc0]=f41,-256 + stf.spill.nta [loc1]=f33,-256 + adds loc3=96*16-128-128,in0 + ;; + stf.spill.nta [loc2]=f120,-256 + stf.spill.nta [loc3]=f112,-256 + ;; + stf.spill.nta [loc2]=f104,-256 + stf.spill.nta [loc3]=f96,-256 + ;; + stf.spill.nta [loc2]=f88,-256 + stf.spill.nta [loc3]=f80,-256 + ;; + stf.spill.nta [loc2]=f72,-256 + stf.spill.nta [loc3]=f64,-256 + ;; + stf.spill.nta [loc2]=f56,-256 + stf.spill.nta [loc3]=f48,-256 + ;; + stf.spill.nta [loc2]=f40 + stf.spill.nta [loc3]=f32 + br.ret.sptk.many rp +END(__ia64_save_fpu) + +GLOBAL_ENTRY(__ia64_load_fpu) + alloc r2=ar.pfs,1,2,0,0 + adds r3=128,in0 + adds r14=256,in0 + adds r15=384,in0 + mov loc0=512 + mov loc1=-1024+16 + ;; + ldf.fill.nta f32=[in0],loc0 + ldf.fill.nta f40=[ r3],loc0 + ldf.fill.nta f48=[r14],loc0 + ldf.fill.nta f56=[r15],loc0 + ;; + ldf.fill.nta f64=[in0],loc0 + ldf.fill.nta f72=[ r3],loc0 + ldf.fill.nta f80=[r14],loc0 + ldf.fill.nta f88=[r15],loc0 + ;; + ldf.fill.nta f96=[in0],loc1 + ldf.fill.nta f104=[ r3],loc1 + ldf.fill.nta f112=[r14],loc1 + ldf.fill.nta f120=[r15],loc1 + ;; + ldf.fill.nta f33=[in0],loc0 + ldf.fill.nta f41=[ r3],loc0 + ldf.fill.nta f49=[r14],loc0 + ldf.fill.nta f57=[r15],loc0 + ;; + ldf.fill.nta f65=[in0],loc0 + ldf.fill.nta f73=[ r3],loc0 + ldf.fill.nta f81=[r14],loc0 + ldf.fill.nta f89=[r15],loc0 + ;; + ldf.fill.nta f97=[in0],loc1 + ldf.fill.nta f105=[ r3],loc1 + ldf.fill.nta f113=[r14],loc1 + ldf.fill.nta f121=[r15],loc1 + ;; + ldf.fill.nta f34=[in0],loc0 + ldf.fill.nta f42=[ r3],loc0 + ldf.fill.nta f50=[r14],loc0 + ldf.fill.nta f58=[r15],loc0 + ;; + ldf.fill.nta f66=[in0],loc0 + ldf.fill.nta f74=[ r3],loc0 + ldf.fill.nta f82=[r14],loc0 + ldf.fill.nta f90=[r15],loc0 + ;; + ldf.fill.nta f98=[in0],loc1 + ldf.fill.nta f106=[ r3],loc1 + ldf.fill.nta f114=[r14],loc1 + ldf.fill.nta f122=[r15],loc1 + ;; + ldf.fill.nta f35=[in0],loc0 + ldf.fill.nta f43=[ r3],loc0 + ldf.fill.nta f51=[r14],loc0 + ldf.fill.nta f59=[r15],loc0 + ;; + ldf.fill.nta f67=[in0],loc0 + ldf.fill.nta f75=[ r3],loc0 + ldf.fill.nta f83=[r14],loc0 + ldf.fill.nta f91=[r15],loc0 + ;; + ldf.fill.nta f99=[in0],loc1 + ldf.fill.nta f107=[ r3],loc1 + ldf.fill.nta f115=[r14],loc1 + ldf.fill.nta f123=[r15],loc1 + ;; + ldf.fill.nta f36=[in0],loc0 + ldf.fill.nta f44=[ r3],loc0 + ldf.fill.nta f52=[r14],loc0 + ldf.fill.nta f60=[r15],loc0 + ;; + ldf.fill.nta f68=[in0],loc0 + ldf.fill.nta f76=[ r3],loc0 + ldf.fill.nta f84=[r14],loc0 + ldf.fill.nta f92=[r15],loc0 + ;; + ldf.fill.nta f100=[in0],loc1 + ldf.fill.nta f108=[ r3],loc1 + ldf.fill.nta f116=[r14],loc1 + ldf.fill.nta f124=[r15],loc1 + ;; + ldf.fill.nta f37=[in0],loc0 + ldf.fill.nta f45=[ r3],loc0 + ldf.fill.nta f53=[r14],loc0 + ldf.fill.nta f61=[r15],loc0 + ;; + ldf.fill.nta f69=[in0],loc0 + ldf.fill.nta f77=[ r3],loc0 + ldf.fill.nta f85=[r14],loc0 + ldf.fill.nta f93=[r15],loc0 + ;; + ldf.fill.nta f101=[in0],loc1 + ldf.fill.nta f109=[ r3],loc1 + ldf.fill.nta f117=[r14],loc1 + ldf.fill.nta f125=[r15],loc1 + ;; + ldf.fill.nta f38 =[in0],loc0 + ldf.fill.nta f46 =[ r3],loc0 + ldf.fill.nta f54 =[r14],loc0 + ldf.fill.nta f62 =[r15],loc0 + ;; + ldf.fill.nta f70 =[in0],loc0 + ldf.fill.nta f78 =[ r3],loc0 + ldf.fill.nta f86 =[r14],loc0 + ldf.fill.nta f94 =[r15],loc0 + ;; + ldf.fill.nta f102=[in0],loc1 + ldf.fill.nta f110=[ r3],loc1 + ldf.fill.nta f118=[r14],loc1 + ldf.fill.nta f126=[r15],loc1 + ;; + ldf.fill.nta f39 =[in0],loc0 + ldf.fill.nta f47 =[ r3],loc0 + ldf.fill.nta f55 =[r14],loc0 + ldf.fill.nta f63 =[r15],loc0 + ;; + ldf.fill.nta f71 =[in0],loc0 + ldf.fill.nta f79 =[ r3],loc0 + ldf.fill.nta f87 =[r14],loc0 + ldf.fill.nta f95 =[r15],loc0 + ;; + ldf.fill.nta f103=[in0] + ldf.fill.nta f111=[ r3] + ldf.fill.nta f119=[r14] + ldf.fill.nta f127=[r15] + br.ret.sptk.many rp +END(__ia64_load_fpu) + +GLOBAL_ENTRY(__ia64_init_fpu) + stf.spill [sp]=f0 // M3 + mov f32=f0 // F + nop.b 0 + + ldfps f33,f34=[sp] // M0 + ldfps f35,f36=[sp] // M1 + mov f37=f0 // F + ;; + + setf.s f38=r0 // M2 + setf.s f39=r0 // M3 + mov f40=f0 // F + + ldfps f41,f42=[sp] // M0 + ldfps f43,f44=[sp] // M1 + mov f45=f0 // F + + setf.s f46=r0 // M2 + setf.s f47=r0 // M3 + mov f48=f0 // F + + ldfps f49,f50=[sp] // M0 + ldfps f51,f52=[sp] // M1 + mov f53=f0 // F + + setf.s f54=r0 // M2 + setf.s f55=r0 // M3 + mov f56=f0 // F + + ldfps f57,f58=[sp] // M0 + ldfps f59,f60=[sp] // M1 + mov f61=f0 // F + + setf.s f62=r0 // M2 + setf.s f63=r0 // M3 + mov f64=f0 // F + + ldfps f65,f66=[sp] // M0 + ldfps f67,f68=[sp] // M1 + mov f69=f0 // F + + setf.s f70=r0 // M2 + setf.s f71=r0 // M3 + mov f72=f0 // F + + ldfps f73,f74=[sp] // M0 + ldfps f75,f76=[sp] // M1 + mov f77=f0 // F + + setf.s f78=r0 // M2 + setf.s f79=r0 // M3 + mov f80=f0 // F + + ldfps f81,f82=[sp] // M0 + ldfps f83,f84=[sp] // M1 + mov f85=f0 // F + + setf.s f86=r0 // M2 + setf.s f87=r0 // M3 + mov f88=f0 // F + + /* + * When the instructions are cached, it would be faster to initialize + * the remaining registers with simply mov instructions (F-unit). + * This gets the time down to ~29 cycles. However, this would use up + * 33 bundles, whereas continuing with the above pattern yields + * 10 bundles and ~30 cycles. + */ + + ldfps f89,f90=[sp] // M0 + ldfps f91,f92=[sp] // M1 + mov f93=f0 // F + + setf.s f94=r0 // M2 + setf.s f95=r0 // M3 + mov f96=f0 // F + + ldfps f97,f98=[sp] // M0 + ldfps f99,f100=[sp] // M1 + mov f101=f0 // F + + setf.s f102=r0 // M2 + setf.s f103=r0 // M3 + mov f104=f0 // F + + ldfps f105,f106=[sp] // M0 + ldfps f107,f108=[sp] // M1 + mov f109=f0 // F + + setf.s f110=r0 // M2 + setf.s f111=r0 // M3 + mov f112=f0 // F + + ldfps f113,f114=[sp] // M0 + ldfps f115,f116=[sp] // M1 + mov f117=f0 // F + + setf.s f118=r0 // M2 + setf.s f119=r0 // M3 + mov f120=f0 // F + + ldfps f121,f122=[sp] // M0 + ldfps f123,f124=[sp] // M1 + mov f125=f0 // F + + setf.s f126=r0 // M2 + setf.s f127=r0 // M3 + br.ret.sptk.many rp // F +END(__ia64_init_fpu) + +/* + * Switch execution mode from virtual to physical + * + * Inputs: + * r16 = new psr to establish + * Output: + * r19 = old virtual address of ar.bsp + * r20 = old virtual address of sp + * + * Note: RSE must already be in enforced lazy mode + */ +GLOBAL_ENTRY(ia64_switch_mode_phys) + { + rsm psr.i | psr.ic // disable interrupts and interrupt collection + mov r15=ip + } + ;; + { + flushrs // must be first insn in group + srlz.i + } + ;; + mov cr.ipsr=r16 // set new PSR + add r3=1f-ia64_switch_mode_phys,r15 + + mov r19=ar.bsp + mov r20=sp + mov r14=rp // get return address into a general register + ;; + + // going to physical mode, use tpa to translate virt->phys + tpa r17=r19 + tpa r3=r3 + tpa sp=sp + tpa r14=r14 + ;; + + mov r18=ar.rnat // save ar.rnat + mov ar.bspstore=r17 // this steps on ar.rnat + mov cr.iip=r3 + mov cr.ifs=r0 + ;; + mov ar.rnat=r18 // restore ar.rnat + rfi // must be last insn in group + ;; +1: mov rp=r14 + br.ret.sptk.many rp +END(ia64_switch_mode_phys) + +/* + * Switch execution mode from physical to virtual + * + * Inputs: + * r16 = new psr to establish + * r19 = new bspstore to establish + * r20 = new sp to establish + * + * Note: RSE must already be in enforced lazy mode + */ +GLOBAL_ENTRY(ia64_switch_mode_virt) + { + rsm psr.i | psr.ic // disable interrupts and interrupt collection + mov r15=ip + } + ;; + { + flushrs // must be first insn in group + srlz.i + } + ;; + mov cr.ipsr=r16 // set new PSR + add r3=1f-ia64_switch_mode_virt,r15 + + mov r14=rp // get return address into a general register + ;; + + // going to virtual + // - for code addresses, set upper bits of addr to KERNEL_START + // - for stack addresses, copy from input argument + movl r18=KERNEL_START + dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT + dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT + mov sp=r20 + ;; + or r3=r3,r18 + or r14=r14,r18 + ;; + + mov r18=ar.rnat // save ar.rnat + mov ar.bspstore=r19 // this steps on ar.rnat + mov cr.iip=r3 + mov cr.ifs=r0 + ;; + mov ar.rnat=r18 // restore ar.rnat + rfi // must be last insn in group + ;; +1: mov rp=r14 + br.ret.sptk.many rp +END(ia64_switch_mode_virt) + +GLOBAL_ENTRY(ia64_delay_loop) + .prologue +{ nop 0 // work around GAS unwind info generation bug... + .save ar.lc,r2 + mov r2=ar.lc + .body + ;; + mov ar.lc=r32 +} + ;; + // force loop to be 32-byte aligned (GAS bug means we cannot use .align + // inside function body without corrupting unwind info). +{ nop 0 } +1: br.cloop.sptk.few 1b + ;; + mov ar.lc=r2 + br.ret.sptk.many rp +END(ia64_delay_loop) + +/* + * Return a CPU-local timestamp in nano-seconds. This timestamp is + * NOT synchronized across CPUs its return value must never be + * compared against the values returned on another CPU. The usage in + * kernel/sched.c ensures that. + * + * The return-value of sched_clock() is NOT supposed to wrap-around. + * If it did, it would cause some scheduling hiccups (at the worst). + * Fortunately, with a 64-bit cycle-counter ticking at 100GHz, even + * that would happen only once every 5+ years. + * + * The code below basically calculates: + * + * (ia64_get_itc() * local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT + * + * except that the multiplication and the shift are done with 128-bit + * intermediate precision so that we can produce a full 64-bit result. + */ +GLOBAL_ENTRY(ia64_native_sched_clock) + addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 + mov.m r9=ar.itc // fetch cycle-counter (35 cyc) + ;; + ldf8 f8=[r8] + ;; + setf.sig f9=r9 // certain to stall, so issue it _after_ ldf8... + ;; + xmpy.lu f10=f9,f8 // calculate low 64 bits of 128-bit product (4 cyc) + xmpy.hu f11=f9,f8 // calculate high 64 bits of 128-bit product + ;; + getf.sig r8=f10 // (5 cyc) + getf.sig r9=f11 + ;; + shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT + br.ret.sptk.many rp +END(ia64_native_sched_clock) +#ifndef CONFIG_PARAVIRT + //unsigned long long + //sched_clock(void) __attribute__((alias("ia64_native_sched_clock"))); + .global sched_clock +sched_clock = ia64_native_sched_clock +#endif + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +GLOBAL_ENTRY(cycle_to_cputime) + alloc r16=ar.pfs,1,0,0,0 + addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 + ;; + ldf8 f8=[r8] + ;; + setf.sig f9=r32 + ;; + xmpy.lu f10=f9,f8 // calculate low 64 bits of 128-bit product (4 cyc) + xmpy.hu f11=f9,f8 // calculate high 64 bits of 128-bit product + ;; + getf.sig r8=f10 // (5 cyc) + getf.sig r9=f11 + ;; + shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT + br.ret.sptk.many rp +END(cycle_to_cputime) +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + +GLOBAL_ENTRY(start_kernel_thread) + .prologue + .save rp, r0 // this is the end of the call-chain + .body + alloc r2 = ar.pfs, 0, 0, 2, 0 + mov out0 = r9 + mov out1 = r11;; + br.call.sptk.many rp = kernel_thread_helper;; + mov out0 = r8 + br.call.sptk.many rp = sys_exit;; +1: br.sptk.few 1b // not reached +END(start_kernel_thread) + +#ifdef CONFIG_IA64_BRL_EMU + +/* + * Assembly routines used by brl_emu.c to set preserved register state. + */ + +#define SET_REG(reg) \ + GLOBAL_ENTRY(ia64_set_##reg); \ + alloc r16=ar.pfs,1,0,0,0; \ + mov reg=r32; \ + ;; \ + br.ret.sptk.many rp; \ + END(ia64_set_##reg) + +SET_REG(b1); +SET_REG(b2); +SET_REG(b3); +SET_REG(b4); +SET_REG(b5); + +#endif /* CONFIG_IA64_BRL_EMU */ + +#ifdef CONFIG_SMP + +#ifdef CONFIG_HOTPLUG_CPU +GLOBAL_ENTRY(ia64_jump_to_sal) + alloc r16=ar.pfs,1,0,0,0;; + rsm psr.i | psr.ic +{ + flushrs + srlz.i +} + tpa r25=in0 + movl r18=tlb_purge_done;; + DATA_VA_TO_PA(r18);; + mov b1=r18 // Return location + movl r18=ia64_do_tlb_purge;; + DATA_VA_TO_PA(r18);; + mov b2=r18 // doing tlb_flush work + mov ar.rsc=0 // Put RSE in enforced lazy, LE mode + movl r17=1f;; + DATA_VA_TO_PA(r17);; + mov cr.iip=r17 + movl r16=SAL_PSR_BITS_TO_SET;; + mov cr.ipsr=r16 + mov cr.ifs=r0;; + rfi;; // note: this unmask MCA/INIT (psr.mc) +1: + /* + * Invalidate all TLB data/inst + */ + br.sptk.many b2;; // jump to tlb purge code + +tlb_purge_done: + RESTORE_REGION_REGS(r25, r17,r18,r19);; + RESTORE_REG(b0, r25, r17);; + RESTORE_REG(b1, r25, r17);; + RESTORE_REG(b2, r25, r17);; + RESTORE_REG(b3, r25, r17);; + RESTORE_REG(b4, r25, r17);; + RESTORE_REG(b5, r25, r17);; + ld8 r1=[r25],0x08;; + ld8 r12=[r25],0x08;; + ld8 r13=[r25],0x08;; + RESTORE_REG(ar.fpsr, r25, r17);; + RESTORE_REG(ar.pfs, r25, r17);; + RESTORE_REG(ar.rnat, r25, r17);; + RESTORE_REG(ar.unat, r25, r17);; + RESTORE_REG(ar.bspstore, r25, r17);; + RESTORE_REG(cr.dcr, r25, r17);; + RESTORE_REG(cr.iva, r25, r17);; + RESTORE_REG(cr.pta, r25, r17);; + srlz.d;; // required not to violate RAW dependency + RESTORE_REG(cr.itv, r25, r17);; + RESTORE_REG(cr.pmv, r25, r17);; + RESTORE_REG(cr.cmcv, r25, r17);; + RESTORE_REG(cr.lrr0, r25, r17);; + RESTORE_REG(cr.lrr1, r25, r17);; + ld8 r4=[r25],0x08;; + ld8 r5=[r25],0x08;; + ld8 r6=[r25],0x08;; + ld8 r7=[r25],0x08;; + ld8 r17=[r25],0x08;; + mov pr=r17,-1;; + RESTORE_REG(ar.lc, r25, r17);; + /* + * Now Restore floating point regs + */ + ldf.fill.nta f2=[r25],16;; + ldf.fill.nta f3=[r25],16;; + ldf.fill.nta f4=[r25],16;; + ldf.fill.nta f5=[r25],16;; + ldf.fill.nta f16=[r25],16;; + ldf.fill.nta f17=[r25],16;; + ldf.fill.nta f18=[r25],16;; + ldf.fill.nta f19=[r25],16;; + ldf.fill.nta f20=[r25],16;; + ldf.fill.nta f21=[r25],16;; + ldf.fill.nta f22=[r25],16;; + ldf.fill.nta f23=[r25],16;; + ldf.fill.nta f24=[r25],16;; + ldf.fill.nta f25=[r25],16;; + ldf.fill.nta f26=[r25],16;; + ldf.fill.nta f27=[r25],16;; + ldf.fill.nta f28=[r25],16;; + ldf.fill.nta f29=[r25],16;; + ldf.fill.nta f30=[r25],16;; + ldf.fill.nta f31=[r25],16;; + + /* + * Now that we have done all the register restores + * we are now ready for the big DIVE to SAL Land + */ + ssm psr.ic;; + srlz.d;; + br.ret.sptk.many b0;; +END(ia64_jump_to_sal) +#endif /* CONFIG_HOTPLUG_CPU */ + +#endif /* CONFIG_SMP */ diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c new file mode 100644 index 00000000000..552fe24b981 --- /dev/null +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -0,0 +1,101 @@ +/* + * Architecture-specific kernel symbols + * + * Don't put any exports here unless it's defined in an assembler file. + * All other exports should be put directly after the definition. + */ + +#include <linux/module.h> + +#include <linux/string.h> +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(memcpy); +EXPORT_SYMBOL(strlen); + +#include<asm/pgtable.h> +EXPORT_SYMBOL_GPL(empty_zero_page); + +#include <asm/checksum.h> +EXPORT_SYMBOL(ip_fast_csum); /* hand-coded assembly */ +EXPORT_SYMBOL(csum_ipv6_magic); + +#include <asm/page.h> +EXPORT_SYMBOL(clear_page); +EXPORT_SYMBOL(copy_page); + +#ifdef CONFIG_VIRTUAL_MEM_MAP +#include <linux/bootmem.h> +EXPORT_SYMBOL(min_low_pfn); /* defined by bootmem.c, but not exported by generic code */ +EXPORT_SYMBOL(max_low_pfn); /* defined by bootmem.c, but not exported by generic code */ +#endif + +#include <asm/processor.h> +EXPORT_SYMBOL(ia64_cpu_info); +#ifdef CONFIG_SMP +EXPORT_SYMBOL(local_per_cpu_offset); +#endif + +#include <asm/uaccess.h> +EXPORT_SYMBOL(__copy_user); +EXPORT_SYMBOL(__do_clear_user); +EXPORT_SYMBOL(__strlen_user); +EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__strnlen_user); + +/* from arch/ia64/lib */ +extern void __divsi3(void); +extern void __udivsi3(void); +extern void __modsi3(void); +extern void __umodsi3(void); +extern void __divdi3(void); +extern void __udivdi3(void); +extern void __moddi3(void); +extern void __umoddi3(void); + +EXPORT_SYMBOL(__divsi3); +EXPORT_SYMBOL(__udivsi3); +EXPORT_SYMBOL(__modsi3); +EXPORT_SYMBOL(__umodsi3); +EXPORT_SYMBOL(__divdi3); +EXPORT_SYMBOL(__udivdi3); +EXPORT_SYMBOL(__moddi3); +EXPORT_SYMBOL(__umoddi3); + +#if defined(CONFIG_MD_RAID456) || defined(CONFIG_MD_RAID456_MODULE) +extern void xor_ia64_2(void); +extern void xor_ia64_3(void); +extern void xor_ia64_4(void); +extern void xor_ia64_5(void); + +EXPORT_SYMBOL(xor_ia64_2); +EXPORT_SYMBOL(xor_ia64_3); +EXPORT_SYMBOL(xor_ia64_4); +EXPORT_SYMBOL(xor_ia64_5); +#endif + +#include <asm/pal.h> +EXPORT_SYMBOL(ia64_pal_call_phys_stacked); +EXPORT_SYMBOL(ia64_pal_call_phys_static); +EXPORT_SYMBOL(ia64_pal_call_stacked); +EXPORT_SYMBOL(ia64_pal_call_static); +EXPORT_SYMBOL(ia64_load_scratch_fpregs); +EXPORT_SYMBOL(ia64_save_scratch_fpregs); + +#include <asm/unwind.h> +EXPORT_SYMBOL(unw_init_running); + +#include <linux/efi.h> +EXPORT_SYMBOL_GPL(efi_mem_type); + +#if defined(CONFIG_IA64_ESI) || defined(CONFIG_IA64_ESI_MODULE) +extern void esi_call_phys (void); +EXPORT_SYMBOL_GPL(esi_call_phys); +#endif +extern char ia64_ivt[]; +EXPORT_SYMBOL(ia64_ivt); + +#include <asm/ftrace.h> +#ifdef CONFIG_FUNCTION_TRACER +/* mcount is defined in assembly */ +EXPORT_SYMBOL(_mcount); +#endif diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c new file mode 100644 index 00000000000..f9efe9739d3 --- /dev/null +++ b/arch/ia64/kernel/init_task.c @@ -0,0 +1,42 @@ +/* + * This is where we statically allocate and initialize the initial + * task. + * + * Copyright (C) 1999, 2002-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init_task.h> +#include <linux/mqueue.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> + +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +/* + * Initial task structure. + * + * We need to make sure that this is properly aligned due to the way process stacks are + * handled. This is done by having a special ".data..init_task" section... + */ +#define init_thread_info init_task_mem.s.thread_info + +union { + struct { + struct task_struct task; + struct thread_info thread_info; + } s; + unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; +} init_task_mem asm ("init_task") __init_task_data = + {{ + .task = INIT_TASK(init_task_mem.s.task), + .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) +}}; + +EXPORT_SYMBOL(init_task); diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c new file mode 100644 index 00000000000..b0f9afebb14 --- /dev/null +++ b/arch/ia64/kernel/iosapic.c @@ -0,0 +1,1119 @@ +/* + * I/O SAPIC support. + * + * Copyright (C) 1999 Intel Corp. + * Copyright (C) 1999 Asit Mallick <asit.k.mallick@intel.com> + * Copyright (C) 2000-2002 J.I. Lee <jung-ik.lee@intel.com> + * Copyright (C) 1999-2000, 2002-2003 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999,2000 Walt Drummond <drummond@valinux.com> + * + * 00/04/19 D. Mosberger Rewritten to mirror more closely the x86 I/O + * APIC code. In particular, we now have separate + * handlers for edge and level triggered + * interrupts. + * 00/10/27 Asit Mallick, Goutham Rao <goutham.rao@intel.com> IRQ vector + * allocation PCI to vector mapping, shared PCI + * interrupts. + * 00/10/27 D. Mosberger Document things a bit more to make them more + * understandable. Clean up much of the old + * IOSAPIC cruft. + * 01/07/27 J.I. Lee PCI irq routing, Platform/Legacy interrupts + * and fixes for ACPI S5(SoftOff) support. + * 02/01/23 J.I. Lee iosapic pgm fixes for PCI irq routing from _PRT + * 02/01/07 E. Focht <efocht@ess.nec.de> Redirectable interrupt + * vectors in iosapic_set_affinity(), + * initializations for /proc/irq/#/smp_affinity + * 02/04/02 P. Diefenbaugh Cleaned up ACPI PCI IRQ routing. + * 02/04/18 J.I. Lee bug fix in iosapic_init_pci_irq + * 02/04/30 J.I. Lee bug fix in find_iosapic to fix ACPI PCI IRQ to + * IOSAPIC mapping error + * 02/07/29 T. Kochi Allocate interrupt vectors dynamically + * 02/08/04 T. Kochi Cleaned up terminology (irq, global system + * interrupt, vector, etc.) + * 02/09/20 D. Mosberger Simplified by taking advantage of ACPI's + * pci_irq code. + * 03/02/19 B. Helgaas Make pcat_compat system-wide, not per-IOSAPIC. + * Remove iosapic_address & gsi_base from + * external interfaces. Rationalize + * __init/__devinit attributes. + * 04/12/04 Ashok Raj <ashok.raj@intel.com> Intel Corporation 2004 + * Updated to work with irq migration necessary + * for CPU Hotplug + */ +/* + * Here is what the interrupt logic between a PCI device and the kernel looks + * like: + * + * (1) A PCI device raises one of the four interrupt pins (INTA, INTB, INTC, + * INTD). The device is uniquely identified by its bus-, and slot-number + * (the function number does not matter here because all functions share + * the same interrupt lines). + * + * (2) The motherboard routes the interrupt line to a pin on a IOSAPIC + * controller. Multiple interrupt lines may have to share the same + * IOSAPIC pin (if they're level triggered and use the same polarity). + * Each interrupt line has a unique Global System Interrupt (GSI) number + * which can be calculated as the sum of the controller's base GSI number + * and the IOSAPIC pin number to which the line connects. + * + * (3) The IOSAPIC uses an internal routing table entries (RTEs) to map the + * IOSAPIC pin into the IA-64 interrupt vector. This interrupt vector is then + * sent to the CPU. + * + * (4) The kernel recognizes an interrupt as an IRQ. The IRQ interface is + * used as architecture-independent interrupt handling mechanism in Linux. + * As an IRQ is a number, we have to have + * IA-64 interrupt vector number <-> IRQ number mapping. On smaller + * systems, we use one-to-one mapping between IA-64 vector and IRQ. A + * platform can implement platform_irq_to_vector(irq) and + * platform_local_vector_to_irq(vector) APIs to differentiate the mapping. + * Please see also arch/ia64/include/asm/hw_irq.h for those APIs. + * + * To sum up, there are three levels of mappings involved: + * + * PCI pin -> global system interrupt (GSI) -> IA-64 vector <-> IRQ + * + * Note: The term "IRQ" is loosely used everywhere in Linux kernel to + * describeinterrupts. Now we use "IRQ" only for Linux IRQ's. ISA IRQ + * (isa_irq) is the only exception in this source code. + */ + +#include <linux/acpi.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/bootmem.h> + +#include <asm/delay.h> +#include <asm/hw_irq.h> +#include <asm/io.h> +#include <asm/iosapic.h> +#include <asm/machvec.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/system.h> + +#undef DEBUG_INTERRUPT_ROUTING + +#ifdef DEBUG_INTERRUPT_ROUTING +#define DBG(fmt...) printk(fmt) +#else +#define DBG(fmt...) +#endif + +static DEFINE_SPINLOCK(iosapic_lock); + +/* + * These tables map IA-64 vectors to the IOSAPIC pin that generates this + * vector. + */ + +#define NO_REF_RTE 0 + +static struct iosapic { + char __iomem *addr; /* base address of IOSAPIC */ + unsigned int gsi_base; /* GSI base */ + unsigned short num_rte; /* # of RTEs on this IOSAPIC */ + int rtes_inuse; /* # of RTEs in use on this IOSAPIC */ +#ifdef CONFIG_NUMA + unsigned short node; /* numa node association via pxm */ +#endif + spinlock_t lock; /* lock for indirect reg access */ +} iosapic_lists[NR_IOSAPICS]; + +struct iosapic_rte_info { + struct list_head rte_list; /* RTEs sharing the same vector */ + char rte_index; /* IOSAPIC RTE index */ + int refcnt; /* reference counter */ + struct iosapic *iosapic; +} ____cacheline_aligned; + +static struct iosapic_intr_info { + struct list_head rtes; /* RTEs using this vector (empty => + * not an IOSAPIC interrupt) */ + int count; /* # of registered RTEs */ + u32 low32; /* current value of low word of + * Redirection table entry */ + unsigned int dest; /* destination CPU physical ID */ + unsigned char dmode : 3; /* delivery mode (see iosapic.h) */ + unsigned char polarity: 1; /* interrupt polarity + * (see iosapic.h) */ + unsigned char trigger : 1; /* trigger mode (see iosapic.h) */ +} iosapic_intr_info[NR_IRQS]; + +static unsigned char pcat_compat __devinitdata; /* 8259 compatibility flag */ + +static inline void +iosapic_write(struct iosapic *iosapic, unsigned int reg, u32 val) +{ + unsigned long flags; + + spin_lock_irqsave(&iosapic->lock, flags); + __iosapic_write(iosapic->addr, reg, val); + spin_unlock_irqrestore(&iosapic->lock, flags); +} + +/* + * Find an IOSAPIC associated with a GSI + */ +static inline int +find_iosapic (unsigned int gsi) +{ + int i; + + for (i = 0; i < NR_IOSAPICS; i++) { + if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < + iosapic_lists[i].num_rte) + return i; + } + + return -1; +} + +static inline int __gsi_to_irq(unsigned int gsi) +{ + int irq; + struct iosapic_intr_info *info; + struct iosapic_rte_info *rte; + + for (irq = 0; irq < NR_IRQS; irq++) { + info = &iosapic_intr_info[irq]; + list_for_each_entry(rte, &info->rtes, rte_list) + if (rte->iosapic->gsi_base + rte->rte_index == gsi) + return irq; + } + return -1; +} + +int +gsi_to_irq (unsigned int gsi) +{ + unsigned long flags; + int irq; + + spin_lock_irqsave(&iosapic_lock, flags); + irq = __gsi_to_irq(gsi); + spin_unlock_irqrestore(&iosapic_lock, flags); + return irq; +} + +static struct iosapic_rte_info *find_rte(unsigned int irq, unsigned int gsi) +{ + struct iosapic_rte_info *rte; + + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) + if (rte->iosapic->gsi_base + rte->rte_index == gsi) + return rte; + return NULL; +} + +static void +set_rte (unsigned int gsi, unsigned int irq, unsigned int dest, int mask) +{ + unsigned long pol, trigger, dmode; + u32 low32, high32; + int rte_index; + char redir; + struct iosapic_rte_info *rte; + ia64_vector vector = irq_to_vector(irq); + + DBG(KERN_DEBUG"IOSAPIC: routing vector %d to 0x%x\n", vector, dest); + + rte = find_rte(irq, gsi); + if (!rte) + return; /* not an IOSAPIC interrupt */ + + rte_index = rte->rte_index; + pol = iosapic_intr_info[irq].polarity; + trigger = iosapic_intr_info[irq].trigger; + dmode = iosapic_intr_info[irq].dmode; + + redir = (dmode == IOSAPIC_LOWEST_PRIORITY) ? 1 : 0; + +#ifdef CONFIG_SMP + set_irq_affinity_info(irq, (int)(dest & 0xffff), redir); +#endif + + low32 = ((pol << IOSAPIC_POLARITY_SHIFT) | + (trigger << IOSAPIC_TRIGGER_SHIFT) | + (dmode << IOSAPIC_DELIVERY_SHIFT) | + ((mask ? 1 : 0) << IOSAPIC_MASK_SHIFT) | + vector); + + /* dest contains both id and eid */ + high32 = (dest << IOSAPIC_DEST_SHIFT); + + iosapic_write(rte->iosapic, IOSAPIC_RTE_HIGH(rte_index), high32); + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32); + iosapic_intr_info[irq].low32 = low32; + iosapic_intr_info[irq].dest = dest; +} + +static void +nop (struct irq_data *data) +{ + /* do nothing... */ +} + + +#ifdef CONFIG_KEXEC +void +kexec_disable_iosapic(void) +{ + struct iosapic_intr_info *info; + struct iosapic_rte_info *rte; + ia64_vector vec; + int irq; + + for (irq = 0; irq < NR_IRQS; irq++) { + info = &iosapic_intr_info[irq]; + vec = irq_to_vector(irq); + list_for_each_entry(rte, &info->rtes, + rte_list) { + iosapic_write(rte->iosapic, + IOSAPIC_RTE_LOW(rte->rte_index), + IOSAPIC_MASK|vec); + iosapic_eoi(rte->iosapic->addr, vec); + } + } +} +#endif + +static void +mask_irq (struct irq_data *data) +{ + unsigned int irq = data->irq; + u32 low32; + int rte_index; + struct iosapic_rte_info *rte; + + if (!iosapic_intr_info[irq].count) + return; /* not an IOSAPIC interrupt! */ + + /* set only the mask bit */ + low32 = iosapic_intr_info[irq].low32 |= IOSAPIC_MASK; + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) { + rte_index = rte->rte_index; + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32); + } +} + +static void +unmask_irq (struct irq_data *data) +{ + unsigned int irq = data->irq; + u32 low32; + int rte_index; + struct iosapic_rte_info *rte; + + if (!iosapic_intr_info[irq].count) + return; /* not an IOSAPIC interrupt! */ + + low32 = iosapic_intr_info[irq].low32 &= ~IOSAPIC_MASK; + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) { + rte_index = rte->rte_index; + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte_index), low32); + } +} + + +static int +iosapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ +#ifdef CONFIG_SMP + unsigned int irq = data->irq; + u32 high32, low32; + int cpu, dest, rte_index; + int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0; + struct iosapic_rte_info *rte; + struct iosapic *iosapic; + + irq &= (~IA64_IRQ_REDIRECTED); + + cpu = cpumask_first_and(cpu_online_mask, mask); + if (cpu >= nr_cpu_ids) + return -1; + + if (irq_prepare_move(irq, cpu)) + return -1; + + dest = cpu_physical_id(cpu); + + if (!iosapic_intr_info[irq].count) + return -1; /* not an IOSAPIC interrupt */ + + set_irq_affinity_info(irq, dest, redir); + + /* dest contains both id and eid */ + high32 = dest << IOSAPIC_DEST_SHIFT; + + low32 = iosapic_intr_info[irq].low32 & ~(7 << IOSAPIC_DELIVERY_SHIFT); + if (redir) + /* change delivery mode to lowest priority */ + low32 |= (IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT); + else + /* change delivery mode to fixed */ + low32 |= (IOSAPIC_FIXED << IOSAPIC_DELIVERY_SHIFT); + low32 &= IOSAPIC_VECTOR_MASK; + low32 |= irq_to_vector(irq); + + iosapic_intr_info[irq].low32 = low32; + iosapic_intr_info[irq].dest = dest; + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) { + iosapic = rte->iosapic; + rte_index = rte->rte_index; + iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32); + iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32); + } + +#endif + return 0; +} + +/* + * Handlers for level-triggered interrupts. + */ + +static unsigned int +iosapic_startup_level_irq (struct irq_data *data) +{ + unmask_irq(data); + return 0; +} + +static void +iosapic_unmask_level_irq (struct irq_data *data) +{ + unsigned int irq = data->irq; + ia64_vector vec = irq_to_vector(irq); + struct iosapic_rte_info *rte; + int do_unmask_irq = 0; + + irq_complete_move(irq); + if (unlikely(irqd_is_setaffinity_pending(data))) { + do_unmask_irq = 1; + mask_irq(data); + } else + unmask_irq(data); + + list_for_each_entry(rte, &iosapic_intr_info[irq].rtes, rte_list) + iosapic_eoi(rte->iosapic->addr, vec); + + if (unlikely(do_unmask_irq)) { + irq_move_masked_irq(data); + unmask_irq(data); + } +} + +#define iosapic_shutdown_level_irq mask_irq +#define iosapic_enable_level_irq unmask_irq +#define iosapic_disable_level_irq mask_irq +#define iosapic_ack_level_irq nop + +static struct irq_chip irq_type_iosapic_level = { + .name = "IO-SAPIC-level", + .irq_startup = iosapic_startup_level_irq, + .irq_shutdown = iosapic_shutdown_level_irq, + .irq_enable = iosapic_enable_level_irq, + .irq_disable = iosapic_disable_level_irq, + .irq_ack = iosapic_ack_level_irq, + .irq_mask = mask_irq, + .irq_unmask = iosapic_unmask_level_irq, + .irq_set_affinity = iosapic_set_affinity +}; + +/* + * Handlers for edge-triggered interrupts. + */ + +static unsigned int +iosapic_startup_edge_irq (struct irq_data *data) +{ + unmask_irq(data); + /* + * IOSAPIC simply drops interrupts pended while the + * corresponding pin was masked, so we can't know if an + * interrupt is pending already. Let's hope not... + */ + return 0; +} + +static void +iosapic_ack_edge_irq (struct irq_data *data) +{ + irq_complete_move(data->irq); + irq_move_irq(data); +} + +#define iosapic_enable_edge_irq unmask_irq +#define iosapic_disable_edge_irq nop + +static struct irq_chip irq_type_iosapic_edge = { + .name = "IO-SAPIC-edge", + .irq_startup = iosapic_startup_edge_irq, + .irq_shutdown = iosapic_disable_edge_irq, + .irq_enable = iosapic_enable_edge_irq, + .irq_disable = iosapic_disable_edge_irq, + .irq_ack = iosapic_ack_edge_irq, + .irq_mask = mask_irq, + .irq_unmask = unmask_irq, + .irq_set_affinity = iosapic_set_affinity +}; + +static unsigned int +iosapic_version (char __iomem *addr) +{ + /* + * IOSAPIC Version Register return 32 bit structure like: + * { + * unsigned int version : 8; + * unsigned int reserved1 : 8; + * unsigned int max_redir : 8; + * unsigned int reserved2 : 8; + * } + */ + return __iosapic_read(addr, IOSAPIC_VERSION); +} + +static int iosapic_find_sharable_irq(unsigned long trigger, unsigned long pol) +{ + int i, irq = -ENOSPC, min_count = -1; + struct iosapic_intr_info *info; + + /* + * shared vectors for edge-triggered interrupts are not + * supported yet + */ + if (trigger == IOSAPIC_EDGE) + return -EINVAL; + + for (i = 0; i < NR_IRQS; i++) { + info = &iosapic_intr_info[i]; + if (info->trigger == trigger && info->polarity == pol && + (info->dmode == IOSAPIC_FIXED || + info->dmode == IOSAPIC_LOWEST_PRIORITY) && + can_request_irq(i, IRQF_SHARED)) { + if (min_count == -1 || info->count < min_count) { + irq = i; + min_count = info->count; + } + } + } + return irq; +} + +/* + * if the given vector is already owned by other, + * assign a new vector for the other and make the vector available + */ +static void __init +iosapic_reassign_vector (int irq) +{ + int new_irq; + + if (iosapic_intr_info[irq].count) { + new_irq = create_irq(); + if (new_irq < 0) + panic("%s: out of interrupt vectors!\n", __func__); + printk(KERN_INFO "Reassigning vector %d to %d\n", + irq_to_vector(irq), irq_to_vector(new_irq)); + memcpy(&iosapic_intr_info[new_irq], &iosapic_intr_info[irq], + sizeof(struct iosapic_intr_info)); + INIT_LIST_HEAD(&iosapic_intr_info[new_irq].rtes); + list_move(iosapic_intr_info[irq].rtes.next, + &iosapic_intr_info[new_irq].rtes); + memset(&iosapic_intr_info[irq], 0, + sizeof(struct iosapic_intr_info)); + iosapic_intr_info[irq].low32 = IOSAPIC_MASK; + INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes); + } +} + +static inline int irq_is_shared (int irq) +{ + return (iosapic_intr_info[irq].count > 1); +} + +struct irq_chip* +ia64_native_iosapic_get_irq_chip(unsigned long trigger) +{ + if (trigger == IOSAPIC_EDGE) + return &irq_type_iosapic_edge; + else + return &irq_type_iosapic_level; +} + +static int +register_intr (unsigned int gsi, int irq, unsigned char delivery, + unsigned long polarity, unsigned long trigger) +{ + struct irq_chip *chip, *irq_type; + int index; + struct iosapic_rte_info *rte; + + index = find_iosapic(gsi); + if (index < 0) { + printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", + __func__, gsi); + return -ENODEV; + } + + rte = find_rte(irq, gsi); + if (!rte) { + rte = kzalloc(sizeof (*rte), GFP_ATOMIC); + if (!rte) { + printk(KERN_WARNING "%s: cannot allocate memory\n", + __func__); + return -ENOMEM; + } + + rte->iosapic = &iosapic_lists[index]; + rte->rte_index = gsi - rte->iosapic->gsi_base; + rte->refcnt++; + list_add_tail(&rte->rte_list, &iosapic_intr_info[irq].rtes); + iosapic_intr_info[irq].count++; + iosapic_lists[index].rtes_inuse++; + } + else if (rte->refcnt == NO_REF_RTE) { + struct iosapic_intr_info *info = &iosapic_intr_info[irq]; + if (info->count > 0 && + (info->trigger != trigger || info->polarity != polarity)){ + printk (KERN_WARNING + "%s: cannot override the interrupt\n", + __func__); + return -EINVAL; + } + rte->refcnt++; + iosapic_intr_info[irq].count++; + iosapic_lists[index].rtes_inuse++; + } + + iosapic_intr_info[irq].polarity = polarity; + iosapic_intr_info[irq].dmode = delivery; + iosapic_intr_info[irq].trigger = trigger; + + irq_type = iosapic_get_irq_chip(trigger); + + chip = irq_get_chip(irq); + if (irq_type != NULL && chip != irq_type) { + if (chip != &no_irq_chip) + printk(KERN_WARNING + "%s: changing vector %d from %s to %s\n", + __func__, irq_to_vector(irq), + chip->name, irq_type->name); + chip = irq_type; + } + __irq_set_chip_handler_name_locked(irq, chip, trigger == IOSAPIC_EDGE ? + handle_edge_irq : handle_level_irq, + NULL); + return 0; +} + +static unsigned int +get_target_cpu (unsigned int gsi, int irq) +{ +#ifdef CONFIG_SMP + static int cpu = -1; + extern int cpe_vector; + cpumask_t domain = irq_to_domain(irq); + + /* + * In case of vector shared by multiple RTEs, all RTEs that + * share the vector need to use the same destination CPU. + */ + if (iosapic_intr_info[irq].count) + return iosapic_intr_info[irq].dest; + + /* + * If the platform supports redirection via XTP, let it + * distribute interrupts. + */ + if (smp_int_redirect & SMP_IRQ_REDIRECTION) + return cpu_physical_id(smp_processor_id()); + + /* + * Some interrupts (ACPI SCI, for instance) are registered + * before the BSP is marked as online. + */ + if (!cpu_online(smp_processor_id())) + return cpu_physical_id(smp_processor_id()); + +#ifdef CONFIG_ACPI + if (cpe_vector > 0 && irq_to_vector(irq) == IA64_CPEP_VECTOR) + return get_cpei_target_cpu(); +#endif + +#ifdef CONFIG_NUMA + { + int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0; + const struct cpumask *cpu_mask; + + iosapic_index = find_iosapic(gsi); + if (iosapic_index < 0 || + iosapic_lists[iosapic_index].node == MAX_NUMNODES) + goto skip_numa_setup; + + cpu_mask = cpumask_of_node(iosapic_lists[iosapic_index].node); + num_cpus = 0; + for_each_cpu_and(numa_cpu, cpu_mask, &domain) { + if (cpu_online(numa_cpu)) + num_cpus++; + } + + if (!num_cpus) + goto skip_numa_setup; + + /* Use irq assignment to distribute across cpus in node */ + cpu_index = irq % num_cpus; + + for_each_cpu_and(numa_cpu, cpu_mask, &domain) + if (cpu_online(numa_cpu) && i++ >= cpu_index) + break; + + if (numa_cpu < nr_cpu_ids) + return cpu_physical_id(numa_cpu); + } +skip_numa_setup: +#endif + /* + * Otherwise, round-robin interrupt vectors across all the + * processors. (It'd be nice if we could be smarter in the + * case of NUMA.) + */ + do { + if (++cpu >= nr_cpu_ids) + cpu = 0; + } while (!cpu_online(cpu) || !cpu_isset(cpu, domain)); + + return cpu_physical_id(cpu); +#else /* CONFIG_SMP */ + return cpu_physical_id(smp_processor_id()); +#endif +} + +static inline unsigned char choose_dmode(void) +{ +#ifdef CONFIG_SMP + if (smp_int_redirect & SMP_IRQ_REDIRECTION) + return IOSAPIC_LOWEST_PRIORITY; +#endif + return IOSAPIC_FIXED; +} + +/* + * ACPI can describe IOSAPIC interrupts via static tables and namespace + * methods. This provides an interface to register those interrupts and + * program the IOSAPIC RTE. + */ +int +iosapic_register_intr (unsigned int gsi, + unsigned long polarity, unsigned long trigger) +{ + int irq, mask = 1, err; + unsigned int dest; + unsigned long flags; + struct iosapic_rte_info *rte; + u32 low32; + unsigned char dmode; + struct irq_desc *desc; + + /* + * If this GSI has already been registered (i.e., it's a + * shared interrupt, or we lost a race to register it), + * don't touch the RTE. + */ + spin_lock_irqsave(&iosapic_lock, flags); + irq = __gsi_to_irq(gsi); + if (irq > 0) { + rte = find_rte(irq, gsi); + if(iosapic_intr_info[irq].count == 0) { + assign_irq_vector(irq); + dynamic_irq_init(irq); + } else if (rte->refcnt != NO_REF_RTE) { + rte->refcnt++; + goto unlock_iosapic_lock; + } + } else + irq = create_irq(); + + /* If vector is running out, we try to find a sharable vector */ + if (irq < 0) { + irq = iosapic_find_sharable_irq(trigger, polarity); + if (irq < 0) + goto unlock_iosapic_lock; + } + + desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); + dest = get_target_cpu(gsi, irq); + dmode = choose_dmode(); + err = register_intr(gsi, irq, dmode, polarity, trigger); + if (err < 0) { + raw_spin_unlock(&desc->lock); + irq = err; + goto unlock_iosapic_lock; + } + + /* + * If the vector is shared and already unmasked for other + * interrupt sources, don't mask it. + */ + low32 = iosapic_intr_info[irq].low32; + if (irq_is_shared(irq) && !(low32 & IOSAPIC_MASK)) + mask = 0; + set_rte(gsi, irq, dest, mask); + + printk(KERN_INFO "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d\n", + gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), + (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), + cpu_logical_id(dest), dest, irq_to_vector(irq)); + + raw_spin_unlock(&desc->lock); + unlock_iosapic_lock: + spin_unlock_irqrestore(&iosapic_lock, flags); + return irq; +} + +void +iosapic_unregister_intr (unsigned int gsi) +{ + unsigned long flags; + int irq, index; + u32 low32; + unsigned long trigger, polarity; + unsigned int dest; + struct iosapic_rte_info *rte; + + /* + * If the irq associated with the gsi is not found, + * iosapic_unregister_intr() is unbalanced. We need to check + * this again after getting locks. + */ + irq = gsi_to_irq(gsi); + if (irq < 0) { + printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", + gsi); + WARN_ON(1); + return; + } + + spin_lock_irqsave(&iosapic_lock, flags); + if ((rte = find_rte(irq, gsi)) == NULL) { + printk(KERN_ERR "iosapic_unregister_intr(%u) unbalanced\n", + gsi); + WARN_ON(1); + goto out; + } + + if (--rte->refcnt > 0) + goto out; + + rte->refcnt = NO_REF_RTE; + + /* Mask the interrupt */ + low32 = iosapic_intr_info[irq].low32 | IOSAPIC_MASK; + iosapic_write(rte->iosapic, IOSAPIC_RTE_LOW(rte->rte_index), low32); + + iosapic_intr_info[irq].count--; + index = find_iosapic(gsi); + iosapic_lists[index].rtes_inuse--; + WARN_ON(iosapic_lists[index].rtes_inuse < 0); + + trigger = iosapic_intr_info[irq].trigger; + polarity = iosapic_intr_info[irq].polarity; + dest = iosapic_intr_info[irq].dest; + printk(KERN_INFO + "GSI %u (%s, %s) -> CPU %d (0x%04x) vector %d unregistered\n", + gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), + (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), + cpu_logical_id(dest), dest, irq_to_vector(irq)); + + if (iosapic_intr_info[irq].count == 0) { +#ifdef CONFIG_SMP + /* Clear affinity */ + cpumask_setall(irq_get_irq_data(irq)->affinity); +#endif + /* Clear the interrupt information */ + iosapic_intr_info[irq].dest = 0; + iosapic_intr_info[irq].dmode = 0; + iosapic_intr_info[irq].polarity = 0; + iosapic_intr_info[irq].trigger = 0; + iosapic_intr_info[irq].low32 |= IOSAPIC_MASK; + + /* Destroy and reserve IRQ */ + destroy_and_reserve_irq(irq); + } + out: + spin_unlock_irqrestore(&iosapic_lock, flags); +} + +/* + * ACPI calls this when it finds an entry for a platform interrupt. + */ +int __init +iosapic_register_platform_intr (u32 int_type, unsigned int gsi, + int iosapic_vector, u16 eid, u16 id, + unsigned long polarity, unsigned long trigger) +{ + static const char * const name[] = {"unknown", "PMI", "INIT", "CPEI"}; + unsigned char delivery; + int irq, vector, mask = 0; + unsigned int dest = ((id << 8) | eid) & 0xffff; + + switch (int_type) { + case ACPI_INTERRUPT_PMI: + irq = vector = iosapic_vector; + bind_irq_vector(irq, vector, CPU_MASK_ALL); + /* + * since PMI vector is alloc'd by FW(ACPI) not by kernel, + * we need to make sure the vector is available + */ + iosapic_reassign_vector(irq); + delivery = IOSAPIC_PMI; + break; + case ACPI_INTERRUPT_INIT: + irq = create_irq(); + if (irq < 0) + panic("%s: out of interrupt vectors!\n", __func__); + vector = irq_to_vector(irq); + delivery = IOSAPIC_INIT; + break; + case ACPI_INTERRUPT_CPEI: + irq = vector = IA64_CPE_VECTOR; + BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL)); + delivery = IOSAPIC_FIXED; + mask = 1; + break; + default: + printk(KERN_ERR "%s: invalid int type 0x%x\n", __func__, + int_type); + return -1; + } + + register_intr(gsi, irq, delivery, polarity, trigger); + + printk(KERN_INFO + "PLATFORM int %s (0x%x): GSI %u (%s, %s) -> CPU %d (0x%04x)" + " vector %d\n", + int_type < ARRAY_SIZE(name) ? name[int_type] : "unknown", + int_type, gsi, (trigger == IOSAPIC_EDGE ? "edge" : "level"), + (polarity == IOSAPIC_POL_HIGH ? "high" : "low"), + cpu_logical_id(dest), dest, vector); + + set_rte(gsi, irq, dest, mask); + return vector; +} + +/* + * ACPI calls this when it finds an entry for a legacy ISA IRQ override. + */ +void __devinit +iosapic_override_isa_irq (unsigned int isa_irq, unsigned int gsi, + unsigned long polarity, + unsigned long trigger) +{ + int vector, irq; + unsigned int dest = cpu_physical_id(smp_processor_id()); + unsigned char dmode; + + irq = vector = isa_irq_to_vector(isa_irq); + BUG_ON(bind_irq_vector(irq, vector, CPU_MASK_ALL)); + dmode = choose_dmode(); + register_intr(gsi, irq, dmode, polarity, trigger); + + DBG("ISA: IRQ %u -> GSI %u (%s,%s) -> CPU %d (0x%04x) vector %d\n", + isa_irq, gsi, trigger == IOSAPIC_EDGE ? "edge" : "level", + polarity == IOSAPIC_POL_HIGH ? "high" : "low", + cpu_logical_id(dest), dest, vector); + + set_rte(gsi, irq, dest, 1); +} + +void __init +ia64_native_iosapic_pcat_compat_init(void) +{ + if (pcat_compat) { + /* + * Disable the compatibility mode interrupts (8259 style), + * needs IN/OUT support enabled. + */ + printk(KERN_INFO + "%s: Disabling PC-AT compatible 8259 interrupts\n", + __func__); + outb(0xff, 0xA1); + outb(0xff, 0x21); + } +} + +void __init +iosapic_system_init (int system_pcat_compat) +{ + int irq; + + for (irq = 0; irq < NR_IRQS; ++irq) { + iosapic_intr_info[irq].low32 = IOSAPIC_MASK; + /* mark as unused */ + INIT_LIST_HEAD(&iosapic_intr_info[irq].rtes); + + iosapic_intr_info[irq].count = 0; + } + + pcat_compat = system_pcat_compat; + if (pcat_compat) + iosapic_pcat_compat_init(); +} + +static inline int +iosapic_alloc (void) +{ + int index; + + for (index = 0; index < NR_IOSAPICS; index++) + if (!iosapic_lists[index].addr) + return index; + + printk(KERN_WARNING "%s: failed to allocate iosapic\n", __func__); + return -1; +} + +static inline void +iosapic_free (int index) +{ + memset(&iosapic_lists[index], 0, sizeof(iosapic_lists[0])); +} + +static inline int +iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver) +{ + int index; + unsigned int gsi_end, base, end; + + /* check gsi range */ + gsi_end = gsi_base + ((ver >> 16) & 0xff); + for (index = 0; index < NR_IOSAPICS; index++) { + if (!iosapic_lists[index].addr) + continue; + + base = iosapic_lists[index].gsi_base; + end = base + iosapic_lists[index].num_rte - 1; + + if (gsi_end < base || end < gsi_base) + continue; /* OK */ + + return -EBUSY; + } + return 0; +} + +int __devinit +iosapic_init (unsigned long phys_addr, unsigned int gsi_base) +{ + int num_rte, err, index; + unsigned int isa_irq, ver; + char __iomem *addr; + unsigned long flags; + + spin_lock_irqsave(&iosapic_lock, flags); + index = find_iosapic(gsi_base); + if (index >= 0) { + spin_unlock_irqrestore(&iosapic_lock, flags); + return -EBUSY; + } + + addr = ioremap(phys_addr, 0); + if (addr == NULL) { + spin_unlock_irqrestore(&iosapic_lock, flags); + return -ENOMEM; + } + ver = iosapic_version(addr); + if ((err = iosapic_check_gsi_range(gsi_base, ver))) { + iounmap(addr); + spin_unlock_irqrestore(&iosapic_lock, flags); + return err; + } + + /* + * The MAX_REDIR register holds the highest input pin number + * (starting from 0). We add 1 so that we can use it for + * number of pins (= RTEs) + */ + num_rte = ((ver >> 16) & 0xff) + 1; + + index = iosapic_alloc(); + iosapic_lists[index].addr = addr; + iosapic_lists[index].gsi_base = gsi_base; + iosapic_lists[index].num_rte = num_rte; +#ifdef CONFIG_NUMA + iosapic_lists[index].node = MAX_NUMNODES; +#endif + spin_lock_init(&iosapic_lists[index].lock); + spin_unlock_irqrestore(&iosapic_lock, flags); + + if ((gsi_base == 0) && pcat_compat) { + /* + * Map the legacy ISA devices into the IOSAPIC data. Some of + * these may get reprogrammed later on with data from the ACPI + * Interrupt Source Override table. + */ + for (isa_irq = 0; isa_irq < 16; ++isa_irq) + iosapic_override_isa_irq(isa_irq, isa_irq, + IOSAPIC_POL_HIGH, + IOSAPIC_EDGE); + } + return 0; +} + +#ifdef CONFIG_HOTPLUG +int +iosapic_remove (unsigned int gsi_base) +{ + int index, err = 0; + unsigned long flags; + + spin_lock_irqsave(&iosapic_lock, flags); + index = find_iosapic(gsi_base); + if (index < 0) { + printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n", + __func__, gsi_base); + goto out; + } + + if (iosapic_lists[index].rtes_inuse) { + err = -EBUSY; + printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n", + __func__, gsi_base); + goto out; + } + + iounmap(iosapic_lists[index].addr); + iosapic_free(index); + out: + spin_unlock_irqrestore(&iosapic_lock, flags); + return err; +} +#endif /* CONFIG_HOTPLUG */ + +#ifdef CONFIG_NUMA +void __devinit +map_iosapic_to_node(unsigned int gsi_base, int node) +{ + int index; + + index = find_iosapic(gsi_base); + if (index < 0) { + printk(KERN_WARNING "%s: No IOSAPIC for GSI %u\n", + __func__, gsi_base); + return; + } + iosapic_lists[index].node = node; + return; +} +#endif diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c new file mode 100644 index 00000000000..ad69606613e --- /dev/null +++ b/arch/ia64/kernel/irq.c @@ -0,0 +1,194 @@ +/* + * linux/arch/ia64/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the code used by various IRQ handling routines: + * asking for different IRQs should be done through these routines + * instead of just grabbing them. Thus setups with different IRQ numbers + * shouldn't result in any weird surprises, and installing new handlers + * should be easier. + * + * Copyright (C) Ashok Raj<ashok.raj@intel.com>, Intel Corporation 2004 + * + * 4/14/2004: Added code to handle cpu migration and do safe irq + * migration without losing interrupts for iosapic + * architecture. + */ + +#include <asm/delay.h> +#include <asm/uaccess.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "Unexpected irq vector 0x%x on CPU %u!\n", irq, smp_processor_id()); +} + +#ifdef CONFIG_IA64_GENERIC +ia64_vector __ia64_irq_to_vector(int irq) +{ + return irq_cfg[irq].vector; +} + +unsigned int __ia64_local_vector_to_irq (ia64_vector vec) +{ + return __get_cpu_var(vector_irq)[vec]; +} +#endif + +/* + * Interrupt statistics: + */ + +atomic_t irq_err_count; + +/* + * /proc/interrupts printing: + */ +int arch_show_interrupts(struct seq_file *p, int prec) +{ + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + return 0; +} + +#ifdef CONFIG_SMP +static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 }; + +void set_irq_affinity_info (unsigned int irq, int hwid, int redir) +{ + if (irq < NR_IRQS) { + cpumask_copy(irq_get_irq_data(irq)->affinity, + cpumask_of(cpu_logical_id(hwid))); + irq_redir[irq] = (char) (redir & 0xff); + } +} + +bool is_affinity_mask_valid(const struct cpumask *cpumask) +{ + if (ia64_platform_is("sn2")) { + /* Only allow one CPU to be specified in the smp_affinity mask */ + if (cpumask_weight(cpumask) != 1) + return false; + } + return true; +} + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_HOTPLUG_CPU +unsigned int vectors_in_migration[NR_IRQS]; + +/* + * Since cpu_online_mask is already updated, we just need to check for + * affinity that has zeros + */ +static void migrate_irqs(void) +{ + int irq, new_cpu; + + for (irq=0; irq < NR_IRQS; irq++) { + struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + + if (irqd_irq_disabled(data)) + continue; + + /* + * No handling for now. + * TBD: Implement a disable function so we can now + * tell CPU not to respond to these local intr sources. + * such as ITV,CPEI,MCA etc. + */ + if (irqd_is_per_cpu(data)) + continue; + + if (cpumask_any_and(data->affinity, cpu_online_mask) + >= nr_cpu_ids) { + /* + * Save it for phase 2 processing + */ + vectors_in_migration[irq] = irq; + + new_cpu = cpumask_any(cpu_online_mask); + + /* + * Al three are essential, currently WARN_ON.. maybe panic? + */ + if (chip && chip->irq_disable && + chip->irq_enable && chip->irq_set_affinity) { + chip->irq_disable(data); + chip->irq_set_affinity(data, + cpumask_of(new_cpu), false); + chip->irq_enable(data); + } else { + WARN_ON((!chip || !chip->irq_disable || + !chip->irq_enable || + !chip->irq_set_affinity)); + } + } + } +} + +void fixup_irqs(void) +{ + unsigned int irq; + extern void ia64_process_pending_intr(void); + extern volatile int time_keeper_id; + + /* Mask ITV to disable timer */ + ia64_set_itv(1 << 16); + + /* + * Find a new timesync master + */ + if (smp_processor_id() == time_keeper_id) { + time_keeper_id = cpumask_first(cpu_online_mask); + printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id); + } + + /* + * Phase 1: Locate IRQs bound to this cpu and + * relocate them for cpu removal. + */ + migrate_irqs(); + + /* + * Phase 2: Perform interrupt processing for all entries reported in + * local APIC. + */ + ia64_process_pending_intr(); + + /* + * Phase 3: Now handle any interrupts not captured in local APIC. + * This is to account for cases that device interrupted during the time the + * rte was being disabled and re-programmed. + */ + for (irq=0; irq < NR_IRQS; irq++) { + if (vectors_in_migration[irq]) { + struct pt_regs *old_regs = set_irq_regs(NULL); + + vectors_in_migration[irq]=0; + generic_handle_irq(irq); + set_irq_regs(old_regs); + } + } + + /* + * Now let processor die. We do irq disable and max_xtp() to + * ensure there is no more interrupts routed to this processor. + * But the local timer interrupt can have 1 pending which we + * take care in timer_interrupt(). + */ + max_xtp(); + local_irq_disable(); +} +#endif diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c new file mode 100644 index 00000000000..782c3a357f2 --- /dev/null +++ b/arch/ia64/kernel/irq_ia64.c @@ -0,0 +1,689 @@ +/* + * linux/arch/ia64/kernel/irq_ia64.c + * + * Copyright (C) 1998-2001 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 6/10/99: Updated to bring in sync with x86 version to facilitate + * support for SMP and different interrupt controllers. + * + * 09/15/00 Goutham Rao <goutham.rao@intel.com> Implemented pci_irq_to_vector + * PCI to vector allocation routine. + * 04/14/2004 Ashok Raj <ashok.raj@intel.com> + * Added CPU Hotplug handling for IPF. + */ + +#include <linux/module.h> + +#include <linux/jiffies.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/ioport.h> +#include <linux/kernel_stat.h> +#include <linux/ptrace.h> +#include <linux/random.h> /* for rand_initialize_irq() */ +#include <linux/signal.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/bitops.h> +#include <linux/irq.h> +#include <linux/ratelimit.h> +#include <linux/acpi.h> +#include <linux/sched.h> + +#include <asm/delay.h> +#include <asm/intrinsics.h> +#include <asm/io.h> +#include <asm/hw_irq.h> +#include <asm/machvec.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/tlbflush.h> + +#ifdef CONFIG_PERFMON +# include <asm/perfmon.h> +#endif + +#define IRQ_DEBUG 0 + +#define IRQ_VECTOR_UNASSIGNED (0) + +#define IRQ_UNUSED (0) +#define IRQ_USED (1) +#define IRQ_RSVD (2) + +/* These can be overridden in platform_irq_init */ +int ia64_first_device_vector = IA64_DEF_FIRST_DEVICE_VECTOR; +int ia64_last_device_vector = IA64_DEF_LAST_DEVICE_VECTOR; + +/* default base addr of IPI table */ +void __iomem *ipi_base_addr = ((void __iomem *) + (__IA64_UNCACHED_OFFSET | IA64_IPI_DEFAULT_BASE_ADDR)); + +static cpumask_t vector_allocation_domain(int cpu); + +/* + * Legacy IRQ to IA-64 vector translation table. + */ +__u8 isa_irq_to_vector_map[16] = { + /* 8259 IRQ translation, first 16 entries */ + 0x2f, 0x20, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, + 0x28, 0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21 +}; +EXPORT_SYMBOL(isa_irq_to_vector_map); + +DEFINE_SPINLOCK(vector_lock); + +struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { + [0 ... NR_IRQS - 1] = { + .vector = IRQ_VECTOR_UNASSIGNED, + .domain = CPU_MASK_NONE + } +}; + +DEFINE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq) = { + [0 ... IA64_NUM_VECTORS - 1] = -1 +}; + +static cpumask_t vector_table[IA64_NUM_VECTORS] = { + [0 ... IA64_NUM_VECTORS - 1] = CPU_MASK_NONE +}; + +static int irq_status[NR_IRQS] = { + [0 ... NR_IRQS -1] = IRQ_UNUSED +}; + +int check_irq_used(int irq) +{ + if (irq_status[irq] == IRQ_USED) + return 1; + + return -1; +} + +static inline int find_unassigned_irq(void) +{ + int irq; + + for (irq = IA64_FIRST_DEVICE_VECTOR; irq < NR_IRQS; irq++) + if (irq_status[irq] == IRQ_UNUSED) + return irq; + return -ENOSPC; +} + +static inline int find_unassigned_vector(cpumask_t domain) +{ + cpumask_t mask; + int pos, vector; + + cpus_and(mask, domain, cpu_online_map); + if (cpus_empty(mask)) + return -EINVAL; + + for (pos = 0; pos < IA64_NUM_DEVICE_VECTORS; pos++) { + vector = IA64_FIRST_DEVICE_VECTOR + pos; + cpus_and(mask, domain, vector_table[vector]); + if (!cpus_empty(mask)) + continue; + return vector; + } + return -ENOSPC; +} + +static int __bind_irq_vector(int irq, int vector, cpumask_t domain) +{ + cpumask_t mask; + int cpu; + struct irq_cfg *cfg = &irq_cfg[irq]; + + BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON((unsigned)vector >= IA64_NUM_VECTORS); + + cpus_and(mask, domain, cpu_online_map); + if (cpus_empty(mask)) + return -EINVAL; + if ((cfg->vector == vector) && cpus_equal(cfg->domain, domain)) + return 0; + if (cfg->vector != IRQ_VECTOR_UNASSIGNED) + return -EBUSY; + for_each_cpu_mask(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + irq_status[irq] = IRQ_USED; + cpus_or(vector_table[vector], vector_table[vector], domain); + return 0; +} + +int bind_irq_vector(int irq, int vector, cpumask_t domain) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&vector_lock, flags); + ret = __bind_irq_vector(irq, vector, domain); + spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +static void __clear_irq_vector(int irq) +{ + int vector, cpu; + cpumask_t mask; + cpumask_t domain; + struct irq_cfg *cfg = &irq_cfg[irq]; + + BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON(cfg->vector == IRQ_VECTOR_UNASSIGNED); + vector = cfg->vector; + domain = cfg->domain; + cpus_and(mask, cfg->domain, cpu_online_map); + for_each_cpu_mask(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + cfg->vector = IRQ_VECTOR_UNASSIGNED; + cfg->domain = CPU_MASK_NONE; + irq_status[irq] = IRQ_UNUSED; + cpus_andnot(vector_table[vector], vector_table[vector], domain); +} + +static void clear_irq_vector(int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + spin_unlock_irqrestore(&vector_lock, flags); +} + +int +ia64_native_assign_irq_vector (int irq) +{ + unsigned long flags; + int vector, cpu; + cpumask_t domain = CPU_MASK_NONE; + + vector = -ENOSPC; + + spin_lock_irqsave(&vector_lock, flags); + for_each_online_cpu(cpu) { + domain = vector_allocation_domain(cpu); + vector = find_unassigned_vector(domain); + if (vector >= 0) + break; + } + if (vector < 0) + goto out; + if (irq == AUTO_ASSIGN) + irq = vector; + BUG_ON(__bind_irq_vector(irq, vector, domain)); + out: + spin_unlock_irqrestore(&vector_lock, flags); + return vector; +} + +void +ia64_native_free_irq_vector (int vector) +{ + if (vector < IA64_FIRST_DEVICE_VECTOR || + vector > IA64_LAST_DEVICE_VECTOR) + return; + clear_irq_vector(vector); +} + +int +reserve_irq_vector (int vector) +{ + if (vector < IA64_FIRST_DEVICE_VECTOR || + vector > IA64_LAST_DEVICE_VECTOR) + return -EINVAL; + return !!bind_irq_vector(vector, vector, CPU_MASK_ALL); +} + +/* + * Initialize vector_irq on a new cpu. This function must be called + * with vector_lock held. + */ +void __setup_vector_irq(int cpu) +{ + int irq, vector; + + /* Clear vector_irq */ + for (vector = 0; vector < IA64_NUM_VECTORS; ++vector) + per_cpu(vector_irq, cpu)[vector] = -1; + /* Mark the inuse vectors */ + for (irq = 0; irq < NR_IRQS; ++irq) { + if (!cpu_isset(cpu, irq_cfg[irq].domain)) + continue; + vector = irq_to_vector(irq); + per_cpu(vector_irq, cpu)[vector] = irq; + } +} + +#if defined(CONFIG_SMP) && (defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_DIG)) + +static enum vector_domain_type { + VECTOR_DOMAIN_NONE, + VECTOR_DOMAIN_PERCPU +} vector_domain_type = VECTOR_DOMAIN_NONE; + +static cpumask_t vector_allocation_domain(int cpu) +{ + if (vector_domain_type == VECTOR_DOMAIN_PERCPU) + return cpumask_of_cpu(cpu); + return CPU_MASK_ALL; +} + +static int __irq_prepare_move(int irq, int cpu) +{ + struct irq_cfg *cfg = &irq_cfg[irq]; + int vector; + cpumask_t domain; + + if (cfg->move_in_progress || cfg->move_cleanup_count) + return -EBUSY; + if (cfg->vector == IRQ_VECTOR_UNASSIGNED || !cpu_online(cpu)) + return -EINVAL; + if (cpu_isset(cpu, cfg->domain)) + return 0; + domain = vector_allocation_domain(cpu); + vector = find_unassigned_vector(domain); + if (vector < 0) + return -ENOSPC; + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + cfg->vector = IRQ_VECTOR_UNASSIGNED; + cfg->domain = CPU_MASK_NONE; + BUG_ON(__bind_irq_vector(irq, vector, domain)); + return 0; +} + +int irq_prepare_move(int irq, int cpu) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&vector_lock, flags); + ret = __irq_prepare_move(irq, cpu); + spin_unlock_irqrestore(&vector_lock, flags); + return ret; +} + +void irq_complete_move(unsigned irq) +{ + struct irq_cfg *cfg = &irq_cfg[irq]; + cpumask_t cleanup_mask; + int i; + + if (likely(!cfg->move_in_progress)) + return; + + if (unlikely(cpu_isset(smp_processor_id(), cfg->old_domain))) + return; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + for_each_cpu_mask(i, cleanup_mask) + platform_send_ipi(i, IA64_IRQ_MOVE_VECTOR, IA64_IPI_DM_INT, 0); + cfg->move_in_progress = 0; +} + +static irqreturn_t smp_irq_move_cleanup_interrupt(int irq, void *dev_id) +{ + int me = smp_processor_id(); + ia64_vector vector; + unsigned long flags; + + for (vector = IA64_FIRST_DEVICE_VECTOR; + vector < IA64_LAST_DEVICE_VECTOR; vector++) { + int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + if (irq < 0) + continue; + + desc = irq_to_desc(irq); + cfg = irq_cfg + irq; + raw_spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if (!cpu_isset(me, cfg->old_domain)) + goto unlock; + + spin_lock_irqsave(&vector_lock, flags); + __get_cpu_var(vector_irq)[vector] = -1; + cpu_clear(me, vector_table[vector]); + spin_unlock_irqrestore(&vector_lock, flags); + cfg->move_cleanup_count--; + unlock: + raw_spin_unlock(&desc->lock); + } + return IRQ_HANDLED; +} + +static struct irqaction irq_move_irqaction = { + .handler = smp_irq_move_cleanup_interrupt, + .flags = IRQF_DISABLED, + .name = "irq_move" +}; + +static int __init parse_vector_domain(char *arg) +{ + if (!arg) + return -EINVAL; + if (!strcmp(arg, "percpu")) { + vector_domain_type = VECTOR_DOMAIN_PERCPU; + no_int_routing = 1; + } + return 0; +} +early_param("vector", parse_vector_domain); +#else +static cpumask_t vector_allocation_domain(int cpu) +{ + return CPU_MASK_ALL; +} +#endif + + +void destroy_and_reserve_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + irq_status[irq] = IRQ_RSVD; + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * Dynamic irq allocate and deallocation for MSI + */ +int create_irq(void) +{ + unsigned long flags; + int irq, vector, cpu; + cpumask_t domain = CPU_MASK_NONE; + + irq = vector = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for_each_online_cpu(cpu) { + domain = vector_allocation_domain(cpu); + vector = find_unassigned_vector(domain); + if (vector >= 0) + break; + } + if (vector < 0) + goto out; + irq = find_unassigned_irq(); + if (irq < 0) + goto out; + BUG_ON(__bind_irq_vector(irq, vector, domain)); + out: + spin_unlock_irqrestore(&vector_lock, flags); + if (irq >= 0) + dynamic_irq_init(irq); + return irq; +} + +void destroy_irq(unsigned int irq) +{ + dynamic_irq_cleanup(irq); + clear_irq_vector(irq); +} + +#ifdef CONFIG_SMP +# define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE) +# define IS_LOCAL_TLB_FLUSH(vec) (vec == IA64_IPI_LOCAL_TLB_FLUSH) +#else +# define IS_RESCHEDULE(vec) (0) +# define IS_LOCAL_TLB_FLUSH(vec) (0) +#endif +/* + * That's where the IVT branches when we get an external + * interrupt. This branches to the correct hardware IRQ handler via + * function ptr. + */ +void +ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned long saved_tpr; + +#if IRQ_DEBUG + { + unsigned long bsp, sp; + + /* + * Note: if the interrupt happened while executing in + * the context switch routine (ia64_switch_to), we may + * get a spurious stack overflow here. This is + * because the register and the memory stack are not + * switched atomically. + */ + bsp = ia64_getreg(_IA64_REG_AR_BSP); + sp = ia64_getreg(_IA64_REG_SP); + + if ((sp - bsp) < 1024) { + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); + + if (__ratelimit(&ratelimit)) { + printk("ia64_handle_irq: DANGER: less than " + "1KB of free stack space!!\n" + "(bsp=0x%lx, sp=%lx)\n", bsp, sp); + } + } + } +#endif /* IRQ_DEBUG */ + + /* + * Always set TPR to limit maximum interrupt nesting depth to + * 16 (without this, it would be ~240, which could easily lead + * to kernel stack overflows). + */ + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); + while (vector != IA64_SPURIOUS_INT_VECTOR) { + int irq = local_vector_to_irq(vector); + struct irq_desc *desc = irq_to_desc(irq); + + if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { + smp_local_flush_tlb(); + kstat_incr_irqs_this_cpu(irq, desc); + } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); + kstat_incr_irqs_this_cpu(irq, desc); + } else { + ia64_setreg(_IA64_REG_CR_TPR, vector); + ia64_srlz_d(); + + if (unlikely(irq < 0)) { + printk(KERN_ERR "%s: Unexpected interrupt " + "vector %d on CPU %d is not mapped " + "to any IRQ!\n", __func__, vector, + smp_processor_id()); + } else + generic_handle_irq(irq); + + /* + * Disable interrupts and send EOI: + */ + local_irq_disable(); + ia64_setreg(_IA64_REG_CR_TPR, saved_tpr); + } + ia64_eoi(); + vector = ia64_get_ivr(); + } + /* + * This must be done *after* the ia64_eoi(). For example, the keyboard softirq + * handler needs to be able to wait for further keyboard interrupts, which can't + * come through until ia64_eoi() has been done. + */ + irq_exit(); + set_irq_regs(old_regs); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * This function emulates a interrupt processing when a cpu is about to be + * brought down. + */ +void ia64_process_pending_intr(void) +{ + ia64_vector vector; + unsigned long saved_tpr; + extern unsigned int vectors_in_migration[NR_IRQS]; + + vector = ia64_get_ivr(); + + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); + + /* + * Perform normal interrupt style processing + */ + while (vector != IA64_SPURIOUS_INT_VECTOR) { + int irq = local_vector_to_irq(vector); + struct irq_desc *desc = irq_to_desc(irq); + + if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) { + smp_local_flush_tlb(); + kstat_incr_irqs_this_cpu(irq, desc); + } else if (unlikely(IS_RESCHEDULE(vector))) { + kstat_incr_irqs_this_cpu(irq, desc); + } else { + struct pt_regs *old_regs = set_irq_regs(NULL); + + ia64_setreg(_IA64_REG_CR_TPR, vector); + ia64_srlz_d(); + + /* + * Now try calling normal ia64_handle_irq as it would have got called + * from a real intr handler. Try passing null for pt_regs, hopefully + * it will work. I hope it works!. + * Probably could shared code. + */ + if (unlikely(irq < 0)) { + printk(KERN_ERR "%s: Unexpected interrupt " + "vector %d on CPU %d not being mapped " + "to any IRQ!!\n", __func__, vector, + smp_processor_id()); + } else { + vectors_in_migration[irq]=0; + generic_handle_irq(irq); + } + set_irq_regs(old_regs); + + /* + * Disable interrupts and send EOI + */ + local_irq_disable(); + ia64_setreg(_IA64_REG_CR_TPR, saved_tpr); + } + ia64_eoi(); + vector = ia64_get_ivr(); + } + irq_exit(); +} +#endif + + +#ifdef CONFIG_SMP + +static irqreturn_t dummy_handler (int irq, void *dev_id) +{ + BUG(); +} + +static struct irqaction ipi_irqaction = { + .handler = handle_IPI, + .flags = IRQF_DISABLED, + .name = "IPI" +}; + +/* + * KVM uses this interrupt to force a cpu out of guest mode + */ +static struct irqaction resched_irqaction = { + .handler = dummy_handler, + .flags = IRQF_DISABLED, + .name = "resched" +}; + +static struct irqaction tlb_irqaction = { + .handler = dummy_handler, + .flags = IRQF_DISABLED, + .name = "tlb_flush" +}; + +#endif + +void +ia64_native_register_percpu_irq (ia64_vector vec, struct irqaction *action) +{ + unsigned int irq; + + irq = vec; + BUG_ON(bind_irq_vector(irq, vec, CPU_MASK_ALL)); + irq_set_status_flags(irq, IRQ_PER_CPU); + irq_set_chip(irq, &irq_type_ia64_lsapic); + if (action) + setup_irq(irq, action); + irq_set_handler(irq, handle_percpu_irq); +} + +void __init +ia64_native_register_ipi(void) +{ +#ifdef CONFIG_SMP + register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction); + register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction); + register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, &tlb_irqaction); +#endif +} + +void __init +init_IRQ (void) +{ +#ifdef CONFIG_ACPI + acpi_boot_init(); +#endif + ia64_register_ipi(); + register_percpu_irq(IA64_SPURIOUS_INT_VECTOR, NULL); +#ifdef CONFIG_SMP +#if defined(CONFIG_IA64_GENERIC) || defined(CONFIG_IA64_DIG) + if (vector_domain_type != VECTOR_DOMAIN_NONE) + register_percpu_irq(IA64_IRQ_MOVE_VECTOR, &irq_move_irqaction); +#endif +#endif +#ifdef CONFIG_PERFMON + pfm_init_percpu(); +#endif + platform_irq_init(); +} + +void +ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect) +{ + void __iomem *ipi_addr; + unsigned long ipi_data; + unsigned long phys_cpu_id; + + phys_cpu_id = cpu_physical_id(cpu); + + /* + * cpu number is in 8bit ID and 8bit EID + */ + + ipi_data = (delivery_mode << 8) | (vector & 0xff); + ipi_addr = ipi_base_addr + ((phys_cpu_id << 4) | ((redirect & 1) << 3)); + + writeq(ipi_data, ipi_addr); +} diff --git a/arch/ia64/kernel/irq_lsapic.c b/arch/ia64/kernel/irq_lsapic.c new file mode 100644 index 00000000000..1b3a776e516 --- /dev/null +++ b/arch/ia64/kernel/irq_lsapic.c @@ -0,0 +1,44 @@ +/* + * LSAPIC Interrupt Controller + * + * This takes care of interrupts that are generated by the CPU's + * internal Streamlined Advanced Programmable Interrupt Controller + * (LSAPIC), such as the ITC and IPI interrupts. + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 2000 Hewlett-Packard Co + * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> + */ + +#include <linux/sched.h> +#include <linux/irq.h> + +static unsigned int +lsapic_noop_startup (struct irq_data *data) +{ + return 0; +} + +static void +lsapic_noop (struct irq_data *data) +{ + /* nothing to do... */ +} + +static int lsapic_retrigger(struct irq_data *data) +{ + ia64_resend_irq(data->irq); + + return 1; +} + +struct irq_chip irq_type_ia64_lsapic = { + .name = "LSAPIC", + .irq_startup = lsapic_noop_startup, + .irq_shutdown = lsapic_noop, + .irq_enable = lsapic_noop, + .irq_disable = lsapic_noop, + .irq_ack = lsapic_noop, + .irq_retrigger = lsapic_retrigger, +}; diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S new file mode 100644 index 00000000000..d93e396bf59 --- /dev/null +++ b/arch/ia64/kernel/ivt.S @@ -0,0 +1,1689 @@ +/* + * arch/ia64/kernel/ivt.S + * + * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger <davidm@hpl.hp.com> + * Copyright (C) 2000, 2002-2003 Intel Co + * Asit Mallick <asit.k.mallick@intel.com> + * Suresh Siddha <suresh.b.siddha@intel.com> + * Kenneth Chen <kenneth.w.chen@intel.com> + * Fenghua Yu <fenghua.yu@intel.com> + * + * 00/08/23 Asit Mallick <asit.k.mallick@intel.com> TLB handling for SMP + * 00/12/20 David Mosberger-Tang <davidm@hpl.hp.com> DTLB/ITLB handler now uses virtual PT. + * + * Copyright (C) 2005 Hewlett-Packard Co + * Dan Magenheimer <dan.magenheimer@hp.com> + * Xen paravirtualization + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan K.K. + * pv_ops. + * Yaozu (Eddie) Dong <eddie.dong@intel.com> + */ +/* + * This file defines the interruption vector table used by the CPU. + * It does not include one entry per possible cause of interruption. + * + * The first 20 entries of the table contain 64 bundles each while the + * remaining 48 entries contain only 16 bundles each. + * + * The 64 bundles are used to allow inlining the whole handler for critical + * interruptions like TLB misses. + * + * For each entry, the comment is as follows: + * + * // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51) + * entry offset ----/ / / / / + * entry number ---------/ / / / + * size of the entry -------------/ / / + * vector name -------------------------------------/ / + * interruptions triggering this vector ----------------------/ + * + * The table is 32KB in size and must be aligned on 32KB boundary. + * (The CPU ignores the 15 lower bits of the address) + * + * Table is based upon EAS2.6 (Oct 1999) + */ + + +#include <asm/asmmacro.h> +#include <asm/break.h> +#include <asm/kregs.h> +#include <asm/asm-offsets.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/thread_info.h> +#include <asm/unistd.h> +#include <asm/errno.h> + +#if 1 +# define PSR_DEFAULT_BITS psr.ac +#else +# define PSR_DEFAULT_BITS 0 +#endif + +#if 0 + /* + * This lets you track the last eight faults that occurred on the CPU. Make sure ar.k2 isn't + * needed for something else before enabling this... + */ +# define DBG_FAULT(i) mov r16=ar.k2;; shl r16=r16,8;; add r16=(i),r16;;mov ar.k2=r16 +#else +# define DBG_FAULT(i) +#endif + +#include "minstate.h" + +#define FAULT(n) \ + mov r31=pr; \ + mov r19=n;; /* prepare to save predicates */ \ + br.sptk.many dispatch_to_fault_handler + + .section .text..ivt,"ax" + + .align 32768 // align on 32KB boundary + .global ia64_ivt +ia64_ivt: +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47) +ENTRY(vhpt_miss) + DBG_FAULT(0) + /* + * The VHPT vector is invoked when the TLB entry for the virtual page table + * is missing. This happens only as a result of a previous + * (the "original") TLB miss, which may either be caused by an instruction + * fetch or a data access (or non-access). + * + * What we do here is normal TLB miss handing for the _original_ miss, + * followed by inserting the TLB entry for the virtual page table page + * that the VHPT walker was attempting to access. The latter gets + * inserted as long as page table entry above pte level have valid + * mappings for the faulting address. The TLB entry for the original + * miss gets inserted only if the pte entry indicates that the page is + * present. + * + * do_page_fault gets invoked in the following cases: + * - the faulting virtual address uses unimplemented address bits + * - the faulting virtual address has no valid page table mapping + */ + MOV_FROM_IFA(r16) // get address that caused the TLB miss +#ifdef CONFIG_HUGETLB_PAGE + movl r18=PAGE_SHIFT + MOV_FROM_ITIR(r25) +#endif + ;; + RSM_PSR_DT // use physical addressing for data + mov r31=pr // save the predicate registers + mov r19=IA64_KR(PT_BASE) // get page table base address + shl r21=r16,3 // shift bit 60 into sign bit + shr.u r17=r16,61 // get the region number into r17 + ;; + shr.u r22=r21,3 +#ifdef CONFIG_HUGETLB_PAGE + extr.u r26=r25,2,6 + ;; + cmp.ne p8,p0=r18,r26 + sub r27=r26,r18 + ;; +(p8) dep r25=r18,r25,2,6 +(p8) shr r22=r22,r27 +#endif + ;; + cmp.eq p6,p7=5,r17 // is IFA pointing into to region 5? + shr.u r18=r22,PGDIR_SHIFT // get bottom portion of pgd index bit + ;; +(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place + + srlz.d + LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir + + .pred.rel "mutex", p6, p7 +(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT +(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3 + ;; +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] + cmp.eq p7,p6=0,r21 // unused address bits all zeroes? +#ifdef CONFIG_PGTABLE_4 + shr.u r28=r22,PUD_SHIFT // shift pud index into position +#else + shr.u r18=r22,PMD_SHIFT // shift pmd index into position +#endif + ;; + ld8 r17=[r17] // get *pgd (may be 0) + ;; +(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? +#ifdef CONFIG_PGTABLE_4 + dep r28=r28,r17,3,(PAGE_SHIFT-3) // r28=pud_offset(pgd,addr) + ;; + shr.u r18=r22,PMD_SHIFT // shift pmd index into position +(p7) ld8 r29=[r28] // get *pud (may be 0) + ;; +(p7) cmp.eq.or.andcm p6,p7=r29,r0 // was pud_present(*pud) == NULL? + dep r17=r18,r29,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr) +#else + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pgd,addr) +#endif + ;; +(p7) ld8 r20=[r17] // get *pmd (may be 0) + shr.u r19=r22,PAGE_SHIFT // shift pte index into position + ;; +(p7) cmp.eq.or.andcm p6,p7=r20,r0 // was pmd_present(*pmd) == NULL? + dep r21=r19,r20,3,(PAGE_SHIFT-3) // r21=pte_offset(pmd,addr) + ;; +(p7) ld8 r18=[r21] // read *pte + MOV_FROM_ISR(r19) // cr.isr bit 32 tells us if this is an insn miss + ;; +(p7) tbit.z p6,p7=r18,_PAGE_P_BIT // page present bit cleared? + MOV_FROM_IHA(r22) // get the VHPT address that caused the TLB miss + ;; // avoid RAW on p7 +(p7) tbit.nz.unc p10,p11=r19,32 // is it an instruction TLB miss? + dep r23=0,r20,0,PAGE_SHIFT // clear low bits to get page address + ;; + ITC_I_AND_D(p10, p11, r18, r24) // insert the instruction TLB entry and + // insert the data TLB entry +(p6) br.cond.spnt.many page_fault // handle bad address/page not present (page fault) + MOV_TO_IFA(r22, r24) + +#ifdef CONFIG_HUGETLB_PAGE + MOV_TO_ITIR(p8, r25, r24) // change to default page-size for VHPT +#endif + + /* + * Now compute and insert the TLB entry for the virtual page table. We never + * execute in a page table page so there is no need to set the exception deferral + * bit. + */ + adds r24=__DIRTY_BITS_NO_ED|_PAGE_PL_0|_PAGE_AR_RW,r23 + ;; + ITC_D(p7, r24, r25) + ;; +#ifdef CONFIG_SMP + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + /* + * Re-check pagetable entry. If they changed, we may have received a ptc.g + * between reading the pagetable and the "itc". If so, flush the entry we + * inserted and retry. At this point, we have: + * + * r28 = equivalent of pud_offset(pgd, ifa) + * r17 = equivalent of pmd_offset(pud, ifa) + * r21 = equivalent of pte_offset(pmd, ifa) + * + * r29 = *pud + * r20 = *pmd + * r18 = *pte + */ + ld8 r25=[r21] // read *pte again + ld8 r26=[r17] // read *pmd again +#ifdef CONFIG_PGTABLE_4 + ld8 r19=[r28] // read *pud again +#endif + cmp.ne p6,p7=r0,r0 + ;; + cmp.ne.or.andcm p6,p7=r26,r20 // did *pmd change +#ifdef CONFIG_PGTABLE_4 + cmp.ne.or.andcm p6,p7=r19,r29 // did *pud change +#endif + mov r27=PAGE_SHIFT<<2 + ;; +(p6) ptc.l r22,r27 // purge PTE page translation +(p7) cmp.ne.or.andcm p6,p7=r25,r18 // did *pte change + ;; +(p6) ptc.l r16,r27 // purge translation +#endif + + mov pr=r31,-1 // restore predicate registers + RFI +END(vhpt_miss) + + .org ia64_ivt+0x400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0400 Entry 1 (size 64 bundles) ITLB (21) +ENTRY(itlb_miss) + DBG_FAULT(1) + /* + * The ITLB handler accesses the PTE via the virtually mapped linear + * page table. If a nested TLB miss occurs, we switch into physical + * mode, walk the page table, and then re-execute the PTE read and + * go on normally after that. + */ + MOV_FROM_IFA(r16) // get virtual address + mov r29=b0 // save b0 + mov r31=pr // save predicates +.itlb_fault: + MOV_FROM_IHA(r17) // get virtual address of PTE + movl r30=1f // load nested fault continuation point + ;; +1: ld8 r18=[r17] // read *pte + ;; + mov b0=r29 + tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared? +(p6) br.cond.spnt page_fault + ;; + ITC_I(p0, r18, r19) + ;; +#ifdef CONFIG_SMP + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r19=[r17] // read *pte again and see if same + mov r20=PAGE_SHIFT<<2 // setup page size for purge + ;; + cmp.ne p7,p0=r18,r19 + ;; +(p7) ptc.l r16,r20 +#endif + mov pr=r31,-1 + RFI +END(itlb_miss) + + .org ia64_ivt+0x0800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0800 Entry 2 (size 64 bundles) DTLB (9,48) +ENTRY(dtlb_miss) + DBG_FAULT(2) + /* + * The DTLB handler accesses the PTE via the virtually mapped linear + * page table. If a nested TLB miss occurs, we switch into physical + * mode, walk the page table, and then re-execute the PTE read and + * go on normally after that. + */ + MOV_FROM_IFA(r16) // get virtual address + mov r29=b0 // save b0 + mov r31=pr // save predicates +dtlb_fault: + MOV_FROM_IHA(r17) // get virtual address of PTE + movl r30=1f // load nested fault continuation point + ;; +1: ld8 r18=[r17] // read *pte + ;; + mov b0=r29 + tbit.z p6,p0=r18,_PAGE_P_BIT // page present bit cleared? +(p6) br.cond.spnt page_fault + ;; + ITC_D(p0, r18, r19) + ;; +#ifdef CONFIG_SMP + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r19=[r17] // read *pte again and see if same + mov r20=PAGE_SHIFT<<2 // setup page size for purge + ;; + cmp.ne p7,p0=r18,r19 + ;; +(p7) ptc.l r16,r20 +#endif + mov pr=r31,-1 + RFI +END(dtlb_miss) + + .org ia64_ivt+0x0c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19) +ENTRY(alt_itlb_miss) + DBG_FAULT(3) + MOV_FROM_IFA(r16) // get address that caused the TLB miss + movl r17=PAGE_KERNEL + MOV_FROM_IPSR(p0, r21) + movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) + mov r31=pr + ;; +#ifdef CONFIG_DISABLE_VHPT + shr.u r22=r16,61 // get the region number into r21 + ;; + cmp.gt p8,p0=6,r22 // user mode + ;; + THASH(p8, r17, r16, r23) + ;; + MOV_TO_IHA(p8, r17, r23) +(p8) mov r29=b0 // save b0 +(p8) br.cond.dptk .itlb_fault +#endif + extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl + and r19=r19,r16 // clear ed, reserved bits, and PTE control bits + shr.u r18=r16,57 // move address bit 61 to bit 4 + ;; + andcm r18=0x10,r18 // bit 4=~address-bit(61) + cmp.ne p8,p0=r0,r23 // psr.cpl != 0? + or r19=r17,r19 // insert PTE control bits into r19 + ;; + or r19=r19,r18 // set bit 4 (uncached) if the access was to region 6 +(p8) br.cond.spnt page_fault + ;; + ITC_I(p0, r19, r18) // insert the TLB entry + mov pr=r31,-1 + RFI +END(alt_itlb_miss) + + .org ia64_ivt+0x1000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46) +ENTRY(alt_dtlb_miss) + DBG_FAULT(4) + MOV_FROM_IFA(r16) // get address that caused the TLB miss + movl r17=PAGE_KERNEL + MOV_FROM_ISR(r20) + movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff) + MOV_FROM_IPSR(p0, r21) + mov r31=pr + mov r24=PERCPU_ADDR + ;; +#ifdef CONFIG_DISABLE_VHPT + shr.u r22=r16,61 // get the region number into r21 + ;; + cmp.gt p8,p0=6,r22 // access to region 0-5 + ;; + THASH(p8, r17, r16, r25) + ;; + MOV_TO_IHA(p8, r17, r25) +(p8) mov r29=b0 // save b0 +(p8) br.cond.dptk dtlb_fault +#endif + cmp.ge p10,p11=r16,r24 // access to per_cpu_data? + tbit.z p12,p0=r16,61 // access to region 6? + mov r25=PERCPU_PAGE_SHIFT << 2 + mov r26=PERCPU_PAGE_SIZE + nop.m 0 + nop.b 0 + ;; +(p10) mov r19=IA64_KR(PER_CPU_DATA) +(p11) and r19=r19,r16 // clear non-ppn fields + extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl + and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field + tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on? + tbit.nz p9,p0=r20,IA64_ISR_NA_BIT // is non-access bit on? + ;; +(p10) sub r19=r19,r26 + MOV_TO_ITIR(p10, r25, r24) + cmp.ne p8,p0=r0,r23 +(p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field +(p12) dep r17=-1,r17,4,1 // set ma=UC for region 6 addr +(p8) br.cond.spnt page_fault + + dep r21=-1,r21,IA64_PSR_ED_BIT,1 + ;; + or r19=r19,r17 // insert PTE control bits into r19 + MOV_TO_IPSR(p6, r21, r24) + ;; + ITC_D(p7, r19, r18) // insert the TLB entry + mov pr=r31,-1 + RFI +END(alt_dtlb_miss) + + .org ia64_ivt+0x1400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45) +ENTRY(nested_dtlb_miss) + /* + * In the absence of kernel bugs, we get here when the virtually mapped linear + * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction + * Access-bit, or Data Access-bit faults). If the DTLB entry for the virtual page + * table is missing, a nested TLB miss fault is triggered and control is + * transferred to this point. When this happens, we lookup the pte for the + * faulting address by walking the page table in physical mode and return to the + * continuation point passed in register r30 (or call page_fault if the address is + * not mapped). + * + * Input: r16: faulting address + * r29: saved b0 + * r30: continuation address + * r31: saved pr + * + * Output: r17: physical address of PTE of faulting address + * r29: saved b0 + * r30: continuation address + * r31: saved pr + * + * Clobbered: b0, r18, r19, r21, r22, psr.dt (cleared) + */ + RSM_PSR_DT // switch to using physical data addressing + mov r19=IA64_KR(PT_BASE) // get the page table base address + shl r21=r16,3 // shift bit 60 into sign bit + MOV_FROM_ITIR(r18) + ;; + shr.u r17=r16,61 // get the region number into r17 + extr.u r18=r18,2,6 // get the faulting page size + ;; + cmp.eq p6,p7=5,r17 // is faulting address in region 5? + add r22=-PAGE_SHIFT,r18 // adjustment for hugetlb address + add r18=PGDIR_SHIFT-PAGE_SHIFT,r18 + ;; + shr.u r22=r16,r22 + shr.u r18=r16,r18 +(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place + + srlz.d + LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir + + .pred.rel "mutex", p6, p7 +(p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT +(p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3 + ;; +(p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 +(p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] + cmp.eq p7,p6=0,r21 // unused address bits all zeroes? +#ifdef CONFIG_PGTABLE_4 + shr.u r18=r22,PUD_SHIFT // shift pud index into position +#else + shr.u r18=r22,PMD_SHIFT // shift pmd index into position +#endif + ;; + ld8 r17=[r17] // get *pgd (may be 0) + ;; +(p7) cmp.eq p6,p7=r17,r0 // was pgd_present(*pgd) == NULL? + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=p[u|m]d_offset(pgd,addr) + ;; +#ifdef CONFIG_PGTABLE_4 +(p7) ld8 r17=[r17] // get *pud (may be 0) + shr.u r18=r22,PMD_SHIFT // shift pmd index into position + ;; +(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pud_present(*pud) == NULL? + dep r17=r18,r17,3,(PAGE_SHIFT-3) // r17=pmd_offset(pud,addr) + ;; +#endif +(p7) ld8 r17=[r17] // get *pmd (may be 0) + shr.u r19=r22,PAGE_SHIFT // shift pte index into position + ;; +(p7) cmp.eq.or.andcm p6,p7=r17,r0 // was pmd_present(*pmd) == NULL? + dep r17=r19,r17,3,(PAGE_SHIFT-3) // r17=pte_offset(pmd,addr); +(p6) br.cond.spnt page_fault + mov b0=r30 + br.sptk.many b0 // return to continuation point +END(nested_dtlb_miss) + + .org ia64_ivt+0x1800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24) +ENTRY(ikey_miss) + DBG_FAULT(6) + FAULT(6) +END(ikey_miss) + + .org ia64_ivt+0x1c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51) +ENTRY(dkey_miss) + DBG_FAULT(7) + FAULT(7) +END(dkey_miss) + + .org ia64_ivt+0x2000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54) +ENTRY(dirty_bit) + DBG_FAULT(8) + /* + * What we do here is to simply turn on the dirty bit in the PTE. We need to + * update both the page-table and the TLB entry. To efficiently access the PTE, + * we address it through the virtual page table. Most likely, the TLB entry for + * the relevant virtual page table page is still present in the TLB so we can + * normally do this without additional TLB misses. In case the necessary virtual + * page table TLB entry isn't present, we take a nested TLB miss hit where we look + * up the physical address of the L3 PTE and then continue at label 1 below. + */ + MOV_FROM_IFA(r16) // get the address that caused the fault + movl r30=1f // load continuation point in case of nested fault + ;; + THASH(p0, r17, r16, r18) // compute virtual address of L3 PTE + mov r29=b0 // save b0 in case of nested fault + mov r31=pr // save pr +#ifdef CONFIG_SMP + mov r28=ar.ccv // save ar.ccv + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + mov ar.ccv=r18 // set compare value for cmpxchg + or r25=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit + ;; +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only update if page is present + mov r24=PAGE_SHIFT<<2 + ;; +(p6) cmp.eq p6,p7=r26,r18 // Only compare if page is present + ;; + ITC_D(p6, r25, r18) // install updated PTE + ;; + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r18=[r17] // read PTE again + ;; + cmp.eq p6,p7=r18,r25 // is it same as the newly installed + ;; +(p7) ptc.l r16,r24 + mov b0=r29 // restore b0 + mov ar.ccv=r28 +#else + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + or r18=_PAGE_D|_PAGE_A,r18 // set the dirty and accessed bits + mov b0=r29 // restore b0 + ;; + st8 [r17]=r18 // store back updated PTE + ITC_D(p0, r18, r16) // install updated PTE +#endif + mov pr=r31,-1 // restore pr + RFI +END(dirty_bit) + + .org ia64_ivt+0x2400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27) +ENTRY(iaccess_bit) + DBG_FAULT(9) + // Like Entry 8, except for instruction access + MOV_FROM_IFA(r16) // get the address that caused the fault + movl r30=1f // load continuation point in case of nested fault + mov r31=pr // save predicates +#ifdef CONFIG_ITANIUM + /* + * Erratum 10 (IFA may contain incorrect address) has "NoFix" status. + */ + MOV_FROM_IPSR(p0, r17) + ;; + MOV_FROM_IIP(r18) + tbit.z p6,p0=r17,IA64_PSR_IS_BIT // IA64 instruction set? + ;; +(p6) mov r16=r18 // if so, use cr.iip instead of cr.ifa +#endif /* CONFIG_ITANIUM */ + ;; + THASH(p0, r17, r16, r18) // compute virtual address of L3 PTE + mov r29=b0 // save b0 in case of nested fault) +#ifdef CONFIG_SMP + mov r28=ar.ccv // save ar.ccv + ;; +1: ld8 r18=[r17] + ;; + mov ar.ccv=r18 // set compare value for cmpxchg + or r25=_PAGE_A,r18 // set the accessed bit + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit + ;; +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page present + mov r24=PAGE_SHIFT<<2 + ;; +(p6) cmp.eq p6,p7=r26,r18 // Only if page present + ;; + ITC_I(p6, r25, r26) // install updated PTE + ;; + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + + ld8 r18=[r17] // read PTE again + ;; + cmp.eq p6,p7=r18,r25 // is it same as the newly installed + ;; +(p7) ptc.l r16,r24 + mov b0=r29 // restore b0 + mov ar.ccv=r28 +#else /* !CONFIG_SMP */ + ;; +1: ld8 r18=[r17] + ;; + or r18=_PAGE_A,r18 // set the accessed bit + mov b0=r29 // restore b0 + ;; + st8 [r17]=r18 // store back updated PTE + ITC_I(p0, r18, r16) // install updated PTE +#endif /* !CONFIG_SMP */ + mov pr=r31,-1 + RFI +END(iaccess_bit) + + .org ia64_ivt+0x2800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55) +ENTRY(daccess_bit) + DBG_FAULT(10) + // Like Entry 8, except for data access + MOV_FROM_IFA(r16) // get the address that caused the fault + movl r30=1f // load continuation point in case of nested fault + ;; + THASH(p0, r17, r16, r18) // compute virtual address of L3 PTE + mov r31=pr + mov r29=b0 // save b0 in case of nested fault) +#ifdef CONFIG_SMP + mov r28=ar.ccv // save ar.ccv + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + mov ar.ccv=r18 // set compare value for cmpxchg + or r25=_PAGE_A,r18 // set the dirty bit + tbit.z p7,p6 = r18,_PAGE_P_BIT // Check present bit + ;; +(p6) cmpxchg8.acq r26=[r17],r25,ar.ccv // Only if page is present + mov r24=PAGE_SHIFT<<2 + ;; +(p6) cmp.eq p6,p7=r26,r18 // Only if page is present + ;; + ITC_D(p6, r25, r26) // install updated PTE + /* + * Tell the assemblers dependency-violation checker that the above "itc" instructions + * cannot possibly affect the following loads: + */ + dv_serialize_data + ;; + ld8 r18=[r17] // read PTE again + ;; + cmp.eq p6,p7=r18,r25 // is it same as the newly installed + ;; +(p7) ptc.l r16,r24 + mov ar.ccv=r28 +#else + ;; +1: ld8 r18=[r17] + ;; // avoid RAW on r18 + or r18=_PAGE_A,r18 // set the accessed bit + ;; + st8 [r17]=r18 // store back updated PTE + ITC_D(p0, r18, r16) // install updated PTE +#endif + mov b0=r29 // restore b0 + mov pr=r31,-1 + RFI +END(daccess_bit) + + .org ia64_ivt+0x2c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x2c00 Entry 11 (size 64 bundles) Break instruction (33) +ENTRY(break_fault) + /* + * The streamlined system call entry/exit paths only save/restore the initial part + * of pt_regs. This implies that the callers of system-calls must adhere to the + * normal procedure calling conventions. + * + * Registers to be saved & restored: + * CR registers: cr.ipsr, cr.iip, cr.ifs + * AR registers: ar.unat, ar.pfs, ar.rsc, ar.rnat, ar.bspstore, ar.fpsr + * others: pr, b0, b6, loadrs, r1, r11, r12, r13, r15 + * Registers to be restored only: + * r8-r11: output value from the system call. + * + * During system call exit, scratch registers (including r15) are modified/cleared + * to prevent leaking bits from kernel to user level. + */ + DBG_FAULT(11) + mov.m r16=IA64_KR(CURRENT) // M2 r16 <- current task (12 cyc) + MOV_FROM_IPSR(p0, r29) // M2 (12 cyc) + mov r31=pr // I0 (2 cyc) + + MOV_FROM_IIM(r17) // M2 (2 cyc) + mov.m r27=ar.rsc // M2 (12 cyc) + mov r18=__IA64_BREAK_SYSCALL // A + + mov.m ar.rsc=0 // M2 + mov.m r21=ar.fpsr // M2 (12 cyc) + mov r19=b6 // I0 (2 cyc) + ;; + mov.m r23=ar.bspstore // M2 (12 cyc) + mov.m r24=ar.rnat // M2 (5 cyc) + mov.i r26=ar.pfs // I0 (2 cyc) + + invala // M0|1 + nop.m 0 // M + mov r20=r1 // A save r1 + + nop.m 0 + movl r30=sys_call_table // X + + MOV_FROM_IIP(r28) // M2 (2 cyc) + cmp.eq p0,p7=r18,r17 // I0 is this a system call? +(p7) br.cond.spnt non_syscall // B no -> + // + // From this point on, we are definitely on the syscall-path + // and we can use (non-banked) scratch registers. + // +/////////////////////////////////////////////////////////////////////// + mov r1=r16 // A move task-pointer to "addl"-addressable reg + mov r2=r16 // A setup r2 for ia64_syscall_setup + add r9=TI_FLAGS+IA64_TASK_SIZE,r16 // A r9 = ¤t_thread_info()->flags + + adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 + adds r15=-1024,r15 // A subtract 1024 from syscall number + mov r3=NR_syscalls - 1 + ;; + ld1.bias r17=[r16] // M0|1 r17 = current->thread.on_ustack flag + ld4 r9=[r9] // M0|1 r9 = current_thread_info()->flags + extr.u r8=r29,41,2 // I0 extract ei field from cr.ipsr + + shladd r30=r15,3,r30 // A r30 = sys_call_table + 8*(syscall-1024) + addl r22=IA64_RBS_OFFSET,r1 // A compute base of RBS + cmp.leu p6,p7=r15,r3 // A syscall number in range? + ;; + + lfetch.fault.excl.nt1 [r22] // M0|1 prefetch RBS +(p6) ld8 r30=[r30] // M0|1 load address of syscall entry point + tnat.nz.or p7,p0=r15 // I0 is syscall nr a NaT? + + mov.m ar.bspstore=r22 // M2 switch to kernel RBS + cmp.eq p8,p9=2,r8 // A isr.ei==2? + ;; + +(p8) mov r8=0 // A clear ei to 0 +(p7) movl r30=sys_ni_syscall // X + +(p8) adds r28=16,r28 // A switch cr.iip to next bundle +(p9) adds r8=1,r8 // A increment ei to next slot +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + ;; + mov b6=r30 // I0 setup syscall handler branch reg early +#else + nop.i 0 + ;; +#endif + + mov.m r25=ar.unat // M2 (5 cyc) + dep r29=r8,r29,41,2 // I0 insert new ei into cr.ipsr + adds r15=1024,r15 // A restore original syscall number + // + // If any of the above loads miss in L1D, we'll stall here until + // the data arrives. + // +/////////////////////////////////////////////////////////////////////// + st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + MOV_FROM_ITC(p0, p14, r30, r18) // M get cycle for accounting +#else + mov b6=r30 // I0 setup syscall handler branch reg early +#endif + cmp.eq pKStk,pUStk=r0,r17 // A were we on kernel stacks already? + + and r9=_TIF_SYSCALL_TRACEAUDIT,r9 // A mask trace or audit + mov r18=ar.bsp // M2 (12 cyc) +(pKStk) br.cond.spnt .break_fixup // B we're already in kernel-mode -- fix up RBS + ;; +.back_from_break_fixup: +(pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A compute base of memory stack + cmp.eq p14,p0=r9,r0 // A are syscalls being traced/audited? + br.call.sptk.many b7=ia64_syscall_setup // B +1: +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + // mov.m r30=ar.itc is called in advance, and r13 is current + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 // A + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 // A +(pKStk) br.cond.spnt .skip_accounting // B unlikely skip + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // M get last stamp + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // M time at leave + ;; + ld8 r20=[r16],TI_AC_STAMP-TI_AC_STIME // M cumulated stime + ld8 r21=[r17] // M cumulated utime + sub r22=r19,r18 // A stime before leave + ;; + st8 [r16]=r30,TI_AC_STIME-TI_AC_STAMP // M update stamp + sub r18=r30,r19 // A elapsed time in user + ;; + add r20=r20,r22 // A sum stime + add r21=r21,r18 // A sum utime + ;; + st8 [r16]=r20 // M update stime + st8 [r17]=r21 // M update utime + ;; +.skip_accounting: +#endif + mov ar.rsc=0x3 // M2 set eager mode, pl 0, LE, loadrs=0 + nop 0 + BSW_1(r2, r14) // B (6 cyc) regs are saved, switch to bank 1 + ;; + + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r16) // M2 now it's safe to re-enable intr.-collection + // M0 ensure interruption collection is on + movl r3=ia64_ret_from_syscall // X + ;; + mov rp=r3 // I0 set the real return addr +(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT + + SSM_PSR_I(p15, p15, r16) // M2 restore psr.i +(p14) br.call.sptk.many b6=b6 // B invoke syscall-handker (ignore return addr) + br.cond.spnt.many ia64_trace_syscall // B do syscall-tracing thingamagic + // NOT REACHED +/////////////////////////////////////////////////////////////////////// + // On entry, we optimistically assumed that we're coming from user-space. + // For the rare cases where a system-call is done from within the kernel, + // we fix things up at this point: +.break_fixup: + add r1=-IA64_PT_REGS_SIZE,sp // A allocate space for pt_regs structure + mov ar.rnat=r24 // M2 restore kernel's AR.RNAT + ;; + mov ar.bspstore=r23 // M2 restore kernel's AR.BSPSTORE + br.cond.sptk .back_from_break_fixup +END(break_fault) + + .org ia64_ivt+0x3000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3000 Entry 12 (size 64 bundles) External Interrupt (4) +ENTRY(interrupt) + /* interrupt handler has become too big to fit this area. */ + br.sptk.many __interrupt +END(interrupt) + + .org ia64_ivt+0x3400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3400 Entry 13 (size 64 bundles) Reserved + DBG_FAULT(13) + FAULT(13) + + .org ia64_ivt+0x3800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3800 Entry 14 (size 64 bundles) Reserved + DBG_FAULT(14) + FAULT(14) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + * + * ia64_syscall_setup() is a separate subroutine so that it can + * allocate stacked registers so it can safely demine any + * potential NaT values from the input registers. + * + * On entry: + * - executing on bank 0 or bank 1 register set (doesn't matter) + * - r1: stack pointer + * - r2: current task pointer + * - r3: preserved + * - r11: original contents (saved ar.pfs to be saved) + * - r12: original contents (sp to be saved) + * - r13: original contents (tp to be saved) + * - r15: original contents (syscall # to be saved) + * - r18: saved bsp (after switching to kernel stack) + * - r19: saved b6 + * - r20: saved r1 (gp) + * - r21: saved ar.fpsr + * - r22: kernel's register backing store base (krbs_base) + * - r23: saved ar.bspstore + * - r24: saved ar.rnat + * - r25: saved ar.unat + * - r26: saved ar.pfs + * - r27: saved ar.rsc + * - r28: saved cr.iip + * - r29: saved cr.ipsr + * - r30: ar.itc for accounting (don't touch) + * - r31: saved pr + * - b0: original contents (to be saved) + * On exit: + * - p10: TRUE if syscall is invoked with more than 8 out + * registers or r15's Nat is true + * - r1: kernel's gp + * - r3: preserved (same as on entry) + * - r8: -EINVAL if p10 is true + * - r12: points to kernel stack + * - r13: points to current task + * - r14: preserved (same as on entry) + * - p13: preserved + * - p15: TRUE if interrupts need to be re-enabled + * - ar.fpsr: set to kernel settings + * - b6: preserved (same as on entry) + */ +#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE +GLOBAL_ENTRY(ia64_syscall_setup) +#if PT(B6) != 0 +# error This code assumes that b6 is the first field in pt_regs. +#endif + st8 [r1]=r19 // save b6 + add r16=PT(CR_IPSR),r1 // initialize first base pointer + add r17=PT(R11),r1 // initialize second base pointer + ;; + alloc r19=ar.pfs,8,0,0,0 // ensure in0-in7 are writable + st8 [r16]=r29,PT(AR_PFS)-PT(CR_IPSR) // save cr.ipsr + tnat.nz p8,p0=in0 + + st8.spill [r17]=r11,PT(CR_IIP)-PT(R11) // save r11 + tnat.nz p9,p0=in1 +(pKStk) mov r18=r0 // make sure r18 isn't NaT + ;; + + st8 [r16]=r26,PT(CR_IFS)-PT(AR_PFS) // save ar.pfs + st8 [r17]=r28,PT(AR_UNAT)-PT(CR_IIP) // save cr.iip + mov r28=b0 // save b0 (2 cyc) + ;; + + st8 [r17]=r25,PT(AR_RSC)-PT(AR_UNAT) // save ar.unat + dep r19=0,r19,38,26 // clear all bits but 0..37 [I0] +(p8) mov in0=-1 + ;; + + st8 [r16]=r19,PT(AR_RNAT)-PT(CR_IFS) // store ar.pfs.pfm in cr.ifs + extr.u r11=r19,7,7 // I0 // get sol of ar.pfs + and r8=0x7f,r19 // A // get sof of ar.pfs + + st8 [r17]=r27,PT(AR_BSPSTORE)-PT(AR_RSC)// save ar.rsc + tbit.nz p15,p0=r29,IA64_PSR_I_BIT // I0 +(p9) mov in1=-1 + ;; + +(pUStk) sub r18=r18,r22 // r18=RSE.ndirty*8 + tnat.nz p10,p0=in2 + add r11=8,r11 + ;; +(pKStk) adds r16=PT(PR)-PT(AR_RNAT),r16 // skip over ar_rnat field +(pKStk) adds r17=PT(B0)-PT(AR_BSPSTORE),r17 // skip over ar_bspstore field + tnat.nz p11,p0=in3 + ;; +(p10) mov in2=-1 + tnat.nz p12,p0=in4 // [I0] +(p11) mov in3=-1 + ;; +(pUStk) st8 [r16]=r24,PT(PR)-PT(AR_RNAT) // save ar.rnat +(pUStk) st8 [r17]=r23,PT(B0)-PT(AR_BSPSTORE) // save ar.bspstore + shl r18=r18,16 // compute ar.rsc to be used for "loadrs" + ;; + st8 [r16]=r31,PT(LOADRS)-PT(PR) // save predicates + st8 [r17]=r28,PT(R1)-PT(B0) // save b0 + tnat.nz p13,p0=in5 // [I0] + ;; + st8 [r16]=r18,PT(R12)-PT(LOADRS) // save ar.rsc value for "loadrs" + st8.spill [r17]=r20,PT(R13)-PT(R1) // save original r1 +(p12) mov in4=-1 + ;; + +.mem.offset 0,0; st8.spill [r16]=r12,PT(AR_FPSR)-PT(R12) // save r12 +.mem.offset 8,0; st8.spill [r17]=r13,PT(R15)-PT(R13) // save r13 +(p13) mov in5=-1 + ;; + st8 [r16]=r21,PT(R8)-PT(AR_FPSR) // save ar.fpsr + tnat.nz p13,p0=in6 + cmp.lt p10,p9=r11,r8 // frame size can't be more than local+8 + ;; + mov r8=1 +(p9) tnat.nz p10,p0=r15 + adds r12=-16,r1 // switch to kernel memory stack (with 16 bytes of scratch) + + st8.spill [r17]=r15 // save r15 + tnat.nz p8,p0=in7 + nop.i 0 + + mov r13=r2 // establish `current' + movl r1=__gp // establish kernel global pointer + ;; + st8 [r16]=r8 // ensure pt_regs.r8 != 0 (see handle_syscall_error) +(p13) mov in6=-1 +(p8) mov in7=-1 + + cmp.eq pSys,pNonSys=r0,r0 // set pSys=1, pNonSys=0 + movl r17=FPSR_DEFAULT + ;; + mov.m ar.fpsr=r17 // set ar.fpsr to kernel default value +(p10) mov r8=-EINVAL + br.ret.sptk.many b7 +END(ia64_syscall_setup) +#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ + + .org ia64_ivt+0x3c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x3c00 Entry 15 (size 64 bundles) Reserved + DBG_FAULT(15) + FAULT(15) + + .org ia64_ivt+0x4000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4000 Entry 16 (size 64 bundles) Reserved + DBG_FAULT(16) + FAULT(16) + +#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(__IA64_ASM_PARAVIRTUALIZED_NATIVE) + /* + * There is no particular reason for this code to be here, other than + * that there happens to be space here that would go unused otherwise. + * If this fault ever gets "unreserved", simply moved the following + * code to a more suitable spot... + * + * account_sys_enter is called from SAVE_MIN* macros if accounting is + * enabled and if the macro is entered from user mode. + */ +GLOBAL_ENTRY(account_sys_enter) + // mov.m r20=ar.itc is called in advance, and r13 is current + add r16=TI_AC_STAMP+IA64_TASK_SIZE,r13 + add r17=TI_AC_LEAVE+IA64_TASK_SIZE,r13 + ;; + ld8 r18=[r16],TI_AC_STIME-TI_AC_STAMP // time at last check in kernel + ld8 r19=[r17],TI_AC_UTIME-TI_AC_LEAVE // time at left from kernel + ;; + ld8 r23=[r16],TI_AC_STAMP-TI_AC_STIME // cumulated stime + ld8 r21=[r17] // cumulated utime + sub r22=r19,r18 // stime before leave kernel + ;; + st8 [r16]=r20,TI_AC_STIME-TI_AC_STAMP // update stamp + sub r18=r20,r19 // elapsed time in user mode + ;; + add r23=r23,r22 // sum stime + add r21=r21,r18 // sum utime + ;; + st8 [r16]=r23 // update stime + st8 [r17]=r21 // update utime + ;; + br.ret.sptk.many rp +END(account_sys_enter) +#endif + + .org ia64_ivt+0x4400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4400 Entry 17 (size 64 bundles) Reserved + DBG_FAULT(17) + FAULT(17) + + .org ia64_ivt+0x4800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4800 Entry 18 (size 64 bundles) Reserved + DBG_FAULT(18) + FAULT(18) + + .org ia64_ivt+0x4c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x4c00 Entry 19 (size 64 bundles) Reserved + DBG_FAULT(19) + FAULT(19) + +// +// --- End of long entries, Beginning of short entries +// + + .org ia64_ivt+0x5000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49) +ENTRY(page_not_present) + DBG_FAULT(20) + MOV_FROM_IFA(r16) + RSM_PSR_DT + /* + * The Linux page fault handler doesn't expect non-present pages to be in + * the TLB. Flush the existing entry now, so we meet that expectation. + */ + mov r17=PAGE_SHIFT<<2 + ;; + ptc.l r16,r17 + ;; + mov r31=pr + srlz.d + br.sptk.many page_fault +END(page_not_present) + + .org ia64_ivt+0x5100 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52) +ENTRY(key_permission) + DBG_FAULT(21) + MOV_FROM_IFA(r16) + RSM_PSR_DT + mov r31=pr + ;; + srlz.d + br.sptk.many page_fault +END(key_permission) + + .org ia64_ivt+0x5200 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26) +ENTRY(iaccess_rights) + DBG_FAULT(22) + MOV_FROM_IFA(r16) + RSM_PSR_DT + mov r31=pr + ;; + srlz.d + br.sptk.many page_fault +END(iaccess_rights) + + .org ia64_ivt+0x5300 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53) +ENTRY(daccess_rights) + DBG_FAULT(23) + MOV_FROM_IFA(r16) + RSM_PSR_DT + mov r31=pr + ;; + srlz.d + br.sptk.many page_fault +END(daccess_rights) + + .org ia64_ivt+0x5400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39) +ENTRY(general_exception) + DBG_FAULT(24) + MOV_FROM_ISR(r16) + mov r31=pr + ;; + cmp4.eq p6,p0=0,r16 +(p6) br.sptk.many dispatch_illegal_op_fault + ;; + mov r19=24 // fault number + br.sptk.many dispatch_to_fault_handler +END(general_exception) + + .org ia64_ivt+0x5500 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35) +ENTRY(disabled_fp_reg) + DBG_FAULT(25) + rsm psr.dfh // ensure we can access fph + ;; + srlz.d + mov r31=pr + mov r19=25 + br.sptk.many dispatch_to_fault_handler +END(disabled_fp_reg) + + .org ia64_ivt+0x5600 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50) +ENTRY(nat_consumption) + DBG_FAULT(26) + + MOV_FROM_IPSR(p0, r16) + MOV_FROM_ISR(r17) + mov r31=pr // save PR + ;; + and r18=0xf,r17 // r18 = cr.ipsr.code{3:0} + tbit.z p6,p0=r17,IA64_ISR_NA_BIT + ;; + cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18 + dep r16=-1,r16,IA64_PSR_ED_BIT,1 +(p6) br.cond.spnt 1f // branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH) + ;; + MOV_TO_IPSR(p0, r16, r18) + mov pr=r31,-1 + ;; + RFI + +1: mov pr=r31,-1 + ;; + FAULT(26) +END(nat_consumption) + + .org ia64_ivt+0x5700 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5700 Entry 27 (size 16 bundles) Speculation (40) +ENTRY(speculation_vector) + DBG_FAULT(27) + /* + * A [f]chk.[as] instruction needs to take the branch to the recovery code but + * this part of the architecture is not implemented in hardware on some CPUs, such + * as Itanium. Thus, in general we need to emulate the behavior. IIM contains + * the relative target (not yet sign extended). So after sign extending it we + * simply add it to IIP. We also need to reset the EI field of the IPSR to zero, + * i.e., the slot to restart into. + * + * cr.imm contains zero_ext(imm21) + */ + MOV_FROM_IIM(r18) + ;; + MOV_FROM_IIP(r17) + shl r18=r18,43 // put sign bit in position (43=64-21) + ;; + + MOV_FROM_IPSR(p0, r16) + shr r18=r18,39 // sign extend (39=43-4) + ;; + + add r17=r17,r18 // now add the offset + ;; + MOV_TO_IIP(r17, r19) + dep r16=0,r16,41,2 // clear EI + ;; + + MOV_TO_IPSR(p0, r16, r19) + ;; + + RFI +END(speculation_vector) + + .org ia64_ivt+0x5800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5800 Entry 28 (size 16 bundles) Reserved + DBG_FAULT(28) + FAULT(28) + + .org ia64_ivt+0x5900 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56) +ENTRY(debug_vector) + DBG_FAULT(29) + FAULT(29) +END(debug_vector) + + .org ia64_ivt+0x5a00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57) +ENTRY(unaligned_access) + DBG_FAULT(30) + mov r31=pr // prepare to save predicates + ;; + br.sptk.many dispatch_unaligned_handler +END(unaligned_access) + + .org ia64_ivt+0x5b00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57) +ENTRY(unsupported_data_reference) + DBG_FAULT(31) + FAULT(31) +END(unsupported_data_reference) + + .org ia64_ivt+0x5c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64) +ENTRY(floating_point_fault) + DBG_FAULT(32) + FAULT(32) +END(floating_point_fault) + + .org ia64_ivt+0x5d00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66) +ENTRY(floating_point_trap) + DBG_FAULT(33) + FAULT(33) +END(floating_point_trap) + + .org ia64_ivt+0x5e00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66) +ENTRY(lower_privilege_trap) + DBG_FAULT(34) + FAULT(34) +END(lower_privilege_trap) + + .org ia64_ivt+0x5f00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68) +ENTRY(taken_branch_trap) + DBG_FAULT(35) + FAULT(35) +END(taken_branch_trap) + + .org ia64_ivt+0x6000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69) +ENTRY(single_step_trap) + DBG_FAULT(36) + FAULT(36) +END(single_step_trap) + + .org ia64_ivt+0x6100 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6100 Entry 37 (size 16 bundles) Reserved + DBG_FAULT(37) + FAULT(37) + + .org ia64_ivt+0x6200 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6200 Entry 38 (size 16 bundles) Reserved + DBG_FAULT(38) + FAULT(38) + + .org ia64_ivt+0x6300 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6300 Entry 39 (size 16 bundles) Reserved + DBG_FAULT(39) + FAULT(39) + + .org ia64_ivt+0x6400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6400 Entry 40 (size 16 bundles) Reserved + DBG_FAULT(40) + FAULT(40) + + .org ia64_ivt+0x6500 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6500 Entry 41 (size 16 bundles) Reserved + DBG_FAULT(41) + FAULT(41) + + .org ia64_ivt+0x6600 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6600 Entry 42 (size 16 bundles) Reserved + DBG_FAULT(42) + FAULT(42) + + .org ia64_ivt+0x6700 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6700 Entry 43 (size 16 bundles) Reserved + DBG_FAULT(43) + FAULT(43) + + .org ia64_ivt+0x6800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6800 Entry 44 (size 16 bundles) Reserved + DBG_FAULT(44) + FAULT(44) + + .org ia64_ivt+0x6900 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77) +ENTRY(ia32_exception) + DBG_FAULT(45) + FAULT(45) +END(ia32_exception) + + .org ia64_ivt+0x6a00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept (30,31,59,70,71) +ENTRY(ia32_intercept) + DBG_FAULT(46) + FAULT(46) +END(ia32_intercept) + + .org ia64_ivt+0x6b00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt (74) +ENTRY(ia32_interrupt) + DBG_FAULT(47) + FAULT(47) +END(ia32_interrupt) + + .org ia64_ivt+0x6c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6c00 Entry 48 (size 16 bundles) Reserved + DBG_FAULT(48) + FAULT(48) + + .org ia64_ivt+0x6d00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6d00 Entry 49 (size 16 bundles) Reserved + DBG_FAULT(49) + FAULT(49) + + .org ia64_ivt+0x6e00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6e00 Entry 50 (size 16 bundles) Reserved + DBG_FAULT(50) + FAULT(50) + + .org ia64_ivt+0x6f00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x6f00 Entry 51 (size 16 bundles) Reserved + DBG_FAULT(51) + FAULT(51) + + .org ia64_ivt+0x7000 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7000 Entry 52 (size 16 bundles) Reserved + DBG_FAULT(52) + FAULT(52) + + .org ia64_ivt+0x7100 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7100 Entry 53 (size 16 bundles) Reserved + DBG_FAULT(53) + FAULT(53) + + .org ia64_ivt+0x7200 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7200 Entry 54 (size 16 bundles) Reserved + DBG_FAULT(54) + FAULT(54) + + .org ia64_ivt+0x7300 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7300 Entry 55 (size 16 bundles) Reserved + DBG_FAULT(55) + FAULT(55) + + .org ia64_ivt+0x7400 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7400 Entry 56 (size 16 bundles) Reserved + DBG_FAULT(56) + FAULT(56) + + .org ia64_ivt+0x7500 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7500 Entry 57 (size 16 bundles) Reserved + DBG_FAULT(57) + FAULT(57) + + .org ia64_ivt+0x7600 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7600 Entry 58 (size 16 bundles) Reserved + DBG_FAULT(58) + FAULT(58) + + .org ia64_ivt+0x7700 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7700 Entry 59 (size 16 bundles) Reserved + DBG_FAULT(59) + FAULT(59) + + .org ia64_ivt+0x7800 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7800 Entry 60 (size 16 bundles) Reserved + DBG_FAULT(60) + FAULT(60) + + .org ia64_ivt+0x7900 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7900 Entry 61 (size 16 bundles) Reserved + DBG_FAULT(61) + FAULT(61) + + .org ia64_ivt+0x7a00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7a00 Entry 62 (size 16 bundles) Reserved + DBG_FAULT(62) + FAULT(62) + + .org ia64_ivt+0x7b00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7b00 Entry 63 (size 16 bundles) Reserved + DBG_FAULT(63) + FAULT(63) + + .org ia64_ivt+0x7c00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7c00 Entry 64 (size 16 bundles) Reserved + DBG_FAULT(64) + FAULT(64) + + .org ia64_ivt+0x7d00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7d00 Entry 65 (size 16 bundles) Reserved + DBG_FAULT(65) + FAULT(65) + + .org ia64_ivt+0x7e00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7e00 Entry 66 (size 16 bundles) Reserved + DBG_FAULT(66) + FAULT(66) + + .org ia64_ivt+0x7f00 +///////////////////////////////////////////////////////////////////////////////////////// +// 0x7f00 Entry 67 (size 16 bundles) Reserved + DBG_FAULT(67) + FAULT(67) + + //----------------------------------------------------------------------------------- + // call do_page_fault (predicates are in r31, psr.dt may be off, r16 is faulting address) +ENTRY(page_fault) + SSM_PSR_DT_AND_SRLZ_I + ;; + SAVE_MIN_WITH_COVER + alloc r15=ar.pfs,0,0,3,0 + MOV_FROM_IFA(out0) + MOV_FROM_ISR(out1) + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r14, r3) + adds r3=8,r2 // set up second base pointer + SSM_PSR_I(p15, p15, r14) // restore psr.i + movl r14=ia64_leave_kernel + ;; + SAVE_REST + mov rp=r14 + ;; + adds out2=16,r12 // out2 = pointer to pt_regs + br.call.sptk.many b6=ia64_do_page_fault // ignore return address +END(page_fault) + +ENTRY(non_syscall) + mov ar.rsc=r27 // restore ar.rsc before SAVE_MIN_WITH_COVER + ;; + SAVE_MIN_WITH_COVER + + // There is no particular reason for this code to be here, other than that + // there happens to be space here that would go unused otherwise. If this + // fault ever gets "unreserved", simply moved the following code to a more + // suitable spot... + + alloc r14=ar.pfs,0,0,2,0 + MOV_FROM_IIM(out0) + add out1=16,sp + adds r3=8,r2 // set up second base pointer for SAVE_REST + + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r15, r24) + // guarantee that interruption collection is on + SSM_PSR_I(p15, p15, r15) // restore psr.i + movl r15=ia64_leave_kernel + ;; + SAVE_REST + mov rp=r15 + ;; + br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr +END(non_syscall) + +ENTRY(__interrupt) + DBG_FAULT(12) + mov r31=pr // prepare to save predicates + ;; + SAVE_MIN_WITH_COVER // uses r31; defines r2 and r3 + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r14) + // ensure everybody knows psr.ic is back on + adds r3=8,r2 // set up second base pointer for SAVE_REST + ;; + SAVE_REST + ;; + MCA_RECOVER_RANGE(interrupt) + alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group + MOV_FROM_IVR(out0, r8) // pass cr.ivr as first arg + add out1=16,sp // pass pointer to pt_regs as second arg + ;; + srlz.d // make sure we see the effect of cr.ivr + movl r14=ia64_leave_kernel + ;; + mov rp=r14 + br.call.sptk.many b6=ia64_handle_irq +END(__interrupt) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + */ + +ENTRY(dispatch_unaligned_handler) + SAVE_MIN_WITH_COVER + ;; + alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!) + MOV_FROM_IFA(out0) + adds out1=16,sp + + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24) + // guarantee that interruption collection is on + SSM_PSR_I(p15, p15, r3) // restore psr.i + adds r3=8,r2 // set up second base pointer + ;; + SAVE_REST + movl r14=ia64_leave_kernel + ;; + mov rp=r14 + br.sptk.many ia64_prepare_handle_unaligned +END(dispatch_unaligned_handler) + + /* + * There is no particular reason for this code to be here, other than that + * there happens to be space here that would go unused otherwise. If this + * fault ever gets "unreserved", simply moved the following code to a more + * suitable spot... + */ + +ENTRY(dispatch_to_fault_handler) + /* + * Input: + * psr.ic: off + * r19: fault vector number (e.g., 24 for General Exception) + * r31: contains saved predicates (pr) + */ + SAVE_MIN_WITH_COVER_R19 + alloc r14=ar.pfs,0,0,5,0 + MOV_FROM_ISR(out1) + MOV_FROM_IFA(out2) + MOV_FROM_IIM(out3) + MOV_FROM_ITIR(out4) + ;; + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, out0) + // guarantee that interruption collection is on + mov out0=r15 + ;; + SSM_PSR_I(p15, p15, r3) // restore psr.i + adds r3=8,r2 // set up second base pointer for SAVE_REST + ;; + SAVE_REST + movl r14=ia64_leave_kernel + ;; + mov rp=r14 + br.call.sptk.many b6=ia64_fault +END(dispatch_to_fault_handler) + + /* + * Squatting in this space ... + * + * This special case dispatcher for illegal operation faults allows preserved + * registers to be modified through a callback function (asm only) that is handed + * back from the fault handler in r8. Up to three arguments can be passed to the + * callback function by returning an aggregate with the callback as its first + * element, followed by the arguments. + */ +ENTRY(dispatch_illegal_op_fault) + .prologue + .body + SAVE_MIN_WITH_COVER + SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(r3, r24) + // guarantee that interruption collection is on + ;; + SSM_PSR_I(p15, p15, r3) // restore psr.i + adds r3=8,r2 // set up second base pointer for SAVE_REST + ;; + alloc r14=ar.pfs,0,0,1,0 // must be first in insn group + mov out0=ar.ec + ;; + SAVE_REST + PT_REGS_UNWIND_INFO(0) + ;; + br.call.sptk.many rp=ia64_illegal_op_fault +.ret0: ;; + alloc r14=ar.pfs,0,0,3,0 // must be first in insn group + mov out0=r9 + mov out1=r10 + mov out2=r11 + movl r15=ia64_leave_kernel + ;; + mov rp=r15 + mov b6=r8 + ;; + cmp.ne p6,p0=0,r8 +(p6) br.call.dpnt.many b6=b6 // call returns to ia64_leave_kernel + br.sptk.many ia64_leave_kernel +END(dispatch_illegal_op_fault) diff --git a/arch/ia64/kernel/jprobes.S b/arch/ia64/kernel/jprobes.S new file mode 100644 index 00000000000..f69389c7be1 --- /dev/null +++ b/arch/ia64/kernel/jprobes.S @@ -0,0 +1,90 @@ +/* + * Jprobe specific operations + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Intel Corporation, 2005 + * + * 2005-May Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy + * <anil.s.keshavamurthy@intel.com> initial implementation + * + * Jprobes (a.k.a. "jump probes" which is built on-top of kprobes) allow a + * probe to be inserted into the beginning of a function call. The fundamental + * difference between a jprobe and a kprobe is the jprobe handler is executed + * in the same context as the target function, while the kprobe handlers + * are executed in interrupt context. + * + * For jprobes we initially gain control by placing a break point in the + * first instruction of the targeted function. When we catch that specific + * break, we: + * * set the return address to our jprobe_inst_return() function + * * jump to the jprobe handler function + * + * Since we fixed up the return address, the jprobe handler will return to our + * jprobe_inst_return() function, giving us control again. At this point we + * are back in the parents frame marker, so we do yet another call to our + * jprobe_break() function to fix up the frame marker as it would normally + * exist in the target function. + * + * Our jprobe_return function then transfers control back to kprobes.c by + * executing a break instruction using one of our reserved numbers. When we + * catch that break in kprobes.c, we continue like we do for a normal kprobe + * by single stepping the emulated instruction, and then returning execution + * to the correct location. + */ +#include <asm/asmmacro.h> +#include <asm/break.h> + + /* + * void jprobe_break(void) + */ + .section .kprobes.text, "ax" +ENTRY(jprobe_break) + break.m __IA64_BREAK_JPROBE +END(jprobe_break) + + /* + * void jprobe_inst_return(void) + */ +GLOBAL_ENTRY(jprobe_inst_return) + br.call.sptk.many b0=jprobe_break +END(jprobe_inst_return) + +GLOBAL_ENTRY(invalidate_stacked_regs) + movl r16=invalidate_restore_cfm + ;; + mov b6=r16 + ;; + br.ret.sptk.many b6 + ;; +invalidate_restore_cfm: + mov r16=ar.rsc + ;; + mov ar.rsc=r0 + ;; + loadrs + ;; + mov ar.rsc=r16 + ;; + br.cond.sptk.many rp +END(invalidate_stacked_regs) + +GLOBAL_ENTRY(flush_register_stack) + // flush dirty regs to backing store (must be first in insn group) + flushrs + ;; + br.ret.sptk.many rp +END(flush_register_stack) + diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c new file mode 100644 index 00000000000..7026b29e277 --- /dev/null +++ b/arch/ia64/kernel/kprobes.c @@ -0,0 +1,1129 @@ +/* + * Kernel Probes (KProbes) + * arch/ia64/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Intel Corporation, 2005 + * + * 2005-Apr Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy + * <anil.s.keshavamurthy@intel.com> adapted from i386 + */ + +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/preempt.h> +#include <linux/moduleloader.h> +#include <linux/kdebug.h> + +#include <asm/pgtable.h> +#include <asm/sections.h> +#include <asm/uaccess.h> + +extern void jprobe_inst_return(void); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; + +enum instruction_type {A, I, M, F, B, L, X, u}; +static enum instruction_type bundle_encoding[32][3] = { + { M, I, I }, /* 00 */ + { M, I, I }, /* 01 */ + { M, I, I }, /* 02 */ + { M, I, I }, /* 03 */ + { M, L, X }, /* 04 */ + { M, L, X }, /* 05 */ + { u, u, u }, /* 06 */ + { u, u, u }, /* 07 */ + { M, M, I }, /* 08 */ + { M, M, I }, /* 09 */ + { M, M, I }, /* 0A */ + { M, M, I }, /* 0B */ + { M, F, I }, /* 0C */ + { M, F, I }, /* 0D */ + { M, M, F }, /* 0E */ + { M, M, F }, /* 0F */ + { M, I, B }, /* 10 */ + { M, I, B }, /* 11 */ + { M, B, B }, /* 12 */ + { M, B, B }, /* 13 */ + { u, u, u }, /* 14 */ + { u, u, u }, /* 15 */ + { B, B, B }, /* 16 */ + { B, B, B }, /* 17 */ + { M, M, B }, /* 18 */ + { M, M, B }, /* 19 */ + { u, u, u }, /* 1A */ + { u, u, u }, /* 1B */ + { M, F, B }, /* 1C */ + { M, F, B }, /* 1D */ + { u, u, u }, /* 1E */ + { u, u, u }, /* 1F */ +}; + +/* Insert a long branch code */ +static void __kprobes set_brl_inst(void *from, void *to) +{ + s64 rel = ((s64) to - (s64) from) >> 4; + bundle_t *brl; + brl = (bundle_t *) ((u64) from & ~0xf); + brl->quad0.template = 0x05; /* [MLX](stop) */ + brl->quad0.slot0 = NOP_M_INST; /* nop.m 0x0 */ + brl->quad0.slot1_p0 = ((rel >> 20) & 0x7fffffffff) << 2; + brl->quad1.slot1_p1 = (((rel >> 20) & 0x7fffffffff) << 2) >> (64 - 46); + /* brl.cond.sptk.many.clr rel<<4 (qp=0) */ + brl->quad1.slot2 = BRL_INST(rel >> 59, rel & 0xfffff); +} + +/* + * In this function we check to see if the instruction + * is IP relative instruction and update the kprobe + * inst flag accordingly + */ +static void __kprobes update_kprobe_inst_flag(uint template, uint slot, + uint major_opcode, + unsigned long kprobe_inst, + struct kprobe *p) +{ + p->ainsn.inst_flag = 0; + p->ainsn.target_br_reg = 0; + p->ainsn.slot = slot; + + /* Check for Break instruction + * Bits 37:40 Major opcode to be zero + * Bits 27:32 X6 to be zero + * Bits 32:35 X3 to be zero + */ + if ((!major_opcode) && (!((kprobe_inst >> 27) & 0x1FF)) ) { + /* is a break instruction */ + p->ainsn.inst_flag |= INST_FLAG_BREAK_INST; + return; + } + + if (bundle_encoding[template][slot] == B) { + switch (major_opcode) { + case INDIRECT_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + case IP_RELATIVE_PREDICT_OPCODE: + case IP_RELATIVE_BRANCH_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + break; + case IP_RELATIVE_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + } + } else if (bundle_encoding[template][slot] == X) { + switch (major_opcode) { + case LONG_CALL_OPCODE: + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; + } + } + return; +} + +/* + * In this function we check to see if the instruction + * (qp) cmpx.crel.ctype p1,p2=r2,r3 + * on which we are inserting kprobe is cmp instruction + * with ctype as unc. + */ +static uint __kprobes is_cmp_ctype_unc_inst(uint template, uint slot, + uint major_opcode, + unsigned long kprobe_inst) +{ + cmp_inst_t cmp_inst; + uint ctype_unc = 0; + + if (!((bundle_encoding[template][slot] == I) || + (bundle_encoding[template][slot] == M))) + goto out; + + if (!((major_opcode == 0xC) || (major_opcode == 0xD) || + (major_opcode == 0xE))) + goto out; + + cmp_inst.l = kprobe_inst; + if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) { + /* Integer compare - Register Register (A6 type)*/ + if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0) + &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) { + /* Integer compare - Immediate Register (A8 type)*/ + if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1)) + ctype_unc = 1; + } +out: + return ctype_unc; +} + +/* + * In this function we check to see if the instruction + * on which we are inserting kprobe is supported. + * Returns qp value if supported + * Returns -EINVAL if unsupported + */ +static int __kprobes unsupported_inst(uint template, uint slot, + uint major_opcode, + unsigned long kprobe_inst, + unsigned long addr) +{ + int qp; + + qp = kprobe_inst & 0x3f; + if (is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst)) { + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on cmp unc " + "instruction on slot 1 at <0x%lx> " + "is not supported\n", addr); + return -EINVAL; + + } + qp = 0; + } + else if (bundle_encoding[template][slot] == I) { + if (major_opcode == 0) { + /* + * Check for Integer speculation instruction + * - Bit 33-35 to be equal to 0x1 + */ + if (((kprobe_inst >> 33) & 0x7) == 1) { + printk(KERN_WARNING + "Kprobes on speculation inst at <0x%lx> not supported\n", + addr); + return -EINVAL; + } + /* + * IP relative mov instruction + * - Bit 27-35 to be equal to 0x30 + */ + if (((kprobe_inst >> 27) & 0x1FF) == 0x30) { + printk(KERN_WARNING + "Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n", + addr); + return -EINVAL; + + } + } + else if ((major_opcode == 5) && !(kprobe_inst & (0xFUl << 33)) && + (kprobe_inst & (0x1UL << 12))) { + /* test bit instructions, tbit,tnat,tf + * bit 33-36 to be equal to 0 + * bit 12 to be equal to 1 + */ + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on test bit " + "instruction on slot at <0x%lx> " + "is not supported\n", addr); + return -EINVAL; + } + qp = 0; + } + } + else if (bundle_encoding[template][slot] == B) { + if (major_opcode == 7) { + /* IP-Relative Predict major code is 7 */ + printk(KERN_WARNING "Kprobes on IP-Relative" + "Predict is not supported\n"); + return -EINVAL; + } + else if (major_opcode == 2) { + /* Indirect Predict, major code is 2 + * bit 27-32 to be equal to 10 or 11 + */ + int x6=(kprobe_inst >> 27) & 0x3F; + if ((x6 == 0x10) || (x6 == 0x11)) { + printk(KERN_WARNING "Kprobes on " + "Indirect Predict is not supported\n"); + return -EINVAL; + } + } + } + /* kernel does not use float instruction, here for safety kprobe + * will judge whether it is fcmp/flass/float approximation instruction + */ + else if (unlikely(bundle_encoding[template][slot] == F)) { + if ((major_opcode == 4 || major_opcode == 5) && + (kprobe_inst & (0x1 << 12))) { + /* fcmp/fclass unc instruction */ + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on fcmp/fclass " + "instruction on slot at <0x%lx> " + "is not supported\n", addr); + return -EINVAL; + + } + qp = 0; + } + if ((major_opcode == 0 || major_opcode == 1) && + (kprobe_inst & (0x1UL << 33))) { + /* float Approximation instruction */ + if (slot == 1 && qp) { + printk(KERN_WARNING "Kprobes on float Approx " + "instr at <0x%lx> is not supported\n", + addr); + return -EINVAL; + } + qp = 0; + } + } + return qp; +} + +/* + * In this function we override the bundle with + * the break instruction at the given slot. + */ +static void __kprobes prepare_break_inst(uint template, uint slot, + uint major_opcode, + unsigned long kprobe_inst, + struct kprobe *p, + int qp) +{ + unsigned long break_inst = BREAK_INST; + bundle_t *bundle = &p->opcode.bundle; + + /* + * Copy the original kprobe_inst qualifying predicate(qp) + * to the break instruction + */ + break_inst |= qp; + + switch (slot) { + case 0: + bundle->quad0.slot0 = break_inst; + break; + case 1: + bundle->quad0.slot1_p0 = break_inst; + bundle->quad1.slot1_p1 = break_inst >> (64-46); + break; + case 2: + bundle->quad1.slot2 = break_inst; + break; + } + + /* + * Update the instruction flag, so that we can + * emulate the instruction properly after we + * single step on original instruction + */ + update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p); +} + +static void __kprobes get_kprobe_inst(bundle_t *bundle, uint slot, + unsigned long *kprobe_inst, uint *major_opcode) +{ + unsigned long kprobe_inst_p0, kprobe_inst_p1; + unsigned int template; + + template = bundle->quad0.template; + + switch (slot) { + case 0: + *major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); + *kprobe_inst = bundle->quad0.slot0; + break; + case 1: + *major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); + kprobe_inst_p0 = bundle->quad0.slot1_p0; + kprobe_inst_p1 = bundle->quad1.slot1_p1; + *kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46)); + break; + case 2: + *major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); + *kprobe_inst = bundle->quad1.slot2; + break; + } +} + +/* Returns non-zero if the addr is in the Interrupt Vector Table */ +static int __kprobes in_ivt_functions(unsigned long addr) +{ + return (addr >= (unsigned long)__start_ivt_text + && addr < (unsigned long)__end_ivt_text); +} + +static int __kprobes valid_kprobe_addr(int template, int slot, + unsigned long addr) +{ + if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) { + printk(KERN_WARNING "Attempting to insert unaligned kprobe " + "at 0x%lx\n", addr); + return -EINVAL; + } + + if (in_ivt_functions(addr)) { + printk(KERN_WARNING "Kprobes can't be inserted inside " + "IVT functions at 0x%lx\n", addr); + return -EINVAL; + } + + return 0; +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + unsigned int i; + i = atomic_add_return(1, &kcb->prev_kprobe_index); + kcb->prev_kprobe[i-1].kp = kprobe_running(); + kcb->prev_kprobe[i-1].status = kcb->kprobe_status; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + unsigned int i; + i = atomic_read(&kcb->prev_kprobe_index); + __get_cpu_var(current_kprobe) = kcb->prev_kprobe[i-1].kp; + kcb->kprobe_status = kcb->prev_kprobe[i-1].status; + atomic_sub(1, &kcb->prev_kprobe_index); +} + +static void __kprobes set_current_kprobe(struct kprobe *p, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; +} + +static void kretprobe_trampoline(void) +{ +} + +/* + * At this point the target function has been tricked into + * returning into our trampoline. Lookup the associated instance + * and then: + * - call the handler function + * - cleanup by marking the instance as unused + * - long jump back to the original return address + */ +int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address = + ((struct fnptr *)kretprobe_trampoline)->ip; + + INIT_HLIST_HEAD(&empty_rp); + kretprobe_hash_lock(current, &head, &flags); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more than one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long)ri->ret_addr; + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + regs->cr_iip = orig_ret_address; + + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + reset_current_kprobe(); + kretprobe_hash_unlock(current, &flags); + preempt_enable_no_resched(); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; +} + +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + ri->ret_addr = (kprobe_opcode_t *)regs->b0; + + /* Replace the return addr with trampoline addr */ + regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; +} + +/* Check the instruction in the slot is break */ +static int __kprobes __is_ia64_break_inst(bundle_t *bundle, uint slot) +{ + unsigned int major_opcode; + unsigned int template = bundle->quad0.template; + unsigned long kprobe_inst; + + /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; + + /* Get Kprobe probe instruction at given slot*/ + get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); + + /* For break instruction, + * Bits 37:40 Major opcode to be zero + * Bits 27:32 X6 to be zero + * Bits 32:35 X3 to be zero + */ + if (major_opcode || ((kprobe_inst >> 27) & 0x1FF)) { + /* Not a break instruction */ + return 0; + } + + /* Is a break instruction */ + return 1; +} + +/* + * In this function, we check whether the target bundle modifies IP or + * it triggers an exception. If so, it cannot be boostable. + */ +static int __kprobes can_boost(bundle_t *bundle, uint slot, + unsigned long bundle_addr) +{ + unsigned int template = bundle->quad0.template; + + do { + if (search_exception_tables(bundle_addr + slot) || + __is_ia64_break_inst(bundle, slot)) + return 0; /* exception may occur in this bundle*/ + } while ((++slot) < 3); + template &= 0x1e; + if (template >= 0x10 /* including B unit */ || + template == 0x04 /* including X unit */ || + template == 0x06) /* undefined */ + return 0; + + return 1; +} + +/* Prepare long jump bundle and disables other boosters if need */ +static void __kprobes prepare_booster(struct kprobe *p) +{ + unsigned long addr = (unsigned long)p->addr & ~0xFULL; + unsigned int slot = (unsigned long)p->addr & 0xf; + struct kprobe *other_kp; + + if (can_boost(&p->ainsn.insn[0].bundle, slot, addr)) { + set_brl_inst(&p->ainsn.insn[1].bundle, (bundle_t *)addr + 1); + p->ainsn.inst_flag |= INST_FLAG_BOOSTABLE; + } + + /* disables boosters in previous slots */ + for (; addr < (unsigned long)p->addr; addr++) { + other_kp = get_kprobe((void *)addr); + if (other_kp) + other_kp->ainsn.inst_flag &= ~INST_FLAG_BOOSTABLE; + } +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + unsigned long addr = (unsigned long) p->addr; + unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL); + unsigned long kprobe_inst=0; + unsigned int slot = addr & 0xf, template, major_opcode = 0; + bundle_t *bundle; + int qp; + + bundle = &((kprobe_opcode_t *)kprobe_addr)->bundle; + template = bundle->quad0.template; + + if(valid_kprobe_addr(template, slot, addr)) + return -EINVAL; + + /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; + + /* Get kprobe_inst and major_opcode from the bundle */ + get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); + + qp = unsupported_inst(template, slot, major_opcode, kprobe_inst, addr); + if (qp < 0) + return -EINVAL; + + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + memcpy(&p->opcode, kprobe_addr, sizeof(kprobe_opcode_t)); + memcpy(p->ainsn.insn, kprobe_addr, sizeof(kprobe_opcode_t)); + + prepare_break_inst(template, slot, major_opcode, kprobe_inst, p, qp); + + prepare_booster(p); + + return 0; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + unsigned long arm_addr; + bundle_t *src, *dest; + + arm_addr = ((unsigned long)p->addr) & ~0xFUL; + dest = &((kprobe_opcode_t *)arm_addr)->bundle; + src = &p->opcode.bundle; + + flush_icache_range((unsigned long)p->ainsn.insn, + (unsigned long)p->ainsn.insn + + sizeof(kprobe_opcode_t) * MAX_INSN_SIZE); + + switch (p->ainsn.slot) { + case 0: + dest->quad0.slot0 = src->quad0.slot0; + break; + case 1: + dest->quad1.slot1_p1 = src->quad1.slot1_p1; + break; + case 2: + dest->quad1.slot2 = src->quad1.slot2; + break; + } + flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + unsigned long arm_addr; + bundle_t *src, *dest; + + arm_addr = ((unsigned long)p->addr) & ~0xFUL; + dest = &((kprobe_opcode_t *)arm_addr)->bundle; + /* p->ainsn.insn contains the original unaltered kprobe_opcode_t */ + src = &p->ainsn.insn->bundle; + switch (p->ainsn.slot) { + case 0: + dest->quad0.slot0 = src->quad0.slot0; + break; + case 1: + dest->quad1.slot1_p1 = src->quad1.slot1_p1; + break; + case 2: + dest->quad1.slot2 = src->quad1.slot2; + break; + } + flush_icache_range(arm_addr, arm_addr + sizeof(kprobe_opcode_t)); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, + p->ainsn.inst_flag & INST_FLAG_BOOSTABLE); + p->ainsn.insn = NULL; + } +} +/* + * We are resuming execution after a single step fault, so the pt_regs + * structure reflects the register state after we executed the instruction + * located in the kprobe (p->ainsn.insn->bundle). We still need to adjust + * the ip to point back to the original stack address. To set the IP address + * to original stack address, handle the case where we need to fixup the + * relative IP address and/or fixup branch register. + */ +static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long bundle_addr = (unsigned long) (&p->ainsn.insn->bundle); + unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; + unsigned long template; + int slot = ((unsigned long)p->addr & 0xf); + + template = p->ainsn.insn->bundle.quad0.template; + + if (slot == 1 && bundle_encoding[template][1] == L) + slot = 2; + + if (p->ainsn.inst_flag & ~INST_FLAG_BOOSTABLE) { + + if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) { + /* Fix relative IP address */ + regs->cr_iip = (regs->cr_iip - bundle_addr) + + resume_addr; + } + + if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) { + /* + * Fix target branch register, software convention is + * to use either b0 or b6 or b7, so just checking + * only those registers + */ + switch (p->ainsn.target_br_reg) { + case 0: + if ((regs->b0 == bundle_addr) || + (regs->b0 == bundle_addr + 0x10)) { + regs->b0 = (regs->b0 - bundle_addr) + + resume_addr; + } + break; + case 6: + if ((regs->b6 == bundle_addr) || + (regs->b6 == bundle_addr + 0x10)) { + regs->b6 = (regs->b6 - bundle_addr) + + resume_addr; + } + break; + case 7: + if ((regs->b7 == bundle_addr) || + (regs->b7 == bundle_addr + 0x10)) { + regs->b7 = (regs->b7 - bundle_addr) + + resume_addr; + } + break; + } /* end switch */ + } + goto turn_ss_off; + } + + if (slot == 2) { + if (regs->cr_iip == bundle_addr + 0x10) { + regs->cr_iip = resume_addr + 0x10; + } + } else { + if (regs->cr_iip == bundle_addr) { + regs->cr_iip = resume_addr; + } + } + +turn_ss_off: + /* Turn off Single Step bit */ + ia64_psr(regs)->ss = 0; +} + +static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs) +{ + unsigned long bundle_addr = (unsigned long) &p->ainsn.insn->bundle; + unsigned long slot = (unsigned long)p->addr & 0xf; + + /* single step inline if break instruction */ + if (p->ainsn.inst_flag == INST_FLAG_BREAK_INST) + regs->cr_iip = (unsigned long)p->addr & ~0xFULL; + else + regs->cr_iip = bundle_addr & ~0xFULL; + + if (slot > 2) + slot = 0; + + ia64_psr(regs)->ri = slot; + + /* turn on single stepping */ + ia64_psr(regs)->ss = 1; +} + +static int __kprobes is_ia64_break_inst(struct pt_regs *regs) +{ + unsigned int slot = ia64_psr(regs)->ri; + unsigned long *kprobe_addr = (unsigned long *)regs->cr_iip; + bundle_t bundle; + + memcpy(&bundle, kprobe_addr, sizeof(bundle_t)); + + return __is_ia64_break_inst(&bundle, slot); +} + +static int __kprobes pre_kprobes_handler(struct die_args *args) +{ + struct kprobe *p; + int ret = 0; + struct pt_regs *regs = args->regs; + kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs); + struct kprobe_ctlblk *kcb; + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Handle recursion cases */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + if ((kcb->kprobe_status == KPROBE_HIT_SS) && + (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) { + ia64_psr(regs)->ss = 0; + goto no_kprobe; + } + /* We have reentered the pre_kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, kcb); + kprobes_inc_nmissed_count(p); + prepare_ss(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } else if (args->err == __IA64_BREAK_JPROBE) { + /* + * jprobe instrumented function just completed + */ + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } else if (!is_ia64_break_inst(regs)) { + /* The breakpoint instruction was removed by + * another cpu right after we hit, no further + * handling of this interrupt is appropriate + */ + ret = 1; + goto no_kprobe; + } else { + /* Not our break */ + goto no_kprobe; + } + } + + p = get_kprobe(addr); + if (!p) { + if (!is_ia64_break_inst(regs)) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + */ + ret = 1; + + } + + /* Not one of our break, let kernel handle it */ + goto no_kprobe; + } + + set_current_kprobe(p, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + if (p->pre_handler && p->pre_handler(p, regs)) + /* + * Our pre-handler is specifically requesting that we just + * do a return. This is used for both the jprobe pre-handler + * and the kretprobe trampoline + */ + return 1; + +ss_probe: +#if !defined(CONFIG_PREEMPT) + if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + ia64_psr(regs)->ri = p->ainsn.slot; + regs->cr_iip = (unsigned long)&p->ainsn.insn->bundle & ~0xFULL; + /* turn single stepping off */ + ia64_psr(regs)->ss = 0; + + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } +#endif + prepare_ss(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +static int __kprobes post_kprobes_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs); + + /*Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); + +out: + preempt_enable_no_resched(); + return 1; +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + + switch(kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the instruction pointer points back to + * the probe address and allow the page fault handler + * to continue as a normal page fault. + */ + regs->cr_iip = ((unsigned long)cur->addr) & ~0xFULL; + ia64_psr(regs)->ri = ((unsigned long)cur->addr) & 0xf; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accouting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (ia64_done_with_exception(regs)) + return 1; + + /* + * Let ia64_do_page_fault() fix it. + */ + break; + default: + break; + } + + return 0; +} + +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode(args->regs)) + return ret; + + switch(val) { + case DIE_BREAK: + /* err is break number from ia64_bad_break() */ + if ((args->err >> 12) == (__IA64_BREAK_KPROBE >> 12) + || args->err == __IA64_BREAK_JPROBE + || args->err == 0) + if (pre_kprobes_handler(args)) + ret = NOTIFY_STOP; + break; + case DIE_FAULT: + /* err is vector number from ia64_fault() */ + if (args->err == 36) + if (post_kprobes_handler(args->regs)) + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; +} + +struct param_bsp_cfm { + unsigned long ip; + unsigned long *bsp; + unsigned long cfm; +}; + +static void ia64_get_bsp_cfm(struct unw_frame_info *info, void *arg) +{ + unsigned long ip; + struct param_bsp_cfm *lp = arg; + + do { + unw_get_ip(info, &ip); + if (ip == 0) + break; + if (ip == lp->ip) { + unw_get_bsp(info, (unsigned long*)&lp->bsp); + unw_get_cfm(info, (unsigned long*)&lp->cfm); + return; + } + } while (unw_unwind(info) >= 0); + lp->bsp = NULL; + lp->cfm = 0; + return; +} + +unsigned long arch_deref_entry_point(void *entry) +{ + return ((struct fnptr *)entry)->ip; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr = arch_deref_entry_point(jp->entry); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + struct param_bsp_cfm pa; + int bytes; + + /* + * Callee owns the argument space and could overwrite it, eg + * tail call optimization. So to be absolutely safe + * we save the argument space before transferring the control + * to instrumented jprobe function which runs in + * the process context + */ + pa.ip = regs->cr_iip; + unw_init_running(ia64_get_bsp_cfm, &pa); + bytes = (char *)ia64_rse_skip_regs(pa.bsp, pa.cfm & 0x3f) + - (char *)pa.bsp; + memcpy( kcb->jprobes_saved_stacked_regs, + pa.bsp, + bytes ); + kcb->bsp = pa.bsp; + kcb->cfm = pa.cfm; + + /* save architectural state */ + kcb->jprobe_saved_regs = *regs; + + /* after rfi, execute the jprobe instrumented function */ + regs->cr_iip = addr & ~0xFULL; + ia64_psr(regs)->ri = addr & 0xf; + regs->r1 = ((struct fnptr *)(jp->entry))->gp; + + /* + * fix the return address to our jprobe_inst_return() function + * in the jprobes.S file + */ + regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip; + + return 1; +} + +/* ia64 does not need this */ +void __kprobes jprobe_return(void) +{ +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + int bytes; + + /* restoring architectural state */ + *regs = kcb->jprobe_saved_regs; + + /* restoring the original argument space */ + flush_register_stack(); + bytes = (char *)ia64_rse_skip_regs(kcb->bsp, kcb->cfm & 0x3f) + - (char *)kcb->bsp; + memcpy( kcb->bsp, + kcb->jprobes_saved_stacked_regs, + bytes ); + invalidate_stacked_regs(); + + preempt_enable_no_resched(); + return 1; +} + +static struct kprobe trampoline_p = { + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + trampoline_p.addr = + (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip; + return register_kprobe(&trampoline_p); +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + if (p->addr == + (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip) + return 1; + + return 0; +} diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c new file mode 100644 index 00000000000..4eed3581499 --- /dev/null +++ b/arch/ia64/kernel/machine_kexec.c @@ -0,0 +1,169 @@ +/* + * arch/ia64/kernel/machine_kexec.c + * + * Handle transition of Linux booting another kernel + * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P. + * Copyright (C) 2005 Khalid Aziz <khalid.aziz@hp.com> + * Copyright (C) 2006 Intel Corp, Zou Nan hai <nanhai.zou@intel.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/cpu.h> +#include <linux/irq.h> +#include <linux/efi.h> +#include <linux/numa.h> +#include <linux/mmzone.h> + +#include <asm/numa.h> +#include <asm/mmu_context.h> +#include <asm/setup.h> +#include <asm/delay.h> +#include <asm/meminit.h> +#include <asm/processor.h> +#include <asm/sal.h> +#include <asm/mca.h> + +typedef void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long start_address, + struct ia64_boot_param *boot_param, + unsigned long pal_addr) __noreturn; + +struct kimage *ia64_kimage; + +struct resource efi_memmap_res = { + .name = "EFI Memory Map", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +struct resource boot_param_res = { + .name = "Boot parameter", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + + +/* + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + */ +int machine_kexec_prepare(struct kimage *image) +{ + void *control_code_buffer; + const unsigned long *func; + + func = (unsigned long *)&relocate_new_kernel; + /* Pre-load control code buffer to minimize work in kexec path */ + control_code_buffer = page_address(image->control_code_page); + memcpy((void *)control_code_buffer, (const void *)func[0], + relocate_new_kernel_size); + flush_icache_range((unsigned long)control_code_buffer, + (unsigned long)control_code_buffer + relocate_new_kernel_size); + ia64_kimage = image; + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +static void ia64_machine_kexec(struct unw_frame_info *info, void *arg) +{ + struct kimage *image = arg; + relocate_new_kernel_t rnk; + void *pal_addr = efi_get_pal_addr(); + unsigned long code_addr = (unsigned long)page_address(image->control_code_page); + int ii; + u64 fp, gp; + ia64_fptr_t *init_handler = (ia64_fptr_t *)ia64_os_init_on_kdump; + + BUG_ON(!image); + if (image->type == KEXEC_TYPE_CRASH) { + crash_save_this_cpu(); + current->thread.ksp = (__u64)info->sw - 16; + + /* Register noop init handler */ + fp = ia64_tpa(init_handler->fp); + gp = ia64_tpa(ia64_getreg(_IA64_REG_GP)); + ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, fp, gp, 0, fp, gp, 0); + } else { + /* Unregister init handlers of current kernel */ + ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, 0, 0, 0, 0, 0, 0); + } + + /* Unregister mca handler - No more recovery on current kernel */ + ia64_sal_set_vectors(SAL_VECTOR_OS_MCA, 0, 0, 0, 0, 0, 0); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Mask CMC and Performance Monitor interrupts */ + ia64_setreg(_IA64_REG_CR_PMV, 1 << 16); + ia64_setreg(_IA64_REG_CR_CMCV, 1 << 16); + + /* Mask ITV and Local Redirect Registers */ + ia64_set_itv(1 << 16); + ia64_set_lrr0(1 << 16); + ia64_set_lrr1(1 << 16); + + /* terminate possible nested in-service interrupts */ + for (ii = 0; ii < 16; ii++) + ia64_eoi(); + + /* unmask TPR and clear any pending interrupts */ + ia64_setreg(_IA64_REG_CR_TPR, 0); + ia64_srlz_d(); + while (ia64_get_ivr() != IA64_SPURIOUS_INT_VECTOR) + ia64_eoi(); + platform_kernel_launch_event(); + rnk = (relocate_new_kernel_t)&code_addr; + (*rnk)(image->head, image->start, ia64_boot_param, + GRANULEROUNDDOWN((unsigned long) pal_addr)); + BUG(); +} + +void machine_kexec(struct kimage *image) +{ + BUG_ON(!image); + unw_init_running(ia64_machine_kexec, image); + for(;;); +} + +void arch_crash_save_vmcoreinfo(void) +{ +#if defined(CONFIG_DISCONTIGMEM) || defined(CONFIG_SPARSEMEM) + VMCOREINFO_SYMBOL(pgdat_list); + VMCOREINFO_LENGTH(pgdat_list, MAX_NUMNODES); +#endif +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_memblk); + VMCOREINFO_LENGTH(node_memblk, NR_NODE_MEMBLKS); + VMCOREINFO_STRUCT_SIZE(node_memblk_s); + VMCOREINFO_OFFSET(node_memblk_s, start_paddr); + VMCOREINFO_OFFSET(node_memblk_s, size); +#endif +#ifdef CONFIG_PGTABLE_3 + VMCOREINFO_CONFIG(PGTABLE_3); +#elif CONFIG_PGTABLE_4 + VMCOREINFO_CONFIG(PGTABLE_4); +#endif +} + +unsigned long paddr_vmcoreinfo_note(void) +{ + return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); +} + diff --git a/arch/ia64/kernel/machvec.c b/arch/ia64/kernel/machvec.c new file mode 100644 index 00000000000..d41a40ef80c --- /dev/null +++ b/arch/ia64/kernel/machvec.c @@ -0,0 +1,91 @@ +#include <linux/module.h> +#include <linux/dma-mapping.h> +#include <asm/machvec.h> +#include <asm/system.h> + +#ifdef CONFIG_IA64_GENERIC + +#include <linux/kernel.h> +#include <linux/string.h> + +#include <asm/page.h> + +struct ia64_machine_vector ia64_mv; +EXPORT_SYMBOL(ia64_mv); + +static struct ia64_machine_vector * __init +lookup_machvec (const char *name) +{ + extern struct ia64_machine_vector machvec_start[]; + extern struct ia64_machine_vector machvec_end[]; + struct ia64_machine_vector *mv; + + for (mv = machvec_start; mv < machvec_end; ++mv) + if (strcmp (mv->name, name) == 0) + return mv; + + return 0; +} + +void __init +machvec_init (const char *name) +{ + struct ia64_machine_vector *mv; + + if (!name) + name = acpi_get_sysname(); + mv = lookup_machvec(name); + if (!mv) + panic("generic kernel failed to find machine vector for" + " platform %s!", name); + + ia64_mv = *mv; + printk(KERN_INFO "booting generic kernel on platform %s\n", name); +} + +void __init +machvec_init_from_cmdline(const char *cmdline) +{ + char str[64]; + const char *start; + char *end; + + if (! (start = strstr(cmdline, "machvec=")) ) + return machvec_init(NULL); + + strlcpy(str, start + strlen("machvec="), sizeof(str)); + if ( (end = strchr(str, ' ')) ) + *end = '\0'; + + return machvec_init(str); +} + +#endif /* CONFIG_IA64_GENERIC */ + +void +machvec_setup (char **arg) +{ +} +EXPORT_SYMBOL(machvec_setup); + +void +machvec_timer_interrupt (int irq, void *dev_id) +{ +} +EXPORT_SYMBOL(machvec_timer_interrupt); + +void +machvec_dma_sync_single(struct device *hwdev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction dir) +{ + mb(); +} +EXPORT_SYMBOL(machvec_dma_sync_single); + +void +machvec_dma_sync_sg(struct device *hwdev, struct scatterlist *sg, int n, + enum dma_data_direction dir) +{ + mb(); +} +EXPORT_SYMBOL(machvec_dma_sync_sg); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c new file mode 100644 index 00000000000..84fb405eee8 --- /dev/null +++ b/arch/ia64/kernel/mca.c @@ -0,0 +1,2158 @@ +/* + * File: mca.c + * Purpose: Generic MCA handling layer + * + * Copyright (C) 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * Copyright (C) 2002 Dell Inc. + * Copyright (C) Matt Domsch <Matt_Domsch@dell.com> + * + * Copyright (C) 2002 Intel + * Copyright (C) Jenna Hall <jenna.s.hall@intel.com> + * + * Copyright (C) 2001 Intel + * Copyright (C) Fred Lewis <frederick.v.lewis@intel.com> + * + * Copyright (C) 2000 Intel + * Copyright (C) Chuck Fleckenstein <cfleck@co.intel.com> + * + * Copyright (C) 1999, 2004-2008 Silicon Graphics, Inc. + * Copyright (C) Vijay Chander <vijay@engr.sgi.com> + * + * Copyright (C) 2006 FUJITSU LIMITED + * Copyright (C) Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> + * + * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com> + * Fixed PAL/SAL update issues, began MCA bug fixes, logging issues, + * added min save state dump, added INIT handler. + * + * 2001-01-03 Fred Lewis <frederick.v.lewis@intel.com> + * Added setup of CMCI and CPEI IRQs, logging of corrected platform + * errors, completed code for logging of corrected & uncorrected + * machine check errors, and updated for conformance with Nov. 2000 + * revision of the SAL 3.0 spec. + * + * 2002-01-04 Jenna Hall <jenna.s.hall@intel.com> + * Aligned MCA stack to 16 bytes, added platform vs. CPU error flag, + * set SAL default return values, changed error record structure to + * linked list, added init call to sal_get_state_info_size(). + * + * 2002-03-25 Matt Domsch <Matt_Domsch@dell.com> + * GUID cleanups. + * + * 2003-04-15 David Mosberger-Tang <davidm@hpl.hp.com> + * Added INIT backtrace support. + * + * 2003-12-08 Keith Owens <kaos@sgi.com> + * smp_call_function() must not be called from interrupt context + * (can deadlock on tasklist_lock). + * Use keventd to call smp_call_function(). + * + * 2004-02-01 Keith Owens <kaos@sgi.com> + * Avoid deadlock when using printk() for MCA and INIT records. + * Delete all record printing code, moved to salinfo_decode in user + * space. Mark variables and functions static where possible. + * Delete dead variables and functions. Reorder to remove the need + * for forward declarations and to consolidate related code. + * + * 2005-08-12 Keith Owens <kaos@sgi.com> + * Convert MCA/INIT handlers to use per event stacks and SAL/OS + * state. + * + * 2005-10-07 Keith Owens <kaos@sgi.com> + * Add notify_die() hooks. + * + * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> + * Add printing support for MCA/INIT. + * + * 2007-04-27 Russ Anderson <rja@sgi.com> + * Support multiple cpus going through OS_MCA in the same event. + */ +#include <linux/jiffies.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/workqueue.h> +#include <linux/cpumask.h> +#include <linux/kdebug.h> +#include <linux/cpu.h> +#include <linux/gfp.h> + +#include <asm/delay.h> +#include <asm/machvec.h> +#include <asm/meminit.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/sal.h> +#include <asm/mca.h> +#include <asm/kexec.h> + +#include <asm/irq.h> +#include <asm/hw_irq.h> +#include <asm/tlb.h> + +#include "mca_drv.h" +#include "entry.h" + +#if defined(IA64_MCA_DEBUG_INFO) +# define IA64_MCA_DEBUG(fmt...) printk(fmt) +#else +# define IA64_MCA_DEBUG(fmt...) +#endif + +#define NOTIFY_INIT(event, regs, arg, spin) \ +do { \ + if ((notify_die((event), "INIT", (regs), (arg), 0, 0) \ + == NOTIFY_STOP) && ((spin) == 1)) \ + ia64_mca_spin(__func__); \ +} while (0) + +#define NOTIFY_MCA(event, regs, arg, spin) \ +do { \ + if ((notify_die((event), "MCA", (regs), (arg), 0, 0) \ + == NOTIFY_STOP) && ((spin) == 1)) \ + ia64_mca_spin(__func__); \ +} while (0) + +/* Used by mca_asm.S */ +DEFINE_PER_CPU(u64, ia64_mca_data); /* == __per_cpu_mca[smp_processor_id()] */ +DEFINE_PER_CPU(u64, ia64_mca_per_cpu_pte); /* PTE to map per-CPU area */ +DEFINE_PER_CPU(u64, ia64_mca_pal_pte); /* PTE to map PAL code */ +DEFINE_PER_CPU(u64, ia64_mca_pal_base); /* vaddr PAL code granule */ +DEFINE_PER_CPU(u64, ia64_mca_tr_reload); /* Flag for TR reload */ + +unsigned long __per_cpu_mca[NR_CPUS]; + +/* In mca_asm.S */ +extern void ia64_os_init_dispatch_monarch (void); +extern void ia64_os_init_dispatch_slave (void); + +static int monarch_cpu = -1; + +static ia64_mc_info_t ia64_mc_info; + +#define MAX_CPE_POLL_INTERVAL (15*60*HZ) /* 15 minutes */ +#define MIN_CPE_POLL_INTERVAL (2*60*HZ) /* 2 minutes */ +#define CMC_POLL_INTERVAL (1*60*HZ) /* 1 minute */ +#define CPE_HISTORY_LENGTH 5 +#define CMC_HISTORY_LENGTH 5 + +#ifdef CONFIG_ACPI +static struct timer_list cpe_poll_timer; +#endif +static struct timer_list cmc_poll_timer; +/* + * This variable tells whether we are currently in polling mode. + * Start with this in the wrong state so we won't play w/ timers + * before the system is ready. + */ +static int cmc_polling_enabled = 1; + +/* + * Clearing this variable prevents CPE polling from getting activated + * in mca_late_init. Use it if your system doesn't provide a CPEI, + * but encounters problems retrieving CPE logs. This should only be + * necessary for debugging. + */ +static int cpe_poll_enabled = 1; + +extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe); + +static int mca_init __initdata; + +/* + * limited & delayed printing support for MCA/INIT handler + */ + +#define mprintk(fmt...) ia64_mca_printk(fmt) + +#define MLOGBUF_SIZE (512+256*NR_CPUS) +#define MLOGBUF_MSGMAX 256 +static char mlogbuf[MLOGBUF_SIZE]; +static DEFINE_SPINLOCK(mlogbuf_wlock); /* mca context only */ +static DEFINE_SPINLOCK(mlogbuf_rlock); /* normal context only */ +static unsigned long mlogbuf_start; +static unsigned long mlogbuf_end; +static unsigned int mlogbuf_finished = 0; +static unsigned long mlogbuf_timestamp = 0; + +static int loglevel_save = -1; +#define BREAK_LOGLEVEL(__console_loglevel) \ + oops_in_progress = 1; \ + if (loglevel_save < 0) \ + loglevel_save = __console_loglevel; \ + __console_loglevel = 15; + +#define RESTORE_LOGLEVEL(__console_loglevel) \ + if (loglevel_save >= 0) { \ + __console_loglevel = loglevel_save; \ + loglevel_save = -1; \ + } \ + mlogbuf_finished = 0; \ + oops_in_progress = 0; + +/* + * Push messages into buffer, print them later if not urgent. + */ +void ia64_mca_printk(const char *fmt, ...) +{ + va_list args; + int printed_len; + char temp_buf[MLOGBUF_MSGMAX]; + char *p; + + va_start(args, fmt); + printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args); + va_end(args); + + /* Copy the output into mlogbuf */ + if (oops_in_progress) { + /* mlogbuf was abandoned, use printk directly instead. */ + printk(temp_buf); + } else { + spin_lock(&mlogbuf_wlock); + for (p = temp_buf; *p; p++) { + unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE; + if (next != mlogbuf_start) { + mlogbuf[mlogbuf_end] = *p; + mlogbuf_end = next; + } else { + /* buffer full */ + break; + } + } + mlogbuf[mlogbuf_end] = '\0'; + spin_unlock(&mlogbuf_wlock); + } +} +EXPORT_SYMBOL(ia64_mca_printk); + +/* + * Print buffered messages. + * NOTE: call this after returning normal context. (ex. from salinfod) + */ +void ia64_mlogbuf_dump(void) +{ + char temp_buf[MLOGBUF_MSGMAX]; + char *p; + unsigned long index; + unsigned long flags; + unsigned int printed_len; + + /* Get output from mlogbuf */ + while (mlogbuf_start != mlogbuf_end) { + temp_buf[0] = '\0'; + p = temp_buf; + printed_len = 0; + + spin_lock_irqsave(&mlogbuf_rlock, flags); + + index = mlogbuf_start; + while (index != mlogbuf_end) { + *p = mlogbuf[index]; + index = (index + 1) % MLOGBUF_SIZE; + if (!*p) + break; + p++; + if (++printed_len >= MLOGBUF_MSGMAX - 1) + break; + } + *p = '\0'; + if (temp_buf[0]) + printk(temp_buf); + mlogbuf_start = index; + + mlogbuf_timestamp = 0; + spin_unlock_irqrestore(&mlogbuf_rlock, flags); + } +} +EXPORT_SYMBOL(ia64_mlogbuf_dump); + +/* + * Call this if system is going to down or if immediate flushing messages to + * console is required. (ex. recovery was failed, crash dump is going to be + * invoked, long-wait rendezvous etc.) + * NOTE: this should be called from monarch. + */ +static void ia64_mlogbuf_finish(int wait) +{ + BREAK_LOGLEVEL(console_loglevel); + + spin_lock_init(&mlogbuf_rlock); + ia64_mlogbuf_dump(); + printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, " + "MCA/INIT might be dodgy or fail.\n"); + + if (!wait) + return; + + /* wait for console */ + printk("Delaying for 5 seconds...\n"); + udelay(5*1000000); + + mlogbuf_finished = 1; +} + +/* + * Print buffered messages from INIT context. + */ +static void ia64_mlogbuf_dump_from_init(void) +{ + if (mlogbuf_finished) + return; + + if (mlogbuf_timestamp && + time_before(jiffies, mlogbuf_timestamp + 30 * HZ)) { + printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT " + " and the system seems to be messed up.\n"); + ia64_mlogbuf_finish(0); + return; + } + + if (!spin_trylock(&mlogbuf_rlock)) { + printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT. " + "Generated messages other than stack dump will be " + "buffered to mlogbuf and will be printed later.\n"); + printk(KERN_ERR "INIT: If messages would not printed after " + "this INIT, wait 30sec and assert INIT again.\n"); + if (!mlogbuf_timestamp) + mlogbuf_timestamp = jiffies; + return; + } + spin_unlock(&mlogbuf_rlock); + ia64_mlogbuf_dump(); +} + +static void inline +ia64_mca_spin(const char *func) +{ + if (monarch_cpu == smp_processor_id()) + ia64_mlogbuf_finish(0); + mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func); + while (1) + cpu_relax(); +} +/* + * IA64_MCA log support + */ +#define IA64_MAX_LOGS 2 /* Double-buffering for nested MCAs */ +#define IA64_MAX_LOG_TYPES 4 /* MCA, INIT, CMC, CPE */ + +typedef struct ia64_state_log_s +{ + spinlock_t isl_lock; + int isl_index; + unsigned long isl_count; + ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ +} ia64_state_log_t; + +static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES]; + +#define IA64_LOG_ALLOCATE(it, size) \ + {ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] = \ + (ia64_err_rec_t *)alloc_bootmem(size); \ + ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] = \ + (ia64_err_rec_t *)alloc_bootmem(size);} +#define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock) +#define IA64_LOG_LOCK(it) spin_lock_irqsave(&ia64_state_log[it].isl_lock, s) +#define IA64_LOG_UNLOCK(it) spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s) +#define IA64_LOG_NEXT_INDEX(it) ia64_state_log[it].isl_index +#define IA64_LOG_CURR_INDEX(it) 1 - ia64_state_log[it].isl_index +#define IA64_LOG_INDEX_INC(it) \ + {ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index; \ + ia64_state_log[it].isl_count++;} +#define IA64_LOG_INDEX_DEC(it) \ + ia64_state_log[it].isl_index = 1 - ia64_state_log[it].isl_index +#define IA64_LOG_NEXT_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)])) +#define IA64_LOG_CURR_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)])) +#define IA64_LOG_COUNT(it) ia64_state_log[it].isl_count + +/* + * ia64_log_init + * Reset the OS ia64 log buffer + * Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE}) + * Outputs : None + */ +static void __init +ia64_log_init(int sal_info_type) +{ + u64 max_size = 0; + + IA64_LOG_NEXT_INDEX(sal_info_type) = 0; + IA64_LOG_LOCK_INIT(sal_info_type); + + // SAL will tell us the maximum size of any error record of this type + max_size = ia64_sal_get_state_info_size(sal_info_type); + if (!max_size) + /* alloc_bootmem() doesn't like zero-sized allocations! */ + return; + + // set up OS data structures to hold error info + IA64_LOG_ALLOCATE(sal_info_type, max_size); + memset(IA64_LOG_CURR_BUFFER(sal_info_type), 0, max_size); + memset(IA64_LOG_NEXT_BUFFER(sal_info_type), 0, max_size); +} + +/* + * ia64_log_get + * + * Get the current MCA log from SAL and copy it into the OS log buffer. + * + * Inputs : info_type (SAL_INFO_TYPE_{MCA,INIT,CMC,CPE}) + * irq_safe whether you can use printk at this point + * Outputs : size (total record length) + * *buffer (ptr to error record) + * + */ +static u64 +ia64_log_get(int sal_info_type, u8 **buffer, int irq_safe) +{ + sal_log_record_header_t *log_buffer; + u64 total_len = 0; + unsigned long s; + + IA64_LOG_LOCK(sal_info_type); + + /* Get the process state information */ + log_buffer = IA64_LOG_NEXT_BUFFER(sal_info_type); + + total_len = ia64_sal_get_state_info(sal_info_type, (u64 *)log_buffer); + + if (total_len) { + IA64_LOG_INDEX_INC(sal_info_type); + IA64_LOG_UNLOCK(sal_info_type); + if (irq_safe) { + IA64_MCA_DEBUG("%s: SAL error record type %d retrieved. Record length = %ld\n", + __func__, sal_info_type, total_len); + } + *buffer = (u8 *) log_buffer; + return total_len; + } else { + IA64_LOG_UNLOCK(sal_info_type); + return 0; + } +} + +/* + * ia64_mca_log_sal_error_record + * + * This function retrieves a specified error record type from SAL + * and wakes up any processes waiting for error records. + * + * Inputs : sal_info_type (Type of error record MCA/CMC/CPE) + * FIXME: remove MCA and irq_safe. + */ +static void +ia64_mca_log_sal_error_record(int sal_info_type) +{ + u8 *buffer; + sal_log_record_header_t *rh; + u64 size; + int irq_safe = sal_info_type != SAL_INFO_TYPE_MCA; +#ifdef IA64_MCA_DEBUG_INFO + static const char * const rec_name[] = { "MCA", "INIT", "CMC", "CPE" }; +#endif + + size = ia64_log_get(sal_info_type, &buffer, irq_safe); + if (!size) + return; + + salinfo_log_wakeup(sal_info_type, buffer, size, irq_safe); + + if (irq_safe) + IA64_MCA_DEBUG("CPU %d: SAL log contains %s error record\n", + smp_processor_id(), + sal_info_type < ARRAY_SIZE(rec_name) ? rec_name[sal_info_type] : "UNKNOWN"); + + /* Clear logs from corrected errors in case there's no user-level logger */ + rh = (sal_log_record_header_t *)buffer; + if (rh->severity == sal_log_severity_corrected) + ia64_sal_clear_state_info(sal_info_type); +} + +/* + * search_mca_table + * See if the MCA surfaced in an instruction range + * that has been tagged as recoverable. + * + * Inputs + * first First address range to check + * last Last address range to check + * ip Instruction pointer, address we are looking for + * + * Return value: + * 1 on Success (in the table)/ 0 on Failure (not in the table) + */ +int +search_mca_table (const struct mca_table_entry *first, + const struct mca_table_entry *last, + unsigned long ip) +{ + const struct mca_table_entry *curr; + u64 curr_start, curr_end; + + curr = first; + while (curr <= last) { + curr_start = (u64) &curr->start_addr + curr->start_addr; + curr_end = (u64) &curr->end_addr + curr->end_addr; + + if ((ip >= curr_start) && (ip <= curr_end)) { + return 1; + } + curr++; + } + return 0; +} + +/* Given an address, look for it in the mca tables. */ +int mca_recover_range(unsigned long addr) +{ + extern struct mca_table_entry __start___mca_table[]; + extern struct mca_table_entry __stop___mca_table[]; + + return search_mca_table(__start___mca_table, __stop___mca_table-1, addr); +} +EXPORT_SYMBOL_GPL(mca_recover_range); + +#ifdef CONFIG_ACPI + +int cpe_vector = -1; +int ia64_cpe_irq = -1; + +static irqreturn_t +ia64_mca_cpe_int_handler (int cpe_irq, void *arg) +{ + static unsigned long cpe_history[CPE_HISTORY_LENGTH]; + static int index; + static DEFINE_SPINLOCK(cpe_history_lock); + + IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", + __func__, cpe_irq, smp_processor_id()); + + /* SAL spec states this should run w/ interrupts enabled */ + local_irq_enable(); + + spin_lock(&cpe_history_lock); + if (!cpe_poll_enabled && cpe_vector >= 0) { + + int i, count = 1; /* we know 1 happened now */ + unsigned long now = jiffies; + + for (i = 0; i < CPE_HISTORY_LENGTH; i++) { + if (now - cpe_history[i] <= HZ) + count++; + } + + IA64_MCA_DEBUG(KERN_INFO "CPE threshold %d/%d\n", count, CPE_HISTORY_LENGTH); + if (count >= CPE_HISTORY_LENGTH) { + + cpe_poll_enabled = 1; + spin_unlock(&cpe_history_lock); + disable_irq_nosync(local_vector_to_irq(IA64_CPE_VECTOR)); + + /* + * Corrected errors will still be corrected, but + * make sure there's a log somewhere that indicates + * something is generating more than we can handle. + */ + printk(KERN_WARNING "WARNING: Switching to polling CPE handler; error records may be lost\n"); + + mod_timer(&cpe_poll_timer, jiffies + MIN_CPE_POLL_INTERVAL); + + /* lock already released, get out now */ + goto out; + } else { + cpe_history[index++] = now; + if (index == CPE_HISTORY_LENGTH) + index = 0; + } + } + spin_unlock(&cpe_history_lock); +out: + /* Get the CPE error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE); + + local_irq_disable(); + + return IRQ_HANDLED; +} + +#endif /* CONFIG_ACPI */ + +#ifdef CONFIG_ACPI +/* + * ia64_mca_register_cpev + * + * Register the corrected platform error vector with SAL. + * + * Inputs + * cpev Corrected Platform Error Vector number + * + * Outputs + * None + */ +void +ia64_mca_register_cpev (int cpev) +{ + /* Register the CPE interrupt vector with SAL */ + struct ia64_sal_retval isrv; + + isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_CPE_INT, SAL_MC_PARAM_MECHANISM_INT, cpev, 0, 0); + if (isrv.status) { + printk(KERN_ERR "Failed to register Corrected Platform " + "Error interrupt vector with SAL (status %ld)\n", isrv.status); + return; + } + + IA64_MCA_DEBUG("%s: corrected platform error " + "vector %#x registered\n", __func__, cpev); +} +#endif /* CONFIG_ACPI */ + +/* + * ia64_mca_cmc_vector_setup + * + * Setup the corrected machine check vector register in the processor. + * (The interrupt is masked on boot. ia64_mca_late_init unmask this.) + * This function is invoked on a per-processor basis. + * + * Inputs + * None + * + * Outputs + * None + */ +void __cpuinit +ia64_mca_cmc_vector_setup (void) +{ + cmcv_reg_t cmcv; + + cmcv.cmcv_regval = 0; + cmcv.cmcv_mask = 1; /* Mask/disable interrupt at first */ + cmcv.cmcv_vector = IA64_CMC_VECTOR; + ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); + + IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x registered.\n", + __func__, smp_processor_id(), IA64_CMC_VECTOR); + + IA64_MCA_DEBUG("%s: CPU %d CMCV = %#016lx\n", + __func__, smp_processor_id(), ia64_getreg(_IA64_REG_CR_CMCV)); +} + +/* + * ia64_mca_cmc_vector_disable + * + * Mask the corrected machine check vector register in the processor. + * This function is invoked on a per-processor basis. + * + * Inputs + * dummy(unused) + * + * Outputs + * None + */ +static void +ia64_mca_cmc_vector_disable (void *dummy) +{ + cmcv_reg_t cmcv; + + cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV); + + cmcv.cmcv_mask = 1; /* Mask/disable interrupt */ + ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); + + IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x disabled.\n", + __func__, smp_processor_id(), cmcv.cmcv_vector); +} + +/* + * ia64_mca_cmc_vector_enable + * + * Unmask the corrected machine check vector register in the processor. + * This function is invoked on a per-processor basis. + * + * Inputs + * dummy(unused) + * + * Outputs + * None + */ +static void +ia64_mca_cmc_vector_enable (void *dummy) +{ + cmcv_reg_t cmcv; + + cmcv.cmcv_regval = ia64_getreg(_IA64_REG_CR_CMCV); + + cmcv.cmcv_mask = 0; /* Unmask/enable interrupt */ + ia64_setreg(_IA64_REG_CR_CMCV, cmcv.cmcv_regval); + + IA64_MCA_DEBUG("%s: CPU %d corrected machine check vector %#x enabled.\n", + __func__, smp_processor_id(), cmcv.cmcv_vector); +} + +/* + * ia64_mca_cmc_vector_disable_keventd + * + * Called via keventd (smp_call_function() is not safe in interrupt context) to + * disable the cmc interrupt vector. + */ +static void +ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused) +{ + on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0); +} + +/* + * ia64_mca_cmc_vector_enable_keventd + * + * Called via keventd (smp_call_function() is not safe in interrupt context) to + * enable the cmc interrupt vector. + */ +static void +ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused) +{ + on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0); +} + +/* + * ia64_mca_wakeup + * + * Send an inter-cpu interrupt to wake-up a particular cpu. + * + * Inputs : cpuid + * Outputs : None + */ +static void +ia64_mca_wakeup(int cpu) +{ + platform_send_ipi(cpu, IA64_MCA_WAKEUP_VECTOR, IA64_IPI_DM_INT, 0); +} + +/* + * ia64_mca_wakeup_all + * + * Wakeup all the slave cpus which have rendez'ed previously. + * + * Inputs : None + * Outputs : None + */ +static void +ia64_mca_wakeup_all(void) +{ + int cpu; + + /* Clear the Rendez checkin flag for all cpus */ + for_each_online_cpu(cpu) { + if (ia64_mc_info.imi_rendez_checkin[cpu] == IA64_MCA_RENDEZ_CHECKIN_DONE) + ia64_mca_wakeup(cpu); + } + +} + +/* + * ia64_mca_rendez_interrupt_handler + * + * This is handler used to put slave processors into spinloop + * while the monarch processor does the mca handling and later + * wake each slave up once the monarch is done. The state + * IA64_MCA_RENDEZ_CHECKIN_DONE indicates the cpu is rendez'ed + * in SAL. The state IA64_MCA_RENDEZ_CHECKIN_NOTDONE indicates + * the cpu has come out of OS rendezvous. + * + * Inputs : None + * Outputs : None + */ +static irqreturn_t +ia64_mca_rendez_int_handler(int rendez_irq, void *arg) +{ + unsigned long flags; + int cpu = smp_processor_id(); + struct ia64_mca_notify_die nd = + { .sos = NULL, .monarch_cpu = &monarch_cpu }; + + /* Mask all interrupts */ + local_irq_save(flags); + + NOTIFY_MCA(DIE_MCA_RENDZVOUS_ENTER, get_irq_regs(), (long)&nd, 1); + + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_DONE; + /* Register with the SAL monarch that the slave has + * reached SAL + */ + ia64_sal_mc_rendez(); + + NOTIFY_MCA(DIE_MCA_RENDZVOUS_PROCESS, get_irq_regs(), (long)&nd, 1); + + /* Wait for the monarch cpu to exit. */ + while (monarch_cpu != -1) + cpu_relax(); /* spin until monarch leaves */ + + NOTIFY_MCA(DIE_MCA_RENDZVOUS_LEAVE, get_irq_regs(), (long)&nd, 1); + + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + /* Enable all interrupts */ + local_irq_restore(flags); + return IRQ_HANDLED; +} + +/* + * ia64_mca_wakeup_int_handler + * + * The interrupt handler for processing the inter-cpu interrupt to the + * slave cpu which was spinning in the rendez loop. + * Since this spinning is done by turning off the interrupts and + * polling on the wakeup-interrupt bit in the IRR, there is + * nothing useful to be done in the handler. + * + * Inputs : wakeup_irq (Wakeup-interrupt bit) + * arg (Interrupt handler specific argument) + * Outputs : None + * + */ +static irqreturn_t +ia64_mca_wakeup_int_handler(int wakeup_irq, void *arg) +{ + return IRQ_HANDLED; +} + +/* Function pointer for extra MCA recovery */ +int (*ia64_mca_ucmc_extension) + (void*,struct ia64_sal_os_state*) + = NULL; + +int +ia64_reg_MCA_extension(int (*fn)(void *, struct ia64_sal_os_state *)) +{ + if (ia64_mca_ucmc_extension) + return 1; + + ia64_mca_ucmc_extension = fn; + return 0; +} + +void +ia64_unreg_MCA_extension(void) +{ + if (ia64_mca_ucmc_extension) + ia64_mca_ucmc_extension = NULL; +} + +EXPORT_SYMBOL(ia64_reg_MCA_extension); +EXPORT_SYMBOL(ia64_unreg_MCA_extension); + + +static inline void +copy_reg(const u64 *fr, u64 fnat, unsigned long *tr, unsigned long *tnat) +{ + u64 fslot, tslot, nat; + *tr = *fr; + fslot = ((unsigned long)fr >> 3) & 63; + tslot = ((unsigned long)tr >> 3) & 63; + *tnat &= ~(1UL << tslot); + nat = (fnat >> fslot) & 1; + *tnat |= (nat << tslot); +} + +/* Change the comm field on the MCA/INT task to include the pid that + * was interrupted, it makes for easier debugging. If that pid was 0 + * (swapper or nested MCA/INIT) then use the start of the previous comm + * field suffixed with its cpu. + */ + +static void +ia64_mca_modify_comm(const struct task_struct *previous_current) +{ + char *p, comm[sizeof(current->comm)]; + if (previous_current->pid) + snprintf(comm, sizeof(comm), "%s %d", + current->comm, previous_current->pid); + else { + int l; + if ((p = strchr(previous_current->comm, ' '))) + l = p - previous_current->comm; + else + l = strlen(previous_current->comm); + snprintf(comm, sizeof(comm), "%s %*s %d", + current->comm, l, previous_current->comm, + task_thread_info(previous_current)->cpu); + } + memcpy(current->comm, comm, sizeof(current->comm)); +} + +static void +finish_pt_regs(struct pt_regs *regs, struct ia64_sal_os_state *sos, + unsigned long *nat) +{ + const pal_min_state_area_t *ms = sos->pal_min_state; + const u64 *bank; + + /* If ipsr.ic then use pmsa_{iip,ipsr,ifs}, else use + * pmsa_{xip,xpsr,xfs} + */ + if (ia64_psr(regs)->ic) { + regs->cr_iip = ms->pmsa_iip; + regs->cr_ipsr = ms->pmsa_ipsr; + regs->cr_ifs = ms->pmsa_ifs; + } else { + regs->cr_iip = ms->pmsa_xip; + regs->cr_ipsr = ms->pmsa_xpsr; + regs->cr_ifs = ms->pmsa_xfs; + + sos->iip = ms->pmsa_iip; + sos->ipsr = ms->pmsa_ipsr; + sos->ifs = ms->pmsa_ifs; + } + regs->pr = ms->pmsa_pr; + regs->b0 = ms->pmsa_br0; + regs->ar_rsc = ms->pmsa_rsc; + copy_reg(&ms->pmsa_gr[1-1], ms->pmsa_nat_bits, ®s->r1, nat); + copy_reg(&ms->pmsa_gr[2-1], ms->pmsa_nat_bits, ®s->r2, nat); + copy_reg(&ms->pmsa_gr[3-1], ms->pmsa_nat_bits, ®s->r3, nat); + copy_reg(&ms->pmsa_gr[8-1], ms->pmsa_nat_bits, ®s->r8, nat); + copy_reg(&ms->pmsa_gr[9-1], ms->pmsa_nat_bits, ®s->r9, nat); + copy_reg(&ms->pmsa_gr[10-1], ms->pmsa_nat_bits, ®s->r10, nat); + copy_reg(&ms->pmsa_gr[11-1], ms->pmsa_nat_bits, ®s->r11, nat); + copy_reg(&ms->pmsa_gr[12-1], ms->pmsa_nat_bits, ®s->r12, nat); + copy_reg(&ms->pmsa_gr[13-1], ms->pmsa_nat_bits, ®s->r13, nat); + copy_reg(&ms->pmsa_gr[14-1], ms->pmsa_nat_bits, ®s->r14, nat); + copy_reg(&ms->pmsa_gr[15-1], ms->pmsa_nat_bits, ®s->r15, nat); + if (ia64_psr(regs)->bn) + bank = ms->pmsa_bank1_gr; + else + bank = ms->pmsa_bank0_gr; + copy_reg(&bank[16-16], ms->pmsa_nat_bits, ®s->r16, nat); + copy_reg(&bank[17-16], ms->pmsa_nat_bits, ®s->r17, nat); + copy_reg(&bank[18-16], ms->pmsa_nat_bits, ®s->r18, nat); + copy_reg(&bank[19-16], ms->pmsa_nat_bits, ®s->r19, nat); + copy_reg(&bank[20-16], ms->pmsa_nat_bits, ®s->r20, nat); + copy_reg(&bank[21-16], ms->pmsa_nat_bits, ®s->r21, nat); + copy_reg(&bank[22-16], ms->pmsa_nat_bits, ®s->r22, nat); + copy_reg(&bank[23-16], ms->pmsa_nat_bits, ®s->r23, nat); + copy_reg(&bank[24-16], ms->pmsa_nat_bits, ®s->r24, nat); + copy_reg(&bank[25-16], ms->pmsa_nat_bits, ®s->r25, nat); + copy_reg(&bank[26-16], ms->pmsa_nat_bits, ®s->r26, nat); + copy_reg(&bank[27-16], ms->pmsa_nat_bits, ®s->r27, nat); + copy_reg(&bank[28-16], ms->pmsa_nat_bits, ®s->r28, nat); + copy_reg(&bank[29-16], ms->pmsa_nat_bits, ®s->r29, nat); + copy_reg(&bank[30-16], ms->pmsa_nat_bits, ®s->r30, nat); + copy_reg(&bank[31-16], ms->pmsa_nat_bits, ®s->r31, nat); +} + +/* On entry to this routine, we are running on the per cpu stack, see + * mca_asm.h. The original stack has not been touched by this event. Some of + * the original stack's registers will be in the RBS on this stack. This stack + * also contains a partial pt_regs and switch_stack, the rest of the data is in + * PAL minstate. + * + * The first thing to do is modify the original stack to look like a blocked + * task so we can run backtrace on the original task. Also mark the per cpu + * stack as current to ensure that we use the correct task state, it also means + * that we can do backtrace on the MCA/INIT handler code itself. + */ + +static struct task_struct * +ia64_mca_modify_original_stack(struct pt_regs *regs, + const struct switch_stack *sw, + struct ia64_sal_os_state *sos, + const char *type) +{ + char *p; + ia64_va va; + extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */ + const pal_min_state_area_t *ms = sos->pal_min_state; + struct task_struct *previous_current; + struct pt_regs *old_regs; + struct switch_stack *old_sw; + unsigned size = sizeof(struct pt_regs) + + sizeof(struct switch_stack) + 16; + unsigned long *old_bspstore, *old_bsp; + unsigned long *new_bspstore, *new_bsp; + unsigned long old_unat, old_rnat, new_rnat, nat; + u64 slots, loadrs = regs->loadrs; + u64 r12 = ms->pmsa_gr[12-1], r13 = ms->pmsa_gr[13-1]; + u64 ar_bspstore = regs->ar_bspstore; + u64 ar_bsp = regs->ar_bspstore + (loadrs >> 16); + const char *msg; + int cpu = smp_processor_id(); + + previous_current = curr_task(cpu); + set_curr_task(cpu, current); + if ((p = strchr(current->comm, ' '))) + *p = '\0'; + + /* Best effort attempt to cope with MCA/INIT delivered while in + * physical mode. + */ + regs->cr_ipsr = ms->pmsa_ipsr; + if (ia64_psr(regs)->dt == 0) { + va.l = r12; + if (va.f.reg == 0) { + va.f.reg = 7; + r12 = va.l; + } + va.l = r13; + if (va.f.reg == 0) { + va.f.reg = 7; + r13 = va.l; + } + } + if (ia64_psr(regs)->rt == 0) { + va.l = ar_bspstore; + if (va.f.reg == 0) { + va.f.reg = 7; + ar_bspstore = va.l; + } + va.l = ar_bsp; + if (va.f.reg == 0) { + va.f.reg = 7; + ar_bsp = va.l; + } + } + + /* mca_asm.S ia64_old_stack() cannot assume that the dirty registers + * have been copied to the old stack, the old stack may fail the + * validation tests below. So ia64_old_stack() must restore the dirty + * registers from the new stack. The old and new bspstore probably + * have different alignments, so loadrs calculated on the old bsp + * cannot be used to restore from the new bsp. Calculate a suitable + * loadrs for the new stack and save it in the new pt_regs, where + * ia64_old_stack() can get it. + */ + old_bspstore = (unsigned long *)ar_bspstore; + old_bsp = (unsigned long *)ar_bsp; + slots = ia64_rse_num_regs(old_bspstore, old_bsp); + new_bspstore = (unsigned long *)((u64)current + IA64_RBS_OFFSET); + new_bsp = ia64_rse_skip_regs(new_bspstore, slots); + regs->loadrs = (new_bsp - new_bspstore) * 8 << 16; + + /* Verify the previous stack state before we change it */ + if (user_mode(regs)) { + msg = "occurred in user space"; + /* previous_current is guaranteed to be valid when the task was + * in user space, so ... + */ + ia64_mca_modify_comm(previous_current); + goto no_mod; + } + + if (r13 != sos->prev_IA64_KR_CURRENT) { + msg = "inconsistent previous current and r13"; + goto no_mod; + } + + if (!mca_recover_range(ms->pmsa_iip)) { + if ((r12 - r13) >= KERNEL_STACK_SIZE) { + msg = "inconsistent r12 and r13"; + goto no_mod; + } + if ((ar_bspstore - r13) >= KERNEL_STACK_SIZE) { + msg = "inconsistent ar.bspstore and r13"; + goto no_mod; + } + va.p = old_bspstore; + if (va.f.reg < 5) { + msg = "old_bspstore is in the wrong region"; + goto no_mod; + } + if ((ar_bsp - r13) >= KERNEL_STACK_SIZE) { + msg = "inconsistent ar.bsp and r13"; + goto no_mod; + } + size += (ia64_rse_skip_regs(old_bspstore, slots) - old_bspstore) * 8; + if (ar_bspstore + size > r12) { + msg = "no room for blocked state"; + goto no_mod; + } + } + + ia64_mca_modify_comm(previous_current); + + /* Make the original task look blocked. First stack a struct pt_regs, + * describing the state at the time of interrupt. mca_asm.S built a + * partial pt_regs, copy it and fill in the blanks using minstate. + */ + p = (char *)r12 - sizeof(*regs); + old_regs = (struct pt_regs *)p; + memcpy(old_regs, regs, sizeof(*regs)); + old_regs->loadrs = loadrs; + old_unat = old_regs->ar_unat; + finish_pt_regs(old_regs, sos, &old_unat); + + /* Next stack a struct switch_stack. mca_asm.S built a partial + * switch_stack, copy it and fill in the blanks using pt_regs and + * minstate. + * + * In the synthesized switch_stack, b0 points to ia64_leave_kernel, + * ar.pfs is set to 0. + * + * unwind.c::unw_unwind() does special processing for interrupt frames. + * It checks if the PRED_NON_SYSCALL predicate is set, if the predicate + * is clear then unw_unwind() does _not_ adjust bsp over pt_regs. Not + * that this is documented, of course. Set PRED_NON_SYSCALL in the + * switch_stack on the original stack so it will unwind correctly when + * unwind.c reads pt_regs. + * + * thread.ksp is updated to point to the synthesized switch_stack. + */ + p -= sizeof(struct switch_stack); + old_sw = (struct switch_stack *)p; + memcpy(old_sw, sw, sizeof(*sw)); + old_sw->caller_unat = old_unat; + old_sw->ar_fpsr = old_regs->ar_fpsr; + copy_reg(&ms->pmsa_gr[4-1], ms->pmsa_nat_bits, &old_sw->r4, &old_unat); + copy_reg(&ms->pmsa_gr[5-1], ms->pmsa_nat_bits, &old_sw->r5, &old_unat); + copy_reg(&ms->pmsa_gr[6-1], ms->pmsa_nat_bits, &old_sw->r6, &old_unat); + copy_reg(&ms->pmsa_gr[7-1], ms->pmsa_nat_bits, &old_sw->r7, &old_unat); + old_sw->b0 = (u64)ia64_leave_kernel; + old_sw->b1 = ms->pmsa_br1; + old_sw->ar_pfs = 0; + old_sw->ar_unat = old_unat; + old_sw->pr = old_regs->pr | (1UL << PRED_NON_SYSCALL); + previous_current->thread.ksp = (u64)p - 16; + + /* Finally copy the original stack's registers back to its RBS. + * Registers from ar.bspstore through ar.bsp at the time of the event + * are in the current RBS, copy them back to the original stack. The + * copy must be done register by register because the original bspstore + * and the current one have different alignments, so the saved RNAT + * data occurs at different places. + * + * mca_asm does cover, so the old_bsp already includes all registers at + * the time of MCA/INIT. It also does flushrs, so all registers before + * this function have been written to backing store on the MCA/INIT + * stack. + */ + new_rnat = ia64_get_rnat(ia64_rse_rnat_addr(new_bspstore)); + old_rnat = regs->ar_rnat; + while (slots--) { + if (ia64_rse_is_rnat_slot(new_bspstore)) { + new_rnat = ia64_get_rnat(new_bspstore++); + } + if (ia64_rse_is_rnat_slot(old_bspstore)) { + *old_bspstore++ = old_rnat; + old_rnat = 0; + } + nat = (new_rnat >> ia64_rse_slot_num(new_bspstore)) & 1UL; + old_rnat &= ~(1UL << ia64_rse_slot_num(old_bspstore)); + old_rnat |= (nat << ia64_rse_slot_num(old_bspstore)); + *old_bspstore++ = *new_bspstore++; + } + old_sw->ar_bspstore = (unsigned long)old_bspstore; + old_sw->ar_rnat = old_rnat; + + sos->prev_task = previous_current; + return previous_current; + +no_mod: + mprintk(KERN_INFO "cpu %d, %s %s, original stack not modified\n", + smp_processor_id(), type, msg); + old_unat = regs->ar_unat; + finish_pt_regs(regs, sos, &old_unat); + return previous_current; +} + +/* The monarch/slave interaction is based on monarch_cpu and requires that all + * slaves have entered rendezvous before the monarch leaves. If any cpu has + * not entered rendezvous yet then wait a bit. The assumption is that any + * slave that has not rendezvoused after a reasonable time is never going to do + * so. In this context, slave includes cpus that respond to the MCA rendezvous + * interrupt, as well as cpus that receive the INIT slave event. + */ + +static void +ia64_wait_for_slaves(int monarch, const char *type) +{ + int c, i , wait; + + /* + * wait 5 seconds total for slaves (arbitrary) + */ + for (i = 0; i < 5000; i++) { + wait = 0; + for_each_online_cpu(c) { + if (c == monarch) + continue; + if (ia64_mc_info.imi_rendez_checkin[c] + == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { + udelay(1000); /* short wait */ + wait = 1; + break; + } + } + if (!wait) + goto all_in; + } + + /* + * Maybe slave(s) dead. Print buffered messages immediately. + */ + ia64_mlogbuf_finish(0); + mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type); + for_each_online_cpu(c) { + if (c == monarch) + continue; + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) + mprintk(" %d", c); + } + mprintk("\n"); + return; + +all_in: + mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type); + return; +} + +/* mca_insert_tr + * + * Switch rid when TR reload and needed! + * iord: 1: itr, 2: itr; + * +*/ +static void mca_insert_tr(u64 iord) +{ + + int i; + u64 old_rr; + struct ia64_tr_entry *p; + unsigned long psr; + int cpu = smp_processor_id(); + + if (!ia64_idtrs[cpu]) + return; + + psr = ia64_clear_ic(); + for (i = IA64_TR_ALLOC_BASE; i < IA64_TR_ALLOC_MAX; i++) { + p = ia64_idtrs[cpu] + (iord - 1) * IA64_TR_ALLOC_MAX; + if (p->pte & 0x1) { + old_rr = ia64_get_rr(p->ifa); + if (old_rr != p->rr) { + ia64_set_rr(p->ifa, p->rr); + ia64_srlz_d(); + } + ia64_ptr(iord, p->ifa, p->itir >> 2); + ia64_srlz_i(); + if (iord & 0x1) { + ia64_itr(0x1, i, p->ifa, p->pte, p->itir >> 2); + ia64_srlz_i(); + } + if (iord & 0x2) { + ia64_itr(0x2, i, p->ifa, p->pte, p->itir >> 2); + ia64_srlz_i(); + } + if (old_rr != p->rr) { + ia64_set_rr(p->ifa, old_rr); + ia64_srlz_d(); + } + } + } + ia64_set_psr(psr); +} + +/* + * ia64_mca_handler + * + * This is uncorrectable machine check handler called from OS_MCA + * dispatch code which is in turn called from SAL_CHECK(). + * This is the place where the core of OS MCA handling is done. + * Right now the logs are extracted and displayed in a well-defined + * format. This handler code is supposed to be run only on the + * monarch processor. Once the monarch is done with MCA handling + * further MCA logging is enabled by clearing logs. + * Monarch also has the duty of sending wakeup-IPIs to pull the + * slave processors out of rendezvous spinloop. + * + * If multiple processors call into OS_MCA, the first will become + * the monarch. Subsequent cpus will be recorded in the mca_cpu + * bitmask. After the first monarch has processed its MCA, it + * will wake up the next cpu in the mca_cpu bitmask and then go + * into the rendezvous loop. When all processors have serviced + * their MCA, the last monarch frees up the rest of the processors. + */ +void +ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, + struct ia64_sal_os_state *sos) +{ + int recover, cpu = smp_processor_id(); + struct task_struct *previous_current; + struct ia64_mca_notify_die nd = + { .sos = sos, .monarch_cpu = &monarch_cpu, .data = &recover }; + static atomic_t mca_count; + static cpumask_t mca_cpu; + + if (atomic_add_return(1, &mca_count) == 1) { + monarch_cpu = cpu; + sos->monarch = 1; + } else { + cpu_set(cpu, mca_cpu); + sos->monarch = 0; + } + mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d " + "monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch); + + previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); + + NOTIFY_MCA(DIE_MCA_MONARCH_ENTER, regs, (long)&nd, 1); + + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_CONCURRENT_MCA; + if (sos->monarch) { + ia64_wait_for_slaves(cpu, "MCA"); + + /* Wakeup all the processors which are spinning in the + * rendezvous loop. They will leave SAL, then spin in the OS + * with interrupts disabled until this monarch cpu leaves the + * MCA handler. That gets control back to the OS so we can + * backtrace the other cpus, backtrace when spinning in SAL + * does not work. + */ + ia64_mca_wakeup_all(); + } else { + while (cpu_isset(cpu, mca_cpu)) + cpu_relax(); /* spin until monarch wakes us */ + } + + NOTIFY_MCA(DIE_MCA_MONARCH_PROCESS, regs, (long)&nd, 1); + + /* Get the MCA error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA); + + /* MCA error recovery */ + recover = (ia64_mca_ucmc_extension + && ia64_mca_ucmc_extension( + IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA), + sos)); + + if (recover) { + sal_log_record_header_t *rh = IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_MCA); + rh->severity = sal_log_severity_corrected; + ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA); + sos->os_status = IA64_MCA_CORRECTED; + } else { + /* Dump buffered message to console */ + ia64_mlogbuf_finish(1); + } + + if (__get_cpu_var(ia64_mca_tr_reload)) { + mca_insert_tr(0x1); /*Reload dynamic itrs*/ + mca_insert_tr(0x2); /*Reload dynamic itrs*/ + } + + NOTIFY_MCA(DIE_MCA_MONARCH_LEAVE, regs, (long)&nd, 1); + + if (atomic_dec_return(&mca_count) > 0) { + int i; + + /* wake up the next monarch cpu, + * and put this cpu in the rendez loop. + */ + for_each_online_cpu(i) { + if (cpu_isset(i, mca_cpu)) { + monarch_cpu = i; + cpu_clear(i, mca_cpu); /* wake next cpu */ + while (monarch_cpu != -1) + cpu_relax(); /* spin until last cpu leaves */ + set_curr_task(cpu, previous_current); + ia64_mc_info.imi_rendez_checkin[cpu] + = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + return; + } + } + } + set_curr_task(cpu, previous_current); + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + monarch_cpu = -1; /* This frees the slaves and previous monarchs */ +} + +static DECLARE_WORK(cmc_disable_work, ia64_mca_cmc_vector_disable_keventd); +static DECLARE_WORK(cmc_enable_work, ia64_mca_cmc_vector_enable_keventd); + +/* + * ia64_mca_cmc_int_handler + * + * This is corrected machine check interrupt handler. + * Right now the logs are extracted and displayed in a well-defined + * format. + * + * Inputs + * interrupt number + * client data arg ptr + * + * Outputs + * None + */ +static irqreturn_t +ia64_mca_cmc_int_handler(int cmc_irq, void *arg) +{ + static unsigned long cmc_history[CMC_HISTORY_LENGTH]; + static int index; + static DEFINE_SPINLOCK(cmc_history_lock); + + IA64_MCA_DEBUG("%s: received interrupt vector = %#x on CPU %d\n", + __func__, cmc_irq, smp_processor_id()); + + /* SAL spec states this should run w/ interrupts enabled */ + local_irq_enable(); + + spin_lock(&cmc_history_lock); + if (!cmc_polling_enabled) { + int i, count = 1; /* we know 1 happened now */ + unsigned long now = jiffies; + + for (i = 0; i < CMC_HISTORY_LENGTH; i++) { + if (now - cmc_history[i] <= HZ) + count++; + } + + IA64_MCA_DEBUG(KERN_INFO "CMC threshold %d/%d\n", count, CMC_HISTORY_LENGTH); + if (count >= CMC_HISTORY_LENGTH) { + + cmc_polling_enabled = 1; + spin_unlock(&cmc_history_lock); + /* If we're being hit with CMC interrupts, we won't + * ever execute the schedule_work() below. Need to + * disable CMC interrupts on this processor now. + */ + ia64_mca_cmc_vector_disable(NULL); + schedule_work(&cmc_disable_work); + + /* + * Corrected errors will still be corrected, but + * make sure there's a log somewhere that indicates + * something is generating more than we can handle. + */ + printk(KERN_WARNING "WARNING: Switching to polling CMC handler; error records may be lost\n"); + + mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL); + + /* lock already released, get out now */ + goto out; + } else { + cmc_history[index++] = now; + if (index == CMC_HISTORY_LENGTH) + index = 0; + } + } + spin_unlock(&cmc_history_lock); +out: + /* Get the CMC error record and log it */ + ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC); + + return IRQ_HANDLED; +} + +/* + * ia64_mca_cmc_int_caller + * + * Triggered by sw interrupt from CMC polling routine. Calls + * real interrupt handler and either triggers a sw interrupt + * on the next cpu or does cleanup at the end. + * + * Inputs + * interrupt number + * client data arg ptr + * Outputs + * handled + */ +static irqreturn_t +ia64_mca_cmc_int_caller(int cmc_irq, void *arg) +{ + static int start_count = -1; + unsigned int cpuid; + + cpuid = smp_processor_id(); + + /* If first cpu, update count */ + if (start_count == -1) + start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CMC); + + ia64_mca_cmc_int_handler(cmc_irq, arg); + + cpuid = cpumask_next(cpuid+1, cpu_online_mask); + + if (cpuid < nr_cpu_ids) { + platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); + } else { + /* If no log record, switch out of polling mode */ + if (start_count == IA64_LOG_COUNT(SAL_INFO_TYPE_CMC)) { + + printk(KERN_WARNING "Returning to interrupt driven CMC handler\n"); + schedule_work(&cmc_enable_work); + cmc_polling_enabled = 0; + + } else { + + mod_timer(&cmc_poll_timer, jiffies + CMC_POLL_INTERVAL); + } + + start_count = -1; + } + + return IRQ_HANDLED; +} + +/* + * ia64_mca_cmc_poll + * + * Poll for Corrected Machine Checks (CMCs) + * + * Inputs : dummy(unused) + * Outputs : None + * + */ +static void +ia64_mca_cmc_poll (unsigned long dummy) +{ + /* Trigger a CMC interrupt cascade */ + platform_send_ipi(first_cpu(cpu_online_map), IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0); +} + +/* + * ia64_mca_cpe_int_caller + * + * Triggered by sw interrupt from CPE polling routine. Calls + * real interrupt handler and either triggers a sw interrupt + * on the next cpu or does cleanup at the end. + * + * Inputs + * interrupt number + * client data arg ptr + * Outputs + * handled + */ +#ifdef CONFIG_ACPI + +static irqreturn_t +ia64_mca_cpe_int_caller(int cpe_irq, void *arg) +{ + static int start_count = -1; + static int poll_time = MIN_CPE_POLL_INTERVAL; + unsigned int cpuid; + + cpuid = smp_processor_id(); + + /* If first cpu, update count */ + if (start_count == -1) + start_count = IA64_LOG_COUNT(SAL_INFO_TYPE_CPE); + + ia64_mca_cpe_int_handler(cpe_irq, arg); + + cpuid = cpumask_next(cpuid+1, cpu_online_mask); + + if (cpuid < NR_CPUS) { + platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); + } else { + /* + * If a log was recorded, increase our polling frequency, + * otherwise, backoff or return to interrupt mode. + */ + if (start_count != IA64_LOG_COUNT(SAL_INFO_TYPE_CPE)) { + poll_time = max(MIN_CPE_POLL_INTERVAL, poll_time / 2); + } else if (cpe_vector < 0) { + poll_time = min(MAX_CPE_POLL_INTERVAL, poll_time * 2); + } else { + poll_time = MIN_CPE_POLL_INTERVAL; + + printk(KERN_WARNING "Returning to interrupt driven CPE handler\n"); + enable_irq(local_vector_to_irq(IA64_CPE_VECTOR)); + cpe_poll_enabled = 0; + } + + if (cpe_poll_enabled) + mod_timer(&cpe_poll_timer, jiffies + poll_time); + start_count = -1; + } + + return IRQ_HANDLED; +} + +/* + * ia64_mca_cpe_poll + * + * Poll for Corrected Platform Errors (CPEs), trigger interrupt + * on first cpu, from there it will trickle through all the cpus. + * + * Inputs : dummy(unused) + * Outputs : None + * + */ +static void +ia64_mca_cpe_poll (unsigned long dummy) +{ + /* Trigger a CPE interrupt cascade */ + platform_send_ipi(first_cpu(cpu_online_map), IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0); +} + +#endif /* CONFIG_ACPI */ + +static int +default_monarch_init_process(struct notifier_block *self, unsigned long val, void *data) +{ + int c; + struct task_struct *g, *t; + if (val != DIE_INIT_MONARCH_PROCESS) + return NOTIFY_DONE; +#ifdef CONFIG_KEXEC + if (atomic_read(&kdump_in_progress)) + return NOTIFY_DONE; +#endif + + /* + * FIXME: mlogbuf will brim over with INIT stack dumps. + * To enable show_stack from INIT, we use oops_in_progress which should + * be used in real oops. This would cause something wrong after INIT. + */ + BREAK_LOGLEVEL(console_loglevel); + ia64_mlogbuf_dump_from_init(); + + printk(KERN_ERR "Processes interrupted by INIT -"); + for_each_online_cpu(c) { + struct ia64_sal_os_state *s; + t = __va(__per_cpu_mca[c] + IA64_MCA_CPU_INIT_STACK_OFFSET); + s = (struct ia64_sal_os_state *)((char *)t + MCA_SOS_OFFSET); + g = s->prev_task; + if (g) { + if (g->pid) + printk(" %d", g->pid); + else + printk(" %d (cpu %d task 0x%p)", g->pid, task_cpu(g), g); + } + } + printk("\n\n"); + if (read_trylock(&tasklist_lock)) { + do_each_thread (g, t) { + printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); + show_stack(t, NULL); + } while_each_thread (g, t); + read_unlock(&tasklist_lock); + } + /* FIXME: This will not restore zapped printk locks. */ + RESTORE_LOGLEVEL(console_loglevel); + return NOTIFY_DONE; +} + +/* + * C portion of the OS INIT handler + * + * Called from ia64_os_init_dispatch + * + * Inputs: pointer to pt_regs where processor info was saved. SAL/OS state for + * this event. This code is used for both monarch and slave INIT events, see + * sos->monarch. + * + * All INIT events switch to the INIT stack and change the previous process to + * blocked status. If one of the INIT events is the monarch then we are + * probably processing the nmi button/command. Use the monarch cpu to dump all + * the processes. The slave INIT events all spin until the monarch cpu + * returns. We can also get INIT slave events for MCA, in which case the MCA + * process is the monarch. + */ + +void +ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, + struct ia64_sal_os_state *sos) +{ + static atomic_t slaves; + static atomic_t monarchs; + struct task_struct *previous_current; + int cpu = smp_processor_id(); + struct ia64_mca_notify_die nd = + { .sos = sos, .monarch_cpu = &monarch_cpu }; + + NOTIFY_INIT(DIE_INIT_ENTER, regs, (long)&nd, 0); + + mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n", + sos->proc_state_param, cpu, sos->monarch); + salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0); + + previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "INIT"); + sos->os_status = IA64_INIT_RESUME; + + /* FIXME: Workaround for broken proms that drive all INIT events as + * slaves. The last slave that enters is promoted to be a monarch. + * Remove this code in September 2006, that gives platforms a year to + * fix their proms and get their customers updated. + */ + if (!sos->monarch && atomic_add_return(1, &slaves) == num_online_cpus()) { + mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n", + __func__, cpu); + atomic_dec(&slaves); + sos->monarch = 1; + } + + /* FIXME: Workaround for broken proms that drive all INIT events as + * monarchs. Second and subsequent monarchs are demoted to slaves. + * Remove this code in September 2006, that gives platforms a year to + * fix their proms and get their customers updated. + */ + if (sos->monarch && atomic_add_return(1, &monarchs) > 1) { + mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n", + __func__, cpu); + atomic_dec(&monarchs); + sos->monarch = 0; + } + + if (!sos->monarch) { + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_INIT; + +#ifdef CONFIG_KEXEC + while (monarch_cpu == -1 && !atomic_read(&kdump_in_progress)) + udelay(1000); +#else + while (monarch_cpu == -1) + cpu_relax(); /* spin until monarch enters */ +#endif + + NOTIFY_INIT(DIE_INIT_SLAVE_ENTER, regs, (long)&nd, 1); + NOTIFY_INIT(DIE_INIT_SLAVE_PROCESS, regs, (long)&nd, 1); + +#ifdef CONFIG_KEXEC + while (monarch_cpu != -1 && !atomic_read(&kdump_in_progress)) + udelay(1000); +#else + while (monarch_cpu != -1) + cpu_relax(); /* spin until monarch leaves */ +#endif + + NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1); + + mprintk("Slave on cpu %d returning to normal service.\n", cpu); + set_curr_task(cpu, previous_current); + ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + atomic_dec(&slaves); + return; + } + + monarch_cpu = cpu; + NOTIFY_INIT(DIE_INIT_MONARCH_ENTER, regs, (long)&nd, 1); + + /* + * Wait for a bit. On some machines (e.g., HP's zx2000 and zx6000, INIT can be + * generated via the BMC's command-line interface, but since the console is on the + * same serial line, the user will need some time to switch out of the BMC before + * the dump begins. + */ + mprintk("Delaying for 5 seconds...\n"); + udelay(5*1000000); + ia64_wait_for_slaves(cpu, "INIT"); + /* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through + * to default_monarch_init_process() above and just print all the + * tasks. + */ + NOTIFY_INIT(DIE_INIT_MONARCH_PROCESS, regs, (long)&nd, 1); + NOTIFY_INIT(DIE_INIT_MONARCH_LEAVE, regs, (long)&nd, 1); + + mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); + atomic_dec(&monarchs); + set_curr_task(cpu, previous_current); + monarch_cpu = -1; + return; +} + +static int __init +ia64_mca_disable_cpe_polling(char *str) +{ + cpe_poll_enabled = 0; + return 1; +} + +__setup("disable_cpe_poll", ia64_mca_disable_cpe_polling); + +static struct irqaction cmci_irqaction = { + .handler = ia64_mca_cmc_int_handler, + .flags = IRQF_DISABLED, + .name = "cmc_hndlr" +}; + +static struct irqaction cmcp_irqaction = { + .handler = ia64_mca_cmc_int_caller, + .flags = IRQF_DISABLED, + .name = "cmc_poll" +}; + +static struct irqaction mca_rdzv_irqaction = { + .handler = ia64_mca_rendez_int_handler, + .flags = IRQF_DISABLED, + .name = "mca_rdzv" +}; + +static struct irqaction mca_wkup_irqaction = { + .handler = ia64_mca_wakeup_int_handler, + .flags = IRQF_DISABLED, + .name = "mca_wkup" +}; + +#ifdef CONFIG_ACPI +static struct irqaction mca_cpe_irqaction = { + .handler = ia64_mca_cpe_int_handler, + .flags = IRQF_DISABLED, + .name = "cpe_hndlr" +}; + +static struct irqaction mca_cpep_irqaction = { + .handler = ia64_mca_cpe_int_caller, + .flags = IRQF_DISABLED, + .name = "cpe_poll" +}; +#endif /* CONFIG_ACPI */ + +/* Minimal format of the MCA/INIT stacks. The pseudo processes that run on + * these stacks can never sleep, they cannot return from the kernel to user + * space, they do not appear in a normal ps listing. So there is no need to + * format most of the fields. + */ + +static void __cpuinit +format_mca_init_stack(void *mca_data, unsigned long offset, + const char *type, int cpu) +{ + struct task_struct *p = (struct task_struct *)((char *)mca_data + offset); + struct thread_info *ti; + memset(p, 0, KERNEL_STACK_SIZE); + ti = task_thread_info(p); + ti->flags = _TIF_MCA_INIT; + ti->preempt_count = 1; + ti->task = p; + ti->cpu = cpu; + p->stack = ti; + p->state = TASK_UNINTERRUPTIBLE; + cpu_set(cpu, p->cpus_allowed); + INIT_LIST_HEAD(&p->tasks); + p->parent = p->real_parent = p->group_leader = p; + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); + strncpy(p->comm, type, sizeof(p->comm)-1); +} + +/* Caller prevents this from being called after init */ +static void * __init_refok mca_bootmem(void) +{ + return __alloc_bootmem(sizeof(struct ia64_mca_cpu), + KERNEL_STACK_SIZE, 0); +} + +/* Do per-CPU MCA-related initialization. */ +void __cpuinit +ia64_mca_cpu_init(void *cpu_data) +{ + void *pal_vaddr; + void *data; + long sz = sizeof(struct ia64_mca_cpu); + int cpu = smp_processor_id(); + static int first_time = 1; + + /* + * Structure will already be allocated if cpu has been online, + * then offlined. + */ + if (__per_cpu_mca[cpu]) { + data = __va(__per_cpu_mca[cpu]); + } else { + if (first_time) { + data = mca_bootmem(); + first_time = 0; + } else + data = (void *)__get_free_pages(GFP_KERNEL, + get_order(sz)); + if (!data) + panic("Could not allocate MCA memory for cpu %d\n", + cpu); + } + format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, mca_stack), + "MCA", cpu); + format_mca_init_stack(data, offsetof(struct ia64_mca_cpu, init_stack), + "INIT", cpu); + __get_cpu_var(ia64_mca_data) = __per_cpu_mca[cpu] = __pa(data); + + /* + * Stash away a copy of the PTE needed to map the per-CPU page. + * We may need it during MCA recovery. + */ + __get_cpu_var(ia64_mca_per_cpu_pte) = + pte_val(mk_pte_phys(__pa(cpu_data), PAGE_KERNEL)); + + /* + * Also, stash away a copy of the PAL address and the PTE + * needed to map it. + */ + pal_vaddr = efi_get_pal_addr(); + if (!pal_vaddr) + return; + __get_cpu_var(ia64_mca_pal_base) = + GRANULEROUNDDOWN((unsigned long) pal_vaddr); + __get_cpu_var(ia64_mca_pal_pte) = pte_val(mk_pte_phys(__pa(pal_vaddr), + PAGE_KERNEL)); +} + +static void __cpuinit ia64_mca_cmc_vector_adjust(void *dummy) +{ + unsigned long flags; + + local_irq_save(flags); + if (!cmc_polling_enabled) + ia64_mca_cmc_vector_enable(NULL); + local_irq_restore(flags); +} + +static int __cpuinit mca_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long) hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + smp_call_function_single(hotcpu, ia64_mca_cmc_vector_adjust, + NULL, 0); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mca_cpu_notifier __cpuinitdata = { + .notifier_call = mca_cpu_callback +}; + +/* + * ia64_mca_init + * + * Do all the system level mca specific initialization. + * + * 1. Register spinloop and wakeup request interrupt vectors + * + * 2. Register OS_MCA handler entry point + * + * 3. Register OS_INIT handler entry point + * + * 4. Initialize MCA/CMC/INIT related log buffers maintained by the OS. + * + * Note that this initialization is done very early before some kernel + * services are available. + * + * Inputs : None + * + * Outputs : None + */ +void __init +ia64_mca_init(void) +{ + ia64_fptr_t *init_hldlr_ptr_monarch = (ia64_fptr_t *)ia64_os_init_dispatch_monarch; + ia64_fptr_t *init_hldlr_ptr_slave = (ia64_fptr_t *)ia64_os_init_dispatch_slave; + ia64_fptr_t *mca_hldlr_ptr = (ia64_fptr_t *)ia64_os_mca_dispatch; + int i; + long rc; + struct ia64_sal_retval isrv; + unsigned long timeout = IA64_MCA_RENDEZ_TIMEOUT; /* platform specific */ + static struct notifier_block default_init_monarch_nb = { + .notifier_call = default_monarch_init_process, + .priority = 0/* we need to notified last */ + }; + + IA64_MCA_DEBUG("%s: begin\n", __func__); + + /* Clear the Rendez checkin flag for all cpus */ + for(i = 0 ; i < NR_CPUS; i++) + ia64_mc_info.imi_rendez_checkin[i] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; + + /* + * Register the rendezvous spinloop and wakeup mechanism with SAL + */ + + /* Register the rendezvous interrupt vector with SAL */ + while (1) { + isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_INT, + SAL_MC_PARAM_MECHANISM_INT, + IA64_MCA_RENDEZ_VECTOR, + timeout, + SAL_MC_PARAM_RZ_ALWAYS); + rc = isrv.status; + if (rc == 0) + break; + if (rc == -2) { + printk(KERN_INFO "Increasing MCA rendezvous timeout from " + "%ld to %ld milliseconds\n", timeout, isrv.v0); + timeout = isrv.v0; + NOTIFY_MCA(DIE_MCA_NEW_TIMEOUT, NULL, timeout, 0); + continue; + } + printk(KERN_ERR "Failed to register rendezvous interrupt " + "with SAL (status %ld)\n", rc); + return; + } + + /* Register the wakeup interrupt vector with SAL */ + isrv = ia64_sal_mc_set_params(SAL_MC_PARAM_RENDEZ_WAKEUP, + SAL_MC_PARAM_MECHANISM_INT, + IA64_MCA_WAKEUP_VECTOR, + 0, 0); + rc = isrv.status; + if (rc) { + printk(KERN_ERR "Failed to register wakeup interrupt with SAL " + "(status %ld)\n", rc); + return; + } + + IA64_MCA_DEBUG("%s: registered MCA rendezvous spinloop and wakeup mech.\n", __func__); + + ia64_mc_info.imi_mca_handler = ia64_tpa(mca_hldlr_ptr->fp); + /* + * XXX - disable SAL checksum by setting size to 0; should be + * ia64_tpa(ia64_os_mca_dispatch_end) - ia64_tpa(ia64_os_mca_dispatch); + */ + ia64_mc_info.imi_mca_handler_size = 0; + + /* Register the os mca handler with SAL */ + if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_MCA, + ia64_mc_info.imi_mca_handler, + ia64_tpa(mca_hldlr_ptr->gp), + ia64_mc_info.imi_mca_handler_size, + 0, 0, 0))) + { + printk(KERN_ERR "Failed to register OS MCA handler with SAL " + "(status %ld)\n", rc); + return; + } + + IA64_MCA_DEBUG("%s: registered OS MCA handler with SAL at 0x%lx, gp = 0x%lx\n", __func__, + ia64_mc_info.imi_mca_handler, ia64_tpa(mca_hldlr_ptr->gp)); + + /* + * XXX - disable SAL checksum by setting size to 0, should be + * size of the actual init handler in mca_asm.S. + */ + ia64_mc_info.imi_monarch_init_handler = ia64_tpa(init_hldlr_ptr_monarch->fp); + ia64_mc_info.imi_monarch_init_handler_size = 0; + ia64_mc_info.imi_slave_init_handler = ia64_tpa(init_hldlr_ptr_slave->fp); + ia64_mc_info.imi_slave_init_handler_size = 0; + + IA64_MCA_DEBUG("%s: OS INIT handler at %lx\n", __func__, + ia64_mc_info.imi_monarch_init_handler); + + /* Register the os init handler with SAL */ + if ((rc = ia64_sal_set_vectors(SAL_VECTOR_OS_INIT, + ia64_mc_info.imi_monarch_init_handler, + ia64_tpa(ia64_getreg(_IA64_REG_GP)), + ia64_mc_info.imi_monarch_init_handler_size, + ia64_mc_info.imi_slave_init_handler, + ia64_tpa(ia64_getreg(_IA64_REG_GP)), + ia64_mc_info.imi_slave_init_handler_size))) + { + printk(KERN_ERR "Failed to register m/s INIT handlers with SAL " + "(status %ld)\n", rc); + return; + } + if (register_die_notifier(&default_init_monarch_nb)) { + printk(KERN_ERR "Failed to register default monarch INIT process\n"); + return; + } + + IA64_MCA_DEBUG("%s: registered OS INIT handler with SAL\n", __func__); + + /* Initialize the areas set aside by the OS to buffer the + * platform/processor error states for MCA/INIT/CMC + * handling. + */ + ia64_log_init(SAL_INFO_TYPE_MCA); + ia64_log_init(SAL_INFO_TYPE_INIT); + ia64_log_init(SAL_INFO_TYPE_CMC); + ia64_log_init(SAL_INFO_TYPE_CPE); + + mca_init = 1; + printk(KERN_INFO "MCA related initialization done\n"); +} + +/* + * ia64_mca_late_init + * + * Opportunity to setup things that require initialization later + * than ia64_mca_init. Setup a timer to poll for CPEs if the + * platform doesn't support an interrupt driven mechanism. + * + * Inputs : None + * Outputs : Status + */ +static int __init +ia64_mca_late_init(void) +{ + if (!mca_init) + return 0; + + /* + * Configure the CMCI/P vector and handler. Interrupts for CMC are + * per-processor, so AP CMC interrupts are setup in smp_callin() (smpboot.c). + */ + register_percpu_irq(IA64_CMC_VECTOR, &cmci_irqaction); + register_percpu_irq(IA64_CMCP_VECTOR, &cmcp_irqaction); + ia64_mca_cmc_vector_setup(); /* Setup vector on BSP */ + + /* Setup the MCA rendezvous interrupt vector */ + register_percpu_irq(IA64_MCA_RENDEZ_VECTOR, &mca_rdzv_irqaction); + + /* Setup the MCA wakeup interrupt vector */ + register_percpu_irq(IA64_MCA_WAKEUP_VECTOR, &mca_wkup_irqaction); + +#ifdef CONFIG_ACPI + /* Setup the CPEI/P handler */ + register_percpu_irq(IA64_CPEP_VECTOR, &mca_cpep_irqaction); +#endif + + register_hotcpu_notifier(&mca_cpu_notifier); + + /* Setup the CMCI/P vector and handler */ + init_timer(&cmc_poll_timer); + cmc_poll_timer.function = ia64_mca_cmc_poll; + + /* Unmask/enable the vector */ + cmc_polling_enabled = 0; + schedule_work(&cmc_enable_work); + + IA64_MCA_DEBUG("%s: CMCI/P setup and enabled.\n", __func__); + +#ifdef CONFIG_ACPI + /* Setup the CPEI/P vector and handler */ + cpe_vector = acpi_request_vector(ACPI_INTERRUPT_CPEI); + init_timer(&cpe_poll_timer); + cpe_poll_timer.function = ia64_mca_cpe_poll; + + { + unsigned int irq; + + if (cpe_vector >= 0) { + /* If platform supports CPEI, enable the irq. */ + irq = local_vector_to_irq(cpe_vector); + if (irq > 0) { + cpe_poll_enabled = 0; + irq_set_status_flags(irq, IRQ_PER_CPU); + setup_irq(irq, &mca_cpe_irqaction); + ia64_cpe_irq = irq; + ia64_mca_register_cpev(cpe_vector); + IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", + __func__); + return 0; + } + printk(KERN_ERR "%s: Failed to find irq for CPE " + "interrupt handler, vector %d\n", + __func__, cpe_vector); + } + /* If platform doesn't support CPEI, get the timer going. */ + if (cpe_poll_enabled) { + ia64_mca_cpe_poll(0UL); + IA64_MCA_DEBUG("%s: CPEP setup and enabled.\n", __func__); + } + } +#endif + + return 0; +} + +device_initcall(ia64_mca_late_init); diff --git a/arch/ia64/kernel/mca_asm.S b/arch/ia64/kernel/mca_asm.S new file mode 100644 index 00000000000..d5bdf9de36b --- /dev/null +++ b/arch/ia64/kernel/mca_asm.S @@ -0,0 +1,1122 @@ +/* + * File: mca_asm.S + * Purpose: assembly portion of the IA64 MCA handling + * + * Mods by cfleck to integrate into kernel build + * + * 2000-03-15 David Mosberger-Tang <davidm@hpl.hp.com> + * Added various stop bits to get a clean compile + * + * 2000-03-29 Chuck Fleckenstein <cfleck@co.intel.com> + * Added code to save INIT handoff state in pt_regs format, + * switch to temp kstack, switch modes, jump to C INIT handler + * + * 2002-01-04 J.Hall <jenna.s.hall@intel.com> + * Before entering virtual mode code: + * 1. Check for TLB CPU error + * 2. Restore current thread pointer to kr6 + * 3. Move stack ptr 16 bytes to conform to C calling convention + * + * 2004-11-12 Russ Anderson <rja@sgi.com> + * Added per cpu MCA/INIT stack save areas. + * + * 2005-12-08 Keith Owens <kaos@sgi.com> + * Use per cpu MCA/INIT stacks for all data. + */ +#include <linux/threads.h> + +#include <asm/asmmacro.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/mca_asm.h> +#include <asm/mca.h> + +#include "entry.h" + +#define GET_IA64_MCA_DATA(reg) \ + GET_THIS_PADDR(reg, ia64_mca_data) \ + ;; \ + ld8 reg=[reg] + + .global ia64_do_tlb_purge + .global ia64_os_mca_dispatch + .global ia64_os_init_on_kdump + .global ia64_os_init_dispatch_monarch + .global ia64_os_init_dispatch_slave + + .text + .align 16 + +//StartMain//////////////////////////////////////////////////////////////////// + +/* + * Just the TLB purge part is moved to a separate function + * so we can re-use the code for cpu hotplug code as well + * Caller should now setup b1, so we can branch once the + * tlb flush is complete. + */ + +ia64_do_tlb_purge: +#define O(member) IA64_CPUINFO_##member##_OFFSET + + GET_THIS_PADDR(r2, ia64_cpu_info) // load phys addr of cpu_info into r2 + ;; + addl r17=O(PTCE_STRIDE),r2 + addl r2=O(PTCE_BASE),r2 + ;; + ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));; // r18=ptce_base + ld4 r19=[r2],4 // r19=ptce_count[0] + ld4 r21=[r17],4 // r21=ptce_stride[0] + ;; + ld4 r20=[r2] // r20=ptce_count[1] + ld4 r22=[r17] // r22=ptce_stride[1] + mov r24=0 + ;; + adds r20=-1,r20 + ;; +#undef O + +2: + cmp.ltu p6,p7=r24,r19 +(p7) br.cond.dpnt.few 4f + mov ar.lc=r20 +3: + ptc.e r18 + ;; + add r18=r22,r18 + br.cloop.sptk.few 3b + ;; + add r18=r21,r18 + add r24=1,r24 + ;; + br.sptk.few 2b +4: + srlz.i // srlz.i implies srlz.d + ;; + + // Now purge addresses formerly mapped by TR registers + // 1. Purge ITR&DTR for kernel. + movl r16=KERNEL_START + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + ;; + ptr.i r16, r18 + ptr.d r16, r18 + ;; + srlz.i + ;; + srlz.d + ;; + // 3. Purge ITR for PAL code. + GET_THIS_PADDR(r2, ia64_mca_pal_base) + ;; + ld8 r16=[r2] + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.i r16,r18 + ;; + srlz.i + ;; + // 4. Purge DTR for stack. + mov r16=IA64_KR(CURRENT_STACK) + ;; + shl r16=r16,IA64_GRANULE_SHIFT + movl r19=PAGE_OFFSET + ;; + add r16=r19,r16 + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.i + ;; + // Now branch away to caller. + br.sptk.many b1 + ;; + +//EndMain////////////////////////////////////////////////////////////////////// + +//StartMain//////////////////////////////////////////////////////////////////// + +ia64_os_mca_dispatch: + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + mov r19=1 // All MCA events are treated as monarch (for now) + br.sptk ia64_state_save // save the state that is not in minstate +1: + + GET_IA64_MCA_DATA(r2) + // Using MCA stack, struct ia64_sal_os_state, variable proc_state_param + ;; + add r3=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET+SOS(PROC_STATE_PARAM), r2 + ;; + ld8 r18=[r3] // Get processor state parameter on existing PALE_CHECK. + ;; + tbit.nz p6,p7=r18,60 +(p7) br.spnt done_tlb_purge_and_reload + + // The following code purges TC and TR entries. Then reload all TC entries. + // Purge percpu data TC entries. +begin_tlb_purge_and_reload: + movl r18=ia64_reload_tr;; + LOAD_PHYSICAL(p0,r18,ia64_reload_tr);; + mov b1=r18;; + br.sptk.many ia64_do_tlb_purge;; + +ia64_reload_tr: + // Finally reload the TR registers. + // 1. Reload DTR/ITR registers for kernel. + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + movl r17=KERNEL_START + ;; + mov cr.itir=r18 + mov cr.ifa=r17 + mov r16=IA64_TR_KERNEL + mov r19=ip + movl r18=PAGE_KERNEL + ;; + dep r17=0,r19,0, KERNEL_TR_PAGE_SHIFT + ;; + or r18=r17,r18 + ;; + itr.i itr[r16]=r18 + ;; + itr.d dtr[r16]=r18 + ;; + srlz.i + srlz.d + ;; + // 3. Reload ITR for PAL code. + GET_THIS_PADDR(r2, ia64_mca_pal_pte) + ;; + ld8 r18=[r2] // load PAL PTE + ;; + GET_THIS_PADDR(r2, ia64_mca_pal_base) + ;; + ld8 r16=[r2] // load PAL vaddr + mov r19=IA64_GRANULE_SHIFT<<2 + ;; + mov cr.itir=r19 + mov cr.ifa=r16 + mov r20=IA64_TR_PALCODE + ;; + itr.i itr[r20]=r18 + ;; + srlz.i + ;; + // 4. Reload DTR for stack. + mov r16=IA64_KR(CURRENT_STACK) + ;; + shl r16=r16,IA64_GRANULE_SHIFT + movl r19=PAGE_OFFSET + ;; + add r18=r19,r16 + movl r20=PAGE_KERNEL + ;; + add r16=r20,r16 + mov r19=IA64_GRANULE_SHIFT<<2 + ;; + mov cr.itir=r19 + mov cr.ifa=r18 + mov r20=IA64_TR_CURRENT_STACK + ;; + itr.d dtr[r20]=r16 + GET_THIS_PADDR(r2, ia64_mca_tr_reload) + mov r18 = 1 + ;; + srlz.d + ;; + st8 [r2] =r18 + ;; + +done_tlb_purge_and_reload: + + // switch to per cpu MCA stack + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_new_stack +1: + + // everything saved, now we can set the kernel registers + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_set_kernel_registers +1: + + // This must be done in physical mode + GET_IA64_MCA_DATA(r2) + ;; + mov r7=r2 + + // Enter virtual mode from physical mode + VIRTUAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_begin, r4) + + // This code returns to SAL via SOS r2, in general SAL has no unwind + // data. To get a clean termination when backtracing the C MCA/INIT + // handler, set a dummy return address of 0 in this routine. That + // requires that ia64_os_mca_virtual_begin be a global function. +ENTRY(ia64_os_mca_virtual_begin) + .prologue + .save rp,r0 + .body + + mov ar.rsc=3 // set eager mode for C handler + mov r2=r7 // see GET_IA64_MCA_DATA above + ;; + + // Call virtual mode handler + alloc r14=ar.pfs,0,0,3,0 + ;; + DATA_PA_TO_VA(r2,r7) + ;; + add out0=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2 + add out1=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2 + add out2=IA64_MCA_CPU_MCA_STACK_OFFSET+MCA_SOS_OFFSET, r2 + br.call.sptk.many b0=ia64_mca_handler + + // Revert back to physical mode before going back to SAL + PHYSICAL_MODE_ENTER(r2, r3, ia64_os_mca_virtual_end, r4) +ia64_os_mca_virtual_end: + +END(ia64_os_mca_virtual_begin) + + // switch back to previous stack + alloc r14=ar.pfs,0,0,0,0 // remove the MCA handler frame + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_old_stack +1: + + mov r3=IA64_MCA_CPU_MCA_STACK_OFFSET // use the MCA stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_state_restore // restore the SAL state +1: + + mov b0=r12 // SAL_CHECK return address + + br b0 + +//EndMain////////////////////////////////////////////////////////////////////// + +//StartMain//////////////////////////////////////////////////////////////////// + +// +// NOP init handler for kdump. In panic situation, we may receive INIT +// while kernel transition. Since we initialize registers on leave from +// current kernel, no longer monarch/slave handlers of current kernel in +// virtual mode are called safely. +// We can unregister these init handlers from SAL, however then the INIT +// will result in warmboot by SAL and we cannot retrieve the crashdump. +// Therefore register this NOP function to SAL, to prevent entering virtual +// mode and resulting warmboot by SAL. +// +ia64_os_init_on_kdump: + mov r8=r0 // IA64_INIT_RESUME + mov r9=r10 // SAL_GP + mov r22=r17 // *minstate + ;; + mov r10=r0 // return to same context + mov b0=r12 // SAL_CHECK return address + br b0 + +// +// SAL to OS entry point for INIT on all processors. This has been defined for +// registration purposes with SAL as a part of ia64_mca_init. Monarch and +// slave INIT have identical processing, except for the value of the +// sos->monarch flag in r19. +// + +ia64_os_init_dispatch_monarch: + mov r19=1 // Bow, bow, ye lower middle classes! + br.sptk ia64_os_init_dispatch + +ia64_os_init_dispatch_slave: + mov r19=0 // <igor>yeth, mathter</igor> + +ia64_os_init_dispatch: + + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_state_save // save the state that is not in minstate +1: + + // switch to per cpu INIT stack + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_new_stack +1: + + // everything saved, now we can set the kernel registers + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_set_kernel_registers +1: + + // This must be done in physical mode + GET_IA64_MCA_DATA(r2) + ;; + mov r7=r2 + + // Enter virtual mode from physical mode + VIRTUAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_begin, r4) + + // This code returns to SAL via SOS r2, in general SAL has no unwind + // data. To get a clean termination when backtracing the C MCA/INIT + // handler, set a dummy return address of 0 in this routine. That + // requires that ia64_os_init_virtual_begin be a global function. +ENTRY(ia64_os_init_virtual_begin) + .prologue + .save rp,r0 + .body + + mov ar.rsc=3 // set eager mode for C handler + mov r2=r7 // see GET_IA64_MCA_DATA above + ;; + + // Call virtual mode handler + alloc r14=ar.pfs,0,0,3,0 + ;; + DATA_PA_TO_VA(r2,r7) + ;; + add out0=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_PT_REGS_OFFSET, r2 + add out1=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SWITCH_STACK_OFFSET, r2 + add out2=IA64_MCA_CPU_INIT_STACK_OFFSET+MCA_SOS_OFFSET, r2 + br.call.sptk.many b0=ia64_init_handler + + // Revert back to physical mode before going back to SAL + PHYSICAL_MODE_ENTER(r2, r3, ia64_os_init_virtual_end, r4) +ia64_os_init_virtual_end: + +END(ia64_os_init_virtual_begin) + + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_state_restore // restore the SAL state +1: + + // switch back to previous stack + alloc r14=ar.pfs,0,0,0,0 // remove the INIT handler frame + mov r3=IA64_MCA_CPU_INIT_STACK_OFFSET // use the INIT stack + LOAD_PHYSICAL(p0,r2,1f) // return address + br.sptk ia64_old_stack +1: + + mov b0=r12 // SAL_CHECK return address + br b0 + +//EndMain////////////////////////////////////////////////////////////////////// + +// common defines for the stubs +#define ms r4 +#define regs r5 +#define temp1 r2 /* careful, it overlaps with input registers */ +#define temp2 r3 /* careful, it overlaps with input registers */ +#define temp3 r7 +#define temp4 r14 + + +//++ +// Name: +// ia64_state_save() +// +// Stub Description: +// +// Save the state that is not in minstate. This is sensitive to the layout of +// struct ia64_sal_os_state in mca.h. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +// The OS to SAL section of struct ia64_sal_os_state is set to a default +// value of cold boot (MCA) or warm boot (INIT) and return to the same +// context. ia64_sal_os_state is also used to hold some registers that +// need to be saved and restored across the stack switches. +// +// Most input registers to this stub come from PAL/SAL +// r1 os gp, physical +// r8 pal_proc entry point +// r9 sal_proc entry point +// r10 sal gp +// r11 MCA - rendevzous state, INIT - reason code +// r12 sal return address +// r17 pal min_state +// r18 processor state parameter +// r19 monarch flag, set by the caller of this routine +// +// In addition to the SAL to OS state, this routine saves all the +// registers that appear in struct pt_regs and struct switch_stack, +// excluding those that are already in the PAL minstate area. This +// results in a partial pt_regs and switch_stack, the C code copies the +// remaining registers from PAL minstate to pt_regs and switch_stack. The +// resulting structures contain all the state of the original process when +// MCA/INIT occurred. +// +//-- + +ia64_state_save: + add regs=MCA_SOS_OFFSET, r3 + add ms=MCA_SOS_OFFSET+8, r3 + mov b0=r2 // save return address + cmp.eq p1,p2=IA64_MCA_CPU_MCA_STACK_OFFSET, r3 + ;; + GET_IA64_MCA_DATA(temp2) + ;; + add temp1=temp2, regs // struct ia64_sal_os_state on MCA or INIT stack + add temp2=temp2, ms // struct ia64_sal_os_state+8 on MCA or INIT stack + ;; + mov regs=temp1 // save the start of sos + st8 [temp1]=r1,16 // os_gp + st8 [temp2]=r8,16 // pal_proc + ;; + st8 [temp1]=r9,16 // sal_proc + st8 [temp2]=r11,16 // rv_rc + mov r11=cr.iipa + ;; + st8 [temp1]=r18 // proc_state_param + st8 [temp2]=r19 // monarch + mov r6=IA64_KR(CURRENT) + add temp1=SOS(SAL_RA), regs + add temp2=SOS(SAL_GP), regs + ;; + st8 [temp1]=r12,16 // sal_ra + st8 [temp2]=r10,16 // sal_gp + mov r12=cr.isr + ;; + st8 [temp1]=r17,16 // pal_min_state + st8 [temp2]=r6,16 // prev_IA64_KR_CURRENT + mov r6=IA64_KR(CURRENT_STACK) + ;; + st8 [temp1]=r6,16 // prev_IA64_KR_CURRENT_STACK + st8 [temp2]=r0,16 // prev_task, starts off as NULL + mov r6=cr.ifa + ;; + st8 [temp1]=r12,16 // cr.isr + st8 [temp2]=r6,16 // cr.ifa + mov r12=cr.itir + ;; + st8 [temp1]=r12,16 // cr.itir + st8 [temp2]=r11,16 // cr.iipa + mov r12=cr.iim + ;; + st8 [temp1]=r12 // cr.iim +(p1) mov r12=IA64_MCA_COLD_BOOT +(p2) mov r12=IA64_INIT_WARM_BOOT + mov r6=cr.iha + add temp1=SOS(OS_STATUS), regs + ;; + st8 [temp2]=r6 // cr.iha + add temp2=SOS(CONTEXT), regs + st8 [temp1]=r12 // os_status, default is cold boot + mov r6=IA64_MCA_SAME_CONTEXT + ;; + st8 [temp2]=r6 // context, default is same context + + // Save the pt_regs data that is not in minstate. The previous code + // left regs at sos. + add regs=MCA_PT_REGS_OFFSET-MCA_SOS_OFFSET, regs + ;; + add temp1=PT(B6), regs + mov temp3=b6 + mov temp4=b7 + add temp2=PT(B7), regs + ;; + st8 [temp1]=temp3,PT(AR_CSD)-PT(B6) // save b6 + st8 [temp2]=temp4,PT(AR_SSD)-PT(B7) // save b7 + mov temp3=ar.csd + mov temp4=ar.ssd + cover // must be last in group + ;; + st8 [temp1]=temp3,PT(AR_UNAT)-PT(AR_CSD) // save ar.csd + st8 [temp2]=temp4,PT(AR_PFS)-PT(AR_SSD) // save ar.ssd + mov temp3=ar.unat + mov temp4=ar.pfs + ;; + st8 [temp1]=temp3,PT(AR_RNAT)-PT(AR_UNAT) // save ar.unat + st8 [temp2]=temp4,PT(AR_BSPSTORE)-PT(AR_PFS) // save ar.pfs + mov temp3=ar.rnat + mov temp4=ar.bspstore + ;; + st8 [temp1]=temp3,PT(LOADRS)-PT(AR_RNAT) // save ar.rnat + st8 [temp2]=temp4,PT(AR_FPSR)-PT(AR_BSPSTORE) // save ar.bspstore + mov temp3=ar.bsp + ;; + sub temp3=temp3, temp4 // ar.bsp - ar.bspstore + mov temp4=ar.fpsr + ;; + shl temp3=temp3,16 // compute ar.rsc to be used for "loadrs" + ;; + st8 [temp1]=temp3,PT(AR_CCV)-PT(LOADRS) // save loadrs + st8 [temp2]=temp4,PT(F6)-PT(AR_FPSR) // save ar.fpsr + mov temp3=ar.ccv + ;; + st8 [temp1]=temp3,PT(F7)-PT(AR_CCV) // save ar.ccv + stf.spill [temp2]=f6,PT(F8)-PT(F6) + ;; + stf.spill [temp1]=f7,PT(F9)-PT(F7) + stf.spill [temp2]=f8,PT(F10)-PT(F8) + ;; + stf.spill [temp1]=f9,PT(F11)-PT(F9) + stf.spill [temp2]=f10 + ;; + stf.spill [temp1]=f11 + + // Save the switch_stack data that is not in minstate nor pt_regs. The + // previous code left regs at pt_regs. + add regs=MCA_SWITCH_STACK_OFFSET-MCA_PT_REGS_OFFSET, regs + ;; + add temp1=SW(F2), regs + add temp2=SW(F3), regs + ;; + stf.spill [temp1]=f2,32 + stf.spill [temp2]=f3,32 + ;; + stf.spill [temp1]=f4,32 + stf.spill [temp2]=f5,32 + ;; + stf.spill [temp1]=f12,32 + stf.spill [temp2]=f13,32 + ;; + stf.spill [temp1]=f14,32 + stf.spill [temp2]=f15,32 + ;; + stf.spill [temp1]=f16,32 + stf.spill [temp2]=f17,32 + ;; + stf.spill [temp1]=f18,32 + stf.spill [temp2]=f19,32 + ;; + stf.spill [temp1]=f20,32 + stf.spill [temp2]=f21,32 + ;; + stf.spill [temp1]=f22,32 + stf.spill [temp2]=f23,32 + ;; + stf.spill [temp1]=f24,32 + stf.spill [temp2]=f25,32 + ;; + stf.spill [temp1]=f26,32 + stf.spill [temp2]=f27,32 + ;; + stf.spill [temp1]=f28,32 + stf.spill [temp2]=f29,32 + ;; + stf.spill [temp1]=f30,SW(B2)-SW(F30) + stf.spill [temp2]=f31,SW(B3)-SW(F31) + mov temp3=b2 + mov temp4=b3 + ;; + st8 [temp1]=temp3,16 // save b2 + st8 [temp2]=temp4,16 // save b3 + mov temp3=b4 + mov temp4=b5 + ;; + st8 [temp1]=temp3,SW(AR_LC)-SW(B4) // save b4 + st8 [temp2]=temp4 // save b5 + mov temp3=ar.lc + ;; + st8 [temp1]=temp3 // save ar.lc + + // FIXME: Some proms are incorrectly accessing the minstate area as + // cached data. The C code uses region 6, uncached virtual. Ensure + // that there is no cache data lying around for the first 1K of the + // minstate area. + // Remove this code in September 2006, that gives platforms a year to + // fix their proms and get their customers updated. + + add r1=32*1,r17 + add r2=32*2,r17 + add r3=32*3,r17 + add r4=32*4,r17 + add r5=32*5,r17 + add r6=32*6,r17 + add r7=32*7,r17 + ;; + fc r17 + fc r1 + fc r2 + fc r3 + fc r4 + fc r5 + fc r6 + fc r7 + add r17=32*8,r17 + add r1=32*8,r1 + add r2=32*8,r2 + add r3=32*8,r3 + add r4=32*8,r4 + add r5=32*8,r5 + add r6=32*8,r6 + add r7=32*8,r7 + ;; + fc r17 + fc r1 + fc r2 + fc r3 + fc r4 + fc r5 + fc r6 + fc r7 + add r17=32*8,r17 + add r1=32*8,r1 + add r2=32*8,r2 + add r3=32*8,r3 + add r4=32*8,r4 + add r5=32*8,r5 + add r6=32*8,r6 + add r7=32*8,r7 + ;; + fc r17 + fc r1 + fc r2 + fc r3 + fc r4 + fc r5 + fc r6 + fc r7 + add r17=32*8,r17 + add r1=32*8,r1 + add r2=32*8,r2 + add r3=32*8,r3 + add r4=32*8,r4 + add r5=32*8,r5 + add r6=32*8,r6 + add r7=32*8,r7 + ;; + fc r17 + fc r1 + fc r2 + fc r3 + fc r4 + fc r5 + fc r6 + fc r7 + + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// + + +//++ +// Name: +// ia64_state_restore() +// +// Stub Description: +// +// Restore the SAL/OS state. This is sensitive to the layout of struct +// ia64_sal_os_state in mca.h. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +// In addition to the SAL to OS state, this routine restores all the +// registers that appear in struct pt_regs and struct switch_stack, +// excluding those in the PAL minstate area. +// +//-- + +ia64_state_restore: + // Restore the switch_stack data that is not in minstate nor pt_regs. + add regs=MCA_SWITCH_STACK_OFFSET, r3 + mov b0=r2 // save return address + ;; + GET_IA64_MCA_DATA(temp2) + ;; + add regs=temp2, regs + ;; + add temp1=SW(F2), regs + add temp2=SW(F3), regs + ;; + ldf.fill f2=[temp1],32 + ldf.fill f3=[temp2],32 + ;; + ldf.fill f4=[temp1],32 + ldf.fill f5=[temp2],32 + ;; + ldf.fill f12=[temp1],32 + ldf.fill f13=[temp2],32 + ;; + ldf.fill f14=[temp1],32 + ldf.fill f15=[temp2],32 + ;; + ldf.fill f16=[temp1],32 + ldf.fill f17=[temp2],32 + ;; + ldf.fill f18=[temp1],32 + ldf.fill f19=[temp2],32 + ;; + ldf.fill f20=[temp1],32 + ldf.fill f21=[temp2],32 + ;; + ldf.fill f22=[temp1],32 + ldf.fill f23=[temp2],32 + ;; + ldf.fill f24=[temp1],32 + ldf.fill f25=[temp2],32 + ;; + ldf.fill f26=[temp1],32 + ldf.fill f27=[temp2],32 + ;; + ldf.fill f28=[temp1],32 + ldf.fill f29=[temp2],32 + ;; + ldf.fill f30=[temp1],SW(B2)-SW(F30) + ldf.fill f31=[temp2],SW(B3)-SW(F31) + ;; + ld8 temp3=[temp1],16 // restore b2 + ld8 temp4=[temp2],16 // restore b3 + ;; + mov b2=temp3 + mov b3=temp4 + ld8 temp3=[temp1],SW(AR_LC)-SW(B4) // restore b4 + ld8 temp4=[temp2] // restore b5 + ;; + mov b4=temp3 + mov b5=temp4 + ld8 temp3=[temp1] // restore ar.lc + ;; + mov ar.lc=temp3 + + // Restore the pt_regs data that is not in minstate. The previous code + // left regs at switch_stack. + add regs=MCA_PT_REGS_OFFSET-MCA_SWITCH_STACK_OFFSET, regs + ;; + add temp1=PT(B6), regs + add temp2=PT(B7), regs + ;; + ld8 temp3=[temp1],PT(AR_CSD)-PT(B6) // restore b6 + ld8 temp4=[temp2],PT(AR_SSD)-PT(B7) // restore b7 + ;; + mov b6=temp3 + mov b7=temp4 + ld8 temp3=[temp1],PT(AR_UNAT)-PT(AR_CSD) // restore ar.csd + ld8 temp4=[temp2],PT(AR_PFS)-PT(AR_SSD) // restore ar.ssd + ;; + mov ar.csd=temp3 + mov ar.ssd=temp4 + ld8 temp3=[temp1] // restore ar.unat + add temp1=PT(AR_CCV)-PT(AR_UNAT), temp1 + ld8 temp4=[temp2],PT(AR_FPSR)-PT(AR_PFS) // restore ar.pfs + ;; + mov ar.unat=temp3 + mov ar.pfs=temp4 + // ar.rnat, ar.bspstore, loadrs are restore in ia64_old_stack. + ld8 temp3=[temp1],PT(F6)-PT(AR_CCV) // restore ar.ccv + ld8 temp4=[temp2],PT(F7)-PT(AR_FPSR) // restore ar.fpsr + ;; + mov ar.ccv=temp3 + mov ar.fpsr=temp4 + ldf.fill f6=[temp1],PT(F8)-PT(F6) + ldf.fill f7=[temp2],PT(F9)-PT(F7) + ;; + ldf.fill f8=[temp1],PT(F10)-PT(F8) + ldf.fill f9=[temp2],PT(F11)-PT(F9) + ;; + ldf.fill f10=[temp1] + ldf.fill f11=[temp2] + + // Restore the SAL to OS state. The previous code left regs at pt_regs. + add regs=MCA_SOS_OFFSET-MCA_PT_REGS_OFFSET, regs + ;; + add temp1=SOS(SAL_RA), regs + add temp2=SOS(SAL_GP), regs + ;; + ld8 r12=[temp1],16 // sal_ra + ld8 r9=[temp2],16 // sal_gp + ;; + ld8 r22=[temp1],16 // pal_min_state, virtual + ld8 r13=[temp2],16 // prev_IA64_KR_CURRENT + ;; + ld8 r16=[temp1],16 // prev_IA64_KR_CURRENT_STACK + ld8 r20=[temp2],16 // prev_task + ;; + ld8 temp3=[temp1],16 // cr.isr + ld8 temp4=[temp2],16 // cr.ifa + ;; + mov cr.isr=temp3 + mov cr.ifa=temp4 + ld8 temp3=[temp1],16 // cr.itir + ld8 temp4=[temp2],16 // cr.iipa + ;; + mov cr.itir=temp3 + mov cr.iipa=temp4 + ld8 temp3=[temp1] // cr.iim + ld8 temp4=[temp2] // cr.iha + add temp1=SOS(OS_STATUS), regs + add temp2=SOS(CONTEXT), regs + ;; + mov cr.iim=temp3 + mov cr.iha=temp4 + dep r22=0,r22,62,1 // pal_min_state, physical, uncached + mov IA64_KR(CURRENT)=r13 + ld8 r8=[temp1] // os_status + ld8 r10=[temp2] // context + + /* Wire IA64_TR_CURRENT_STACK to the stack that we are resuming to. To + * avoid any dependencies on the algorithm in ia64_switch_to(), just + * purge any existing CURRENT_STACK mapping and insert the new one. + * + * r16 contains prev_IA64_KR_CURRENT_STACK, r13 contains + * prev_IA64_KR_CURRENT, these values may have been changed by the C + * code. Do not use r8, r9, r10, r22, they contain values ready for + * the return to SAL. + */ + + mov r15=IA64_KR(CURRENT_STACK) // physical granule mapped by IA64_TR_CURRENT_STACK + ;; + shl r15=r15,IA64_GRANULE_SHIFT + ;; + dep r15=-1,r15,61,3 // virtual granule + mov r18=IA64_GRANULE_SHIFT<<2 // for cr.itir.ps + ;; + ptr.d r15,r18 + ;; + srlz.d + + extr.u r19=r13,61,3 // r13 = prev_IA64_KR_CURRENT + shl r20=r16,IA64_GRANULE_SHIFT // r16 = prev_IA64_KR_CURRENT_STACK + movl r21=PAGE_KERNEL // page properties + ;; + mov IA64_KR(CURRENT_STACK)=r16 + cmp.ne p6,p0=RGN_KERNEL,r19 // new stack is in the kernel region? + or r21=r20,r21 // construct PA | page properties +(p6) br.spnt 1f // the dreaded cpu 0 idle task in region 5:( + ;; + mov cr.itir=r18 + mov cr.ifa=r13 + mov r20=IA64_TR_CURRENT_STACK + ;; + itr.d dtr[r20]=r21 + ;; + srlz.d +1: + + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// + + +//++ +// Name: +// ia64_new_stack() +// +// Stub Description: +// +// Switch to the MCA/INIT stack. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +// On entry RBS is still on the original stack, this routine switches RBS +// to use the MCA/INIT stack. +// +// On entry, sos->pal_min_state is physical, on exit it is virtual. +// +//-- + +ia64_new_stack: + add regs=MCA_PT_REGS_OFFSET, r3 + add temp2=MCA_SOS_OFFSET+SOS(PAL_MIN_STATE), r3 + mov b0=r2 // save return address + GET_IA64_MCA_DATA(temp1) + invala + ;; + add temp2=temp2, temp1 // struct ia64_sal_os_state.pal_min_state on MCA or INIT stack + add regs=regs, temp1 // struct pt_regs on MCA or INIT stack + ;; + // Address of minstate area provided by PAL is physical, uncacheable. + // Convert to Linux virtual address in region 6 for C code. + ld8 ms=[temp2] // pal_min_state, physical + ;; + dep temp1=-1,ms,62,2 // set region 6 + mov temp3=IA64_RBS_OFFSET-MCA_PT_REGS_OFFSET + ;; + st8 [temp2]=temp1 // pal_min_state, virtual + + add temp4=temp3, regs // start of bspstore on new stack + ;; + mov ar.bspstore=temp4 // switch RBS to MCA/INIT stack + ;; + flushrs // must be first in group + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// + + +//++ +// Name: +// ia64_old_stack() +// +// Stub Description: +// +// Switch to the old stack. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +// On entry, pal_min_state is virtual, on exit it is physical. +// +// On entry RBS is on the MCA/INIT stack, this routine switches RBS +// back to the previous stack. +// +// The psr is set to all zeroes. SAL return requires either all zeroes or +// just psr.mc set. Leaving psr.mc off allows INIT to be issued if this +// code does not perform correctly. +// +// The dirty registers at the time of the event were flushed to the +// MCA/INIT stack in ia64_pt_regs_save(). Restore the dirty registers +// before reverting to the previous bspstore. +//-- + +ia64_old_stack: + add regs=MCA_PT_REGS_OFFSET, r3 + mov b0=r2 // save return address + GET_IA64_MCA_DATA(temp2) + LOAD_PHYSICAL(p0,temp1,1f) + ;; + mov cr.ipsr=r0 + mov cr.ifs=r0 + mov cr.iip=temp1 + ;; + invala + rfi +1: + + add regs=regs, temp2 // struct pt_regs on MCA or INIT stack + ;; + add temp1=PT(LOADRS), regs + ;; + ld8 temp2=[temp1],PT(AR_BSPSTORE)-PT(LOADRS) // restore loadrs + ;; + ld8 temp3=[temp1],PT(AR_RNAT)-PT(AR_BSPSTORE) // restore ar.bspstore + mov ar.rsc=temp2 + ;; + loadrs + ld8 temp4=[temp1] // restore ar.rnat + ;; + mov ar.bspstore=temp3 // back to old stack + ;; + mov ar.rnat=temp4 + ;; + + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// + + +//++ +// Name: +// ia64_set_kernel_registers() +// +// Stub Description: +// +// Set the registers that are required by the C code in order to run on an +// MCA/INIT stack. +// +// r2 contains the return address, r3 contains either +// IA64_MCA_CPU_MCA_STACK_OFFSET or IA64_MCA_CPU_INIT_STACK_OFFSET. +// +//-- + +ia64_set_kernel_registers: + add temp3=MCA_SP_OFFSET, r3 + mov b0=r2 // save return address + GET_IA64_MCA_DATA(temp1) + ;; + add r12=temp1, temp3 // kernel stack pointer on MCA/INIT stack + add r13=temp1, r3 // set current to start of MCA/INIT stack + add r20=temp1, r3 // physical start of MCA/INIT stack + ;; + DATA_PA_TO_VA(r12,temp2) + DATA_PA_TO_VA(r13,temp3) + ;; + mov IA64_KR(CURRENT)=r13 + + /* Wire IA64_TR_CURRENT_STACK to the MCA/INIT handler stack. To avoid + * any dependencies on the algorithm in ia64_switch_to(), just purge + * any existing CURRENT_STACK mapping and insert the new one. + */ + + mov r16=IA64_KR(CURRENT_STACK) // physical granule mapped by IA64_TR_CURRENT_STACK + ;; + shl r16=r16,IA64_GRANULE_SHIFT + ;; + dep r16=-1,r16,61,3 // virtual granule + mov r18=IA64_GRANULE_SHIFT<<2 // for cr.itir.ps + ;; + ptr.d r16,r18 + ;; + srlz.d + + shr.u r16=r20,IA64_GRANULE_SHIFT // r20 = physical start of MCA/INIT stack + movl r21=PAGE_KERNEL // page properties + ;; + mov IA64_KR(CURRENT_STACK)=r16 + or r21=r20,r21 // construct PA | page properties + ;; + mov cr.itir=r18 + mov cr.ifa=r13 + mov r20=IA64_TR_CURRENT_STACK + + movl r17=FPSR_DEFAULT + ;; + mov.m ar.fpsr=r17 // set ar.fpsr to kernel default value + ;; + itr.d dtr[r20]=r21 + ;; + srlz.d + + br.sptk b0 + +//EndStub////////////////////////////////////////////////////////////////////// + +#undef ms +#undef regs +#undef temp1 +#undef temp2 +#undef temp3 +#undef temp4 + + +// Support function for mca.c, it is here to avoid using inline asm. Given the +// address of an rnat slot, if that address is below the current ar.bspstore +// then return the contents of that slot, otherwise return the contents of +// ar.rnat. +GLOBAL_ENTRY(ia64_get_rnat) + alloc r14=ar.pfs,1,0,0,0 + mov ar.rsc=0 + ;; + mov r14=ar.bspstore + ;; + cmp.lt p6,p7=in0,r14 + ;; +(p6) ld8 r8=[in0] +(p7) mov r8=ar.rnat + mov ar.rsc=3 + br.ret.sptk.many rp +END(ia64_get_rnat) + + +// void ia64_set_psr_mc(void) +// +// Set psr.mc bit to mask MCA/INIT. +GLOBAL_ENTRY(ia64_set_psr_mc) + rsm psr.i | psr.ic // disable interrupts + ;; + srlz.d + ;; + mov r14 = psr // get psr{36:35,31:0} + movl r15 = 1f + ;; + dep r14 = -1, r14, PSR_MC, 1 // set psr.mc + ;; + dep r14 = -1, r14, PSR_IC, 1 // set psr.ic + ;; + dep r14 = -1, r14, PSR_BN, 1 // keep bank1 in use + ;; + mov cr.ipsr = r14 + mov cr.ifs = r0 + mov cr.iip = r15 + ;; + rfi +1: + br.ret.sptk.many rp +END(ia64_set_psr_mc) diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c new file mode 100644 index 00000000000..09b4d6828c4 --- /dev/null +++ b/arch/ia64/kernel/mca_drv.c @@ -0,0 +1,795 @@ +/* + * File: mca_drv.c + * Purpose: Generic MCA handling layer + * + * Copyright (C) 2004 FUJITSU LIMITED + * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> + * Copyright (C) 2005 Silicon Graphics, Inc + * Copyright (C) 2005 Keith Owens <kaos@sgi.com> + * Copyright (C) 2006 Russ Anderson <rja@sgi.com> + */ +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kallsyms.h> +#include <linux/bootmem.h> +#include <linux/acpi.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/workqueue.h> +#include <linux/mm.h> +#include <linux/slab.h> + +#include <asm/delay.h> +#include <asm/machvec.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/system.h> +#include <asm/sal.h> +#include <asm/mca.h> + +#include <asm/irq.h> +#include <asm/hw_irq.h> + +#include "mca_drv.h" + +/* max size of SAL error record (default) */ +static int sal_rec_max = 10000; + +/* from mca_drv_asm.S */ +extern void *mca_handler_bhhook(void); + +static DEFINE_SPINLOCK(mca_bh_lock); + +typedef enum { + MCA_IS_LOCAL = 0, + MCA_IS_GLOBAL = 1 +} mca_type_t; + +#define MAX_PAGE_ISOLATE 1024 + +static struct page *page_isolate[MAX_PAGE_ISOLATE]; +static int num_page_isolate = 0; + +typedef enum { + ISOLATE_NG, + ISOLATE_OK, + ISOLATE_NONE +} isolate_status_t; + +typedef enum { + MCA_NOT_RECOVERED = 0, + MCA_RECOVERED = 1 +} recovery_status_t; + +/* + * This pool keeps pointers to the section part of SAL error record + */ +static struct { + slidx_list_t *buffer; /* section pointer list pool */ + int cur_idx; /* Current index of section pointer list pool */ + int max_idx; /* Maximum index of section pointer list pool */ +} slidx_pool; + +static int +fatal_mca(const char *fmt, ...) +{ + va_list args; + char buf[256]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf); + + return MCA_NOT_RECOVERED; +} + +static int +mca_recovered(const char *fmt, ...) +{ + va_list args; + char buf[256]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + ia64_mca_printk(KERN_INFO "MCA: %s\n", buf); + + return MCA_RECOVERED; +} + +/** + * mca_page_isolate - isolate a poisoned page in order not to use it later + * @paddr: poisoned memory location + * + * Return value: + * one of isolate_status_t, ISOLATE_OK/NG/NONE. + */ + +static isolate_status_t +mca_page_isolate(unsigned long paddr) +{ + int i; + struct page *p; + + /* whether physical address is valid or not */ + if (!ia64_phys_addr_valid(paddr)) + return ISOLATE_NONE; + + if (!pfn_valid(paddr >> PAGE_SHIFT)) + return ISOLATE_NONE; + + /* convert physical address to physical page number */ + p = pfn_to_page(paddr>>PAGE_SHIFT); + + /* check whether a page number have been already registered or not */ + for (i = 0; i < num_page_isolate; i++) + if (page_isolate[i] == p) + return ISOLATE_OK; /* already listed */ + + /* limitation check */ + if (num_page_isolate == MAX_PAGE_ISOLATE) + return ISOLATE_NG; + + /* kick pages having attribute 'SLAB' or 'Reserved' */ + if (PageSlab(p) || PageReserved(p)) + return ISOLATE_NG; + + /* add attribute 'Reserved' and register the page */ + get_page(p); + SetPageReserved(p); + page_isolate[num_page_isolate++] = p; + + return ISOLATE_OK; +} + +/** + * mca_hanlder_bh - Kill the process which occurred memory read error + * @paddr: poisoned address received from MCA Handler + */ + +void +mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr) +{ + ia64_mlogbuf_dump(); + printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, " + "iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n", + raw_smp_processor_id(), current->pid, current_uid(), + iip, ipsr, paddr, current->comm); + + spin_lock(&mca_bh_lock); + switch (mca_page_isolate(paddr)) { + case ISOLATE_OK: + printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr); + break; + case ISOLATE_NG: + printk(KERN_CRIT "Page isolation: ( %lx ) failure.\n", paddr); + break; + default: + break; + } + spin_unlock(&mca_bh_lock); + + /* This process is about to be killed itself */ + do_exit(SIGKILL); +} + +/** + * mca_make_peidx - Make index of processor error section + * @slpi: pointer to record of processor error section + * @peidx: pointer to index of processor error section + */ + +static void +mca_make_peidx(sal_log_processor_info_t *slpi, peidx_table_t *peidx) +{ + /* + * calculate the start address of + * "struct cpuid_info" and "sal_processor_static_info_t". + */ + u64 total_check_num = slpi->valid.num_cache_check + + slpi->valid.num_tlb_check + + slpi->valid.num_bus_check + + slpi->valid.num_reg_file_check + + slpi->valid.num_ms_check; + u64 head_size = sizeof(sal_log_mod_error_info_t) * total_check_num + + sizeof(sal_log_processor_info_t); + u64 mid_size = slpi->valid.cpuid_info * sizeof(struct sal_cpuid_info); + + peidx_head(peidx) = slpi; + peidx_mid(peidx) = (struct sal_cpuid_info *) + (slpi->valid.cpuid_info ? ((char*)slpi + head_size) : NULL); + peidx_bottom(peidx) = (sal_processor_static_info_t *) + (slpi->valid.psi_static_struct ? + ((char*)slpi + head_size + mid_size) : NULL); +} + +/** + * mca_make_slidx - Make index of SAL error record + * @buffer: pointer to SAL error record + * @slidx: pointer to index of SAL error record + * + * Return value: + * 1 if record has platform error / 0 if not + */ +#define LOG_INDEX_ADD_SECT_PTR(sect, ptr) \ + {slidx_list_t *hl = &slidx_pool.buffer[slidx_pool.cur_idx]; \ + hl->hdr = ptr; \ + list_add(&hl->list, &(sect)); \ + slidx_pool.cur_idx = (slidx_pool.cur_idx + 1)%slidx_pool.max_idx; } + +static int +mca_make_slidx(void *buffer, slidx_table_t *slidx) +{ + int platform_err = 0; + int record_len = ((sal_log_record_header_t*)buffer)->len; + u32 ercd_pos; + int sects; + sal_log_section_hdr_t *sp; + + /* + * Initialize index referring current record + */ + INIT_LIST_HEAD(&(slidx->proc_err)); + INIT_LIST_HEAD(&(slidx->mem_dev_err)); + INIT_LIST_HEAD(&(slidx->sel_dev_err)); + INIT_LIST_HEAD(&(slidx->pci_bus_err)); + INIT_LIST_HEAD(&(slidx->smbios_dev_err)); + INIT_LIST_HEAD(&(slidx->pci_comp_err)); + INIT_LIST_HEAD(&(slidx->plat_specific_err)); + INIT_LIST_HEAD(&(slidx->host_ctlr_err)); + INIT_LIST_HEAD(&(slidx->plat_bus_err)); + INIT_LIST_HEAD(&(slidx->unsupported)); + + /* + * Extract a Record Header + */ + slidx->header = buffer; + + /* + * Extract each section records + * (arranged from "int ia64_log_platform_info_print()") + */ + for (ercd_pos = sizeof(sal_log_record_header_t), sects = 0; + ercd_pos < record_len; ercd_pos += sp->len, sects++) { + sp = (sal_log_section_hdr_t *)((char*)buffer + ercd_pos); + if (!efi_guidcmp(sp->guid, SAL_PROC_DEV_ERR_SECT_GUID)) { + LOG_INDEX_ADD_SECT_PTR(slidx->proc_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_MEM_DEV_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->mem_dev_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_SEL_DEV_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->sel_dev_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_PCI_BUS_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->pci_bus_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_SMBIOS_DEV_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->smbios_dev_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_PCI_COMP_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->pci_comp_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_SPECIFIC_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->plat_specific_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_HOST_CTLR_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->host_ctlr_err, sp); + } else if (!efi_guidcmp(sp->guid, + SAL_PLAT_BUS_ERR_SECT_GUID)) { + platform_err = 1; + LOG_INDEX_ADD_SECT_PTR(slidx->plat_bus_err, sp); + } else { + LOG_INDEX_ADD_SECT_PTR(slidx->unsupported, sp); + } + } + slidx->n_sections = sects; + + return platform_err; +} + +/** + * init_record_index_pools - Initialize pool of lists for SAL record index + * + * Return value: + * 0 on Success / -ENOMEM on Failure + */ +static int +init_record_index_pools(void) +{ + int i; + int rec_max_size; /* Maximum size of SAL error records */ + int sect_min_size; /* Minimum size of SAL error sections */ + /* minimum size table of each section */ + static int sal_log_sect_min_sizes[] = { + sizeof(sal_log_processor_info_t) + + sizeof(sal_processor_static_info_t), + sizeof(sal_log_mem_dev_err_info_t), + sizeof(sal_log_sel_dev_err_info_t), + sizeof(sal_log_pci_bus_err_info_t), + sizeof(sal_log_smbios_dev_err_info_t), + sizeof(sal_log_pci_comp_err_info_t), + sizeof(sal_log_plat_specific_err_info_t), + sizeof(sal_log_host_ctlr_err_info_t), + sizeof(sal_log_plat_bus_err_info_t), + }; + + /* + * MCA handler cannot allocate new memory on flight, + * so we preallocate enough memory to handle a SAL record. + * + * Initialize a handling set of slidx_pool: + * 1. Pick up the max size of SAL error records + * 2. Pick up the min size of SAL error sections + * 3. Allocate the pool as enough to 2 SAL records + * (now we can estimate the maxinum of section in a record.) + */ + + /* - 1 - */ + rec_max_size = sal_rec_max; + + /* - 2 - */ + sect_min_size = sal_log_sect_min_sizes[0]; + for (i = 1; i < sizeof sal_log_sect_min_sizes/sizeof(size_t); i++) + if (sect_min_size > sal_log_sect_min_sizes[i]) + sect_min_size = sal_log_sect_min_sizes[i]; + + /* - 3 - */ + slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1; + slidx_pool.buffer = (slidx_list_t *) + kmalloc(slidx_pool.max_idx * sizeof(slidx_list_t), GFP_KERNEL); + + return slidx_pool.buffer ? 0 : -ENOMEM; +} + + +/***************************************************************************** + * Recovery functions * + *****************************************************************************/ + +/** + * is_mca_global - Check whether this MCA is global or not + * @peidx: pointer of index of processor error section + * @pbci: pointer to pal_bus_check_info_t + * @sos: pointer to hand off struct between SAL and OS + * + * Return value: + * MCA_IS_LOCAL / MCA_IS_GLOBAL + */ + +static mca_type_t +is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) +{ + pal_processor_state_info_t *psp = + (pal_processor_state_info_t*)peidx_psp(peidx); + + /* + * PAL can request a rendezvous, if the MCA has a global scope. + * If "rz_always" flag is set, SAL requests MCA rendezvous + * in spite of global MCA. + * Therefore it is local MCA when rendezvous has not been requested. + * Failed to rendezvous, the system must be down. + */ + switch (sos->rv_rc) { + case -1: /* SAL rendezvous unsuccessful */ + return MCA_IS_GLOBAL; + case 0: /* SAL rendezvous not required */ + return MCA_IS_LOCAL; + case 1: /* SAL rendezvous successful int */ + case 2: /* SAL rendezvous successful int with init */ + default: + break; + } + + /* + * If One or more Cache/TLB/Reg_File/Uarch_Check is here, + * it would be a local MCA. (i.e. processor internal error) + */ + if (psp->tc || psp->cc || psp->rc || psp->uc) + return MCA_IS_LOCAL; + + /* + * Bus_Check structure with Bus_Check.ib (internal bus error) flag set + * would be a global MCA. (e.g. a system bus address parity error) + */ + if (!pbci || pbci->ib) + return MCA_IS_GLOBAL; + + /* + * Bus_Check structure with Bus_Check.eb (external bus error) flag set + * could be either a local MCA or a global MCA. + * + * Referring Bus_Check.bsi: + * 0: Unknown/unclassified + * 1: BERR# + * 2: BINIT# + * 3: Hard Fail + * (FIXME: Are these SGI specific or generic bsi values?) + */ + if (pbci->eb) + switch (pbci->bsi) { + case 0: + /* e.g. a load from poisoned memory */ + return MCA_IS_LOCAL; + case 1: + case 2: + case 3: + return MCA_IS_GLOBAL; + } + + return MCA_IS_GLOBAL; +} + +/** + * get_target_identifier - Get the valid Cache or Bus check target identifier. + * @peidx: pointer of index of processor error section + * + * Return value: + * target address on Success / 0 on Failure + */ +static u64 +get_target_identifier(peidx_table_t *peidx) +{ + u64 target_address = 0; + sal_log_mod_error_info_t *smei; + pal_cache_check_info_t *pcci; + int i, level = 9; + + /* + * Look through the cache checks for a valid target identifier + * If more than one valid target identifier, return the one + * with the lowest cache level. + */ + for (i = 0; i < peidx_cache_check_num(peidx); i++) { + smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i); + if (smei->valid.target_identifier && smei->target_identifier) { + pcci = (pal_cache_check_info_t *)&(smei->check_info); + if (!target_address || (pcci->level < level)) { + target_address = smei->target_identifier; + level = pcci->level; + continue; + } + } + } + if (target_address) + return target_address; + + /* + * Look at the bus check for a valid target identifier + */ + smei = peidx_bus_check(peidx, 0); + if (smei && smei->valid.target_identifier) + return smei->target_identifier; + + return 0; +} + +/** + * recover_from_read_error - Try to recover the errors which type are "read"s. + * @slidx: pointer of index of SAL error record + * @peidx: pointer of index of processor error section + * @pbci: pointer of pal_bus_check_info + * @sos: pointer to hand off struct between SAL and OS + * + * Return value: + * 1 on Success / 0 on Failure + */ + +static int +recover_from_read_error(slidx_table_t *slidx, + peidx_table_t *peidx, pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) +{ + u64 target_identifier; + pal_min_state_area_t *pmsa; + struct ia64_psr *psr1, *psr2; + ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook; + + /* Is target address valid? */ + target_identifier = get_target_identifier(peidx); + if (!target_identifier) + return fatal_mca("target address not valid"); + + /* + * cpu read or memory-mapped io read + * + * offending process affected process OS MCA do + * kernel mode kernel mode down system + * kernel mode user mode kill the process + * user mode kernel mode down system (*) + * user mode user mode kill the process + * + * (*) You could terminate offending user-mode process + * if (pbci->pv && pbci->pl != 0) *and* if you sure + * the process not have any locks of kernel. + */ + + /* Is minstate valid? */ + if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) + return fatal_mca("minstate not valid"); + psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); + psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); + + /* + * Check the privilege level of interrupted context. + * If it is user-mode, then terminate affected process. + */ + + pmsa = sos->pal_min_state; + if (psr1->cpl != 0 || + ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) { + /* + * setup for resume to bottom half of MCA, + * "mca_handler_bhhook" + */ + /* pass to bhhook as argument (gr8, ...) */ + pmsa->pmsa_gr[8-1] = target_identifier; + pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip; + pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr; + /* set interrupted return address (but no use) */ + pmsa->pmsa_br0 = pmsa->pmsa_iip; + /* change resume address to bottom half */ + pmsa->pmsa_iip = mca_hdlr_bh->fp; + pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp; + /* set cpl with kernel mode */ + psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; + psr2->cpl = 0; + psr2->ri = 0; + psr2->bn = 1; + psr2->i = 0; + + return mca_recovered("user memory corruption. " + "kill affected process - recovered."); + } + + return fatal_mca("kernel context not recovered, iip 0x%lx\n", + pmsa->pmsa_iip); +} + +/** + * recover_from_platform_error - Recover from platform error. + * @slidx: pointer of index of SAL error record + * @peidx: pointer of index of processor error section + * @pbci: pointer of pal_bus_check_info + * @sos: pointer to hand off struct between SAL and OS + * + * Return value: + * 1 on Success / 0 on Failure + */ + +static int +recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, + pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) +{ + int status = 0; + pal_processor_state_info_t *psp = + (pal_processor_state_info_t*)peidx_psp(peidx); + + if (psp->bc && pbci->eb && pbci->bsi == 0) { + switch(pbci->type) { + case 1: /* partial read */ + case 3: /* full line(cpu) read */ + case 9: /* I/O space read */ + status = recover_from_read_error(slidx, peidx, pbci, + sos); + break; + case 0: /* unknown */ + case 2: /* partial write */ + case 4: /* full line write */ + case 5: /* implicit or explicit write-back operation */ + case 6: /* snoop probe */ + case 7: /* incoming or outgoing ptc.g */ + case 8: /* write coalescing transactions */ + case 10: /* I/O space write */ + case 11: /* inter-processor interrupt message(IPI) */ + case 12: /* interrupt acknowledge or + external task priority cycle */ + default: + break; + } + } else if (psp->cc && !psp->bc) { /* Cache error */ + status = recover_from_read_error(slidx, peidx, pbci, sos); + } + + return status; +} + +/* + * recover_from_tlb_check + * @peidx: pointer of index of processor error section + * + * Return value: + * 1 on Success / 0 on Failure + */ +static int +recover_from_tlb_check(peidx_table_t *peidx) +{ + sal_log_mod_error_info_t *smei; + pal_tlb_check_info_t *ptci; + + smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0); + ptci = (pal_tlb_check_info_t *)&(smei->check_info); + + /* + * Look for signature of a duplicate TLB DTC entry, which is + * a SW bug and always fatal. + */ + if (ptci->op == PAL_TLB_CHECK_OP_PURGE + && !(ptci->itr || ptci->dtc || ptci->itc)) + return fatal_mca("Duplicate TLB entry"); + + return mca_recovered("TLB check recovered"); +} + +/** + * recover_from_processor_error + * @platform: whether there are some platform error section or not + * @slidx: pointer of index of SAL error record + * @peidx: pointer of index of processor error section + * @pbci: pointer of pal_bus_check_info + * @sos: pointer to hand off struct between SAL and OS + * + * Return value: + * 1 on Success / 0 on Failure + */ + +static int +recover_from_processor_error(int platform, slidx_table_t *slidx, + peidx_table_t *peidx, pal_bus_check_info_t *pbci, + struct ia64_sal_os_state *sos) +{ + pal_processor_state_info_t *psp = + (pal_processor_state_info_t*)peidx_psp(peidx); + + /* + * Processor recovery status must key off of the PAL recovery + * status in the Processor State Parameter. + */ + + /* + * The machine check is corrected. + */ + if (psp->cm == 1) + return mca_recovered("machine check is already corrected."); + + /* + * The error was not contained. Software must be reset. + */ + if (psp->us || psp->ci == 0) + return fatal_mca("error not contained"); + + /* + * Look for recoverable TLB check + */ + if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) + return recover_from_tlb_check(peidx); + + /* + * The cache check and bus check bits have four possible states + * cc bc + * 1 1 Memory error, attempt recovery + * 1 0 Cache error, attempt recovery + * 0 1 I/O error, attempt recovery + * 0 0 Other error type, not recovered + */ + if (psp->cc == 0 && (psp->bc == 0 || pbci == NULL)) + return fatal_mca("No cache or bus check"); + + /* + * Cannot handle more than one bus check. + */ + if (peidx_bus_check_num(peidx) > 1) + return fatal_mca("Too many bus checks"); + + if (pbci->ib) + return fatal_mca("Internal Bus error"); + if (pbci->eb && pbci->bsi > 0) + return fatal_mca("External bus check fatal status"); + + /* + * This is a local MCA and estimated as a recoverable error. + */ + if (platform) + return recover_from_platform_error(slidx, peidx, pbci, sos); + + /* + * On account of strange SAL error record, we cannot recover. + */ + return fatal_mca("Strange SAL record"); +} + +/** + * mca_try_to_recover - Try to recover from MCA + * @rec: pointer to a SAL error record + * @sos: pointer to hand off struct between SAL and OS + * + * Return value: + * 1 on Success / 0 on Failure + */ + +static int +mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) +{ + int platform_err; + int n_proc_err; + slidx_table_t slidx; + peidx_table_t peidx; + pal_bus_check_info_t pbci; + + /* Make index of SAL error record */ + platform_err = mca_make_slidx(rec, &slidx); + + /* Count processor error sections */ + n_proc_err = slidx_count(&slidx, proc_err); + + /* Now, OS can recover when there is one processor error section */ + if (n_proc_err > 1) + return fatal_mca("Too Many Errors"); + else if (n_proc_err == 0) + /* Weird SAL record ... We can't do anything */ + return fatal_mca("Weird SAL record"); + + /* Make index of processor error section */ + mca_make_peidx((sal_log_processor_info_t*) + slidx_first_entry(&slidx.proc_err)->hdr, &peidx); + + /* Extract Processor BUS_CHECK[0] */ + *((u64*)&pbci) = peidx_check_info(&peidx, bus_check, 0); + + /* Check whether MCA is global or not */ + if (is_mca_global(&peidx, &pbci, sos)) + return fatal_mca("global MCA"); + + /* Try to recover a processor error */ + return recover_from_processor_error(platform_err, &slidx, &peidx, + &pbci, sos); +} + +/* + * ============================================================================= + */ + +int __init mca_external_handler_init(void) +{ + if (init_record_index_pools()) + return -ENOMEM; + + /* register external mca handlers */ |