diff options
author | Frank Ch. Eigler <fche@elastic.org> | 2008-02-19 14:26:42 -0500 |
---|---|---|
committer | Frank Ch. Eigler <fche@elastic.org> | 2008-02-19 14:26:42 -0500 |
commit | 661f6234c23e077f088859da0fe6f95511de6679 (patch) | |
tree | 69adc1e0c0b2912b0b07b8752cf9e782f9f289c9 | |
parent | 0f05501579dc0a4e66ccbbd8e0b29d052d9b5920 (diff) | |
parent | a73014758df9ba7f832c8f13305652a777b574a9 (diff) | |
download | systemtap-steved-661f6234c23e077f088859da0fe6f95511de6679.tar.gz systemtap-steved-661f6234c23e077f088859da0fe6f95511de6679.tar.xz systemtap-steved-661f6234c23e077f088859da0fe6f95511de6679.zip |
Merge branch 'master' of git://sources.redhat.com/git/systemtap
-rw-r--r-- | ChangeLog | 19 | ||||
-rw-r--r-- | NEWS | 11 | ||||
-rw-r--r-- | runtime/uprobes/uprobes_x86.c | 717 | ||||
-rw-r--r-- | runtime/uprobes/uprobes_x86.h | 111 | ||||
-rw-r--r-- | stapprobes.5.in | 14 | ||||
-rw-r--r-- | tapsets.cxx | 172 | ||||
-rw-r--r-- | testsuite/ChangeLog | 11 | ||||
-rwxr-xr-x | testsuite/semko/fortyfive.stp | 3 | ||||
-rwxr-xr-x | testsuite/semko/zero.stp (renamed from testsuite/transko/one.stp) | 2 |
9 files changed, 992 insertions, 68 deletions
@@ -1,3 +1,22 @@ +2008-02-19 Frank Ch. Eigler <fche@elastic.org> + + PR 5766. + * tapsets.cxx (build_blacklist): Switch (back) to regexp-based + blacklist construction ... + (blacklist_p): ... and querying. + +2008-02-19 David Smith <dsmith@redhat.com> + + PR 5672. + * tapsets.cxx (mark_derived_probe): Call probe_point copy ctor to + shallow-copy incoming base probe location before + recomputing/overwriting it. + +2008-02-18 Frank Ch. Eigler <fche@elastic.org> + + * NEWS, stapprobes.5.in: Document basic (non-symbolic prototype) + user-space probe points. + 2008-02-15 Frank Ch. Eigler <fche@elastic.org> * tapsets.cxx (function_name_last_match): New function. @@ -1,5 +1,16 @@ * What's new in version 0.6 / 0.6.1 +- Prototype support for user-space probing is showing some progress. + No symbolic notations are supported yet (so no probing by function names, + file names, process names, and no access to $context variables), but at + least it's something: + + probe process(PID).statement(ADDRESS).absolute { } + + This will set a uprobe on the given process-id and given virtual address. + The proble handler runs in kernel-space as usual, and can generally use + existing tapset functions. + - Crash utility can retrieve systemtap's relay buffer from a kernel dump image by using staplog which is a crash extension module. To use this feature, type commands as below from crash(8)'s command line: diff --git a/runtime/uprobes/uprobes_x86.c b/runtime/uprobes/uprobes_x86.c new file mode 100644 index 00000000..ebb10d4e --- /dev/null +++ b/runtime/uprobes/uprobes_x86.c @@ -0,0 +1,717 @@ +/* + * Userspace Probes (UProbes) + * uprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006-2008 + */ + +#ifdef CONFIG_X86_32 +#define is_32bit_app(tsk) 1 +#else +#define is_32bit_app(tsk) (test_tsk_thread_flag(tsk, TIF_IA32)) +#endif + +/* Adapted from arch/x86_64/kprobes.c */ +#undef W +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##ULL << 0x0)|(b1##ULL << 0x1)|(b2##ULL << 0x2)|(b3##ULL << 0x3) | \ + (b4##ULL << 0x4)|(b5##ULL << 0x5)|(b6##ULL << 0x6)|(b7##ULL << 0x7) | \ + (b8##ULL << 0x8)|(b9##ULL << 0x9)|(ba##ULL << 0xa)|(bb##ULL << 0xb) | \ + (bc##ULL << 0xc)|(bd##ULL << 0xd)|(be##ULL << 0xe)|(bf##ULL << 0xf)) \ + << (row % 64)) + +static const unsigned long long good_insns_64[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 00 */ + W(0x10, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 10 */ + W(0x20, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 20 */ + W(0x30, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0), /* 30 */ + W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ + W(0x60, 0,0,0,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 60 */ + W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1)| /* 90 */ + W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* Good-instruction tables for 32-bit apps -- copied from i386 uprobes */ + +static const unsigned long long good_insns_32[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 00 */ + W(0x10, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 10 */ + W(0x20, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1)| /* 20 */ + W(0x30, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1), /* 30 */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ + W(0x60, 1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0)| /* 60 */ + W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1)| /* 90 */ + W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* Using this for both 64-bit and 32-bit apps */ +static const unsigned long long good_2byte_insns[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1)| /* 00 */ + W(0x10, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1)| /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* 20 */ + W(0x30, 0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 60 */ + W(0x70, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 90 */ + W(0xa0, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* c0 */ + W(0xd0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* e0 */ + W(0xf0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* + * opcodes we'll probably never support: + * 6c-6d, e4-e5, ec-ed - in + * 6e-6f, e6-e7, ee-ef - out + * cc, cd - int3, int + * cf - iret + * d6 - illegal instruction + * f1 - int1/icebp + * f4 - hlt + * fa, fb - cli, sti + * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 + * + * invalid opcodes in 64-bit mode: + * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, c4-c5, d4-d5 + * + * 63 - we support this opcode in x86_64 but not in i386. + * opcodes we may need to refine support for: + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK + * + * others -- Do we need to support these? + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e, 64, 65 - es:, cs:, ss:, ds:, fs:, gs: segment prefixes + * 67 - addr16 prefix + * 9b - wait/fwait + * ce - into + * f0 - lock prefix + */ + +/* + * TODO: + * - Where necessary, examine the modrm byte and allow only valid instructions + * in the different Groups and fpu instructions. + * - Note: If we go past the first byte, do we need to verify that + * subsequent bytes were actually there, rather than off the last page? + * - Be clearer about which instructions we'll never probe. + */ + +/* + * Return 1 if this is a legacy instruction prefix we support, -1 if + * it's one we don't support, or 0 if it's not a prefix at all. + */ +static inline int check_legacy_prefix(u8 byte) +{ + switch (byte) { + case 0x26: + case 0x2e: + case 0x36: + case 0x3e: + case 0x64: + case 0x65: + case 0xf0: + return -1; + case 0x66: + case 0x67: + case 0xf2: + case 0xf3: + return 1; + default: + return 0; + } +} + +static void report_bad_1byte_opcode(int mode, uprobe_opcode_t op) +{ + printk(KERN_ERR "In %d-bit apps, " + "uprobes does not currently support probing " + "instructions whose first byte is 0x%2.2x\n", mode, op); +} + +static void report_bad_2byte_opcode(uprobe_opcode_t op) +{ + printk(KERN_ERR "uprobes does not currently support probing " + "instructions with the 2-byte opcode 0x0f 0x%2.2x\n", op); +} + +static int validate_insn_32bits(struct uprobe_probept *ppt) +{ + uprobe_opcode_t *insn = ppt->insn; + int pfx; + + /* Skip good instruction prefixes; reject "bad" ones. */ + while ((pfx = check_legacy_prefix(insn[0])) == 1) + insn++; + if (pfx < 0) { + report_bad_1byte_opcode(32, insn[0]); + return -EPERM; + } + if (test_bit(insn[0], (unsigned long*)good_insns_32)) + return 0; + if (insn[0] == 0x0f) { + if (test_bit(insn[1], (unsigned long*)good_2byte_insns)) + return 0; + report_bad_2byte_opcode(insn[1]); + } else + report_bad_1byte_opcode(32, insn[0]); + return -EPERM; +} + +static int validate_insn_64bits(struct uprobe_probept *ppt) +{ + uprobe_opcode_t *insn = ppt->insn; + int pfx; + + /* Skip good instruction prefixes; reject "bad" ones. */ + while ((pfx = check_legacy_prefix(insn[0])) == 1) + insn++; + if (pfx < 0) { + report_bad_1byte_opcode(64, insn[0]); + return -EPERM; + } + /* Skip REX prefix. */ + if ((insn[0] & 0xf0) == 0x40) + insn++; + if (test_bit(insn[0], (unsigned long*)good_insns_64)) + return 0; + if (insn[0] == 0x0f) { + if (test_bit(insn[1], (unsigned long*)good_2byte_insns)) + return 0; + report_bad_2byte_opcode(insn[1]); + } else + report_bad_1byte_opcode(64, insn[0]); + return -EPERM; +} + +#ifdef CONFIG_X86_64 +static int handle_riprel_insn(struct uprobe_probept *ppt); +#endif + +static +int arch_validate_probed_insn(struct uprobe_probept *ppt, + struct task_struct *tsk) +{ + int ret; + +#ifdef CONFIG_X86_64 + ppt->arch_info.flags = 0x0; + ppt->arch_info.rip_target_address = 0x0; +#endif + + if (is_32bit_app(tsk)) + return validate_insn_32bits(ppt); + if ((ret = validate_insn_64bits(ppt)) != 0) + return ret; +#ifdef CONFIG_X86_64 + (void) handle_riprel_insn(ppt); +#endif + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * Returns 0 if the indicated instruction has no immediate operand + * and/or can't use rip-relative addressing. Otherwise returns + * the size of the immediate operand in the instruction. (Note that + * for instructions such as "movq $7,xxxx(%rip)" the immediate-operand + * field is 4 bytes, even though 8 bytes are stored.) + */ +static int immediate_operand_size(u8 opcode1, u8 opcode2, u8 reg, + int operand_size_prefix) +{ + switch (opcode1) { + case 0x6b: /* imul immed,mem,reg */ + case 0x80: /* Group 1 */ + case 0x83: /* Group 1 */ + case 0xc0: /* Group 2 */ + case 0xc1: /* Group 2 */ + case 0xc6: /* Group 11 */ + return 1; + case 0x69: /* imul immed,mem,reg */ + case 0x81: /* Group 1 */ + case 0xc7: /* Group 11 */ + return (operand_size_prefix ? 2 : 4); + case 0xf6: /* Group 3, reg field == 0 or 1 */ + return (reg > 1 ? 0 : 1); + case 0xf7: /* Group 3, reg field == 0 or 1 */ + if (reg > 1) + return 0; + return (operand_size_prefix ? 2 : 4); + case 0x0f: + /* 2-byte opcodes */ + switch (opcode2) { + /* + * Note: 0x71-73 (Groups 12-14) have immediate operands, + * but not memory operands. + */ + case 0x70: /* pshuf* immed,mem,reg */ + case 0xa4: /* shld immed,reg,mem */ + case 0xac: /* shrd immed,reg,mem */ + case 0xc2: /* cmpps or cmppd */ + case 0xc4: /* pinsrw */ + case 0xc5: /* pextrw */ + case 0xc6: /* shufps or shufpd */ + case 0x0f: /* 3DNow extensions */ + return 1; + default: + return 0; + } + } + return 0; +} + +/* + * TODO: These tables are common for kprobes and uprobes and can be moved + * to a common place. + */ +static const u64 onebyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ + W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ + W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ + W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ + W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ + W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ + W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ + W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ + W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u64 twobyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ + W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ + W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ + W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ + W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ + W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ + W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ + W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ + W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ + W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* + * If pp->insn doesn't use rip-relative addressing, return 0. Otherwise, + * rewrite the instruction so that it accesses its memory operand + * indirectly through a scratch register. Set flags and rip_target_address + * in ppt->arch_info accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, and + * restored afterward.) Return 1. + * + * We do this because a rip-relative instruction can access only a + * relatively small area (+/- 2 GB from the instruction), and the SSOL + * area typically lies beyond that area. At least for instructions + * that store to memory, we can't single-step the original instruction + * and "fix things up" later, because the misdirected store could be + * disastrous. + * + * Some useful facts about rip-relative instructions: + * - There's always a modrm byte. + * - There's never a SIB byte. + * - The offset is always 4 bytes. + */ +static int handle_riprel_insn(struct uprobe_probept *ppt) +{ + u8 *insn = (u8*) ppt->insn; + u8 opcode1, opcode2, modrm, reg; + int need_modrm; + int operand_size_prefix = 0; + int immed_size, instruction_size; + + /* + * Skip legacy instruction prefixes. Some of these we don't + * support (yet), but here we pretend to support all of them. + * Skip the REX prefix, if any. + */ + while (check_legacy_prefix(*insn)) { + if (*insn == 0x66) + operand_size_prefix = 1; + insn++; + } + if ((*insn & 0xf0) == 0x40) + insn++; + + opcode1 = *insn; + if (opcode1 == 0x0f) { /* Two-byte opcode. */ + opcode2 = *++insn; + need_modrm = test_bit(opcode2, twobyte_has_modrm); + } else { /* One-byte opcode. */ + opcode2 = 0x0; + need_modrm = test_bit(opcode1, onebyte_has_modrm); + } + + if (!need_modrm) + return 0; + + modrm = *++insn; + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + if ((modrm & 0xc7) != 0x5) + return 0; + + /* + * We have a rip-relative instruction. insn points at the + * modrm byte. The next 4 bytes are the offset. Beyond the + * offset, for some instructions, is the immediate operand. + */ + reg = (modrm >> 3) & 0x7; + immed_size = immediate_operand_size(opcode1, opcode2, reg, + operand_size_prefix); + instruction_size = + (insn - (u8*) ppt->insn) /* prefixes + opcodes */ + + 1 /* modrm byte */ + + 4 /* offset */ + + immed_size; /* immediate field */ +#undef DEBUG_UPROBES_RIP +#ifdef DEBUG_UPROBES_RIP +{ + int i; + BUG_ON(instruction_size > 15); + printk(KERN_INFO "Munging rip-relative insn:"); + for (i = 0; i < instruction_size; i++) + printk(" %2.2x", ppt->insn[i]); + printk("\n"); +} +#endif + + /* + * Convert from rip-relative addressing to indirect addressing + * via a scratch register. Change the r/m field from 0x5 (%rip) + * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + */ + if (reg == 0) { + /* + * The register operand (if any) is either the A register + * (%rax, %eax, etc.) or (if the 0x4 bit is set in the + * REX prefix) %r8. In any case, we know the C register + * is NOT the register operand, so we use %rcx (register + * #1) for the scratch register. + */ + ppt->arch_info.flags = UPFIX_RIP_RCX; + /* Change modrm from 00 000 101 to 00 000 001. */ + *insn = 0x1; + } else { + /* Use %rax (register #0) for the scratch register. */ + ppt->arch_info.flags = UPFIX_RIP_RAX; + /* Change modrm from 00 xxx 101 to 00 xxx 000 */ + *insn = (reg << 3); + } + + /* Target address = address of next instruction + (signed) offset */ + insn++; + ppt->arch_info.rip_target_address = + (long) ppt->vaddr + instruction_size + *((s32*)insn); + if (immed_size) + memmove(insn, insn+4, immed_size); +#ifdef DEBUG_UPROBES_RIP +{ + int i; + printk(KERN_INFO "Munged rip-relative insn: "); + for (i = 0; i < instruction_size-4; i++) + printk(" %2.2x", ppt->insn[i]); + printk("\n"); + printk(KERN_INFO "Target address = %#lx\n", + ppt->arch_info.rip_target_address); +} +#endif + return 1; +} +#endif + +/* + * Get an instruction slot from the process's SSOL area, containing the + * instruction at ppt's probepoint. Point the rip at that slot, in + * preparation for single-stepping out of line. + * + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static +void uprobe_pre_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt, + struct pt_regs *regs) +{ + struct uprobe_ssol_slot *slot; + + slot = uprobe_get_insn_slot(ppt); + if (!slot) { + utask->doomed = 1; + return; + } + + REGS_IP = (long)slot->insn; + utask->singlestep_addr = REGS_IP; +#ifdef CONFIG_X86_64 + if (ppt->arch_info.flags == UPFIX_RIP_RAX) { + utask->arch_info.saved_scratch_register = REGS_AX; + REGS_AX = ppt->arch_info.rip_target_address; + } else if (ppt->arch_info.flags == UPFIX_RIP_RCX) { + utask->arch_info.saved_scratch_register = REGS_CX; + REGS_CX = ppt->arch_info.rip_target_address; + } +#endif +} + +/* + * Called by uprobe_post_ssout() to adjust the return address + * pushed by a call instruction executed out of line. + */ +static void adjust_ret_addr(unsigned long rsp, long correction, + struct uprobe_task *utask) +{ + unsigned long nleft; + if (is_32bit_app(current)) { + s32 ra; + nleft = copy_from_user(&ra, (const void __user *) rsp, 4); + if (unlikely(nleft != 0)) + goto fail; + ra += (s32) correction; + nleft = copy_to_user((void __user *) rsp, &ra, 4); + if (unlikely(nleft != 0)) + goto fail; + } else { + s64 ra; + nleft = copy_from_user(&ra, (const void __user *) rsp, 8); + if (unlikely(nleft != 0)) + goto fail; + ra += correction; + nleft = copy_to_user((void __user *) rsp, &ra, 8); + if (unlikely(nleft != 0)) + goto fail; + } + return; + +fail: + printk(KERN_ERR + "uprobes: Failed to adjust return address after" + " single-stepping call instruction;" + " pid=%d, rsp=%#lx\n", current->pid, rsp); + utask->doomed = 1; +} + +/* + * Called after single-stepping. ppt->vaddr is the address of the + * instruction whose first byte has been replaced by the "int3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is utask->singlestep_addr. + * + * This function prepares to return from the post-single-step + * trap. We have to fix things up as follows: + * + * 0) Typically, the new rip is relative to the copied instruction. We + * need to make it relative to the original instruction. Exceptions are + * return instructions and absolute or indirect jump or call instructions. + * + * 1) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + * + * 2) If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,(%rax)". + * We need to restore the contents of the scratch register and adjust + * the rip, keeping in mind that the instruction we executed is 4 bytes + * shorter than the original instruction (since we squeezed out the offset + * field). + */ +static +void uprobe_post_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt, + struct pt_regs *regs) +{ + unsigned long next_rip = 0; + unsigned long copy_rip = utask->singlestep_addr; + unsigned long orig_rip = ppt->vaddr; + long correction = (long) (orig_rip - copy_rip); + uprobe_opcode_t *insn = ppt->insn; +#ifdef CONFIG_X86_64 + unsigned long flags = ppt->arch_info.flags; +#endif + + up_read(&ppt->slot->rwsem); +#ifdef CONFIG_X86_64 + if (flags & (UPFIX_RIP_RAX | UPFIX_RIP_RCX)) { + if (flags & UPFIX_RIP_RAX) + REGS_AX = utask->arch_info.saved_scratch_register; + else + REGS_CX = utask->arch_info.saved_scratch_register; + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Fall through to handle stuff like "jmpq *...(%rip)" and + * "callq *...(%rip)". + */ + correction += 4; + } +#endif + /* + * TODO: Move all this instruction parsing to + * arch_validate_probed_insn(), and store what we learn in + * ppt->arch_info.flags. + * + * We don't bother skipping prefixes here because none of the + * instructions that require special treatment (other than + * rip-relative instructions, handled above) involve prefixes. + */ + + switch (*insn) { + case 0xc3: /* ret/lret */ + case 0xcb: + case 0xc2: + case 0xca: + /* rip is correct */ + next_rip = REGS_IP; + break; + case 0xe8: /* call relative - Fix return addr */ + adjust_ret_addr(REGS_SP, correction, utask); + break; + case 0x9a: /* call absolute - Fix return addr */ + adjust_ret_addr(REGS_SP, correction, utask); + next_rip = REGS_IP; + break; + case 0xff: + if ((insn[1] & 0x30) == 0x10) { + /* call absolute, indirect */ + /* Fix return addr; rip is correct. */ + next_rip = REGS_IP; + adjust_ret_addr(REGS_SP, correction, utask); + } else if ((insn[1] & 0x31) == 0x20 || /* jmp near, absolute indirect */ + (insn[1] & 0x31) == 0x21) { /* jmp far, absolute indirect */ + /* rip is correct. */ + next_rip = REGS_IP; + } + break; + case 0xea: /* jmp absolute -- rip is correct */ + next_rip = REGS_IP; + break; + default: + break; + } + + if (next_rip) + REGS_IP = next_rip; + else + REGS_IP += correction; +} + +/* + * Replace the return address with the trampoline address. Returns + * the original return address. + */ +static +unsigned long arch_hijack_uret_addr(unsigned long trampoline_address, + struct pt_regs *regs, struct uprobe_task *utask) +{ + int nleft; + unsigned long orig_ret_addr = 0; /* clear high bits for 32-bit apps */ + size_t rasize; + + if (is_32bit_app(current)) + rasize = 4; + else + rasize = 8; + nleft = copy_from_user(&orig_ret_addr, (const void __user *) REGS_SP, + rasize); + if (unlikely(nleft != 0)) + return 0; + if (orig_ret_addr == trampoline_address) + /* + * There's another uretprobe on this function, and it was + * processed first, so the return address has already + * been hijacked. + */ + return orig_ret_addr; + + nleft = copy_to_user((void __user *) REGS_SP, &trampoline_address, + rasize); + if (unlikely(nleft != 0)) { + if (nleft != rasize) { + printk(KERN_ERR "uretprobe_entry_handler: " + "return address partially clobbered -- " + "pid=%d, %%sp=%#lx, %%ip=%#lx\n", + current->pid, REGS_SP, REGS_IP); + utask->doomed = 1; + } // else nothing written, so no harm + return 0; + } + return orig_ret_addr; +} diff --git a/runtime/uprobes/uprobes_x86.h b/runtime/uprobes/uprobes_x86.h new file mode 100644 index 00000000..30541157 --- /dev/null +++ b/runtime/uprobes/uprobes_x86.h @@ -0,0 +1,111 @@ +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H +/* + * Userspace Probes (UProbes) + * uprobes.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008 + */ +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <asm/thread_info.h> + +/* Normally defined in Kconfig */ +#define CONFIG_URETPROBES 1 +#define CONFIG_UPROBES_SSOL 1 + + +typedef u8 uprobe_opcode_t; +#define BREAKPOINT_INSTRUCTION 0xcc +#define BP_INSN_SIZE 1 +#define MAX_UINSN_BYTES 16 + +#ifdef STAPCONF_X86_UNIREGS +#define REGS_IP regs->ip +#define REGS_SP regs->sp +#define REGS_AX regs->ax +#define REGS_CX regs->cx +#else +#ifdef CONFIG_X86_32 +#define REGS_IP regs->eip +#define REGS_SP regs->esp +#else +#define REGS_IP regs->rip +#define REGS_SP regs->rsp +#define REGS_AX regs->rax +#define REGS_CX regs->rcx +#endif +#endif /*STAPCONF_X86_UNIREGS */ + +// SLOT_IP should be 16 for 64-bit apps (include/asm-x86_64/elf.h) +// but 12 for 32-bit apps (arch/x86_64/ia32/ia32_binfmt.c) +#ifdef CONFIG_X86_32 +#define SLOT_IP(tsk) 12 +#else +#define SLOT_IP(tsk) (test_tsk_thread_flag(tsk, TIF_IA32) ? 12 : 16) +#endif + +#define BREAKPOINT_SIGNAL SIGTRAP +#define SSTEP_SIGNAL SIGTRAP + +/* Architecture specific switch for where the IP points after a bp hit */ +#define ARCH_BP_INST_PTR(inst_ptr) (inst_ptr - BP_INSN_SIZE) + +#define UPFIX_RIP_RAX 0x1 /* (%rip) insn rewritten to use (%rax) */ +#define UPFIX_RIP_RCX 0x2 /* (%rip) insn rewritten to use (%rcx) */ + +#ifdef CONFIG_X86_64 +struct uprobe_probept_arch_info { + unsigned long flags; + unsigned long rip_target_address; +}; + +struct uprobe_task_arch_info { + unsigned long saved_scratch_register; +}; +#else +struct uprobe_probept_arch_info {}; +struct uprobe_task_arch_info {}; +#endif + +struct uprobe_probept; +struct uprobe_task; + +static int arch_validate_probed_insn(struct uprobe_probept *ppt, + struct task_struct *tsk); + +/* On x86_64, the int3 traps leaves rip pointing past the int3 instruction. */ +static inline unsigned long arch_get_probept(struct pt_regs *regs) +{ + return (unsigned long) (REGS_IP - BP_INSN_SIZE); +} + +static inline void arch_reset_ip_for_sstep(struct pt_regs *regs) +{ + REGS_IP -= BP_INSN_SIZE; +} + +static inline void arch_restore_uret_addr(unsigned long ret_addr, + struct pt_regs *regs) +{ + REGS_IP = ret_addr; +} + +static unsigned long arch_hijack_uret_addr(unsigned long trampoline_addr, + struct pt_regs*, struct uprobe_task*); +#endif /* _ASM_UPROBES_H */ diff --git a/stapprobes.5.in b/stapprobes.5.in index 276358a0..642ed739 100644 --- a/stapprobes.5.in +++ b/stapprobes.5.in @@ -283,6 +283,20 @@ $var[N] indexes into an array. The index is given with a literal number. +.SS USER-SPACE +Early prototype support for user-space probing is available in the +form of a non-symbolic probe point: +.SAMPLE +process(PID).statement(ADDRESS).absolute +.ESAMPLE +is analogous to +.IR +kernel.statement(ADDRESS).absolute +in that both use raw (unverified) virtual addresses and provide +no $variables. The target PID parameter must identify a running +process, and ADDRESS should identify a valid instruction address. +All threads of that process will be probed. + .SS PROCFS These probe points allow procfs "files" in diff --git a/tapsets.cxx b/tapsets.cxx index 4ff53e6f..e89cfe90 100644 --- a/tapsets.cxx +++ b/tapsets.cxx @@ -2158,8 +2158,9 @@ struct dwarf_query : public base_query Dwarf_Addr addr); string get_blacklist_section(Dwarf_Addr addr); - set<string> blacklisted_probes; - set<string> blacklisted_return_probes; + regex_t blacklist_func; // function/statement probes + regex_t blacklist_func_ret; // only for .return probes + regex_t blacklist_file; // file name void build_blacklist(); bool blacklisted_p(const string& funcname, @@ -2374,70 +2375,110 @@ dwarf_query::handle_query_module() void dwarf_query::build_blacklist() { - // FIXME: it would be nice if these blacklisted functions were pulled in - // dynamically, instead of being statically defined here. + // We build up the regexps in these strings + + // Add ^ anchors at the front; $ will be added just before regcomp. + + // NB: all the regexp strings will start with "^(|foo|bar)$", note the + // empty first alternative. It's a bit unsightly, but it lets all the + // regexp concatenation statements look uniform, and we should have no + // empty actual strings to match against anyway. + + string blfn = "^("; + string blfn_ret = "^("; + string blfile = "^("; + + blfile += "|kernel/kprobes.c"; + blfile += "|arch/.*/kernel/kprobes.c"; + + // XXX: it would be nice if these blacklisted functions were pulled + // in dynamically, instead of being statically defined here. + // Perhaps it could be populated from script files. A "noprobe + // kernel.function("...")" construct might do the trick. // Most of these are marked __kprobes in newer kernels. We list - // them here so the translator can block them on older kernels that - // don't have the __kprobes function decorator. This also allows - // detection of problems at translate- rather than run-time. - blacklisted_probes.insert("__raw_spin_is_locked"); - blacklisted_probes.insert("atomic_notifier_call_chain"); - blacklisted_probes.insert("default_do_nmi"); - blacklisted_probes.insert("__die"); - blacklisted_probes.insert("die_nmi"); - blacklisted_probes.insert("do_debug"); - blacklisted_probes.insert("do_general_protection"); - blacklisted_probes.insert("do_int3"); - blacklisted_probes.insert("do_IRQ"); - blacklisted_probes.insert("do_page_fault"); - blacklisted_probes.insert("do_sparc64_fault"); - blacklisted_probes.insert("do_trap"); - blacklisted_probes.insert("dummy_nmi_callback"); - blacklisted_probes.insert("flush_icache_range"); - blacklisted_probes.insert("ia64_bad_break"); - blacklisted_probes.insert("ia64_do_page_fault"); - blacklisted_probes.insert("ia64_fault"); - blacklisted_probes.insert("io_check_error"); - blacklisted_probes.insert("mem_parity_error"); - blacklisted_probes.insert("nmi_watchdog_tick"); - blacklisted_probes.insert("notifier_call_chain"); - blacklisted_probes.insert("oops_begin"); - blacklisted_probes.insert("oops_end"); - blacklisted_probes.insert("program_check_exception"); - blacklisted_probes.insert("single_step_exception"); - blacklisted_probes.insert("sync_regs"); - blacklisted_probes.insert("unhandled_fault"); - blacklisted_probes.insert("unknown_nmi_error"); - - blacklisted_probes.insert("_read_trylock"); - blacklisted_probes.insert("_read_lock"); - blacklisted_probes.insert("_read_unlock"); - blacklisted_probes.insert("_write_trylock"); - blacklisted_probes.insert("_write_lock"); - blacklisted_probes.insert("_write_unlock"); - blacklisted_probes.insert("_spin_lock"); - blacklisted_probes.insert("_spin_lock_irqsave"); - blacklisted_probes.insert("_spin_trylock"); - blacklisted_probes.insert("_spin_unlock"); - blacklisted_probes.insert("_spin_unlock_irqrestore"); + // them here (anyway) so the translator can block them on older + // kernels that don't have the __kprobes function decorator. This + // also allows detection of problems at translate- rather than + // run-time. + + blfn += "|atomic_notifier_call_chain"; + blfn += "|default_do_nmi"; + blfn += "|__die"; + blfn += "|die_nmi"; + blfn += "|do_debug"; + blfn += "|do_general_protection"; + blfn += "|do_int3"; + blfn += "|do_IRQ"; + blfn += "|do_page_fault"; + blfn += "|do_sparc64_fault"; + blfn += "|do_trap"; + blfn += "|dummy_nmi_callback"; + blfn += "|flush_icache_range"; + blfn += "|ia64_bad_break"; + blfn += "|ia64_do_page_fault"; + blfn += "|ia64_fault"; + blfn += "|io_check_error"; + blfn += "|mem_parity_error"; + blfn += "|nmi_watchdog_tick"; + blfn += "|notifier_call_chain"; + blfn += "|oops_begin"; + blfn += "|oops_end"; + blfn += "|program_check_exception"; + blfn += "|single_step_exception"; + blfn += "|sync_regs"; + blfn += "|unhandled_fault"; + blfn += "|unknown_nmi_error"; + + // Lots of locks + blfn += "|.*raw_.*lock.*"; + blfn += "|.*read_.*lock.*"; + blfn += "|.*write_.*lock.*"; + blfn += "|.*spin_.*lock.*"; + blfn += "|.*rwlock_.*lock.*"; + blfn += "|.*rwsem_.*lock.*"; + blfn += "|.*mutex_.*lock.*"; + blfn += "|raw_.*"; + blfn += "|.*seq_.*lock.*"; + + // Experimental + blfn += "|.*apic.*|.*APIC.*"; + blfn += "|.*softirq.*"; + blfn += "|.*IRQ.*"; + blfn += "|.*_intr.*"; + blfn += "|__delay"; + blfn += "|.*kernel_text.*"; + blfn += "|get_current"; + blfn += "|current_.*"; + blfn += "|.*exception_tables.*"; + blfn += "|.*setup_rt_frame.*"; // PR 5759, CONFIG_PREEMPT kernels - blacklisted_probes.insert("add_preempt_count"); - blacklisted_probes.insert("preempt_schedule"); - blacklisted_probes.insert("sub_preempt_count"); + blfn += "|.*preempt_count.*"; + blfn += "|preempt_schedule"; // __switch_to changes "current" on x86_64 and i686, so return probes // would cause kernel panic, and it is marked as "__kprobes" on x86_64 if (sess.architecture == "x86_64") - blacklisted_probes.insert("__switch_to"); + blfn += "|__switch_to"; if (sess.architecture == "i686") - blacklisted_return_probes.insert("__switch_to"); + blfn_ret += "|__switch_to"; // These functions don't return, so return probes would never be recovered - blacklisted_return_probes.insert("do_exit"); - blacklisted_return_probes.insert("sys_exit"); - blacklisted_return_probes.insert("sys_exit_group"); + blfn_ret += "|do_exit"; + blfn_ret += "|sys_exit"; + blfn_ret += "|sys_exit_group"; + + blfn += ")$"; + blfn_ret += ")$"; + blfile += ")$"; + + int rc = regcomp (& blacklist_func, blfn.c_str(), REG_NOSUB|REG_EXTENDED); + if (rc) throw semantic_error ("blacklist_func regcomp failed"); + rc = regcomp (& blacklist_func_ret, blfn_ret.c_str(), REG_NOSUB|REG_EXTENDED); + if (rc) throw semantic_error ("blacklist_func_ret regcomp failed"); + rc = regcomp (& blacklist_file, blfile.c_str(), REG_NOSUB|REG_EXTENDED); + if (rc) throw semantic_error ("blacklist_file regcomp failed"); } @@ -2555,17 +2596,13 @@ dwarf_query::blacklisted_p(const string& funcname, return true; } - // Check probe point against blacklist. XXX: This has to be - // properly generalized, perhaps via a table populated from script - // files. A "noprobe kernel.function("...")" construct might do - // the trick. - if (blacklisted_probes.count(funcname) > 0 || - (has_return && blacklisted_return_probes.count(funcname) > 0) || - filename == "kernel/kprobes.c" || - 0 == fnmatch ("arch/*/kernel/kprobes.c", filename.c_str(), 0)) - // XXX: these tests (set lookup, fnmatch) could be combined into a - // single synthetic compiled regexp, which would allow blacklisted - // functions to be identified by wildcard instead of exact name. + // Check probe point against blacklist. + int goodfn = regexec (&blacklist_func, funcname.c_str(), 0, NULL, 0); + if (has_return) + goodfn = goodfn && regexec (&blacklist_func_ret, funcname.c_str(), 0, NULL, 0); + int goodfile = regexec (&blacklist_file, filename.c_str(), 0, NULL, 0); + + if (! (goodfn && goodfile)) { if (sess.verbose>1) clog << " skipping - blacklisted"; @@ -5289,7 +5326,8 @@ mark_derived_probe::mark_derived_probe (systemtap_session &s, const string& p_n, const string& p_s, probe* base, probe_point* loc): - derived_probe (base, loc), sess (s), probe_name (p_n), probe_sig (p_s), + derived_probe (base, new probe_point(*loc) /* .components soon rewritten */), + sess (s), probe_name (p_n), probe_sig (p_s), target_symbol_seen (false) { // create synthetic probe point name; preserve condition diff --git a/testsuite/ChangeLog b/testsuite/ChangeLog index 11af94f3..6811d912 100644 --- a/testsuite/ChangeLog +++ b/testsuite/ChangeLog @@ -1,3 +1,14 @@ +2008-02-19 Frank Ch. Eigler <fche@elastic.org> + + PR5766. + * semko/fortyfive.stp: New test. + +2008-02-19 Frank Ch. Eigler <fche@elastic.org> + + PR5771. + * transko/one.stp: Remove, or rather, move to this ... + * semko/zero.stp: new file. + 2008-02-12 Frank Ch. Eigler <fche@elastic.org> * systemtap.context/context.exp: Build temporary modules under diff --git a/testsuite/semko/fortyfive.stp b/testsuite/semko/fortyfive.stp new file mode 100755 index 00000000..d295bbde --- /dev/null +++ b/testsuite/semko/fortyfive.stp @@ -0,0 +1,3 @@ +#! stap -p2 + +probe kernel.function("*@kernel/kprobes.c") {} // blacklisted diff --git a/testsuite/transko/one.stp b/testsuite/semko/zero.stp index 508ce9d4..bb116be7 100755 --- a/testsuite/transko/one.stp +++ b/testsuite/semko/zero.stp @@ -1,4 +1,4 @@ -#! stap -p3 +#! stap -p2 probe begin { print (1 = a) |