diff options
Diffstat (limited to 'runtime/uprobes')
-rw-r--r-- | runtime/uprobes/uprobes_x86_64.c | 697 | ||||
-rw-r--r-- | runtime/uprobes/uprobes_x86_64.h | 84 |
2 files changed, 781 insertions, 0 deletions
diff --git a/runtime/uprobes/uprobes_x86_64.c b/runtime/uprobes/uprobes_x86_64.c new file mode 100644 index 00000000..23dcdadb --- /dev/null +++ b/runtime/uprobes/uprobes_x86_64.c @@ -0,0 +1,697 @@ +/* + * Userspace Probes (UProbes) + * arch/x86_64/kernel/uprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006-2007 + */ +/* + * In versions of uprobes built in the SystemTap runtime, this file + * is #included at the end of uprobes.c. + */ + +#define is_32bit_app(tsk) (test_tsk_thread_flag(tsk, TIF_IA32)) + +/* Adapted from arch/x86_64/kprobes.c */ +#undef W +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 64)) + +static const unsigned long good_insns_64[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 00 */ + W(0x10, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 10 */ + W(0x20, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 20 */ + W(0x30, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0), /* 30 */ + W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ + W(0x60, 0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0)| /* 60 */ + W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1)| /* 90 */ + W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* Good-instruction tables for 32-bit apps -- copied from i386 uprobes */ + +static const unsigned long good_insns_32[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 00 */ + W(0x10, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 10 */ + W(0x20, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1)| /* 20 */ + W(0x30, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1), /* 30 */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ + W(0x60, 1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0)| /* 60 */ + W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1)| /* 90 */ + W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* Using this for both 64-bit and 32-bit apps */ +static const unsigned long good_2byte_insns[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1)| /* 00 */ + W(0x10, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1)| /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* 20 */ + W(0x30, 0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 60 */ + W(0x70, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 90 */ + W(0xa0, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* c0 */ + W(0xd0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* e0 */ + W(0xf0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* + * opcodes we'll probably never support: + * 63 - arpl + * 6c-6d, e4-e5, ec-ed - in + * 6e-6f, e6-e7, ee-ef - out + * cc, cd - int3, int + * cf - iret + * d6 - illegal instruction + * f1 - int1/icebp + * f4 - hlt + * fa, fb - cli, sti + * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 + * + * invalid opcodes in 64-bit mode: + * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, c4-c5, d4-d5 + * + * opcodes we may need to refine support for: + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK + * + * others -- Do we need to support these? + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e, 64, 65 - es:, cs:, ss:, ds:, fs:, gs: segment prefixes + * 67 - addr16 prefix + * 9b - wait/fwait + * ce - into + * f0 - lock prefix + */ + +/* + * TODO: + * - Where necessary, examine the modrm byte and allow only valid instructions + * in the different Groups and fpu instructions. + * - Note: If we go past the first byte, do we need to verify that + * subsequent bytes were actually there, rather than off the last page? + * - Be clearer about which instructions we'll never probe. + */ + +/* + * Return 1 if this is a legacy instruction prefix we support, -1 if + * it's one we don't support, or 0 if it's not a prefix at all. + */ +static inline int check_legacy_prefix(u8 byte) +{ + switch (byte) { + case 0x26: + case 0x2e: + case 0x36: + case 0x3e: + case 0x64: + case 0x65: + case 0xf0: + return -1; + case 0x66: + case 0x67: + case 0xf2: + case 0xf3: + return 1; + default: + return 0; + } +} + +static void report_bad_1byte_opcode(int mode, uprobe_opcode_t op) +{ + printk(KERN_ERR "In %d-bit apps, " + "uprobes does not currently support probing " + "instructions whose first byte is 0x%2.2x\n", mode, op); +} + +static void report_bad_2byte_opcode(uprobe_opcode_t op) +{ + printk(KERN_ERR "uprobes does not currently support probing " + "instructions with the 2-byte opcode 0x0f 0x%2.2x\n", op); +} + +static int validate_insn_32bits(struct uprobe_probept *ppt) +{ + uprobe_opcode_t *insn = ppt->insn; + int pfx; + + /* Skip good instruction prefixes; reject "bad" ones. */ + while ((pfx = check_legacy_prefix(insn[0])) == 1) + insn++; + if (pfx < 0) { + report_bad_1byte_opcode(32, insn[0]); + return -EPERM; + } + if (test_bit(insn[0], good_insns_32)) + return 0; + if (insn[0] == 0x0f) { + if (test_bit(insn[1], good_2byte_insns)) + return 0; + report_bad_2byte_opcode(insn[1]); + } else + report_bad_1byte_opcode(32, insn[0]); + return -EPERM; +} + +static int validate_insn_64bits(struct uprobe_probept *ppt) +{ + uprobe_opcode_t *insn = ppt->insn; + int pfx; + + /* Skip good instruction prefixes; reject "bad" ones. */ + while ((pfx = check_legacy_prefix(insn[0])) == 1) + insn++; + if (pfx < 0) { + report_bad_1byte_opcode(64, insn[0]); + return -EPERM; + } + /* Skip REX prefix. */ + if ((insn[0] & 0xf0) == 0x40) + insn++; + if (test_bit(insn[0], good_insns_64)) + return 0; + if (insn[0] == 0x0f) { + if (test_bit(insn[1], good_2byte_insns)) + return 0; + report_bad_2byte_opcode(insn[1]); + } else + report_bad_1byte_opcode(64, insn[0]); + return -EPERM; +} + +static int handle_riprel_insn(struct uprobe_probept *ppt); + +static +int arch_validate_probed_insn(struct uprobe_probept *ppt, + struct task_struct *tsk) +{ + int ret; + + ppt->arch_info.flags = 0x0; + ppt->arch_info.rip_target_address = 0x0; + + if (is_32bit_app(tsk)) + return validate_insn_32bits(ppt); + if ((ret = validate_insn_64bits(ppt)) != 0) + return ret; + (void) handle_riprel_insn(ppt); + return 0; +} + +/* + * Returns 0 if the indicated instruction has no immediate operand + * and/or can't use rip-relative addressing. Otherwise returns + * the size of the immediate operand in the instruction. (Note that + * for instructions such as "movq $7,xxxx(%rip)" the immediate-operand + * field is 4 bytes, even though 8 bytes are stored.) + */ +static int immediate_operand_size(u8 opcode1, u8 opcode2, u8 reg, + int operand_size_prefix) +{ + switch (opcode1) { + case 0x6b: /* imul immed,mem,reg */ + case 0x80: /* Group 1 */ + case 0x83: /* Group 1 */ + case 0xc0: /* Group 2 */ + case 0xc1: /* Group 2 */ + case 0xc6: /* Group 11 */ + return 1; + case 0x69: /* imul immed,mem,reg */ + case 0x81: /* Group 1 */ + case 0xc7: /* Group 11 */ + return (operand_size_prefix ? 2 : 4); + case 0xf6: /* Group 3, reg field == 0 or 1 */ + return (reg > 1 ? 0 : 1); + case 0xf7: /* Group 3, reg field == 0 or 1 */ + if (reg > 1) + return 0; + return (operand_size_prefix ? 2 : 4); + case 0x0f: + /* 2-byte opcodes */ + switch (opcode2) { + /* + * Note: 0x71-73 (Groups 12-14) have immediate operands, + * but not memory operands. + */ + case 0x70: /* pshuf* immed,mem,reg */ + case 0xa4: /* shld immed,reg,mem */ + case 0xac: /* shrd immed,reg,mem */ + case 0xc2: /* cmpps or cmppd */ + case 0xc4: /* pinsrw */ + case 0xc5: /* pextrw */ + case 0xc6: /* shufps or shufpd */ + case 0x0f: /* 3DNow extensions */ + return 1; + default: + return 0; + } + } + return 0; +} + +/* + * TODO: These tables are common for kprobes and uprobes and can be moved + * to a common place. + */ +static const u64 onebyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ + W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ + W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ + W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ + W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ + W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ + W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ + W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ + W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ + W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ + W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ + W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ + W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ + W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u64 twobyte_has_modrm[256 / 64] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ + W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ + W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ + W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ + W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ + W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ + W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ + W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ + W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ + W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ + W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* + * If pp->insn doesn't use rip-relative addressing, return 0. Otherwise, + * rewrite the instruction so that it accesses its memory operand + * indirectly through a scratch register. Set flags and rip_target_address + * in ppt->arch_info accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, and + * restored afterward.) Return 1. + * + * We do this because a rip-relative instruction can access only a + * relatively small area (+/- 2 GB from the instruction), and the SSOL + * area typically lies beyond that area. At least for instructions + * that store to memory, we can't single-step the original instruction + * and "fix things up" later, because the misdirected store could be + * disastrous. + * + * Some useful facts about rip-relative instructions: + * - There's always a modrm byte. + * - There's never a SIB byte. + * - The offset is always 4 bytes. + */ +static int handle_riprel_insn(struct uprobe_probept *ppt) +{ + u8 *insn = (u8*) ppt->insn; + u8 opcode1, opcode2, modrm, reg; + int need_modrm; + int operand_size_prefix = 0; + int immed_size, instruction_size; + + /* + * Skip legacy instruction prefixes. Some of these we don't + * support (yet), but here we pretend to support all of them. + * Skip the REX prefix, if any. + */ + while (check_legacy_prefix(*insn)) { + if (*insn == 0x66) + operand_size_prefix = 1; + insn++; + } + if ((*insn & 0xf0) == 0x40) + insn++; + + opcode1 = *insn; + if (opcode1 == 0x0f) { /* Two-byte opcode. */ + opcode2 = *++insn; + need_modrm = test_bit(opcode2, twobyte_has_modrm); + } else { /* One-byte opcode. */ + opcode2 = 0x0; + need_modrm = test_bit(opcode1, onebyte_has_modrm); + } + + if (!need_modrm) + return 0; + + modrm = *++insn; + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + if ((modrm & 0xc7) != 0x5) + return 0; + + /* + * We have a rip-relative instruction. insn points at the + * modrm byte. The next 4 bytes are the offset. Beyond the + * offset, for some instructions, is the immediate operand. + */ + reg = (modrm >> 3) & 0x7; + immed_size = immediate_operand_size(opcode1, opcode2, reg, + operand_size_prefix); + instruction_size = + (insn - (u8*) ppt->insn) /* prefixes + opcodes */ + + 1 /* modrm byte */ + + 4 /* offset */ + + immed_size; /* immediate field */ +#undef DEBUG_UPROBES_RIP +#ifdef DEBUG_UPROBES_RIP +{ + int i; + BUG_ON(instruction_size > 15); + printk(KERN_INFO "Munging rip-relative insn:"); + for (i = 0; i < instruction_size; i++) + printk(" %2.2x", ppt->insn[i]); + printk("\n"); +} +#endif + + /* + * Convert from rip-relative addressing to indirect addressing + * via a scratch register. Change the r/m field from 0x5 (%rip) + * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + */ + if (reg == 0) { + /* + * The register operand (if any) is either the A register + * (%rax, %eax, etc.) or (if the 0x4 bit is set in the + * REX prefix) %r8. In any case, we know the C register + * is NOT the register operand, so we use %rcx (register + * #1) for the scratch register. + */ + ppt->arch_info.flags = UPFIX_RIP_RCX; + /* Change modrm from 00 000 101 to 00 000 001. */ + *insn = 0x1; + } else { + /* Use %rax (register #0) for the scratch register. */ + ppt->arch_info.flags = UPFIX_RIP_RAX; + /* Change modrm from 00 xxx 101 to 00 xxx 000 */ + *insn = (reg << 3); + } + + /* Target address = address of next instruction + (signed) offset */ + insn++; + ppt->arch_info.rip_target_address = + (long) ppt->vaddr + instruction_size + *((s32*)insn); + if (immed_size) + memmove(insn, insn+4, immed_size); +#ifdef DEBUG_UPROBES_RIP +{ + int i; + printk(KERN_INFO "Munged rip-relative insn: "); + for (i = 0; i < instruction_size-4; i++) + printk(" %2.2x", ppt->insn[i]); + printk("\n"); + printk(KERN_INFO "Target address = %#lx\n", + ppt->arch_info.rip_target_address); +} +#endif + return 1; +} + +/* + * Get an instruction slot from the process's SSOL area, containing the + * instruction at ppt's probepoint. Point the rip at that slot, in + * preparation for single-stepping out of line. + * + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static +void uprobe_pre_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt, + struct pt_regs *regs) +{ + struct uprobe_ssol_slot *slot; + + slot = uprobe_get_insn_slot(ppt); + if (!slot) { + utask->doomed = 1; + return; + } + + regs->rip = (long)slot->insn; + utask->singlestep_addr = regs->rip; + + if (ppt->arch_info.flags == UPFIX_RIP_RAX) { + utask->arch_info.saved_scratch_register = regs->rax; + regs->rax = ppt->arch_info.rip_target_address; + } else if (ppt->arch_info.flags == UPFIX_RIP_RCX) { + utask->arch_info.saved_scratch_register = regs->rcx; + regs->rcx = ppt->arch_info.rip_target_address; + } +} + +/* + * Called by uprobe_post_ssout() to adjust the return address + * pushed by a call instruction executed out of line. + */ +static void adjust_ret_addr(unsigned long rsp, long correction, + struct uprobe_task *utask) +{ + unsigned long nleft; + if (is_32bit_app(current)) { + s32 ra; + nleft = copy_from_user(&ra, (const void __user *) rsp, 4); + if (unlikely(nleft != 0)) + goto fail; + ra += (s32) correction; + nleft = copy_to_user((void __user *) rsp, &ra, 4); + if (unlikely(nleft != 0)) + goto fail; + } else { + s64 ra; + nleft = copy_from_user(&ra, (const void __user *) rsp, 8); + if (unlikely(nleft != 0)) + goto fail; + ra += correction; + nleft = copy_to_user((void __user *) rsp, &ra, 8); + if (unlikely(nleft != 0)) + goto fail; + } + return; + +fail: + printk(KERN_ERR + "uprobes: Failed to adjust return address after" + " single-stepping call instruction;" + " pid=%d, rsp=%#lx\n", current->pid, rsp); + utask->doomed = 1; +} + +/* + * Called after single-stepping. ppt->vaddr is the address of the + * instruction whose first byte has been replaced by the "int3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is utask->singlestep_addr. + * + * This function prepares to return from the post-single-step + * trap. We have to fix things up as follows: + * + * 0) Typically, the new rip is relative to the copied instruction. We + * need to make it relative to the original instruction. Exceptions are + * return instructions and absolute or indirect jump or call instructions. + * + * 1) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + * + * 2) If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,(%rax)". + * We need to restore the contents of the scratch register and adjust + * the rip, keeping in mind that the instruction we executed is 4 bytes + * shorter than the original instruction (since we squeezed out the offset + * field). + */ +static +void uprobe_post_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt, + struct pt_regs *regs) +{ + unsigned long next_rip = 0; + unsigned long copy_rip = utask->singlestep_addr; + unsigned long orig_rip = ppt->vaddr; + long correction = (long) (orig_rip - copy_rip); + uprobe_opcode_t *insn = ppt->insn; + unsigned long flags = ppt->arch_info.flags; + + up_read(&ppt->slot->rwsem); + + if (flags & (UPFIX_RIP_RAX | UPFIX_RIP_RCX)) { + if (flags & UPFIX_RIP_RAX) + regs->rax = utask->arch_info.saved_scratch_register; + else + regs->rcx = utask->arch_info.saved_scratch_register; + regs->rip += (4 + correction); + return; + } + + /* + * TODO: Move all this instruction parsing to + * arch_validate_probed_insn(), and store what we learn in + * ppt->arch_info.flags. + * + * We don't bother skipping prefixes here because none of the + * non-rip-relative instructions that require special treatment + * involve prefixes. + */ + + switch (*insn) { + case 0xc3: /* ret/lret */ + case 0xcb: + case 0xc2: + case 0xca: + /* rip is correct */ + next_rip = regs->rip; + break; + case 0xe8: /* call relative - Fix return addr */ + adjust_ret_addr(regs->rsp, correction, utask); + break; + case 0xff: + if ((insn[1] & 0x30) == 0x10) { + /* call absolute, indirect */ + /* Fix return addr; rip is correct. */ + next_rip = regs->rip; + adjust_ret_addr(regs->rsp, correction, utask); + } else if ((insn[1] & 0x31) == 0x20 || /* jmp near, absolute indirect */ + (insn[1] & 0x31) == 0x21) { /* jmp far, absolute indirect */ + /* rip is correct. */ + next_rip = regs->rip; + } + break; + case 0xea: /* jmp absolute -- rip is correct */ + next_rip = regs->rip; + break; + default: + break; + } + + if (next_rip) + regs->rip = next_rip; + else + regs->rip += correction; +} + +/* + * Replace the return address with the trampoline address. Returns + * the original return address. + */ +static +unsigned long arch_hijack_uret_addr(unsigned long trampoline_address, + struct pt_regs *regs, struct uprobe_task *utask) +{ + int nleft; + unsigned long orig_ret_addr = 0; /* clear high bits for 32-bit apps */ + size_t rasize; + + if (is_32bit_app(current)) + rasize = 4; + else + rasize = 8; + nleft = copy_from_user(&orig_ret_addr, (const void __user *) regs->rsp, + rasize); + if (unlikely(nleft != 0)) + return 0; + if (orig_ret_addr == trampoline_address) + /* + * There's another uretprobe on this function, and it was + * processed first, so the return address has already + * been hijacked. + */ + return orig_ret_addr; + + nleft = copy_to_user((void __user *) regs->rsp, &trampoline_address, + rasize); + if (unlikely(nleft != 0)) { + if (nleft != rasize) { + printk(KERN_ERR "uretprobe_entry_handler: " + "return address partially clobbered -- " + "pid=%d, %%rsp=%#lx, %%rip=%#lx\n", + current->pid, regs->rsp, regs->rip); + utask->doomed = 1; + } // else nothing written, so no harm + return 0; + } + return orig_ret_addr; +} diff --git a/runtime/uprobes/uprobes_x86_64.h b/runtime/uprobes/uprobes_x86_64.h new file mode 100644 index 00000000..c9345073 --- /dev/null +++ b/runtime/uprobes/uprobes_x86_64.h @@ -0,0 +1,84 @@ +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H +/* + * Userspace Probes (UProbes) + * include/asm-x86_64/uprobes.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + */ +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <asm/thread_info.h> + +/* Normally defined in Kconfig */ +#define CONFIG_URETPROBES 1 +#define CONFIG_UPROBES_SSOL 1 + +typedef u8 uprobe_opcode_t; +#define BREAKPOINT_INSTRUCTION 0xcc +#define BP_INSN_SIZE 1 +#define MAX_UINSN_BYTES 16 + +// SLOT_IP should be 16 for 64-bit apps (include/asm-x86_64/elf.h) +// but 12 for 32-bit apps (arch/x86_64/ia32/ia32_binfmt.c) +#define SLOT_IP(tsk) (test_tsk_thread_flag(tsk, TIF_IA32) ? 12 : 16) + +#define BREAKPOINT_SIGNAL SIGTRAP +#define SSTEP_SIGNAL SIGTRAP + +/* Architecture specific switch for where the IP points after a bp hit */ +#define ARCH_BP_INST_PTR(inst_ptr) (inst_ptr - BP_INSN_SIZE) + +#define UPFIX_RIP_RAX 0x1 /* (%rip) insn rewritten to use (%rax) */ +#define UPFIX_RIP_RCX 0x2 /* (%rip) insn rewritten to use (%rcx) */ + +struct uprobe_probept_arch_info { + unsigned long flags; + unsigned long rip_target_address; +}; + +struct uprobe_task_arch_info { + unsigned long saved_scratch_register; +}; + +struct uprobe_probept; +struct uprobe_task; + +static int arch_validate_probed_insn(struct uprobe_probept *ppt, + struct task_struct *tsk); + +/* On x86_64, the int3 traps leaves rip pointing past the int3 instruction. */ +static inline unsigned long arch_get_probept(struct pt_regs *regs) +{ + return (unsigned long) (regs->rip - BP_INSN_SIZE); +} + +static inline void arch_reset_ip_for_sstep(struct pt_regs *regs) +{ + regs->rip -= BP_INSN_SIZE; +} + +static inline void arch_restore_uret_addr(unsigned long ret_addr, + struct pt_regs *regs) +{ + regs->rip = ret_addr; +} + +static unsigned long arch_hijack_uret_addr(unsigned long trampoline_addr, + struct pt_regs*, struct uprobe_task*); +#endif /* _ASM_UPROBES_H */ |