/*
 *  Userspace Probes (UProbes)
 *  uprobes.c
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * Copyright (C) IBM Corporation, 2006-2008
 */
/*
 * In versions of uprobes built in the SystemTap runtime, this file
 * is #included at the end of uprobes.c.
 */
#include <asm/uaccess.h>

#ifdef CONFIG_X86_32
#define is_32bit_app(tsk) 1
#else
#define is_32bit_app(tsk) (test_tsk_thread_flag(tsk, TIF_IA32))
#endif

/* Adapted from arch/x86_64/kprobes.c */
#undef W
#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)		      \
	(((b0##ULL<< 0x0)|(b1##ULL<< 0x1)|(b2##ULL<< 0x2)|(b3##ULL<< 0x3) |   \
	  (b4##ULL<< 0x4)|(b5##ULL<< 0x5)|(b6##ULL<< 0x6)|(b7##ULL<< 0x7) |   \
	  (b8##ULL<< 0x8)|(b9##ULL<< 0x9)|(ba##ULL<< 0xa)|(bb##ULL<< 0xb) |   \
	  (bc##ULL<< 0xc)|(bd##ULL<< 0xd)|(be##ULL<< 0xe)|(bf##ULL<< 0xf))    \
	 << (row % 64))

static const u64 good_insns_64[256 / 64] = {
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
	/*      -------------------------------         */
	W(0x00, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 00 */
	W(0x10, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 10 */
	W(0x20, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0)| /* 20 */
	W(0x30, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0), /* 30 */
	W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
	W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */
	W(0x60, 0,0,0,1,1,1,0,0,1,1,1,1,0,0,0,0)| /* 60 */
	W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */
	W(0x80, 1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
	W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 90 */
	W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */
	W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */
	W(0xc0, 1,1,1,1,0,0,1,1,1,1,1,1,0,0,0,0)| /* c0 */
	W(0xd0, 1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)| /* d0 */
	W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */
	W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1)  /* f0 */
	/*      -------------------------------         */
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
};

/* Good-instruction tables for 32-bit apps -- copied from i386 uprobes */

static const u64 good_insns_32[256 / 64] = {
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
	/*      -------------------------------         */
	W(0x00, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 00 */
	W(0x10, 1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0)| /* 10 */
	W(0x20, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1)| /* 20 */
	W(0x30, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1), /* 30 */
	W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
	W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */
	W(0x60, 1,1,1,0,1,1,0,0,1,1,1,1,0,0,0,0)| /* 60 */
	W(0x70, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 70 */
	W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
	W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 90 */
	W(0xa0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* a0 */
	W(0xb0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* b0 */
	W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0)| /* c0 */
	W(0xd0, 1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1)| /* d0 */
	W(0xe0, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* e0 */
	W(0xf0, 0,0,1,1,0,1,1,1,1,1,0,0,1,1,1,1)  /* f0 */
	/*      -------------------------------         */
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
};

/* Using this for both 64-bit and 32-bit apps */
static const u64 good_2byte_insns[256 / 64] = {
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
	/*      -------------------------------         */
	W(0x00, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1)| /* 00 */
	W(0x10, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1)| /* 10 */
	W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* 20 */
	W(0x30, 0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
	W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
	W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 50 */
	W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 60 */
	W(0x70, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
	W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
	W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 90 */
	W(0xa0, 1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,1)| /* a0 */
	W(0xb0, 1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1), /* b0 */
	W(0xc0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* c0 */
	W(0xd0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* d0 */
	W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* e0 */
	W(0xf0, 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* f0 */
	/*      -------------------------------         */
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
};

/*
 * opcodes we'll probably never support:
 * 6c-6d, e4-e5, ec-ed - in
 * 6e-6f, e6-e7, ee-ef - out
 * cc, cd - int3, int
 * cf - iret
 * d6 - illegal instruction
 * f1 - int1/icebp
 * f4 - hlt
 * fa, fb - cli, sti
 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
 *
 * invalid opcodes in 64-bit mode:
 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
 *
 * 63 - we support this opcode in x86_64 but not in i386.
 * opcodes we may need to refine support for:
 * 0f - 2-byte instructions: For many of these instructions, the validity
 * depends on the prefix and/or the reg field.  On such instructions, we
 * just consider the opcode combination valid if it corresponds to any
 * valid instruction.
 * 8f - Group 1 - only reg = 0 is OK
 * c6-c7 - Group 11 - only reg = 0 is OK
 * d9-df - fpu insns with some illegal encodings
 * f2, f3 - repnz, repz prefixes.  These are also the first byte for
 * certain floating-point instructions, such as addsd.
 * fe - Group 4 - only reg = 0 or 1 is OK
 * ff - Group 5 - only reg = 0-6 is OK
 *
 * others -- Do we need to support these?
 * 0f - (floating-point?) prefetch instructions
 * 07, 17, 1f - pop es, pop ss, pop ds
 * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
 * 	but 64 and 65 (fs: and gs:) seem to be used, so we support them
 * 67 - addr16 prefix
 * ce - into
 * f0 - lock prefix
 */

/*
 * TODO:
 * - Where necessary, examine the modrm byte and allow only valid instructions
 * in the different Groups and fpu instructions.
 * - Note: If we go past the first byte, do we need to verify that
 * subsequent bytes were actually there, rather than off the last page?
 * - Be clearer about which instructions we'll never probe.
 */

/*
 * Return 1 if this is a legacy instruction prefix we support, -1 if
 * it's one we don't support, or 0 if it's not a prefix at all.
 */
static inline int check_legacy_prefix(u8 byte)
{
	switch (byte) {
	case 0x26:
	case 0x2e:
	case 0x36:
	case 0x3e:
	case 0xf0:
		return -1;
	case 0x64:
	case 0x65:
	case 0x66:
	case 0x67:
	case 0xf2:
	case 0xf3:
		return 1;
	default:
		return 0;
	}
}

static void report_bad_1byte_opcode(int mode, uprobe_opcode_t op)
{
	printk(KERN_ERR "In %d-bit apps, "
		"uprobes does not currently support probing "
		"instructions whose first byte is 0x%2.2x\n", mode, op);
}

static void report_bad_2byte_opcode(uprobe_opcode_t op)
{
	printk(KERN_ERR "uprobes does not currently support probing "
		"instructions with the 2-byte opcode 0x0f 0x%2.2x\n", op);
}

static int validate_insn_32bits(struct uprobe_probept *ppt)
{
	uprobe_opcode_t *insn = ppt->insn;
	int pfx;

	/* Skip good instruction prefixes; reject "bad" ones. */
	while ((pfx = check_legacy_prefix(insn[0])) == 1)
		insn++;
	if (pfx < 0) {
		report_bad_1byte_opcode(32, insn[0]);
		return -EPERM;
	}
	if (test_bit(insn[0], (unsigned long*)good_insns_32))
		return 0;
	if (insn[0] == 0x0f) {
		if (test_bit(insn[1], (unsigned long*)good_2byte_insns))
			return 0;
		report_bad_2byte_opcode(insn[1]);
	} else
		report_bad_1byte_opcode(32, insn[0]);
	return -EPERM;
}

static int validate_insn_64bits(struct uprobe_probept *ppt)
{
	uprobe_opcode_t *insn = ppt->insn;
	int pfx;

	/* Skip good instruction prefixes; reject "bad" ones. */
	while ((pfx = check_legacy_prefix(insn[0])) == 1)
		insn++;
	if (pfx < 0) {
		report_bad_1byte_opcode(64, insn[0]);
		return -EPERM;
	}
	/* Skip REX prefix. */
	if ((insn[0] & 0xf0) == 0x40)
		insn++;
	if (test_bit(insn[0], (unsigned long*)good_insns_64))
		return 0;
	if (insn[0] == 0x0f) {
		if (test_bit(insn[1], (unsigned long*)good_2byte_insns))
			return 0;
		report_bad_2byte_opcode(insn[1]);
	} else
		report_bad_1byte_opcode(64, insn[0]);
	return -EPERM;
}

#ifdef CONFIG_X86_64
static int handle_riprel_insn(struct uprobe_probept *ppt);
#endif

static
int arch_validate_probed_insn(struct uprobe_probept *ppt,
						struct task_struct *tsk)
{
	int ret;

#ifdef CONFIG_X86_64
	ppt->arch_info.flags = 0x0;
	ppt->arch_info.rip_target_address = 0x0;
#endif

	if (is_32bit_app(tsk))
		return validate_insn_32bits(ppt);
	if ((ret = validate_insn_64bits(ppt)) != 0)
		return ret;
#ifdef CONFIG_X86_64
	(void) handle_riprel_insn(ppt);
#endif
	return 0;
}

#ifdef CONFIG_X86_64
/*
 * Returns 0 if the indicated instruction has no immediate operand
 * and/or can't use rip-relative addressing.  Otherwise returns
 * the size of the immediate operand in the instruction.  (Note that
 * for instructions such as "movq $7,xxxx(%rip)" the immediate-operand
 * field is 4 bytes, even though 8 bytes are stored.)
 */
static int immediate_operand_size(u8 opcode1, u8 opcode2, u8 reg,
						int operand_size_prefix)
{
	switch (opcode1) {
	case 0x6b:	/* imul immed,mem,reg */
	case 0x80:	/* Group 1 */
	case 0x83:	/* Group 1 */
	case 0xc0:	/* Group 2 */
	case 0xc1:	/* Group 2 */
	case 0xc6:	/* Group 11 */
		return 1;
	case 0x69:	/* imul immed,mem,reg */
	case 0x81:	/* Group 1 */
	case 0xc7:	/* Group 11 */
		return (operand_size_prefix ? 2 : 4);
	case 0xf6:	/* Group 3, reg field == 0 or 1 */
		return (reg > 1 ? 0 : 1);
	case 0xf7:	/* Group 3, reg field == 0 or 1 */
		if (reg > 1)
			return 0;
		return (operand_size_prefix ? 2 : 4);
	case 0x0f:
		/* 2-byte opcodes */
		switch (opcode2) {
		/*
		 * Note: 0x71-73 (Groups 12-14) have immediate operands,
		 * but not memory operands.
		 */
		case 0x70:	/* pshuf* immed,mem,reg */
		case 0xa4:	/* shld immed,reg,mem */
		case 0xac:	/* shrd immed,reg,mem */
		case 0xc2:	/* cmpps or cmppd */
		case 0xc4:	/* pinsrw */
		case 0xc5:	/* pextrw */
		case 0xc6:	/* shufps or shufpd */
		case 0x0f:	/* 3DNow extensions */
			return 1;
		default:
			return 0;
		}
	}
	return 0;
}

/*
 * TODO: These tables are common for kprobes and uprobes and can be moved
 * to a common place.
 */
static const u64 onebyte_has_modrm[256 / 64] = {
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
	/*      -------------------------------         */
	W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
	W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
	W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
	W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
	W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
	W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
	W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
	W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
	W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
	W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
	W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
	W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
	W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
	W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
	W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
	W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1)  /* f0 */
	/*      -------------------------------         */
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
};
static const u64 twobyte_has_modrm[256 / 64] = {
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
	/*      -------------------------------         */
	W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
	W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
	W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
	W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
	W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
	W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
	W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
	W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
	W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
	W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
	W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
	W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
	W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
	W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
	W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
	W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0)  /* ff */
	/*      -------------------------------         */
	/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */
};

/*
 * If pp->insn doesn't use rip-relative addressing, return 0.  Otherwise,
 * rewrite the instruction so that it accesses its memory operand
 * indirectly through a scratch register.  Set flags and rip_target_address
 * in ppt->arch_info accordingly.  (The contents of the scratch register
 * will be saved before we single-step the modified instruction, and
 * restored afterward.)  Return 1.
 *
 * We do this because a rip-relative instruction can access only a
 * relatively small area (+/- 2 GB from the instruction), and the SSOL
 * area typically lies beyond that area.  At least for instructions
 * that store to memory, we can't single-step the original instruction
 * and "fix things up" later, because the misdirected store could be
 * disastrous.
 *
 * Some useful facts about rip-relative instructions:
 * - There's always a modrm byte.
 * - There's never a SIB byte.
 * - The offset is always 4 bytes.
 */
static int handle_riprel_insn(struct uprobe_probept *ppt)
{
	u8 *insn = (u8*) ppt->insn;
	u8 opcode1, opcode2, modrm, reg;
	int need_modrm;
	int operand_size_prefix = 0;
	int immed_size, instruction_size;

	/*
	 * Skip legacy instruction prefixes.  Some of these we don't
	 * support (yet), but here we pretend to support all of them.
	 * Skip the REX prefix, if any.
	 */
	while (check_legacy_prefix(*insn)) {
		if (*insn == 0x66)
			operand_size_prefix = 1;
		insn++;
	}
	if ((*insn & 0xf0) == 0x40)
		insn++;

	opcode1 = *insn;
	if (opcode1 == 0x0f) {	/* Two-byte opcode.  */
		opcode2 = *++insn;
		need_modrm = test_bit(opcode2, (unsigned long*)twobyte_has_modrm);
	} else {		/* One-byte opcode.  */
		opcode2 = 0x0;
		need_modrm = test_bit(opcode1, (unsigned long*)onebyte_has_modrm);
	}

	if (!need_modrm)
		return 0;

	modrm = *++insn;
	/*
	 * For rip-relative instructions, the mod field (top 2 bits)
	 * is zero and the r/m field (bottom 3 bits) is 0x5.
	 */
	if ((modrm & 0xc7) != 0x5)
		return 0;

	/*
	 * We have a rip-relative instruction.  insn points at the
	 * modrm byte.  The next 4 bytes are the offset. Beyond the
	 * offset, for some instructions, is the immediate operand.
	 */
	reg = (modrm >> 3) & 0x7;
	immed_size = immediate_operand_size(opcode1, opcode2, reg,
						operand_size_prefix);
	instruction_size =
		(insn - (u8*) ppt->insn)	/* prefixes + opcodes */
		+ 1			/* modrm byte */
		+ 4			/* offset */
		+ immed_size;		/* immediate field */
#undef DEBUG_UPROBES_RIP
#ifdef DEBUG_UPROBES_RIP
{
	int i;
	BUG_ON(instruction_size > 15);
	printk(KERN_INFO "Munging rip-relative insn:");
	for (i = 0; i < instruction_size; i++)
		printk(" %2.2x", ppt->insn[i]);
	printk("\n");
}
#endif

	/*
	 * Convert from rip-relative addressing to indirect addressing
	 * via a scratch register.  Change the r/m field from 0x5 (%rip)
	 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
	 */
	if (reg == 0) {
		/*
		 * The register operand (if any) is either the A register
		 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
		 * REX prefix) %r8.  In any case, we know the C register
		 * is NOT the register operand, so we use %rcx (register
		 * #1) for the scratch register.
		 */
		ppt->arch_info.flags = UPFIX_RIP_RCX;
		/* Change modrm from 00 000 101 to 00 000 001. */
		*insn = 0x1;
	} else {
		/* Use %rax (register #0) for the scratch register. */
		ppt->arch_info.flags = UPFIX_RIP_RAX;
		/* Change modrm from 00 xxx 101 to 00 xxx 000 */
		*insn = (reg << 3);
	}

	/* Target address = address of next instruction + (signed) offset */
	insn++;
	ppt->arch_info.rip_target_address =
			(long) ppt->vaddr + instruction_size + *((s32*)insn);
	if (immed_size)
		memmove(insn, insn+4, immed_size);
#ifdef DEBUG_UPROBES_RIP
{
	int i;
	printk(KERN_INFO "Munged rip-relative insn: ");
	for (i = 0; i < instruction_size-4; i++)
		printk(" %2.2x", ppt->insn[i]);
	printk("\n");
	printk(KERN_INFO "Target address = %#lx\n",
				ppt->arch_info.rip_target_address);
}
#endif
	return 1;
}
#endif

/*
 * Get an instruction slot from the process's SSOL area, containing the
 * instruction at ppt's probepoint.  Point the rip at that slot, in
 * preparation for single-stepping out of line.
 *
 * If we're emulating a rip-relative instruction, save the contents
 * of the scratch register and store the target address in that register.
 */
static
void uprobe_pre_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt,
		struct pt_regs *regs)
{
	struct uprobe_ssol_slot *slot;

	slot = uprobe_get_insn_slot(ppt);
	if (!slot) {
		utask->doomed = 1;
		return;
	}

	regs->ip = (long)slot->insn;
	utask->singlestep_addr = regs->ip;
#ifdef CONFIG_X86_64
	if (ppt->arch_info.flags == UPFIX_RIP_RAX) {
		utask->arch_info.saved_scratch_register = regs->ax;
		regs->ax = ppt->arch_info.rip_target_address;
	} else if (ppt->arch_info.flags == UPFIX_RIP_RCX) {
		utask->arch_info.saved_scratch_register = regs->cx;
		regs->cx = ppt->arch_info.rip_target_address;
	}
#endif
}

/*
 * Called by uprobe_post_ssout() to adjust the return address
 * pushed by a call instruction executed out of line.
 */
static void adjust_ret_addr(unsigned long rsp, long correction,
					struct uprobe_task *utask)
{
	unsigned long nleft;
	if (is_32bit_app(current)) {
		s32 ra;
		nleft = copy_from_user(&ra, (const void __user *) rsp, 4);
		if (unlikely(nleft != 0))
			goto fail;
		ra += (s32) correction;
		nleft = copy_to_user((void __user *) rsp, &ra, 4);
		if (unlikely(nleft != 0))
			goto fail;
	} else {
		s64 ra;
		nleft = copy_from_user(&ra, (const void __user *) rsp, 8);
		if (unlikely(nleft != 0))
			goto fail;
		ra += correction;
		nleft = copy_to_user((void __user *) rsp, &ra, 8);
		if (unlikely(nleft != 0))
			goto fail;
	}
	return;

fail:
	printk(KERN_ERR
		"uprobes: Failed to adjust return address after"
		" single-stepping call instruction;"
		" pid=%d, rsp=%#lx\n", current->pid, rsp);
	utask->doomed = 1;
}

/*
 * Called after single-stepping.  ppt->vaddr is the address of the
 * instruction whose first byte has been replaced by the "int3"
 * instruction.  To avoid the SMP problems that can occur when we
 * temporarily put back the original opcode to single-step, we
 * single-stepped a copy of the instruction.  The address of this
 * copy is utask->singlestep_addr.
 *
 * This function prepares to return from the post-single-step
 * trap.  We have to fix things up as follows:
 *
 * 0) Typically, the new rip is relative to the copied instruction.  We
 * need to make it relative to the original instruction.  Exceptions are
 * return instructions and absolute or indirect jump or call instructions.
 *
 * 1) If the single-stepped instruction was a call, the return address
 * that is atop the stack is the address following the copied instruction.
 * We need to make it the address following the original instruction.
 *
 * 2) If the original instruction was a rip-relative instruction such as
 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
 * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
 * We need to restore the contents of the scratch register and adjust
 * the rip, keeping in mind that the instruction we executed is 4 bytes
 * shorter than the original instruction (since we squeezed out the offset
 * field).
 */
static
void uprobe_post_ssout(struct uprobe_task *utask, struct uprobe_probept *ppt,
		struct pt_regs *regs)
{
	unsigned long next_ip = 0;
	unsigned long copy_ip = utask->singlestep_addr;
	unsigned long orig_ip = ppt->vaddr;
	long correction = (long) (orig_ip - copy_ip);
	uprobe_opcode_t *insn = ppt->insn;
#ifdef CONFIG_X86_64
	unsigned long flags = ppt->arch_info.flags;
#endif

	up_read(&ppt->slot->rwsem);
#ifdef CONFIG_X86_64
	if (flags & (UPFIX_RIP_RAX | UPFIX_RIP_RCX)) {
		if (flags & UPFIX_RIP_RAX)
			regs->ax = utask->arch_info.saved_scratch_register;
		else
			regs->cx = utask->arch_info.saved_scratch_register;
		/*
		 * The original instruction includes a displacement, and so
		 * is 4 bytes longer than what we've just single-stepped.
		 * Fall through to handle stuff like "jmpq *...(%rip)" and
		 * "callq *...(%rip)".
		 */
		correction += 4;
	}
#endif
	/*
	 * TODO: Move all this instruction parsing to
	 * arch_validate_probed_insn(), and store what we learn in
	 * ppt->arch_info.flags.
	 *
	 * We don't bother skipping prefixes here because none of the
	 * instructions that require special treatment (other than
	 * rip-relative instructions, handled above) involve prefixes.
	 */

	switch (*insn) {
	case 0xc3:		/* ret/lret */
	case 0xcb:
	case 0xc2:
	case 0xca:
		/* rip is correct */
		next_ip = regs->ip;
		break;
	case 0xe8:		/* call relative - Fix return addr */
		adjust_ret_addr(regs->sp, correction, utask);
		break;
	case 0x9a:		/* call absolute - Fix return addr */
		adjust_ret_addr(regs->sp, correction, utask);
		next_ip = regs->ip;
		break;
	case 0xff:
		if ((insn[1] & 0x30) == 0x10) {
			/* call absolute, indirect */
			/* Fix return addr; rip is correct. */
			next_ip = regs->ip;
			adjust_ret_addr(regs->sp, correction, utask);
		} else if ((insn[1] & 0x31) == 0x20 ||	/* jmp near, absolute indirect */
			   (insn[1] & 0x31) == 0x21) {	/* jmp far, absolute indirect */
			/* rip is correct. */
			next_ip = regs->ip;
		}
		break;
	case 0xea:		/* jmp absolute -- rip is correct */
		next_ip = regs->ip;
		break;
	default:
		break;
	}

	if (next_ip)
		regs->ip = next_ip;
	else
		regs->ip += correction;
}

/*
 * Replace the return address with the trampoline address.  Returns
 * the original return address.
 */
static
unsigned long arch_hijack_uret_addr(unsigned long trampoline_address,
			struct pt_regs *regs, struct uprobe_task *utask)
{
	int nleft;
	unsigned long orig_ret_addr = 0;  /* clear high bits for 32-bit apps */
	size_t rasize;

	if (is_32bit_app(current))
		rasize = 4;
	else
		rasize = 8;
	nleft = copy_from_user(&orig_ret_addr, (const void __user *) regs->sp,
		rasize);
	if (unlikely(nleft != 0))
		return 0;
	if (orig_ret_addr == trampoline_address)
		/*
		 * There's another uretprobe on this function, and it was
		 * processed first, so the return address has already
		 * been hijacked.
		 */
		return orig_ret_addr;

	nleft = copy_to_user((void __user *) regs->sp, &trampoline_address,
		rasize);
	if (unlikely(nleft != 0)) {
		if (nleft != rasize) {
			printk(KERN_ERR "uretprobe_entry_handler: "
				"return address partially clobbered -- "
				"pid=%d, %%sp=%#lx, %%ip=%#lx\n",
				current->pid, regs->sp, regs->ip);
			utask->doomed = 1;
		} // else nothing written, so no harm
		return 0;
	}
	return orig_ret_addr;
}