5 files changed, 272 insertions, 28 deletions
diff --git a/ChangeLog b/ChangeLog
index 434a2e9b..cc9dbd3d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2008-10-03  Jim Keniston  <jkenisto@us.ibm.com>
+
+	PR 6850
+	* runtime/uprobes/uprobes.c: When a probed process forks with
+	uretprobe_instances outstanding, create a uprobe_process and
+	uprobe_task for the child, and clone the uretprobe_instances.
+	This requires us to allow the SSOL vma to be copied on fork.
+	* testsuite/systemtap.base/bz6850.{exp,c,stp}: new test case
+
 2008-09-30  Mark Wielaard  <mjw@redhat.com>
 
 	* tapsets.cxx (literal_stmt_for_local): Check if alternatives can be
diff --git a/runtime/uprobes/uprobes.c b/runtime/uprobes/uprobes.c
index f7d90add..0f273e93 100644
--- a/runtime/uprobes/uprobes.c
+++ b/runtime/uprobes/uprobes.c
@@ -57,7 +57,8 @@ static void uretprobe_handle_entry(struct uprobe *u, struct pt_regs *regs,
 	struct uprobe_task *utask);
 static void uretprobe_handle_return(struct pt_regs *regs,
 	struct uprobe_task *utask);
-static void uretprobe_set_trampoline(struct uprobe_process *uproc);
+static void uretprobe_set_trampoline(struct uprobe_process *uproc,
+	struct task_struct *tsk);
 static void zap_uretprobe_instances(struct uprobe *u,
 	struct uprobe_process *uproc);
 
@@ -1167,7 +1168,7 @@ static unsigned long find_next_possible_ssol_vma(unsigned long ceiling)
 	struct mm_struct *mm = current->mm;
 	struct rb_node *rb_node;
 	struct vm_area_struct *vma;
-	unsigned long good_flags = VM_EXEC | VM_DONTCOPY | VM_DONTEXPAND;
+	unsigned long good_flags = VM_EXEC | VM_DONTEXPAND;
 	unsigned long bad_flags = VM_WRITE | VM_GROWSDOWN | VM_GROWSUP;
 	unsigned long addr = 0;
 
@@ -1238,20 +1239,29 @@ static noinline unsigned long uprobe_setup_ssol_vma(unsigned long nbytes)
 
 	vma = find_vma(mm, addr);
 	BUG_ON(!vma);
-	/* avoid vma copy on fork() and don't expand when mremap() */
-	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+	/*
+	 * Don't expand vma on mremap().  Allow vma to be copied on
+	 * fork() -- see uprobe_fork_uproc().
+	 */
+	vma->vm_flags |= VM_DONTEXPAND;
 
 	up_write(&mm->mmap_sem);
 	return addr;
 }
 
-/*
- * Initialize per-process area for single stepping out-of-line.
- * Must be run by a thread in the probed process.  Returns with
- * area->insn_area pointing to the initialized area, or set to a
- * negative errno.
+/**
+ * uprobe_init_ssol -- initialize per-process area for single stepping
+ * out-of-line.
+ * @uproc:	probed process
+ * @tsk:	probed task: must be current if @insn_area is %NULL
+ * @insn_area:	virtual address of the already-established SSOL vma --
+ * see uprobe_fork_uproc().
+ *
+ * Returns with @uproc->ssol_area.insn_area pointing to the initialized
+ * area, or set to a negative errno.
  */
-static noinline void uprobe_init_ssol(struct uprobe_process *uproc)
+static void uprobe_init_ssol(struct uprobe_process *uproc,
+	struct task_struct *tsk, __user uprobe_opcode_t *insn_area)
 {
 	struct uprobe_ssol_area *area = &uproc->ssol_area;
 	struct uprobe_ssol_slot *slot;
@@ -1261,9 +1271,16 @@ static noinline void uprobe_init_ssol(struct uprobe_process *uproc)
 	/* Trampoline setup will either fail or succeed here. */
 	uproc->uretprobe_trampoline_addr = ERR_PTR(-ENOMEM);
 
-	area->insn_area = (uprobe_opcode_t *) uprobe_setup_ssol_vma(PAGE_SIZE);
-	if (IS_ERR(area->insn_area))
-		return;
+	if (insn_area) {
+		BUG_ON(IS_ERR(insn_area));
+		area->insn_area = insn_area;
+	} else {
+		BUG_ON(tsk != current);
+		area->insn_area =
+			(uprobe_opcode_t *) uprobe_setup_ssol_vma(PAGE_SIZE);
+		if (IS_ERR(area->insn_area))
+			return;
+	}
 
 	area->nfree = area->nslots = PAGE_SIZE / MAX_UINSN_BYTES;
 	if (area->nslots > MAX_SSOL_SLOTS)
@@ -1288,7 +1305,7 @@ static noinline void uprobe_init_ssol(struct uprobe_process *uproc)
 		slot->insn = (__user uprobe_opcode_t *) slot_addr;
 		slot_addr += MAX_UINSN_BYTES;
 	}
-	uretprobe_set_trampoline(uproc);
+	uretprobe_set_trampoline(uproc, tsk);
 }
 
 /*
@@ -1305,7 +1322,7 @@ static __user uprobe_opcode_t
 		mutex_lock(&uproc->ssol_area.setup_mutex);
 		if (likely(!area->initialized)) {
 			/* Nobody snuck in and set things up ahead of us. */
-			uprobe_init_ssol(uproc);
+			uprobe_init_ssol(uproc, current, NULL);
 			area->initialized = 1;
 		}
 		mutex_unlock(&uproc->ssol_area.setup_mutex);
@@ -2035,6 +2052,106 @@ static u32 uprobe_report_exit(struct utrace_attached_engine *engine,
 }
 
 /*
+ * Duplicate the FIFO of uretprobe_instances from parent_utask into
+ * child_utask.  Zap the uretprobe pointer, since all we care about is
+ * vectoring to the proper return address.  Where there are multiple
+ * uretprobe_instances for the same function instance, copy only the
+ * one that contains the real return address.
+ */
+static int uprobe_fork_uretprobe_instances(struct uprobe_task *parent_utask,
+					struct uprobe_task *child_utask)
+{
+	struct uprobe_process *parent_uproc = parent_utask->uproc;
+	struct uprobe_process *child_uproc = child_utask->uproc;
+	__user uprobe_opcode_t *trampoline_addr =
+				child_uproc->uretprobe_trampoline_addr;
+	struct hlist_node *tmp, *tail;
+	struct uretprobe_instance *pri, *cri;
+
+	BUG_ON(trampoline_addr != parent_uproc->uretprobe_trampoline_addr);
+
+	/* Since there's no hlist_add_tail()... */
+	tail = NULL;
+	hlist_for_each_entry(pri, tmp, &parent_utask->uretprobe_instances,
+								hlist) {
+		if (pri->ret_addr == (unsigned long) trampoline_addr)
+			continue;
+		cri = kmalloc(sizeof(*cri), GFP_USER);
+		if (!cri)
+			return -ENOMEM;
+		cri->rp = NULL;
+		cri->ret_addr = pri->ret_addr;
+		INIT_HLIST_NODE(&cri->hlist);
+		if (tail)
+			hlist_add_after(tail, &cri->hlist);
+		else
+			hlist_add_head(&cri->hlist,
+				&child_utask->uretprobe_instances);
+		tail = &cri->hlist;
+
+		/* Ref-count uretprobe_instances. */
+		uprobe_get_process(child_uproc);
+	}
+	BUG_ON(hlist_empty(&child_utask->uretprobe_instances));
+	return 0;
+}
+
+/*
+ * A probed process is forking, and at least one function in the
+ * call stack has a uretprobe on it.  Since the child inherits the
+ * call stack, it's possible that the child could attempt to return
+ * through the uretprobe trampoline.  Create a uprobe_process for
+ * the child, initialize its SSOL vma (which has been cloned from
+ * the parent), and clone the parent's list of uretprobe_instances.
+ *
+ * Called with uproc_table locked and parent_uproc->rwsem write-locked.
+ *
+ * (On architectures where it's easy to keep track of where in the
+ * stack the return addresses are stored, we could just poke the real
+ * return addresses back into the child's stack.  We use this more
+ * general solution.)
+ */
+static int uprobe_fork_uproc(struct uprobe_process *parent_uproc,
+				struct uprobe_task *parent_utask,
+				struct task_struct *child_tsk)
+{
+	int ret = 0;
+	struct uprobe_process *child_uproc;
+	struct uprobe_task *child_utask;
+
+	BUG_ON(!parent_uproc->uretprobe_trampoline_addr ||
+			IS_ERR(parent_uproc->uretprobe_trampoline_addr));
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENOSYS;
+	child_uproc = uprobe_mk_process(child_tsk);
+	if (IS_ERR(child_uproc)) {
+		ret = (int) PTR_ERR(child_uproc);
+		module_put(THIS_MODULE);
+		return ret;
+	}
+	/* child_uproc is write-locked and ref-counted at this point. */
+
+	mutex_lock(&child_uproc->ssol_area.setup_mutex);
+	uprobe_init_ssol(child_uproc, child_tsk,
+				parent_uproc->ssol_area.insn_area);
+	child_uproc->ssol_area.initialized = 1;
+	mutex_unlock(&child_uproc->ssol_area.setup_mutex);
+
+	child_utask = uprobe_find_utask(child_tsk);
+	BUG_ON(!child_utask);
+	ret = uprobe_fork_uretprobe_instances(parent_utask, child_utask);
+	
+	hlist_add_head(&child_uproc->hlist,
+			&uproc_table[hash_long(child_uproc->tgid,
+			UPROBE_HASH_BITS)]);
+
+	up_write(&child_uproc->rwsem);
+	uprobe_decref_process(child_uproc);
+	return ret;
+}
+
+/*
  * Clone callback: The current task has spawned a thread/process.
  *
  * NOTE: For now, we don't pass on uprobes from the parent to the
@@ -2057,8 +2174,10 @@ static u32 uprobe_report_clone(struct utrace_attached_engine *engine,
 
 	/*
 	 * Lock uproc so no new uprobes can be installed 'til all
-	 * report_clone activities are completed
+	 * report_clone activities are completed.  Lock uproc_table
+	 * in case we have to run uprobe_fork_uproc().
 	 */
+	lock_uproc_table();
 	down_write(&uproc->rwsem);
 	get_task_struct(child);
 
@@ -2066,13 +2185,9 @@ static u32 uprobe_report_clone(struct utrace_attached_engine *engine,
 		/* New thread in the same process */
 		ctask = uprobe_add_task(child, uproc);
 		BUG_ON(!ctask);
-		if (IS_ERR(ctask)) {
-			put_task_struct(child);
-			up_write(&uproc->rwsem);
-			goto fail;
-		}
-		if (ctask)
-			uproc->nthreads++;
+		if (IS_ERR(ctask))
+			goto done;
+		uproc->nthreads++;
 		/*
 		 * FIXME: Handle the case where uproc is quiescing
 		 * (assuming it's possible to clone while quiescing).
@@ -2108,12 +2223,15 @@ static u32 uprobe_report_clone(struct utrace_attached_engine *engine,
 				}
 			}
 		}
+		
+		if (!hlist_empty(&ptask->uretprobe_instances))
+			(void) uprobe_fork_uproc(uproc, ptask, child);
 	}
 
+done:
 	put_task_struct(child);
 	up_write(&uproc->rwsem);
-
-fail:
+	unlock_uproc_table();
 	return UTRACE_ACTION_RESUME;
 }
 
@@ -2316,13 +2434,14 @@ EXPORT_SYMBOL_GPL(unregister_uretprobe);
  * uproc->ssol_area has been successfully set up.  Establish the
  * uretprobe trampoline in slot 0.
  */
-static void uretprobe_set_trampoline(struct uprobe_process *uproc)
+static void uretprobe_set_trampoline(struct uprobe_process *uproc,
+					struct task_struct *tsk)
 {
 	uprobe_opcode_t bp_insn = BREAKPOINT_INSTRUCTION;
 	struct uprobe_ssol_area *area = &uproc->ssol_area;
 	struct uprobe_ssol_slot *slot = &area->slots[0];
 
-	if (access_process_vm(current, (unsigned long) slot->insn,
+	if (access_process_vm(tsk, (unsigned long) slot->insn,
 			&bp_insn, BP_INSN_SIZE, 1) == BP_INSN_SIZE) {
 		uproc->uretprobe_trampoline_addr = slot->insn;
 		slot->state = SSOL_RESERVED;
@@ -2345,7 +2464,8 @@ static void uretprobe_handle_return(struct pt_regs *regs,
 	struct uprobe_task *utask)
 {
 }
-static void uretprobe_set_trampoline(struct uprobe_process *uproc)
+static void uretprobe_set_trampoline(struct uprobe_process *uproc,
+					struct task_struct *tsk)
 {
 }
 static void zap_uretprobe_instances(struct uprobe *u,
diff --git a/testsuite/systemtap.base/bz6850.c b/testsuite/systemtap.base/bz6850.c
new file mode 100644
index 00000000..a8b78110
--- /dev/null
+++ b/testsuite/systemtap.base/bz6850.c
@@ -0,0 +1,87 @@
+/* Regression test for bugzilla 6850 */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define PASS_MARKER "./bz6850_pass"
+
+/* All this in an attempt to defeat gcc's over-aggressive inlining... */
+typedef pid_t (*forker)(int);
+static forker call_chain[];
+
+/*
+ * Both parent and child return from fork2() and fork1().  Both
+ * processes will hit the uretprobe trampolines.  The handlers should
+ * run in the parent.  With the bug fix in place, the child will return
+ * correctly and do the exec (but won't run the handlers).
+ */
+static pid_t fork2(int ignored)
+{
+	return fork();
+}
+
+static pid_t fork1(int func_index)
+{
+	++func_index;
+	return call_chain[func_index](func_index);	/* fork2() */
+}
+
+static pid_t fork_and_exec2(int func_index)
+{
+	pid_t child;
+	++func_index;
+	child = call_chain[func_index](func_index);	/* fork1() */
+	if (child == 0) {
+		/* I'm the child.  Create the marker file.  */
+		char *child_args[] = { "/bin/touch", PASS_MARKER, NULL };
+		char *child_env[] = { NULL };
+		execve(child_args[0], child_args, child_env);
+		perror("execve");
+		fprintf(stderr, "FAIL: child couldn't exec.\n");
+		exit(2);
+	}
+	return child;
+}
+
+static pid_t fork_and_exec1(int func_index)
+{
+	++func_index;
+	return call_chain[func_index](func_index);	/* fork_and_exec2() */
+}
+
+static forker call_chain[] = {
+	fork_and_exec1,
+	fork_and_exec2,
+	fork1,
+	fork2,
+	NULL
+};
+
+main()
+{
+	pid_t child, wait_child;
+	int status = 0;
+
+	(void) unlink(PASS_MARKER);
+	child = call_chain[0](0);	/* fork_and_exec1() */
+	if (child < 0) {
+		fprintf(stderr, "FAIL: fork_and_exec1() failed.\n");
+		exit(1);
+	}
+	wait_child = wait(&status);
+	if (wait_child != child) {
+		fprintf(stderr, "FAIL: waited for %d but got %d\n",
+						child, wait_child);
+		exit(1);
+	}
+	if (WEXITSTATUS(status) != 0) {
+		fprintf(stderr, "FAIL: child died with status = %d\n",
+			WEXITSTATUS(status));
+		exit(1);
+	}
+	exit(0);
+}
diff --git a/testsuite/systemtap.base/bz6850.exp b/testsuite/systemtap.base/bz6850.exp
new file mode 100644
index 00000000..cd56ddce
--- /dev/null
+++ b/testsuite/systemtap.base/bz6850.exp
@@ -0,0 +1,21 @@
+set test bz6850
+
+catch {exec gcc -g -o bz6850 $srcdir/$subdir/bz6850.c} err
+if {$err == "" && [file exists bz6850]} then { pass "$test compile" } else { fail "$test compile" }
+
+set rc [stap_run_batch $srcdir/$subdir/bz6850.stp]
+if {$rc == 0} then { pass "$test -p4" } else { fail "$test -p4" }
+
+if {! [installtest_p]} { untested "$test -p5"; return }
+
+spawn sudo stap $srcdir/$subdir/bz6850.stp -c ./bz6850
+expect {
+	-timeout 60
+	-re {[^\r\n]*called\r\n} { exp_continue }
+	-re {[^\r\n]*returns\r\n} { exp_continue }
+	timeout { fail "$test (timeout)" }
+	eof { }
+}
+wait
+if {[file exists bz6850_pass]} then { pass "$test -p5" } else { fail "$test -p5" }
+exec rm -f bz6850_pass bz6850
diff --git a/testsuite/systemtap.base/bz6850.stp b/testsuite/systemtap.base/bz6850.stp
new file mode 100644
index 00000000..d6f41862
--- /dev/null
+++ b/testsuite/systemtap.base/bz6850.stp
@@ -0,0 +1,7 @@
+#! stap -p4
+probe process("./bz6850").function("*").call {
+	printf("%s called\n", probefunc())
+}
+probe process("./bz6850").function("*").return {
+	printf("%s returns\n", probefunc())
+}