18 files changed, 679 insertions, 182 deletions
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b7f8f4a87cc..10a642df014 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -162,7 +162,6 @@ static inline unsigned long test_perf_counter_pending(void)
 	return 0;
 }
 
-static inline void set_perf_counter_pending(void) {}
 static inline void clear_perf_counter_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index cc7c887705b..b398a84edce 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -10,6 +10,8 @@
  */
 #include <linux/types.h>
 
+#include <asm/hw_irq.h>
+
 #define MAX_HWCOUNTERS		8
 #define MAX_EVENT_ALTERNATIVES	8
 #define MAX_LIMITED_HWCOUNTERS	2
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bb202388170..e6dc1850191 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -913,6 +913,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	case PERF_TYPE_RAW:
 		ev = counter->attr.config;
 		break;
+	default:
+		return ERR_PTR(-EINVAL);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
@@ -1013,7 +1015,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	u64 period = counter->hw.sample_period;
 	s64 prev, delta, left;
 	int record = 0;
-	u64 addr, mmcra, sdsync;
+	u64 mmcra, sdsync;
 
 	/* we don't have to worry about interrupts here */
 	prev = atomic64_read(&counter->hw.prev_count);
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 876ed97147b..5fb33e160ea 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,11 +84,6 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-extern void set_perf_counter_pending(void);
-
-#define clear_perf_counter_pending()	do { } while (0)
-#define test_perf_counter_pending()	(0)
-
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(void);
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 31bd120cf2a..01fd9461d32 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -49,13 +49,17 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
 #endif
 
 #if defined(CONFIG_HIGHPTE)
+#define __KM_PTE			\
+	(in_nmi() ? KM_NMI_PTE : 	\
+	 in_irq() ? KM_IRQ_PTE :	\
+	 KM_PTE0)
 #define pte_offset_map(dir, address)					\
-	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) +		\
+	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) +		\
 	 pte_index((address)))
 #define pte_offset_map_nested(dir, address)				\
 	((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) +		\
 	 pte_index((address)))
-#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
+#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
 #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
 #else
 #define pte_offset_map(dir, address)					\
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 275bc142cd5..e8c68a5091d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -19,6 +19,7 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 #include <linux/uaccess.h>
+#include <linux/highmem.h>
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
@@ -389,23 +390,23 @@ static u64 intel_pmu_raw_event(u64 event)
 	return event & CORE_EVNTSEL_MASK;
 }
 
-static const u64 amd_0f_hw_cache_event_ids
+static const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
  [ C(L1D) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0042, /* Data Cache Refills from L2 */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
 	},
  },
  [ C(L1I ) ] = {
@@ -418,17 +419,17 @@ static const u64 amd_0f_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
@@ -438,8 +439,8 @@ static const u64 amd_0f_hw_cache_event_ids
  },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0046, /* L1 DTLB and L2 DLTB Miss   */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
@@ -1459,18 +1460,16 @@ static int intel_pmu_init(void)
 
 static int amd_pmu_init(void)
 {
+	/* Performance-monitoring supported from K7 and later: */
+	if (boot_cpu_data.x86 < 6)
+		return -ENODEV;
+
 	x86_pmu = amd_pmu;
 
-	switch (boot_cpu_data.x86) {
-	case 0x0f:
-	case 0x10:
-	case 0x11:
-		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
+	/* Events are common for all AMDs */
+	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
 
-		pr_cont("AMD Family 0f/10/11 events, ");
-		break;
-	}
 	return 0;
 }
 
@@ -1577,8 +1576,8 @@ static void backtrace_warning(void *data, char *msg)
 
 static int backtrace_stack(void *data, char *name)
 {
-	/* Don't bother with IRQ stacks for now */
-	return -1;
+	/* Process all stacks: */
+	return 0;
 }
 
 static void backtrace_address(void *data, unsigned long addr, int reliable)
@@ -1596,6 +1595,8 @@ static const struct stacktrace_ops backtrace_ops = {
 	.address		= backtrace_address,
 };
 
+#include "../dumpstack.h"
+
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
@@ -1603,40 +1604,62 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	char *stack;
 	int nr = entry->nr;
 
-	callchain_store(entry, instruction_pointer(regs));
+	callchain_store(entry, regs->ip);
 
 	stack = ((char *)regs + sizeof(struct pt_regs));
 #ifdef CONFIG_FRAME_POINTER
-	bp = frame_pointer(regs);
+	get_bp(bp);
 #else
 	bp = 0;
 #endif
 
-	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+	dump_trace(NULL, regs, (void *)&stack, bp, &backtrace_ops, entry);
 
 	entry->kernel = entry->nr - nr;
 }
 
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	int type = in_nmi() ? KM_NMI : KM_IRQ0;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
+	int ret;
 
-struct stack_frame {
-	const void __user	*next_fp;
-	unsigned long		return_address;
-};
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
+
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		map = kmap_atomic(page, type);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map, type);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
 
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 {
-	int ret;
+	unsigned long bytes;
 
-	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
-		return 0;
-
-	ret = 1;
-	pagefault_disable();
-	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
-		ret = 0;
-	pagefault_enable();
+	bytes = copy_from_user_nmi(frame, fp, sizeof(*frame));
 
-	return ret;
+	return bytes == sizeof(*frame);
 }
 
 static void
@@ -1646,23 +1669,25 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	const void __user *fp;
 	int nr = entry->nr;
 
-	regs = (struct pt_regs *)current->thread.sp0 - 1;
-	fp   = (void __user *)regs->bp;
+	if (!user_mode(regs))
+		regs = task_pt_regs(current);
+
+	fp = (void __user *)regs->bp;
 
 	callchain_store(entry, regs->ip);
 
 	while (entry->nr < MAX_STACK_DEPTH) {
-		frame.next_fp	     = NULL;
+		frame.next_frame	     = NULL;
 		frame.return_address = 0;
 
 		if (!copy_stack_frame(fp, &frame))
 			break;
 
-		if ((unsigned long)fp < user_stack_pointer(regs))
+		if ((unsigned long)fp < regs->sp)
 			break;
 
 		callchain_store(entry, frame.return_address);
-		fp = frame.next_fp;
+		fp = frame.next_frame;
 	}
 
 	entry->user = entry->nr - nr;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6340cef6798..697d5727c11 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
 	return 1;
 }
 
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	unsigned long flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					(void __user *)start, len)))
+		return 0;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_save(flags);
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			break;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			break;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_restore(flags);
+
+	return nr;
+}
+
 /**
  * get_user_pages_fast() - pin user pages in memory
  * @start:	starting user address
diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h
index 54e8b3d956b..eddbce0f9fb 100644
--- a/include/asm-generic/kmap_types.h
+++ b/include/asm-generic/kmap_types.h
@@ -24,7 +24,10 @@ D(12)	KM_SOFTIRQ1,
 D(13)	KM_SYNC_ICACHE,
 D(14)	KM_SYNC_DCACHE,
 D(15)	KM_UML_USERCOPY, /* UML specific, for copy_*_user - used in do_op_one_page */
-D(16)	KM_TYPE_NR
+D(16)	KM_IRQ_PTE,
+D(17)	KM_NMI,
+D(18)	KM_NMI_PTE,
+D(19)	KM_TYPE_NR
 };
 
 #undef D
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d88d6fc530a..cf260d848eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -854,6 +854,12 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
 			  unsigned long end, unsigned long newflags);
 
 /*
+ * doesn't attempt to fault and will return short.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages);
+
+/*
  * A callback you can register to apply pressure to ageable caches.
  *
  * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'.  It should
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1b3118a1023..eccae437fe3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -604,6 +604,7 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_free_task(struct task_struct *task);
+extern void set_perf_counter_pending(void);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void __perf_disable(void);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b685f551a..109a9572385 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1283,7 +1283,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 		if (!interrupts) {
 			perf_disable();
 			counter->pmu->disable(counter);
-			atomic_set(&hwc->period_left, 0);
+			atomic64_set(&hwc->period_left, 0);
 			counter->pmu->enable(counter);
 			perf_enable();
 		}
@@ -1553,7 +1553,7 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 values[3];
+	u64 values[4];
 	int n;
 
 	/*
@@ -1620,22 +1620,6 @@ static void perf_counter_reset(struct perf_counter *counter)
 	perf_counter_update_userpage(counter);
 }
 
-static void perf_counter_for_each_sibling(struct perf_counter *counter,
-					  void (*func)(struct perf_counter *))
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_counter *sibling;
-
-	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
-	counter = counter->group_leader;
-
-	func(counter);
-	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
-		func(sibling);
-	mutex_unlock(&ctx->mutex);
-}
-
 /*
  * Holding the top-level counter's child_mutex means that any
  * descendant process that has inherited this counter will block
@@ -1658,14 +1642,18 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 static void perf_counter_for_each(struct perf_counter *counter,
 				  void (*func)(struct perf_counter *))
 {
-	struct perf_counter *child;
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
 
-	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	mutex_lock(&counter->child_mutex);
-	perf_counter_for_each_sibling(counter, func);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_for_each_sibling(child, func);
-	mutex_unlock(&counter->child_mutex);
+	WARN_ON_ONCE(ctx->parent_ctx);
+	mutex_lock(&ctx->mutex);
+	counter = counter->group_leader;
+
+	perf_counter_for_each_child(counter, func);
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		perf_counter_for_each_child(counter, func);
+	mutex_unlock(&ctx->mutex);
 }
 
 static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0cbd5d6874e..e8346f95fbb 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -160,7 +160,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
-LDFLAGS = -lpthread -lrt -lelf
+LDFLAGS = -lpthread -lrt -lelf -lm
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index b1ed5f766cb..94cea678fd7 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -25,6 +25,10 @@
 #define SHOW_USER	2
 #define SHOW_HV		4
 
+#define MIN_GREEN		0.5
+#define MIN_RED		5.0
+
+
 static char		const *input_name = "perf.data";
 static char		*vmlinux = "vmlinux";
 
@@ -39,6 +43,8 @@ static int		dump_trace = 0;
 
 static int		verbose;
 
+static int		print_line;
+
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
@@ -84,6 +90,13 @@ typedef union event_union {
 	struct period_event		period;
 } event_t;
 
+
+struct sym_ext {
+	struct rb_node	node;
+	double		percent;
+	char		*path;
+};
+
 static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
 static struct dso *vdso;
@@ -1030,10 +1043,30 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static char *get_color(double percent)
+{
+	char *color = PERF_COLOR_NORMAL;
+
+	/*
+	 * We color high-overhead entries in red, mid-overhead
+	 * entries in green - and keep the low overhead places
+	 * normal:
+	 */
+	if (percent >= MIN_RED)
+		color = PERF_COLOR_RED;
+	else {
+		if (percent > MIN_GREEN)
+			color = PERF_COLOR_GREEN;
+	}
+	return color;
+}
+
 static int
 parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 {
 	char *line = NULL, *tmp, *tmp2;
+	static const char *prev_line;
+	static const char *prev_color;
 	unsigned int offset;
 	size_t line_len;
 	__u64 line_ip;
@@ -1073,27 +1106,36 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 	}
 
 	if (line_ip != -1) {
+		const char *path = NULL;
 		unsigned int hits = 0;
 		double percent = 0.0;
-		char *color = PERF_COLOR_NORMAL;
+		char *color;
+		struct sym_ext *sym_ext = sym->priv;
 
 		offset = line_ip - start;
 		if (offset < len)
 			hits = sym->hist[offset];
 
-		if (sym->hist_sum)
+		if (offset < len && sym_ext) {
+			path = sym_ext[offset].path;
+			percent = sym_ext[offset].percent;
+		} else if (sym->hist_sum)
 			percent = 100.0 * hits / sym->hist_sum;
 
+		color = get_color(percent);
+
 		/*
-		 * We color high-overhead entries in red, mid-overhead
-		 * entries in green - and keep the low overhead places
-		 * normal:
+		 * Also color the filename and line if needed, with
+		 * the same color than the percentage. Don't print it
+		 * twice for close colored ip with the same filename:line
 		 */
-		if (percent >= 5.0)
-			color = PERF_COLOR_RED;
-		else {
-			if (percent > 0.5)
-				color = PERF_COLOR_GREEN;
+		if (path) {
+			if (!prev_line || strcmp(prev_line, path)
+				       || color != prev_color) {
+				color_fprintf(stdout, color, " %s", path);
+				prev_line = path;
+				prev_color = color;
+			}
 		}
 
 		color_fprintf(stdout, color, " %7.2f", percent);
@@ -1109,6 +1151,121 @@ parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 	return 0;
 }
 
+static struct rb_root root_sym_ext;
+
+static void insert_source_line(struct sym_ext *sym_ext)
+{
+	struct sym_ext *iter;
+	struct rb_node **p = &root_sym_ext.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct sym_ext, node);
+
+		if (sym_ext->percent > iter->percent)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&sym_ext->node, parent, p);
+	rb_insert_color(&sym_ext->node, &root_sym_ext);
+}
+
+static void free_source_line(struct symbol *sym, int len)
+{
+	struct sym_ext *sym_ext = sym->priv;
+	int i;
+
+	if (!sym_ext)
+		return;
+
+	for (i = 0; i < len; i++)
+		free(sym_ext[i].path);
+	free(sym_ext);
+
+	sym->priv = NULL;
+	root_sym_ext = RB_ROOT;
+}
+
+/* Get the filename:line for the colored entries */
+static void
+get_source_line(struct symbol *sym, __u64 start, int len, char *filename)
+{
+	int i;
+	char cmd[PATH_MAX * 2];
+	struct sym_ext *sym_ext;
+
+	if (!sym->hist_sum)
+		return;
+
+	sym->priv = calloc(len, sizeof(struct sym_ext));
+	if (!sym->priv)
+		return;
+
+	sym_ext = sym->priv;
+
+	for (i = 0; i < len; i++) {
+		char *path = NULL;
+		size_t line_len;
+		__u64 offset;
+		FILE *fp;
+
+		sym_ext[i].percent = 100.0 * sym->hist[i] / sym->hist_sum;
+		if (sym_ext[i].percent <= 0.5)
+			continue;
+
+		offset = start + i;
+		sprintf(cmd, "addr2line -e %s %016llx", filename, offset);
+		fp = popen(cmd, "r");
+		if (!fp)
+			continue;
+
+		if (getline(&path, &line_len, fp) < 0 || !line_len)
+			goto next;
+
+		sym_ext[i].path = malloc(sizeof(char) * line_len + 1);
+		if (!sym_ext[i].path)
+			goto next;
+
+		strcpy(sym_ext[i].path, path);
+		insert_source_line(&sym_ext[i]);
+
+	next:
+		pclose(fp);
+	}
+}
+
+static void print_summary(char *filename)
+{
+	struct sym_ext *sym_ext;
+	struct rb_node *node;
+
+	printf("\nSorted summary for file %s\n", filename);
+	printf("----------------------------------------------\n\n");
+
+	if (RB_EMPTY_ROOT(&root_sym_ext)) {
+		printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
+		return;
+	}
+
+	node = rb_first(&root_sym_ext);
+	while (node) {
+		double percent;
+		char *color;
+		char *path;
+
+		sym_ext = rb_entry(node, struct sym_ext, node);
+		percent = sym_ext->percent;
+		color = get_color(percent);
+		path = sym_ext->path;
+
+		color_fprintf(stdout, color, " %7.2f %s", percent, path);
+		node = rb_next(node);
+	}
+}
+
 static void annotate_sym(struct dso *dso, struct symbol *sym)
 {
 	char *filename = dso->name;
@@ -1121,13 +1278,6 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	if (dso == kernel_dso)
 		filename = vmlinux;
 
-	printf("\n------------------------------------------------\n");
-	printf(" Percent |	Source code & Disassembly of %s\n", filename);
-	printf("------------------------------------------------\n");
-
-	if (verbose >= 2)
-		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
-
 	start = sym->obj_start;
 	if (!start)
 		start = sym->start;
@@ -1135,6 +1285,18 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	end = start + sym->end - sym->start + 1;
 	len = sym->end - sym->start;
 
+	if (print_line) {
+		get_source_line(sym, start, len, filename);
+		print_summary(filename);
+	}
+
+	printf("\n\n------------------------------------------------\n");
+	printf(" Percent |	Source code & Disassembly of %s\n", filename);
+	printf("------------------------------------------------\n");
+
+	if (verbose >= 2)
+		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
+
 	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
 
 	if (verbose >= 3)
@@ -1150,6 +1312,8 @@ static void annotate_sym(struct dso *dso, struct symbol *sym)
 	}
 
 	pclose(file);
+	if (print_line)
+		free_source_line(sym, len);
 }
 
 static void find_annotations(void)
@@ -1308,6 +1472,8 @@ static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_BOOLEAN('l', "print-line", &print_line,
+		    "print matching source lines (may be slow)"),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0f5771f615d..e1dfef24887 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,7 @@ static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
+static int			call_graph			= 0;
 static int			verbose				= 0;
 
 static long			samples;
@@ -201,8 +202,12 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	if (read(fd, bf, sizeof(bf)) < 0) {
 		fprintf(stderr, "couldn't read %s\n", filename);
@@ -272,8 +277,12 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 
 	fp = fopen(filename, "r");
 	if (fp == NULL) {
-		fprintf(stderr, "couldn't open %s\n", filename);
-		exit(EXIT_FAILURE);
+		/*
+		 * We raced with a task exiting - just return:
+		 */
+		if (verbose)
+			fprintf(stderr, "couldn't open %s\n", filename);
+		return;
 	}
 	while (1) {
 		char bf[BUFSIZ], *pbf = bf;
@@ -351,11 +360,16 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	int track = 1;
 
 	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+
 	if (freq) {
 		attr->sample_type	|= PERF_SAMPLE_PERIOD;
 		attr->freq		= 1;
 		attr->sample_freq	= freq;
 	}
+
+	if (call_graph)
+		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
+
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= (cpu < 0) && inherit;
@@ -555,6 +569,8 @@ static const struct option options[] = {
 		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
 		    "number of mmap data pages"),
+	OPT_BOOLEAN('g', "call-graph", &call_graph,
+		    "do call-graph (stack chain/backtrace) recording"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 82fa93b4db9..f86bb07c0e8 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -36,18 +36,28 @@ static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
 #define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+#define cdprintf(x...)	do { if (dump_trace) color_fprintf(stdout, color, x); } while (0)
 
 static int		verbose;
 static int		full_paths;
+static int		collapse_syscalls;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
+struct ip_chain_event {
+	__u16 nr;
+	__u16 hv;
+	__u16 kernel;
+	__u16 user;
+	__u64 ips[];
+};
+
 struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
 	__u32 pid, tid;
-	__u64 period;
+	unsigned char __more_data[];
 };
 
 struct mmap_event {
@@ -944,9 +954,13 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	__u64 ip = event->ip.ip;
 	__u64 period = 1;
 	struct map *map = NULL;
+	void *more_data = event->ip.__more_data;
+	struct ip_chain_event *chain;
 
-	if (event->header.type & PERF_SAMPLE_PERIOD)
-		period = event->ip.period;
+	if (event->header.type & PERF_SAMPLE_PERIOD) {
+		period = *(__u64 *)more_data;
+		more_data += sizeof(__u64);
+	}
 
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
 		(void *)(offset + head),
@@ -956,6 +970,31 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		(void *)(long)ip,
 		(long long)period);
 
+	if (event->header.type & PERF_SAMPLE_CALLCHAIN) {
+		int i;
+
+		chain = (void *)more_data;
+
+		if (dump_trace) {
+			dprintf("... chain: u:%d, k:%d, nr:%d\n",
+				chain->user,
+				chain->kernel,
+				chain->nr);
+
+			for (i = 0; i < chain->nr; i++)
+				dprintf("..... %2d: %016Lx\n", i, chain->ips[i]);
+		}
+		if (collapse_syscalls) {
+			/*
+			 * Find the all-but-last kernel entry
+			 * amongst the call-chains - to get
+			 * to the level of system calls:
+			 */
+			if (chain->kernel >= 2)
+				ip = chain->ips[chain->kernel-2];
+		}
+	}
+
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
 	if (thread == NULL) {
@@ -1095,9 +1134,47 @@ process_period_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static void trace_event(event_t *event)
+{
+	unsigned char *raw_event = (void *)event;
+	char *color = PERF_COLOR_BLUE;
+	int i, j;
+
+	if (!dump_trace)
+		return;
+
+	dprintf(".");
+	cdprintf("\n. ... raw event: size %d bytes\n", event->header.size);
+
+	for (i = 0; i < event->header.size; i++) {
+		if ((i & 15) == 0) {
+			dprintf(".");
+			cdprintf("  %04x: ", i);
+		}
+
+		cdprintf(" %02x", raw_event[i]);
+
+		if (((i & 15) == 15) || i == event->header.size-1) {
+			cdprintf("  ");
+			for (j = 0; j < 15-(i & 15); j++)
+				cdprintf("   ");
+			for (j = 0; j < (i & 15); j++) {
+				if (isprint(raw_event[i-15+j]))
+					cdprintf("%c", raw_event[i-15+j]);
+				else
+					cdprintf(".");
+			}
+			cdprintf("\n");
+		}
+	}
+	dprintf(".\n");
+}
+
 static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
+	trace_event(event);
+
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
 		return process_overflow_event(event, offset, head);
 
@@ -1204,7 +1281,7 @@ more:
 
 	size = event->header.size;
 
-	dprintf("%p [%p]: event: %d\n",
+	dprintf("\n%p [%p]: event: %d\n",
 			(void *)(offset + head),
 			(void *)(long)event->header.size,
 			event->header.type);
@@ -1276,6 +1353,8 @@ static const struct option options[] = {
 		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
 	OPT_BOOLEAN('P', "full-paths", &full_paths,
 		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_BOOLEAN('S', "syscalls", &collapse_syscalls,
+		    "show per syscall summary overhead, using call graph"),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c43e4a97dc4..e5b3c0ff03a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -43,6 +43,7 @@
 #include "util/parse-events.h"
 
 #include <sys/prctl.h>
+#include <math.h>
 
 static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
@@ -79,12 +80,34 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
-static __u64			event_res[MAX_COUNTERS][3];
-static __u64			event_scaled[MAX_COUNTERS];
+#define MAX_RUN 100
 
-static __u64			runtime_nsecs;
-static __u64			walltime_nsecs;
-static __u64			runtime_cycles;
+static int			run_count		=  1;
+static int			run_idx			=  0;
+
+static __u64			event_res[MAX_RUN][MAX_COUNTERS][3];
+static __u64			event_scaled[MAX_RUN][MAX_COUNTERS];
+
+//static __u64			event_hist[MAX_RUN][MAX_COUNTERS][3];
+
+
+static __u64			runtime_nsecs[MAX_RUN];
+static __u64			walltime_nsecs[MAX_RUN];
+static __u64			runtime_cycles[MAX_RUN];
+
+static __u64			event_res_avg[MAX_COUNTERS][3];
+static __u64			event_res_noise[MAX_COUNTERS][3];
+
+static __u64			event_scaled_avg[MAX_COUNTERS];
+
+static __u64			runtime_nsecs_avg;
+static __u64			runtime_nsecs_noise;
+
+static __u64			walltime_nsecs_avg;
+static __u64			walltime_nsecs_noise;
+
+static __u64			runtime_cycles_avg;
+static __u64			runtime_cycles_noise;
 
 static void create_perf_stat_counter(int counter)
 {
@@ -140,7 +163,7 @@ static void read_counter(int counter)
 	int cpu, nv;
 	int scaled;
 
-	count = event_res[counter];
+	count = event_res[run_idx][counter];
 
 	count[0] = count[1] = count[2] = 0;
 
@@ -151,6 +174,8 @@ static void read_counter(int counter)
 
 		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
 		assert(res == nv * sizeof(__u64));
+		close(fd[cpu][counter]);
+		fd[cpu][counter] = -1;
 
 		count[0] += single_count[0];
 		if (scale) {
@@ -162,13 +187,13 @@ static void read_counter(int counter)
 	scaled = 0;
 	if (scale) {
 		if (count[2] == 0) {
-			event_scaled[counter] = -1;
+			event_scaled[run_idx][counter] = -1;
 			count[0] = 0;
 			return;
 		}
 
 		if (count[2] < count[1]) {
-			event_scaled[counter] = 1;
+			event_scaled[run_idx][counter] = 1;
 			count[0] = (unsigned long long)
 				((double)count[0] * count[1] / count[2] + 0.5);
 		}
@@ -178,10 +203,94 @@ static void read_counter(int counter)
 	 */
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
 		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
-		runtime_nsecs = count[0];
+		runtime_nsecs[run_idx] = count[0];
 	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
 		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
-		runtime_cycles = count[0];
+		runtime_cycles[run_idx] = count[0];
+}
+
+static int run_perf_stat(int argc, const char **argv)
+{
+	unsigned long long t0, t1;
+	int status = 0;
+	int counter;
+	int pid;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perf_stat_counter(counter);
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+
+	if (!pid) {
+		if (execvp(argv[0], (char **)argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+
+	wait(&status);
+
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	walltime_nsecs[run_idx] = t1 - t0;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		read_counter(counter);
+
+	return WEXITSTATUS(status);
+}
+
+static void print_noise(__u64 *count, __u64 *noise)
+{
+	if (run_count > 1)
+		fprintf(stderr, "   ( +- %7.3f%% )",
+			(double)noise[0]/(count[0]+1)*100.0);
+}
+
+static void nsec_printout(int counter, __u64 *count, __u64 *noise)
+{
+	double msecs = (double)count[0] / 1000000;
+
+	fprintf(stderr, " %14.6f  %-20s", msecs, event_name(counter));
+
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
+
+		if (walltime_nsecs_avg)
+			fprintf(stderr, " # %10.3f CPUs ",
+				(double)count[0] / (double)walltime_nsecs_avg);
+	}
+	print_noise(count, noise);
+}
+
+static void abs_printout(int counter, __u64 *count, __u64 *noise)
+{
+	fprintf(stderr, " %14Ld  %-20s", count[0], event_name(counter));
+
+	if (runtime_cycles_avg &&
+		attrs[counter].type == PERF_TYPE_HARDWARE &&
+			attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
+
+		fprintf(stderr, " # %10.3f IPC  ",
+			(double)count[0] / (double)runtime_cycles_avg);
+	} else {
+		if (runtime_nsecs_avg) {
+			fprintf(stderr, " # %10.3f M/sec",
+				(double)count[0]/runtime_nsecs_avg*1000.0);
+		}
+	}
+	print_noise(count, noise);
 }
 
 /*
@@ -189,11 +298,12 @@ static void read_counter(int counter)
  */
 static void print_counter(int counter)
 {
-	__u64 *count;
+	__u64 *count, *noise;
 	int scaled;
 
-	count = event_res[counter];
-	scaled = event_scaled[counter];
+	count = event_res_avg[counter];
+	noise = event_res_noise[counter];
+	scaled = event_scaled_avg[counter];
 
 	if (scaled == -1) {
 		fprintf(stderr, " %14s  %-20s\n",
@@ -201,75 +311,107 @@ static void print_counter(int counter)
 		return;
 	}
 
-	if (nsec_counter(counter)) {
-		double msecs = (double)count[0] / 1000000;
-
-		fprintf(stderr, " %14.6f  %-20s",
-			msecs, event_name(counter));
-		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
+	if (nsec_counter(counter))
+		nsec_printout(counter, count, noise);
+	else
+		abs_printout(counter, count, noise);
 
-			if (walltime_nsecs)
-				fprintf(stderr, " # %11.3f CPU utilization factor",
-					(double)count[0] / (double)walltime_nsecs);
-		}
-	} else {
-		fprintf(stderr, " %14Ld  %-20s",
-			count[0], event_name(counter));
-		if (runtime_nsecs)
-			fprintf(stderr, " # %11.3f M/sec",
-				(double)count[0]/runtime_nsecs*1000.0);
-		if (runtime_cycles &&
-			attrs[counter].type == PERF_TYPE_HARDWARE &&
-				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
-
-			fprintf(stderr, " # %1.3f per cycle",
-				(double)count[0] / (double)runtime_cycles);
-		}
-	}
 	if (scaled)
 		fprintf(stderr, "  (scaled from %.2f%%)",
 			(double) count[2] / count[1] * 100);
+
 	fprintf(stderr, "\n");
 }
 
-static int do_perf_stat(int argc, const char **argv)
+/*
+ * normalize_noise noise values down to stddev:
+ */
+static void normalize_noise(__u64 *val)
 {
-	unsigned long long t0, t1;
-	int counter;
-	int status;
-	int pid;
-	int i;
+	double res;
 
-	if (!system_wide)
-		nr_cpus = 1;
+	res = (double)*val / (run_count * sqrt((double)run_count));
 
-	for (counter = 0; counter < nr_counters; counter++)
-		create_perf_stat_counter(counter);
+	*val = (__u64)res;
+}
 
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+static void update_avg(const char *name, int idx, __u64 *avg, __u64 *val)
+{
+	*avg += *val;
 
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
+	if (verbose > 1)
+		fprintf(stderr, "debug: %20s[%d]: %Ld\n", name, idx, *val);
+}
+/*
+ * Calculate the averages and noises:
+ */
+static void calc_avg(void)
+{
+	int i, j;
+
+	if (verbose > 1)
+		fprintf(stderr, "\n");
+
+	for (i = 0; i < run_count; i++) {
+		update_avg("runtime", 0, &runtime_nsecs_avg, runtime_nsecs + i);
+		update_avg("walltime", 0, &walltime_nsecs_avg, walltime_nsecs + i);
+		update_avg("runtime_cycles", 0, &runtime_cycles_avg, runtime_cycles + i);
+
+		for (j = 0; j < nr_counters; j++) {
+			update_avg("counter/0", j,
+				event_res_avg[j]+0, event_res[i][j]+0);
+			update_avg("counter/1", j,
+				event_res_avg[j]+1, event_res[i][j]+1);
+			update_avg("counter/2", j,
+				event_res_avg[j]+2, event_res[i][j]+2);
+			update_avg("scaled", j,
+				event_scaled_avg + j, event_scaled[i]+j);
+		}
+	}
+	runtime_nsecs_avg /= run_count;
+	walltime_nsecs_avg /= run_count;
+	runtime_cycles_avg /= run_count;
+
+	for (j = 0; j < nr_counters; j++) {
+		event_res_avg[j][0] /= run_count;
+		event_res_avg[j][1] /= run_count;
+		event_res_avg[j][2] /= run_count;
+	}
 
-	if (!pid) {
-		if (execvp(argv[0], (char **)argv)) {
-			perror(argv[0]);
-			exit(-1);
+	for (i = 0; i < run_count; i++) {
+		runtime_nsecs_noise +=
+			abs((__s64)(runtime_nsecs[i] - runtime_nsecs_avg));
+		walltime_nsecs_noise +=
+			abs((__s64)(walltime_nsecs[i] - walltime_nsecs_avg));
+		runtime_cycles_noise +=
+			abs((__s64)(runtime_cycles[i] - runtime_cycles_avg));
+
+		for (j = 0; j < nr_counters; j++) {
+			event_res_noise[j][0] +=
+				abs((__s64)(event_res[i][j][0] - event_res_avg[j][0]));
+			event_res_noise[j][1] +=
+				abs((__s64)(event_res[i][j][1] - event_res_avg[j][1]));
+			event_res_noise[j][2] +=
+				abs((__s64)(event_res[i][j][2] - event_res_avg[j][2]));
 		}
 	}
 
-	while (wait(&status) >= 0)
-		;
+	normalize_noise(&runtime_nsecs_noise);
+	normalize_noise(&walltime_nsecs_noise);
+	normalize_noise(&runtime_cycles_noise);
 
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
+	for (j = 0; j < nr_counters; j++) {
+		normalize_noise(&event_res_noise[j][0]);
+		normalize_noise(&event_res_noise[j][1]);
+		normalize_noise(&event_res_noise[j][2]);
+	}
+}
 
-	walltime_nsecs = t1 - t0;
+static void print_stat(int argc, const char **argv)
+{
+	int i, counter;
+
+	calc_avg();
 
 	fflush(stdout);
 
@@ -279,22 +421,19 @@ static int do_perf_stat(int argc, const char **argv)
 	for (i = 1; i < argc; i++)
 		fprintf(stderr, " %s", argv[i]);
 
-	fprintf(stderr, "\':\n");
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++)
-		read_counter(counter);
+	fprintf(stderr, "\'");
+	if (run_count > 1)
+		fprintf(stderr, " (%d runs)", run_count);
+	fprintf(stderr, ":\n\n");
 
 	for (counter = 0; counter < nr_counters; counter++)
 		print_counter(counter);
 
 
 	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
+	fprintf(stderr, " %14.9f  seconds time elapsed.\n",
+			(double)walltime_nsecs_avg/1e9);
 	fprintf(stderr, "\n");
-
-	return 0;
 }
 
 static volatile int signr = -1;
@@ -332,11 +471,15 @@ static const struct option options[] = {
 			    "scale/normalize counters"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
+	OPT_INTEGER('r', "repeat", &run_count,
+		    "repeat command and print average + stddev (max: 100)"),
 	OPT_END()
 };
 
 int cmd_stat(int argc, const char **argv, const char *prefix)
 {
+	int status;
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	memcpy(attrs, default_attrs, sizeof(attrs));
@@ -344,6 +487,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	argc = parse_options(argc, argv, options, stat_usage, 0);
 	if (!argc)
 		usage_with_options(stat_usage, options);
+	if (run_count <= 0 || run_count > MAX_RUN)
+		usage_with_options(stat_usage, options);
 
 	if (!nr_counters)
 		nr_counters = 8;
@@ -363,5 +508,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
 
-	return do_perf_stat(argc, argv);
+	status = 0;
+	for (run_idx = 0; run_idx < run_count; run_idx++) {
+		if (run_count != 1 && verbose)
+			fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx+1);
+		status = run_perf_stat(argc, argv);
+	}
+
+	print_stat(argc, argv);
+
+	return status;
 }
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5a72586e1df..f0c9f2627fe 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -63,8 +63,8 @@ static char *hw_event_names[] = {
 };
 
 static char *sw_event_names[] = {
-	"cpu-clock-ticks",
-	"task-clock-ticks",
+	"cpu-clock-msecs",
+	"task-clock-msecs",
 	"page-faults",
 	"context-switches",
 	"CPU-migrations",
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 0d1292bd827..5ad9b06c3f6 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -12,6 +12,7 @@ struct symbol {
 	__u64		obj_start;
 	__u64		hist_sum;
 	__u64		*hist;
+	void		*priv;
 	char		name[0];
 };