13 files changed, 357 insertions, 138 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d7f01e6e8ba..ae048a2dbbe 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER
 	select MARKERS
 	bool
 
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -66,6 +73,10 @@ config TRACING
 	select BINARY_PRINTF
 	select EVENT_TRACING
 
+config GENERIC_TRACER
+	bool
+	select TRACING
+
 #
 # Minimum requirements an architecture has to meet for us to
 # be able to offer generic tracing facilities:
@@ -95,7 +106,7 @@ config FUNCTION_TRACER
 	depends on HAVE_FUNCTION_TRACER
 	select FRAME_POINTER
 	select KALLSYMS
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  Enable the kernel to trace every kernel function. This is done
@@ -126,7 +137,7 @@ config IRQSOFF_TRACER
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
 	select TRACE_IRQFLAGS
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in irqs-off critical
@@ -136,7 +147,7 @@ config IRQSOFF_TRACER
 	  disabled by default and can be runtime (re-)started
 	  via:
 
-	      echo 0 > /debugfs/tracing/tracing_max_latency
+	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
 
 	  (Note that kernel size and overhead increases with this option
 	  enabled. This option and the preempt-off timing option can be
@@ -147,7 +158,7 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in preemption off critical
@@ -157,7 +168,7 @@ config PREEMPT_TRACER
 	  disabled by default and can be runtime (re-)started
 	  via:
 
-	      echo 0 > /debugfs/tracing/tracing_max_latency
+	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
 
 	  (Note that kernel size and overhead increases with this option
 	  enabled. This option and the irqs-off timing option can be
@@ -166,7 +177,7 @@ config PREEMPT_TRACER
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
@@ -174,44 +185,33 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
-config ENABLE_CONTEXT_SWITCH_TRACER
-	bool "Trace process context switches"
-	select TRACING
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer gets called from the context switch and records
-	  all switching of tasks.
-
-config ENABLE_EVENT_TRACING
-	bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+	bool "Trace process context switches and events"
+	depends on !GENERIC_TRACER
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
 	  allowing the user to pick and choose which trace point they
-	  want to trace.
-
-	  Note, all tracers enable event tracing. This option is
-	  only a convenience to enable event tracing when no other
-	  tracers are selected.
+	  want to trace. It also includes the sched_switch tracer plugin.
 
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
-	select TRACING
+	select GENERIC_TRACER
 	select KALLSYMS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
 config BOOT_TRACER
 	bool "Trace boot initcalls"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer helps developers to optimize boot times: it records
@@ -228,7 +228,7 @@ config BOOT_TRACER
 
 config TRACE_BRANCH_PROFILING
 	bool
-	select TRACING
+	select GENERIC_TRACER
 
 choice
 	prompt "Branch Profiling"
@@ -261,7 +261,7 @@ config PROFILE_ANNOTATED_BRANCHES
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
 
-	  /debugfs/tracing/profile_annotated_branch
+	  /sys/kernel/debug/tracing/profile_annotated_branch
 
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
@@ -274,7 +274,7 @@ config PROFILE_ALL_BRANCHES
 	  taken in the kernel is recorded whether it hit or miss.
 	  The results will be displayed in:
 
-	  /debugfs/tracing/profile_branch
+	  /sys/kernel/debug/tracing/profile_branch
 
 	  This option also enables the likely/unlikely profiler.
 
@@ -308,7 +308,7 @@ config BRANCH_TRACER
 config POWER_TRACER
 	bool "Trace power consumption behavior"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer helps developers to analyze and optimize the kernels
 	  power management decisions, specifically the C-state and P-state
@@ -344,7 +344,7 @@ config STACK_TRACER
 	select KALLSYMS
 	help
 	  This special tracer records the maximum stack footprint of the
-	  kernel and displays it in debugfs/tracing/stack_trace.
+	  kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
 
 	  This tracer works by hooking into every function call that the
 	  kernel executes, and keeping a maximum stack depth value and
@@ -363,14 +363,14 @@ config STACK_TRACER
 config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
 	bool "Trace hw branches"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
 config KMEMTRACE
 	bool "Trace SLAB allocations"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -390,7 +390,7 @@ config KMEMTRACE
 
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  The workqueue tracer provides some statistical informations
           about each cpu workqueue thread such as the number of the
@@ -406,7 +406,7 @@ config BLK_DEV_IO_TRACE
 	select RELAY
 	select DEBUG_FS
 	select TRACEPOINTS
-	select TRACING
+	select GENERIC_TRACER
 	select STACKTRACE
 	help
 	  Say Y here if you want to be able to trace the block layer actions
@@ -467,7 +467,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING
+	depends on GENERIC_TRACER
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -478,7 +478,7 @@ config FTRACE_STARTUP_TEST
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 658aace8c41..ce3b1cd0273 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e3abf55bc8e..39af8af6fc3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
@@ -665,12 +669,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				rq->cmd_len, rq->cmd);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				rw, what, rq->errors, 0, NULL);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+				what, rq->errors, 0, NULL);
 	}
 }
 
@@ -877,11 +881,11 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
@@ -1658,3 +1662,72 @@ int blk_trace_init_sysfs(struct device *dev)
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+	int i, end;
+	int len = rq->cmd_len;
+	unsigned char *cmd = rq->cmd;
+
+	if (!blk_pc_request(rq)) {
+		buf[0] = '\0';
+		return;
+	}
+
+	for (end = len - 1; end >= 0; end--)
+		if (cmd[end])
+			break;
+	end++;
+
+	for (i = 0; i < len; i++) {
+		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+		if (i == end && end != len - 1) {
+			sprintf(buf, " ..");
+			break;
+		}
+	}
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+	int i = 0;
+
+	if (rw & WRITE)
+		rwbs[i++] = 'W';
+	else if (rw & 1 << BIO_RW_DISCARD)
+		rwbs[i++] = 'D';
+	else if (bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (rw & 1 << BIO_RW_AHEAD)
+		rwbs[i++] = 'A';
+	if (rw & 1 << BIO_RW_BARRIER)
+		rwbs[i++] = 'B';
+	if (rw & 1 << BIO_RW_SYNCIO)
+		rwbs[i++] = 'S';
+	if (rw & 1 << BIO_RW_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+	int rw = rq->cmd_flags & 0x03;
+	int bytes;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	bytes = blk_rq_bytes(rq);
+
+	blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 140699a9a8a..bb60732ade0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,6 +32,7 @@
 #include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
+#include <asm/setup.h>
 
 #include "trace_output.h"
 #include "trace_stat.h"
@@ -598,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_save(flags);
 
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	rec = ftrace_find_profiled_func(stat, ip);
@@ -629,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 
 	local_irq_save(flags);
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	calltime = trace->rettime - trace->calltime;
@@ -723,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			ftrace_profile_enabled = 1;
 		} else {
 			ftrace_profile_enabled = 0;
+			/*
+			 * unregister_ftrace_profiler calls stop_machine
+			 * so this acts like an synchronize_sched.
+			 */
 			unregister_ftrace_profiler();
 		}
 	}
@@ -2369,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
 	ftrace_set_regex(buf, len, reset, 0);
 }
 
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+	char *func;
+
+	while (buf) {
+		func = strsep(&buf, ",");
+		ftrace_set_regex(func, strlen(func), 0, enable);
+	}
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+	if (ftrace_filter_buf[0])
+		set_ftrace_early_filter(ftrace_filter_buf, 1);
+	if (ftrace_notrace_buf[0])
+		set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
 static int
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
@@ -2829,6 +2873,8 @@ void __init ftrace_init(void)
 	if (ret)
 		pr_warning("Failed to register trace ftrace module notifier\n");
 
+	set_ftrace_early_filters();
+
 	return;
  failed:
 	ftrace_disabled = 1;
@@ -3172,12 +3218,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 
 		if (t->ret_stack == NULL) {
-			t->curr_ret_stack = -1;
-			/* Make sure IRQs see the -1 first: */
-			barrier();
-			t->ret_stack = ret_stack_list[start++];
 			atomic_set(&t->tracing_graph_pause, 0);
 			atomic_set(&t->trace_overrun, 0);
+			t->curr_ret_stack = -1;
+			/* Make sure the tasks see the -1 first: */
+			smp_wmb();
+			t->ret_stack = ret_stack_list[start++];
 		}
 	} while_each_thread(g, t);
 
@@ -3235,8 +3281,10 @@ static int start_graph_tracing(void)
 		return -ENOMEM;
 
 	/* The cpu_boot init_task->ret_stack will never be freed */
-	for_each_online_cpu(cpu)
-		ftrace_graph_init_task(idle_task(cpu));
+	for_each_online_cpu(cpu) {
+		if (!idle_task(cpu)->ret_stack)
+			ftrace_graph_init_task(idle_task(cpu));
+	}
 
 	do {
 		ret = alloc_retstack_tasklist(ret_stack_list);
@@ -3328,18 +3376,25 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
+	/* Make sure we do not use the parent ret_stack */
+	t->ret_stack = NULL;
+
 	if (ftrace_graph_active) {
-		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
-		if (!t->ret_stack)
+		if (!ret_stack)
 			return;
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 		t->ftrace_timestamp = 0;
-	} else
-		t->ret_stack = NULL;
+		/* make curr_ret_stack visable before we add the ret_stack */
+		smp_wmb();
+		t->ret_stack = ret_stack;
+	}
 }
 
 void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 16b24d49604..dc4dc70171c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -370,6 +371,9 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
@@ -423,6 +427,8 @@ struct ring_buffer {
 	atomic_t			record_disabled;
 	cpumask_var_t			cpumask;
 
+	struct lock_class_key		*reader_lock_key;
+
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
@@ -562,6 +568,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
 	spin_lock_init(&cpu_buffer->reader_lock);
+	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -632,7 +639,8 @@ static int rb_cpu_notify(struct notifier_block *self,
  * when the buffer wraps. If this flag is not set, the buffer will
  * drop data when the tail hits the head.
  */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+					struct lock_class_key *key)
 {
 	struct ring_buffer *buffer;
 	int bsize;
@@ -655,6 +663,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
+	buffer->reader_lock_key = key;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -712,7 +721,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	kfree(buffer);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
 
 /**
  * ring_buffer_free - free a ring buffer.
@@ -1262,6 +1271,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail < BUF_PAGE_SIZE) {
 		/* Mark the rest of the page with padding */
 		event = __rb_page_index(tail_page, tail);
+		kmemcheck_annotate_bitfield(event, bitfield);
 		rb_event_set_padding(event);
 	}
 
@@ -1319,6 +1329,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		return NULL;
 
 	event = __rb_page_index(tail_page, tail);
+	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(event, type, length);
 
 	/* The passed in type is zero for DATA */
@@ -1335,6 +1346,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event)
+{
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			return 1;
+	}
+
+	/* could not discard */
+	return 0;
+}
+
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
@@ -1377,16 +1420,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			event->array[0] = *delta >> TS_SHIFT;
 		} else {
 			cpu_buffer->commit_page->page->time_stamp = *ts;
-			event->time_delta = 0;
-			event->array[0] = 0;
+			/* try to discard, since we do not need this */
+			if (!rb_try_to_discard(cpu_buffer, event)) {
+				/* nope, just zero it */
+				event->time_delta = 0;
+				event->array[0] = 0;
+			}
 		}
 		cpu_buffer->write_stamp = *ts;
 		/* let the caller know this was the commit */
 		ret = 1;
 	} else {
-		/* Darn, this is just wasted space */
-		event->time_delta = 0;
-		event->array[0] = 0;
+		/* Try to discard the event */
+		if (!rb_try_to_discard(cpu_buffer, event)) {
+			/* Darn, this is just wasted space */
+			event->time_delta = 0;
+			event->array[0] = 0;
+		}
 		ret = 0;
 	}
 
@@ -1682,10 +1732,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 				struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long new_index, old_index;
-	struct buffer_page *bpage;
-	unsigned long index;
-	unsigned long addr;
 	int cpu;
 
 	/* The event is discarded regardless */
@@ -1701,24 +1747,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	cpu = smp_processor_id();
 	cpu_buffer = buffer->buffers[cpu];
 
-	new_index = rb_event_index(event);
-	old_index = new_index + rb_event_length(event);
-	addr = (unsigned long)event;
-	addr &= PAGE_MASK;
-
-	bpage = cpu_buffer->tail_page;
-
-	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
-		/*
-		 * This is on the tail page. It is possible that
-		 * a write could come in and move the tail page
-		 * and write to the next page. That is fine
-		 * because we just shorten what is on this page.
-		 */
-		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index)
-			goto out;
-	}
+	if (!rb_try_to_discard(cpu_buffer, event))
+		goto out;
 
 	/*
 	 * The commit is still visible by the reader, so we
@@ -2253,8 +2283,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		if (RB_WARN_ON(buffer,
-			       iter->head_page == cpu_buffer->commit_page))
+		/* discarded commits can make the page empty */
+		if (iter->head_page == cpu_buffer->commit_page)
 			return;
 		rb_inc_iter(iter);
 		return;
@@ -2297,12 +2327,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	/*
 	 * We repeat when a timestamp is encountered. It is possible
 	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have.  Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * as one timestamp is about to be written, or from discarded
+	 * commits. The most that we can have is the number on a single page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	reader = rb_get_reader_page(cpu_buffer);
@@ -2368,14 +2396,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
  again:
 	/*
-	 * We repeat when a timestamp is encountered. It is possible
-	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have. Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * We repeat when a timestamp is encountered.
+	 * We can get multiple timestamps by nested interrupts or also
+	 * if filtering is on (discarding commits). Since discarding
+	 * commits can be frequent we can get a lot of timestamps.
+	 * But we limit them by not adding timestamps if they begin
+	 * at the start of a page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	if (rb_per_cpu_empty(cpu_buffer))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3a8a87d7e9..c1878bfb2e1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -344,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
 /*
  * Copy the new maximum trace into the separate maximum-trace
  * structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /debugfs/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
  */
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2414,21 +2414,20 @@ static const struct file_operations tracing_iter_fops = {
 
 static const char readme_msg[] =
 	"tracing mini-HOWTO:\n\n"
-	"# mkdir /debug\n"
-	"# mount -t debugfs nodev /debug\n\n"
-	"# cat /debug/tracing/available_tracers\n"
+	"# mount -t debugfs nodev /sys/kernel/debug\n\n"
+	"# cat /sys/kernel/debug/tracing/available_tracers\n"
 	"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
-	"# cat /debug/tracing/current_tracer\n"
+	"# cat /sys/kernel/debug/tracing/current_tracer\n"
 	"nop\n"
-	"# echo sched_switch > /debug/tracing/current_tracer\n"
-	"# cat /debug/tracing/current_tracer\n"
+	"# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
+	"# cat /sys/kernel/debug/tracing/current_tracer\n"
 	"sched_switch\n"
-	"# cat /debug/tracing/trace_options\n"
+	"# cat /sys/kernel/debug/tracing/trace_options\n"
 	"noprint-parent nosym-offset nosym-addr noverbose\n"
-	"# echo print-parent > /debug/tracing/trace_options\n"
-	"# echo 1 > /debug/tracing/tracing_enabled\n"
-	"# cat /debug/tracing/trace > /tmp/trace.txt\n"
-	"echo 0 > /debug/tracing/tracing_enabled\n"
+	"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
+	"# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+	"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
+	"# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
 ;
 
 static ssize_t
@@ -2826,6 +2825,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	mutex_init(&iter->mutex);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6c81f9c2142..aa08be69a1b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod)
 	struct ftrace_event_call *call, *p;
 	bool found = false;
 
+	down_write(&trace_event_mutex);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
 			ftrace_event_enable_disable(call, 0);
 			if (call->event)
-				unregister_ftrace_event(call->event);
+				__unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
 			trace_destroy_fields(call);
@@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod)
 	 */
 	if (found)
 		tracing_reset_current_online_cpus();
+	up_write(&trace_event_mutex);
 }
 
 static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a7430b16d24..db6e54bdb59 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -478,12 +478,12 @@ enum {
 
 static int is_string_field(const char *type)
 {
+	if (strstr(type, "__data_loc") && strstr(type, "char"))
+		return FILTER_DYN_STRING;
+
 	if (strchr(type, '[') && strstr(type, "char"))
 		return FILTER_STATIC_STRING;
 
-	if (!strcmp(type, "__str_loc"))
-		return FILTER_DYN_STRING;
-
 	return 0;
 }
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 10f6ad7d85f..8b592418d8b 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	if (!current->ret_stack)
 		return -EBUSY;
 
+	/*
+	 * We must make sure the ret_stack is tested before we read
+	 * anything else.
+	 */
+	smp_rmb();
+
 	/* The return trace stack is full */
 	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
 		atomic_inc(&current->trace_overrun);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c12d95db2f5..7938f3ae93e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,9 +14,10 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
 
 DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
 
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
@@ -99,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
@@ -222,10 +255,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 {
 	unsigned long mask;
 	const char *str;
+	const char *ret = p->buffer + p->len;
 	int i;
 
-	trace_seq_init(p);
-
 	for (i = 0;  flag_array[i].name && flags; i++) {
 
 		mask = flag_array[i].mask;
@@ -248,16 +280,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
+EXPORT_SYMBOL(ftrace_print_flags_seq);
 
 const char *
 ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 			 const struct trace_print_flags *symbol_array)
 {
 	int i;
-
-	trace_seq_init(p);
+	const char *ret = p->buffer + p->len;
 
 	for (i = 0;  symbol_array[i].name; i++) {
 
@@ -273,8 +305,9 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 		
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
@@ -386,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 
 		if (ip == ULONG_MAX || !ret)
 			break;
-		if (i && ret)
-			ret = trace_seq_puts(s, " <- ");
+		if (ret)
+			ret = trace_seq_puts(s, " => ");
 		if (!ip) {
 			if (ret)
 				ret = trace_seq_puts(s, "??");
+			if (ret)
+				ret = trace_seq_puts(s, "\n");
 			continue;
 		}
 		if (!ret)
 			break;
 		if (ret)
 			ret = seq_print_user_ip(s, mm, ip, sym_flags);
+		ret = trace_seq_puts(s, "\n");
 	}
 
 	if (mm)
@@ -666,6 +702,16 @@ int register_ftrace_event(struct trace_event *event)
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
 
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+	hlist_del(&event->node);
+	list_del(&event->list);
+	return 0;
+}
+
 /**
  * unregister_ftrace_event - remove a no longer used event
  * @event: the event to remove
@@ -673,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
 int unregister_ftrace_event(struct trace_event *event)
 {
 	down_write(&trace_event_mutex);
-	hlist_del(&event->node);
-	list_del(&event->list);
+	__unregister_ftrace_event(event);
 	up_write(&trace_event_mutex);
 
 	return 0;
@@ -972,16 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
+	if (!trace_seq_puts(s, "<stack trace>\n"))
+		goto partial;
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		if (!field->caller[i])
+		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
 			break;
-		if (i) {
-			if (!trace_seq_puts(s, " <= "))
-				goto partial;
+		if (!trace_seq_puts(s, " => "))
+			goto partial;
 
-			if (!seq_print_ip_sym(s, field->caller[i], flags))
-				goto partial;
-		}
+		if (!seq_print_ip_sym(s, field->caller[i], flags))
+			goto partial;
 		if (!trace_seq_puts(s, "\n"))
 			goto partial;
 	}
@@ -1009,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!seq_print_userip_objs(field, s, flags))
+	if (!trace_seq_puts(s, "<user stack trace>\n"))
 		goto partial;
 
-	if (!trace_seq_putc(s, '\n'))
+	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
 
 	return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ac240e76eb0..d38bec4a9c3 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type);
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
 
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 1796f00524e..2d7aebd71db 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
 		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   max_stack_trace.nr_entries);
+			   max_stack_trace.nr_entries - 1);
 
 		if (!stack_tracer_enabled && !max_stack_size)
 			print_disabled(m);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index e04b76cc238..f6693969287 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
 
-	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static void start_stack_timers(void)