summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/cpuset.c67
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c3
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/module.c22
-rw-r--r--kernel/rcutorture.c2
-rw-r--r--kernel/sched.c726
-rw-r--r--kernel/sched_debug.c108
-rw-r--r--kernel/sched_fair.c350
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/stop_machine.c534
-rw-r--r--kernel/time/tick-sched.c84
-rw-r--r--kernel/time/timer_list.c1
-rw-r--r--kernel/trace/blktrace.c138
-rw-r--r--kernel/trace/ftrace.c36
-rw-r--r--kernel/trace/kmemtrace.c70
-rw-r--r--kernel/trace/ring_buffer.c179
-rw-r--r--kernel/trace/ring_buffer_benchmark.c5
-rw-r--r--kernel/trace/trace.c136
-rw-r--r--kernel/trace/trace.h36
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_perf.c17
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c28
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions_graph.c176
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c104
-rw-r--r--kernel/trace/trace_output.c139
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c29
-rw-r--r--kernel/trace/trace_selftest.c7
-rw-r--r--kernel/trace/trace_syscalls.c137
-rw-r--r--kernel/trace/trace_workqueue.c26
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user.c11
43 files changed, 2306 insertions, 1544 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b..149e18ef1ab 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b27..2f05303715a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <asm/uaccess.h>
-#include "cred-internals.h"
/*
* Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980..4a07d057a26 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3010,7 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
- remove_wait_queue_locked(event->wqh, &event->wait);
+ __remove_wait_queue(event->wqh, &event->wait);
spin_lock(&cgrp->event_list_lock);
list_del(&event->list);
spin_unlock(&cgrp->event_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be..54577757477 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
}
struct take_cpu_down_param {
+ struct task_struct *caller;
unsigned long mod;
void *hcpu;
};
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
+ unsigned int cpu = (unsigned long)param->hcpu;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
param->hcpu);
+ if (task_cpu(param->caller) == cpu)
+ move_task_off_dead_cpu(cpu, param->caller);
/* Force idle task to run as soon as we yield: it should
immediately notice cpu is offline and die quickly. */
sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
- cpumask_var_t old_allowed;
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
+ .caller = current,
.mod = mod,
.hcpu = hcpu,
};
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
- if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
- return -ENOMEM;
-
cpu_hotplug_begin();
set_cpu_active(cpu, false);
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
goto out_release;
}
- /* Ensure that we are not runnable on dying cpu */
- cpumask_copy(old_allowed, &current->cpus_allowed);
- set_cpus_allowed_ptr(current, cpu_active_mask);
-
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
hcpu) == NOTIFY_BAD)
BUG();
- goto out_allowed;
+ goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
check_for_tasks(cpu);
-out_allowed:
- set_cpus_allowed_ptr(current, old_allowed);
out_release:
cpu_hotplug_done();
if (!err) {
@@ -264,7 +259,6 @@ out_release:
hcpu) == NOTIFY_BAD)
BUG();
}
- free_cpumask_var(old_allowed);
return err;
}
@@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
{
int err;
- err = stop_machine_create();
- if (err)
- return err;
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
out:
cpu_maps_update_done();
- stop_machine_destroy();
return err;
}
EXPORT_SYMBOL(cpu_down);
@@ -367,9 +357,6 @@ int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error;
- error = stop_machine_create();
- if (error)
- return error;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
/*
@@ -400,7 +387,6 @@ int disable_nonboot_cpus(void)
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
cpu_maps_update_done();
- stop_machine_destroy();
return error;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec..9a50c5f6e72 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
mutex_lock(&callback_mutex);
- cpuset_cpus_allowed_locked(tsk, pmask);
+ task_lock(tsk);
+ guarantee_online_cpus(task_cs(tsk), pmask);
+ task_unlock(tsk);
mutex_unlock(&callback_mutex);
}
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- task_lock(tsk);
- guarantee_online_cpus(task_cs(tsk), pmask);
- task_unlock(tsk);
+ const struct cpuset *cs;
+ int cpu;
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+ if (cs)
+ cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+ rcu_read_unlock();
+
+ /*
+ * We own tsk->cpus_allowed, nobody can change it under us.
+ *
+ * But we used cs && cs->cpus_allowed lockless and thus can
+ * race with cgroup_attach_task() or update_cpumask() and get
+ * the wrong tsk->cpus_allowed. However, both cases imply the
+ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+ * which takes task_rq_lock().
+ *
+ * If we are called after it dropped the lock we must see all
+ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+ * set any mask even if it is not right from task_cs() pov,
+ * the pending set_cpus_allowed_ptr() will fix things.
+ */
+
+ cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+ if (cpu >= nr_cpu_ids) {
+ /*
+ * Either tsk->cpus_allowed is wrong (see above) or it
+ * is actually empty. The latter case is only possible
+ * if we are racing with remove_tasks_in_empty_cpuset().
+ * Like above we can temporary set any mask and rely on
+ * set_cpus_allowed_ptr() as synchronization point.
+ */
+ cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+ cpu = cpumask_any(cpu_active_mask);
+ }
+
+ return cpu;
}
void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
}
/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset. Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list. The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
- mutex_lock(&callback_mutex);
-}
-
-/**
* cpuset_unlock - release lock on cpuset changes
*
* Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf..00000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
- sched_move_task(p);
-#endif /* CONFIG_USER_SCHED */
-}
-
diff --git a/kernel/cred.c b/kernel/cred.c
index 62af1816c23..8f3672a58a1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,7 +17,6 @@
#include <linux/init_task.h>
#include <linux/security.h>
#include <linux/cn_proc.h>
-#include "cred-internals.h"
#if 0
#define kdebug(FMT, ...) \
@@ -560,8 +559,6 @@ int commit_creds(struct cred *new)
atomic_dec(&old->user->processes);
alter_cred_subscribers(old, -2);
- sched_switch_user(task);
-
/* send notifications */
if (new->uid != old->uid ||
new->euid != old->euid ||
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac..eabca5a73a8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-#include "cred-internals.h"
static void exit_mm(struct task_struct * tsk);
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026..e2564580f3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
#if 0
#define DEBUGP printk
#else
@@ -515,6 +513,9 @@ MODINFO_ATTR(srcversion);
static char last_unloaded_module[MODULE_NAME_LEN+1];
#ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
@@ -723,16 +724,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
- /* Create stop_machine threads since free_module relies on
- * a non-failing stop_machine call. */
- ret = stop_machine_create();
- if (ret)
- return ret;
-
- if (mutex_lock_interruptible(&module_mutex) != 0) {
- ret = -EINTR;
- goto out_stop;
- }
+ if (mutex_lock_interruptible(&module_mutex) != 0)
+ return -EINTR;
mod = find_module(name);
if (!mod) {
@@ -792,8 +785,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
out:
mutex_unlock(&module_mutex);
-out_stop:
- stop_machine_destroy();
return ret;
}
@@ -867,8 +858,7 @@ void module_put(struct module *module)
smp_wmb(); /* see comment in module_refcount */
__this_cpu_inc(module->refptr->decs);
- trace_module_put(module, _RET_IP_,
- __this_cpu_read(module->refptr->decs));
+ trace_module_put(module, _RET_IP_);
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83e..2b676f3a0f2 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
.sync = synchronize_sched_expedited,
.cb_barrier = NULL,
.fqs = rcu_sched_force_quiescent_state,
- .stats = rcu_expedited_torture_stats,
+ .stats = NULL,
.irq_capable = 1,
.name = "sched_expedited"
};
diff --git a/kernel/sched.c b/kernel/sched.c
index b11b80a3eed..78554dd0d1a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
-#include <linux/kthread.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
@@ -503,8 +503,11 @@ struct rq {
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ
+ u64 nohz_stamp;
unsigned char in_nohz_recently;
#endif
+ unsigned int skip_clock_update;
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -546,15 +549,13 @@ struct rq {
int post_schedule;
int active_balance;
int push_cpu;
+ struct cpu_stop_work active_balance_work;
/* cpu of this runqueue: */
int cpu;
int online;
unsigned long avg_load_per_task;
- struct task_struct *migration_thread;
- struct list_head migration_queue;
-
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
@@ -602,6 +603,13 @@ static inline
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+ /*
+ * A queue event has occurred, and we're going to schedule. In
+ * this case, we can save a useless back to back clock update.
+ */
+ if (test_tsk_need_resched(p))
+ rq->skip_clock_update = 1;
}
static inline int cpu_of(struct rq *rq)
@@ -636,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
inline void update_rq_clock(struct rq *rq)
{
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ if (!rq->skip_clock_update)
+ rq->clock = sched_clock_cpu(cpu_of(rq));
}
/*
@@ -914,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
- * Check whether the task is waking, we use this to synchronize against
- * ttwu() so that task_cpu() reports a stable number.
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
*/
static inline int task_is_waking(struct task_struct *p)
{
- return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+ return unlikely(p->state == TASK_WAKING);
}
/*
@@ -936,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
struct rq *rq;
for (;;) {
- while (task_is_waking(p))
- cpu_relax();
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock(&rq->lock);
}
@@ -957,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
struct rq *rq;
for (;;) {
- while (task_is_waking(p))
- cpu_relax();
local_irq_save(*flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ if (likely(rq == task_rq(p)))
return rq;
raw_spin_unlock_irqrestore(&rq->lock, *flags);
}
@@ -1239,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
if (!tsk_is_polling(rq->idle))
smp_send_reschedule(cpu);
}
+
+int nohz_ratelimit(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ u64 diff = rq->clock - rq->nohz_stamp;
+
+ rq->nohz_stamp = rq->clock;
+
+ return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -1781,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
}
}
- update_rq_clock(rq1);
- update_rq_clock(rq2);
}
/*
@@ -1813,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
}
#endif
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
@@ -1870,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
}
-static void update_avg(u64 *avg, u64 sample)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
- s64 diff = sample - *avg;
- *avg += diff >> 3;
-}
-
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
-{
- if (wakeup)
- p->se.start_runtime = p->se.sum_exec_runtime;
-
+ update_rq_clock(rq);
sched_info_queued(p);
- p->sched_class->enqueue_task(rq, p, wakeup, head);
+ p->sched_class->enqueue_task(rq, p, flags);
p->se.on_rq = 1;
}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
- if (sleep) {
- if (p->se.last_wakeup) {
- update_avg(&p->se.avg_overlap,
- p->se.sum_exec_runtime - p->se.last_wakeup);
- p->se.last_wakeup = 0;
- } else {
- update_avg(&p->se.avg_wakeup,
- sysctl_sched_wakeup_granularity);
- }
- }
-
+ update_rq_clock(rq);
sched_info_dequeued(p);
- p->sched_class->dequeue_task(rq, p, sleep);
+ p->sched_class->dequeue_task(rq, p, flags);
p->se.on_rq = 0;
}
/*
* activate_task - move a task to the runqueue.
*/
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
- enqueue_task(rq, p, wakeup, false);
+ enqueue_task(rq, p, flags);
inc_nr_running(rq);
}
/*
* deactivate_task - remove a task from the runqueue.
*/
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
- dequeue_task(rq, p, sleep);
+ dequeue_task(rq, p, flags);
dec_nr_running(rq);
}
@@ -2054,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
-struct migration_req {
- struct list_head list;
-
+struct migration_arg {
struct task_struct *task;
int dest_cpu;
-
- struct completion done;
};
+static int migration_cpu_stop(void *data);
+
/*
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static int
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+static bool migrate_task(struct task_struct *p, int dest_cpu)
{
struct rq *rq = task_rq(p);
@@ -2076,15 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
*/
- if (!p->se.on_rq && !task_running(rq, p))
- return 0;
-
- init_completion(&req->done);
- req->task = p;
- req->dest_cpu = dest_cpu;
- list_add(&req->list, &rq->migration_queue);
-
- return 1;
+ return p->se.on_rq || task_running(rq, p);
}
/*
@@ -2142,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
- trace_sched_wait_task(rq, p);
+ trace_sched_wait_task(p);
running = task_running(rq, p);
on_rq = p->se.on_rq;
ncsw = 0;
@@ -2240,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
}
#ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
static int select_fallback_rq(int cpu, struct task_struct *p)
{
int dest_cpu;
@@ -2256,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (dest_cpu >= nr_cpu_ids) {
- rcu_read_lock();
- cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
- rcu_read_unlock();
- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
-
+ if (unlikely(dest_cpu >= nr_cpu_ids)) {
+ dest_cpu = cpuset_cpus_allowed_fallback(p);
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
@@ -2278,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
/*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- * exec: is unstable, retry loop
- * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2306,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
return cpu;
}
+
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
#endif
/***
@@ -2327,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
{
int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
+ unsigned long en_flags = ENQUEUE_WAKEUP;
struct rq *rq;
- if (!sched_feat(SYNC_WAKEUPS))
- wake_flags &= ~WF_SYNC;
-
this_cpu = get_cpu();
smp_wmb();
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
if (!(p->state & state))
goto out;
@@ -2356,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
*
* First fix up the nr_uninterruptible count:
*/
- if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
+ if (task_contributes_to_load(p)) {
+ if (likely(cpu_online(orig_cpu)))
+ rq->nr_uninterruptible--;
+ else
+ this_rq()->nr_uninterruptible--;
+ }
p->state = TASK_WAKING;
- if (p->sched_class->task_waking)
+ if (p->sched_class->task_waking) {
p->sched_class->task_waking(rq, p);
+ en_flags |= ENQUEUE_WAKING;
+ }
- __task_rq_unlock(rq);
-
- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu) {
- /*
- * Since we migrate the task without holding any rq->lock,
- * we need to be careful with task_rq_lock(), since that
- * might end up locking an invalid rq.
- */
+ cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+ if (cpu != orig_cpu)
set_task_cpu(p, cpu);
- }
+ __task_rq_unlock(rq);
rq = cpu_rq(cpu);
raw_spin_lock(&rq->lock);
- update_rq_clock(rq);
/*
* We migrated the task without holding either rq->lock, however
@@ -2405,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
out_activate:
#endif /* CONFIG_SMP */
- schedstat_inc(p, se.nr_wakeups);
+ schedstat_inc(p, se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p, se.nr_wakeups_sync);
+ schedstat_inc(p, se.statistics.nr_wakeups_sync);
if (orig_cpu != cpu)
- schedstat_inc(p, se.nr_wakeups_migrate);
+ schedstat_inc(p, se.statistics.nr_wakeups_migrate);
if (cpu == this_cpu)
- schedstat_inc(p, se.nr_wakeups_local);
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
else
- schedstat_inc(p, se.nr_wakeups_remote);
- activate_task(rq, p, 1);
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ activate_task(rq, p, en_flags);
success = 1;
- /*
- * Only attribute actual wakeups done by this task.
- */
- if (!in_interrupt()) {
- struct sched_entity *se = &current->se;
- u64 sample = se->sum_exec_runtime;
-
- if (se->last_wakeup)
- sample -= se->last_wakeup;
- else
- sample -= se->start_runtime;
- update_avg(&se->avg_wakeup, sample);
-
- se->last_wakeup = se->sum_exec_runtime;
- }
-
out_running:
- trace_sched_wakeup(rq, p, success);
+ trace_sched_wakeup(p, success);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -2494,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
- p->se.last_wakeup = 0;
- p->se.avg_overlap = 0;
- p->se.start_runtime = 0;
- p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.wait_max = 0;
- p->se.wait_count = 0;
- p->se.wait_sum = 0;
-
- p->se.sleep_start = 0;
- p->se.sleep_max = 0;
- p->se.sum_sleep_runtime = 0;
-
- p->se.block_start = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
-
- p->se.nr_migrations_cold = 0;
- p->se.nr_failed_migrations_affine = 0;
- p->se.nr_failed_migrations_running = 0;
- p->se.nr_failed_migrations_hot = 0;
- p->se.nr_forced_migrations = 0;
-
- p->se.nr_wakeups = 0;
- p->se.nr_wakeups_sync = 0;
- p->se.nr_wakeups_migrate = 0;
- p->se.nr_wakeups_local = 0;
- p->se.nr_wakeups_remote = 0;
- p->se.nr_wakeups_affine = 0;
- p->se.nr_wakeups_affine_attempts = 0;
- p->se.nr_wakeups_passive = 0;
- p->se.nr_wakeups_idle = 0;
-
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
INIT_LIST_HEAD(&p->rt.run_list);
@@ -2550,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
__sched_fork(p);
/*
- * We mark the process as waking here. This guarantees that
+ * We mark the process as running here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_WAKING;
+ p->state = TASK_RUNNING;
/*
* Revert to default priority/policy on fork if requested.
@@ -2621,31 +2547,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
int cpu __maybe_unused = get_cpu();
#ifdef CONFIG_SMP
+ rq = task_rq_lock(p, &flags);
+ p->state = TASK_WAKING;
+
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
*
- * We still have TASK_WAKING but PF_STARTING is gone now, meaning
- * ->cpus_allowed is stable, we have preemption disabled, meaning
- * cpu_online_mask is stable.
+ * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+ * without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+ cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
set_task_cpu(p, cpu);
-#endif
- /*
- * Since the task is not on the rq and we still have TASK_WAKING set
- * nobody else will migrate this task.
- */
- rq = cpu_rq(cpu);
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- BUG_ON(p->state != TASK_WAKING);
p->state = TASK_RUNNING;
- update_rq_clock(rq);
+ task_rq_unlock(rq, &flags);
+#endif
+
+ rq = task_rq_lock(p, &flags);
activate_task(rq, p, 0);
- trace_sched_wakeup_new(rq, p, 1);
+ trace_sched_wakeup_new(p, 1);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
@@ -2865,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_sched_switch(rq, prev, next);
+ trace_sched_switch(prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -2982,6 +2904,61 @@ static unsigned long calc_load_update;
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun);
+static long calc_load_fold_active(struct rq *this_rq)
+{
+ long nr_active, delta = 0;
+
+ nr_active = this_rq->nr_running;
+ nr_active += (long) this_rq->nr_uninterruptible;
+
+ if (nr_active != this_rq->calc_load_active) {
+ delta = nr_active - this_rq->calc_load_active;
+ this_rq->calc_load_active = nr_active;
+ }
+
+ return delta;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+
+static void calc_load_account_idle(struct rq *this_rq)
+{
+ long delta;
+
+ delta = calc_load_fold_active(this_rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks_idle);
+}
+
+static long calc_load_fold_idle(void)
+{
+ long delta = 0;
+
+ /*
+ * Its got a race, we don't care...
+ */
+ if (atomic_long_read(&calc_load_tasks_idle))
+ delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+
+ return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+
+static inline long calc_load_fold_idle(void)
+{
+ return 0;
+}
+#endif
+
/**
* get_avenrun - get the load average array
* @loads: pointer to dest load array
@@ -3028,20 +3005,22 @@ void calc_global_load(void)
}
/*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
*/
static void calc_load_account_active(struct rq *this_rq)
{
- long nr_active, delta;
+ long delta;
- nr_active = this_rq->nr_running;
- nr_active += (long) this_rq->nr_uninterruptible;
+ if (time_before(jiffies, this_rq->calc_load_update))
+ return;
- if (nr_active != this_rq->calc_load_active) {
- delta = nr_active - this_rq->calc_load_active;
- this_rq->calc_load_active = nr_active;
+ delta = calc_load_fold_active(this_rq);
+ delta += calc_load_fold_idle();
+ if (delta)
atomic_long_add(delta, &calc_load_tasks);
- }
+
+ this_rq->calc_load_update += LOAD_FREQ;
}
/*
@@ -3073,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
- if (time_after_eq(jiffies, this_rq->calc_load_update)) {
- this_rq->calc_load_update += LOAD_FREQ;
- calc_load_account_active(this_rq);
- }
+ calc_load_account_active(this_rq);
}
#ifdef CONFIG_SMP
@@ -3088,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
void sched_exec(void)
{
struct task_struct *p = current;
- struct migration_req req;
- int dest_cpu, this_cpu;
unsigned long flags;
struct rq *rq;
-
-again:
- this_cpu = get_cpu();
- dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
- if (dest_cpu == this_cpu) {
- put_cpu();
- return;
- }
+ int dest_cpu;
rq = task_rq_lock(p, &flags);
- put_cpu();
+ dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ if (dest_cpu == smp_processor_id())
+ goto unlock;
/*
* select_task_rq() can race against ->cpus_allowed
*/
- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
- || unlikely(!cpu_active(dest_cpu))) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+ likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ struct migration_arg arg = { p, dest_cpu };
- /* force the process onto the specified CPU */
- if (migrate_task(p, dest_cpu, &req)) {
- /* Need to wait for migration thread (might exit: take ref). */
- struct task_struct *mt = rq->migration_thread;
-
- get_task_struct(mt);
task_rq_unlock(rq, &flags);
- wake_up_process(mt);
- put_task_struct(mt);
- wait_for_completion(&req.done);
-
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
return;
}
+unlock:
task_rq_unlock(rq, &flags);
}
@@ -3597,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->state == TASK_RUNNING) {
- u64 runtime = prev->se.sum_exec_runtime;
-
- runtime -= prev->se.prev_sum_exec_runtime;
- runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
- /*
- * In order to avoid avg_overlap growing stale when we are
- * indeed overlapping and hence not getting put to sleep, grow
- * the avg_overlap on preemption.
- *
- * We use the average preemption runtime because that
- * correlates to the amount of cache footprint a task can
- * build up.
- */
- update_avg(&prev->se.avg_overlap, runtime);
- }
+ if (prev->se.on_rq)
+ update_rq_clock(rq);
+ rq->skip_clock_update = 0;
prev->sched_class->put_prev_task(rq, prev);
}
@@ -3676,14 +3621,13 @@ need_resched_nonpreemptible:
hrtick_clear(rq);
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
clear_tsk_need_resched(prev);
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev)))
prev->state = TASK_RUNNING;
else
- deactivate_task(rq, prev, 1);
+ deactivate_task(rq, prev, DEQUEUE_SLEEP);
switch_count = &prev->nvcsw;
}
@@ -4006,8 +3950,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(&x->wait, &wait);
+ __add_wait_queue_tail_exclusive(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
@@ -4233,7 +4176,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
oldprio = p->prio;
prev_class = p->sched_class;
@@ -4254,7 +4196,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq) {
- enqueue_task(rq, p, 0, oldprio < prio);
+ enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
check_class_changed(rq, p, prev_class, oldprio, running);
}
@@ -4276,7 +4218,6 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
- update_rq_clock(rq);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -4298,7 +4239,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (on_rq) {
- enqueue_task(rq, p, 0, false);
+ enqueue_task(rq, p, 0);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4559,7 +4500,6 @@ recheck:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
- update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -5296,17 +5236,15 @@ static inline void sched_init_granularity(void)
/*
* This is how migration works:
*
- * 1) we queue a struct migration_req structure in the source CPU's
- * runqueue and wake up that CPU's migration thread.
- * 2) we down() the locked semaphore => thread blocks.
- * 3) migration thread wakes up (implicitly it forces the migrated
- * thread off the CPU)
- * 4) it gets the migration request and checks whether the migrated
- * task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
* it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
- * 7) we wake up and the migration is done.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
*/
/*
@@ -5320,12 +5258,23 @@ static inline void sched_init_granularity(void)
*/
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
- struct migration_req req;
unsigned long flags;
struct rq *rq;
+ unsigned int dest_cpu;
int ret = 0;
+ /*
+ * Serialize against TASK_WAKING so that ttwu() and wunt() can
+ * drop the rq->lock and still rely on ->cpus_allowed.
+ */
+again:
+ while (task_is_waking(p))
+ cpu_relax();
rq = task_rq_lock(p, &flags);
+ if (task_is_waking(p)) {
+ task_rq_unlock(rq, &flags);
+ goto again;
+ }
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
@@ -5349,15 +5298,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (migrate_task(p, dest_cpu)) {
+ struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- struct task_struct *mt = rq->migration_thread;
-
- get_task_struct(mt);
task_rq_unlock(rq, &flags);
- wake_up_process(mt);
- put_task_struct(mt);
- wait_for_completion(&req.done);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
@@ -5415,98 +5361,49 @@ fail:
return ret;
}
-#define RCU_MIGRATION_IDLE 0
-#define RCU_MIGRATION_NEED_QS 1
-#define RCU_MIGRATION_GOT_QS 2
-#define RCU_MIGRATION_MUST_SYNC 3
-
/*
- * migration_thread - this is a highprio system thread that performs
- * thread migration by bumping thread off CPU then 'pushing' onto
- * another runqueue.
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
*/
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
{
- int badcpu;
- int cpu = (long)data;
- struct rq *rq;
-
- rq = cpu_rq(cpu);
- BUG_ON(rq->migration_thread != current);
-
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- struct migration_req *req;
- struct list_head *head;
-
- raw_spin_lock_irq(&rq->lock);
-
- if (cpu_is_offline(cpu)) {
- raw_spin_unlock_irq(&rq->lock);
- break;
- }
-
- if (rq->active_balance) {
- active_load_balance(rq, cpu);
- rq->active_balance = 0;
- }
-
- head = &rq->migration_queue;
-
- if (list_empty(head)) {
- raw_spin_unlock_irq(&rq->lock);
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
- continue;
- }
- req = list_entry(head->next, struct migration_req, list);
- list_del_init(head->next);
-
- if (req->task != NULL) {
- raw_spin_unlock(&rq->lock);
- __migrate_task(req->task, cpu, req->dest_cpu);
- } else if (likely(cpu == (badcpu = smp_processor_id()))) {
- req->dest_cpu = RCU_MIGRATION_GOT_QS;
- raw_spin_unlock(&rq->lock);
- } else {
- req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
- raw_spin_unlock(&rq->lock);
- WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
- }
- local_irq_enable();
-
- complete(&req->done);
- }
- __set_current_state(TASK_RUNNING);
-
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- int ret;
+ struct migration_arg *arg = data;
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
local_irq_disable();
- ret = __migrate_task(p, src_cpu, dest_cpu);
+ __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
local_irq_enable();
- return ret;
+ return 0;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* Figure out where task on dead CPU should go, use force if necessary.
*/
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
- int dest_cpu;
+ struct rq *rq = cpu_rq(dead_cpu);
+ int needs_cpu, uninitialized_var(dest_cpu);
+ unsigned long flags;
-again:
- dest_cpu = select_fallback_rq(dead_cpu, p);
+ local_irq_save(flags);
- /* It can have affinity changed while we were choosing. */
- if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
- goto again;
+ raw_spin_lock(&rq->lock);
+ needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+ if (needs_cpu)
+ dest_cpu = select_fallback_rq(dead_cpu, p);
+ raw_spin_unlock(&rq->lock);
+ /*
+ * It can only fail if we race with set_cpus_allowed(),
+ * in the racer should migrate the task anyway.
+ */
+ if (needs_cpu)
+ __migrate_task(p, dead_cpu, dest_cpu);
+ local_irq_restore(flags);
}
/*
@@ -5570,7 +5467,6 @@ void sched_idle_next(void)
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
- update_rq_clock(rq);
activate_task(rq, p, 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5625,7 +5521,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
for ( ; ; ) {
if (!rq->nr_running)
break;
- update_rq_clock(rq);
next = pick_next_task(rq);
if (!next)
break;
@@ -5848,35 +5743,20 @@ static void set_rq_offline(struct rq *rq)
static int __cpuinit
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
- struct task_struct *p;
int cpu = (long)hcpu;
unsigned long flags;
- struct rq *rq;
+ struct rq *rq = cpu_rq(cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
- if (IS_ERR(p))
- return NOTIFY_BAD;
- kthread_bind(p, cpu);
- /* Must be high prio: stop_machine expects to yield to it. */
- rq = task_rq_lock(p, &flags);
- __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
- task_rq_unlock(rq, &flags);
- get_task_struct(p);
- cpu_rq(cpu)->migration_thread = p;
rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- /* Strictly unnecessary, as first user will wake it. */
- wake_up_process(cpu_rq(cpu)->migration_thread);
-
/* Update our root-domain */
- rq = cpu_rq(cpu);
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5887,61 +5767,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!cpu_rq(cpu)->migration_thread)
- break;
- /* Unbind it from offline cpu so it can run. Fall thru. */
- kthread_bind(cpu_rq(cpu)->migration_thread,
- cpumask_any(cpu_online_mask));
- kthread_stop(cpu_rq(cpu)->migration_thread);
- put_task_struct(cpu_rq(cpu)->migration_thread);
- cpu_rq(cpu)->migration_thread = NULL;
- break;
-
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
migrate_live_tasks(cpu);
- rq = cpu_rq(cpu);
- kthread_stop(rq->migration_thread);
- put_task_struct(rq->migration_thread);
- rq->migration_thread = NULL;
/* Idle task back to normal (off runqueue, low prio) */
raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
deactivate_task(rq, rq->idle, 0);
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
rq->idle->sched_class = &idle_sched_class;
migrate_dead_tasks(cpu);
raw_spin_unlock_irq(&rq->lock);
- cpuset_unlock();
migrate_nr_uninterruptible(rq);
BUG_ON(rq->nr_running != 0);
calc_global_load_remove(rq);
- /*
- * No need to migrate the tasks: it was best-effort if
- * they didn't take sched_hotcpu_mutex. Just wake up
- * the requestors.
- */
- raw_spin_lock_irq(&rq->lock);
- while (!list_empty(&rq->migration_queue)) {
- struct migration_req *req;
-
- req = list_entry(rq->migration_queue.next,
- struct migration_req, list);
- list_del_init(&req->list);
- raw_spin_unlock_irq(&rq->lock);
- complete(&req->done);
- raw_spin_lock_irq(&rq->lock);
- }
- raw_spin_unlock_irq(&rq->lock);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
/* Update our root-domain */
- rq = cpu_rq(cpu);
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6272,6 +6115,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
+ for (tmp = sd; tmp; tmp = tmp->parent)
+ tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
@@ -7755,10 +7601,8 @@ void __init sched_init(void)
rq->push_cpu = 0;
rq->cpu = i;
rq->online = 0;
- rq->migration_thread = NULL;
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
- INIT_LIST_HEAD(&rq->migration_queue);
rq_attach_root(rq, &def_root_domain);
#endif
init_rq_hrtick(rq);
@@ -7859,7 +7703,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
{
int on_rq;
- update_rq_clock(rq);
on_rq = p->se.on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
@@ -7886,9 +7729,9 @@ void normalize_rt_tasks(void)
p->se.exec_start = 0;
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_start = 0;
- p->se.sleep_start = 0;
- p->se.block_start = 0;
+ p->se.statistics.wait_start = 0;
+ p->se.statistics.sleep_start = 0;
+ p->se.statistics.block_start = 0;
#endif
if (!rt_task(p)) {
@@ -8221,8 +8064,6 @@ void sched_move_task(struct task_struct *tsk)
rq = task_rq_lock(tsk, &flags);
- update_rq_clock(rq);
-
running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
@@ -8241,7 +8082,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (on_rq)
- enqueue_task(rq, tsk, 0, false);
+ enqueue_task(rq, tsk, 0);
task_rq_unlock(rq, &flags);
}
@@ -9055,43 +8896,32 @@ struct cgroup_subsys cpuacct_subsys = {
#ifndef CONFIG_SMP
-int rcu_expedited_torture_stats(char *page)
-{
- return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
void synchronize_sched_expedited(void)
{
+ barrier();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
#else /* #ifndef CONFIG_SMP */
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
-
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
{
- int cnt = 0;
- int cpu;
-
- cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
- for_each_online_cpu(cpu) {
- cnt += sprintf(&page[cnt], " %d:%d",
- cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
- }
- cnt += sprintf(&page[cnt], "\n");
- return cnt;
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
-static long synchronize_sched_expedited_count;
/*
* Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9105,18 +8935,14 @@ static long synchronize_sched_expedited_count;
*/
void synchronize_sched_expedited(void)
{
- int cpu;
- unsigned long flags;
- bool need_full_sync = 0;
- struct rq *rq;
- struct migration_req *req;
- long snap;
- int trycount = 0;
+ int snap, trycount = 0;
smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+ snap = atomic_read(&synchronize_sched_expedited_count) + 1;
get_online_cpus();
- while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+ while (try_stop_cpus(cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
put_online_cpus();
if (trycount++ < 10)
udelay(trycount * num_online_cpus());
@@ -9124,41 +8950,15 @@ void synchronize_sched_expedited(void)
synchronize_sched();
return;
}
- if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+ if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
smp_mb(); /* ensure test happens before caller kfree */
return;
}
get_online_cpus();
}
- rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
- for_each_online_cpu(cpu) {
- rq = cpu_rq(cpu);
- req = &per_cpu(rcu_migration_req, cpu);
- init_completion(&req->done);
- req->task = NULL;
- req->dest_cpu = RCU_MIGRATION_NEED_QS;
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_add(&req->list, &rq->migration_queue);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- wake_up_process(rq->migration_thread);
- }
- for_each_online_cpu(cpu) {
- rcu_expedited_state = cpu;
- req = &per_cpu(rcu_migration_req, cpu);
- rq = cpu_rq(cpu);
- wait_for_completion(&req->done);
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
- need_full_sync = 1;
- req->dest_cpu = RCU_MIGRATION_IDLE;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
- rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
- synchronize_sched_expedited_count++;
- mutex_unlock(&rcu_sched_expedited_mutex);
+ atomic_inc(&synchronize_sched_expedited_count);
+ smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
put_online_cpus();
- if (need_full_sync)
- synchronize_sched();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9b49db14403..9cf1baf6616 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
- PN(se->wait_start);
- PN(se->sleep_start);
- PN(se->block_start);
- PN(se->sleep_max);
- PN(se->block_max);
- PN(se->exec_max);
- PN(se->slice_max);
- PN(se->wait_max);
- PN(se->wait_sum);
- P(se->wait_count);
+ PN(se->statistics.wait_start);
+ PN(se->statistics.sleep_start);
+ PN(se->statistics.block_start);
+ PN(se->statistics.sleep_max);
+ PN(se->statistics.block_max);
+ PN(se->statistics.exec_max);
+ PN(se->statistics.slice_max);
+ PN(se->statistics.wait_max);
+ PN(se->statistics.wait_sum);
+ P(se->statistics.wait_count);
#endif
P(se->load.weight);
#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
SPLIT_NS(p->se.vruntime),
SPLIT_NS(p->se.sum_exec_runtime),
- SPLIT_NS(p->se.sum_sleep_runtime));
+ SPLIT_NS(p->se.statistics.sum_sleep_runtime));
#else
SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -173,11 +173,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
task_group_path(tg, path, sizeof(path));
SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
- {
- uid_t uid = cfs_rq->tg->uid;
- SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
- }
#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
#endif
@@ -407,40 +402,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_start);
PN(se.vruntime);
PN(se.sum_exec_runtime);
- PN(se.avg_overlap);
- PN(se.avg_wakeup);
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
- PN(se.wait_start);
- PN(se.sleep_start);
- PN(se.block_start);
- PN(se.sleep_max);
- PN(se.block_max);
- PN(se.exec_max);
- PN(se.slice_max);
- PN(se.wait_max);
- PN(se.wait_sum);
- P(se.wait_count);
- PN(se.iowait_sum);
- P(se.iowait_count);
+ PN(se.statistics.wait_start);
+ PN(se.statistics.sleep_start);
+ PN(se.statistics.block_start);
+ PN(se.statistics.sleep_max);
+ PN(se.statistics.block_max);
+ PN(se.statistics.exec_max);
+ PN(se.statistics.slice_max);
+ PN(se.statistics.wait_max);
+ PN(se.statistics.wait_sum);
+ P(se.statistics.wait_count);
+ PN(se.statistics.iowait_sum);
+ P(se.statistics.iowait_count);
P(sched_info.bkl_count);
P(se.nr_migrations);
- P(se.nr_migrations_cold);
- P(se.nr_failed_migrations_affine);
- P(se.nr_failed_migrations_running);
- P(se.nr_failed_migrations_hot);
- P(se.nr_forced_migrations);
- P(se.nr_wakeups);
- P(se.nr_wakeups_sync);
- P(se.nr_wakeups_migrate);
- P(se.nr_wakeups_local);
- P(se.nr_wakeups_remote);
- P(se.nr_wakeups_affine);
- P(se.nr_wakeups_affine_attempts);
- P(se.nr_wakeups_passive);
- P(se.nr_wakeups_idle);
+ P(se.statistics.nr_migrations_cold);
+ P(se.statistics.nr_failed_migrations_affine);
+ P(se.statistics.nr_failed_migrations_running);
+ P(se.statistics.nr_failed_migrations_hot);
+ P(se.statistics.nr_forced_migrations);
+ P(se.statistics.nr_wakeups);
+ P(se.statistics.nr_wakeups_sync);
+ P(se.statistics.nr_wakeups_migrate);
+ P(se.statistics.nr_wakeups_local);
+ P(se.statistics.nr_wakeups_remote);
+ P(se.statistics.nr_wakeups_affine);
+ P(se.statistics.nr_wakeups_affine_attempts);
+ P(se.statistics.nr_wakeups_passive);
+ P(se.statistics.nr_wakeups_idle);
{
u64 avg_atom, avg_per_cpu;
@@ -491,31 +484,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
void proc_sched_set_task(struct task_struct *p)
{
#ifdef CONFIG_SCHEDSTATS
- p->se.wait_max = 0;
- p->se.wait_sum = 0;
- p->se.wait_count = 0;
- p->se.iowait_sum = 0;
- p->se.iowait_count = 0;
- p->se.sleep_max = 0;
- p->se.sum_sleep_runtime = 0;
- p->se.block_max = 0;
- p->se.exec_max = 0;
- p->se.slice_max = 0;
- p->se.nr_migrations = 0;
- p->se.nr_migrations_cold = 0;
- p->se.nr_failed_migrations_affine = 0;
- p->se.nr_failed_migrations_running = 0;
- p->se.nr_failed_migrations_hot = 0;
- p->se.nr_forced_migrations = 0;
- p->se.nr_wakeups = 0;
- p->se.nr_wakeups_sync = 0;
- p->se.nr_wakeups_migrate = 0;
- p->se.nr_wakeups_local = 0;
- p->se.nr_wakeups_remote = 0;
- p->se.nr_wakeups_affine = 0;
- p->se.nr_wakeups_affine_attempts = 0;
- p->se.nr_wakeups_passive = 0;
- p->se.nr_wakeups_idle = 0;
- p->sched_info.bkl_count = 0;
+ memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924..217e4a9393e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
/*
* Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
/*
* After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
{
unsigned long delta_exec_weighted;
- schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+ schedstat_set(curr->statistics.exec_max,
+ max((u64)delta_exec, curr->statistics.exec_max));
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+ schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
}
/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- schedstat_set(se->wait_max, max(se->wait_max,
- rq_of(cfs_rq)->clock - se->wait_start));
- schedstat_set(se->wait_count, se->wait_count + 1);
- schedstat_set(se->wait_sum, se->wait_sum +
- rq_of(cfs_rq)->clock - se->wait_start);
+ schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+ rq_of(cfs_rq)->clock - se->statistics.wait_start));
+ schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+ schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
trace_sched_stat_wait(task_of(se),
- rq_of(cfs_rq)->clock - se->wait_start);
+ rq_of(cfs_rq)->clock - se->statistics.wait_start);
}
#endif
- schedstat_set(se->wait_start, 0);
+ schedstat_set(se->statistics.wait_start, 0);
}
static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (entity_is_task(se))
tsk = task_of(se);
- if (se->sleep_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+ if (se->statistics.sleep_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->sleep_max))
- se->sleep_max = delta;
+ if (unlikely(delta > se->statistics.sleep_max))
+ se->statistics.sleep_max = delta;
- se->sleep_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.sleep_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
trace_sched_stat_sleep(tsk, delta);
}
}
- if (se->block_start) {
- u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+ if (se->statistics.block_start) {
+ u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
if ((s64)delta < 0)
delta = 0;
- if (unlikely(delta > se->block_max))
- se->block_max = delta;
+ if (unlikely(delta > se->statistics.block_max))
+ se->statistics.block_max = delta;
- se->block_start = 0;
- se->sum_sleep_runtime += delta;
+ se->statistics.block_start = 0;
+ se->statistics.sum_sleep_runtime += delta;
if (tsk) {
if (tsk->in_iowait) {
- se->iowait_sum += delta;
- se->iowait_count++;
+ se->statistics.iowait_sum += delta;
+ se->statistics.iowait_count++;
trace_sched_stat_iowait(tsk, delta);
}
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
- if (!initial && sched_feat(FAIR_SLEEPERS)) {
+ if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
- * Convert the sleeper threshold into virtual time.
- * SCHED_IDLE is a special sub-class. We care about
- * fairness only relative to other SCHED_IDLE tasks,
- * all of which have the same weight.
- */
- if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
- task_of(se)->policy != SCHED_IDLE))
- thresh = calc_delta_fair(thresh, se);
-
- /*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
se->vruntime = vruntime;
}
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_MIGRATE 2
-
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update the normalized vruntime before updating min_vruntime
* through callig update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
se->vruntime += cfs_rq->min_vruntime;
/*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_curr(cfs_rq);
update_stats_dequeue(cfs_rq, se);
- if (sleep) {
+ if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- se->sleep_start = rq_of(cfs_rq)->clock;
+ se->statistics.sleep_start = rq_of(cfs_rq)->clock;
if (tsk->state & TASK_UNINTERRUPTIBLE)
- se->block_start = rq_of(cfs_rq)->clock;
+ se->statistics.block_start = rq_of(cfs_rq)->clock;
}
#endif
}
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
* update can refer to the ->curr item and we need to reflect this
* movement in our normalized position.
*/
- if (!sleep)
+ if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
}
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
- se->slice_max = max(se->slice_max,
+ se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
* then put the task into the rbtree:
*/
static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int flags = 0;
-
- if (wakeup)
- flags |= ENQUEUE_WAKEUP;
- if (p->state == TASK_WAKING)
- flags |= ENQUEUE_MIGRATE;
for_each_sched_entity(se) {
if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
* decreased. We remove the task from the rbtree and
* update the fair scheduling stats:
*/
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- dequeue_entity(cfs_rq, se, sleep);
+ dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
- sleep = 1;
+ flags |= DEQUEUE_SLEEP;
}
hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- struct task_struct *curr = current;
unsigned long this_load, load;
int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
- if (sync) {
- if (sched_feat(SYNC_LESS) &&
- (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
- } else {
- if (sched_feat(SYNC_MORE) &&
- (curr->se.avg_overlap < sysctl_sched_migration_cost &&
- p->se.avg_overlap < sysctl_sched_migration_cost))
- sync = 1;
- }
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
if (sync && balanced)
return 1;
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
* there is no bad imbalance.
*/
schedstat_inc(sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
+ schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
{
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
+ struct sched_domain *sd;
int i;
/*
- * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
- * test in select_task_rq_fair) and the prev_cpu is idle then that's
- * always a better target than the current cpu.
+ * If the task is going to be woken-up on this cpu and if it is
+ * already idle, then it is the right target.
*/
- if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+ if (target == cpu && idle_cpu(cpu))
+ return cpu;
+
+ /*
+ * If the task is going to be woken-up on the cpu where it previously
+ * ran and if it is currently idle, then it the right target.
+ */
+ if (target == prev_cpu && idle_cpu(prev_cpu))
return prev_cpu;
/*
- * Otherwise, iterate the domain and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
- if (!cpu_rq(i)->cfs.nr_running) {
- target = i;
+ for_each_domain(target, sd) {
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
+
+ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+ if (idle_cpu(i)) {
+ target = i;
+ break;
+ }
}
+
+ /*
+ * Lets stop looking for an idle sibling when we reached
+ * the domain that spans the current cpu and prev_cpu.
+ */
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ break;
}
return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
*
* preempt must be disabled.
*/
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE) {
- if (sched_feat(AFFINE_WAKEUPS) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed))
want_affine = 1;
new_cpu = prev_cpu;
}
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
}
/*
- * While iterating the domains looking for a spanning
- * WAKE_AFFINE domain, adjust the affine target to any idle cpu
- * in cache sharing domains along the way.
+ * If both cpu and prev_cpu are part of this domain,
+ * cpu is a valid SD_WAKE_AFFINE target.
*/
- if (want_affine) {
- int target = -1;
-
- /*
- * If both cpu and prev_cpu are part of this domain,
- * cpu is a valid SD_WAKE_AFFINE target.
- */
- if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
- target = cpu;
-
- /*
- * If there's an idle sibling in this domain, make that
- * the wake_affine target instead of the current cpu.
- */
- if (tmp->flags & SD_SHARE_PKG_RESOURCES)
- target = select_idle_sibling(p, tmp, target);
-
- if (target >= 0) {
- if (tmp->flags & SD_WAKE_AFFINE) {
- affine_sd = tmp;
- want_affine = 0;
- }
- cpu = target;
- }
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ affine_sd = tmp;
+ want_affine = 0;
}
if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
sd = tmp;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (sched_feat(LB_SHARES_UPDATE)) {
/*
* Pick the largest domain to update shares over
*/
tmp = sd;
- if (affine_sd && (!tmp ||
- cpumask_weight(sched_domain_span(affine_sd)) >
- cpumask_weight(sched_domain_span(sd))))
+ if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
tmp = affine_sd;
- if (tmp)
+ if (tmp) {
+ raw_spin_unlock(&rq->lock);
update_shares(tmp);
+ raw_spin_lock(&rq->lock);
+ }
}
+#endif
- if (affine_sd && wake_affine(affine_sd, p, sync))
- return cpu;
+ if (affine_sd) {
+ if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ return select_idle_sibling(p, cpu);
+ else
+ return select_idle_sibling(p, prev_cpu);
+ }
while (sd) {
int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
- weight = cpumask_weight(sched_domain_span(sd));
+ weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
- if (weight <= cpumask_weight(sched_domain_span(tmp)))
+ if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
}
#endif /* CONFIG_SMP */
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- * degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
- u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
- u64 gran = 0;
-
- if (this_run < expected_wakeup)
- gran = expected_wakeup - this_run;
-
- return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
- if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
- gran = adaptive_gran(curr, se);
-
/*
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
*/
- if (sched_feat(ASYM_GRAN)) {
- /*
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
- } else {
- if (unlikely(curr->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, curr);
- }
+ if (unlikely(se->load.weight != NICE_0_LOAD))
+ gran = calc_delta_fair(gran, se);
return gran;
}
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int sync = wake_flags & WF_SYNC;
int scale = cfs_rq->nr_running >= sched_nr_latency;
if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(curr->policy == SCHED_IDLE))
goto preempt;
- if (sched_feat(WAKEUP_SYNC) && sync)
- goto preempt;
-
- if (sched_feat(WAKEUP_OVERLAP) &&
- se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost)
- goto preempt;
-
if (!sched_feat(WAKEUP_PREEMPT))
return;
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 3) are cache-hot on their current CPU.
*/
if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
- schedstat_inc(p, se.nr_failed_migrations_affine);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
*all_pinned = 0;
if (task_running(rq, p)) {
- schedstat_inc(p, se.nr_failed_migrations_running);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
#ifdef CONFIG_SCHEDSTATS
if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
- schedstat_inc(p, se.nr_forced_migrations);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
}
#endif
return 1;
}
if (tsk_cache_hot) {
- schedstat_inc(p, se.nr_failed_migrations_hot);
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
- unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ unsigned long weight = sd->span_weight;
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
+static int active_load_balance_cpu_stop(void *data);
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
if (need_active_balance(sd, sd_idle, idle)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
- /* don't kick the migration_thread, if the curr
- * task on busiest cpu can't be moved to this_cpu
+ /* don't kick the active_load_balance_cpu_stop,
+ * if the curr task on busiest cpu can't be
+ * moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
&busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
goto out_one_pinned;
}
+ /*
+ * ->active_balance synchronizes accesses to
+ * ->active_balance_work. Once set, it's cleared
+ * only after active load balance is finished.
+ */
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
if (active_balance)
- wake_up_process(busiest->migration_thread);
+ stop_one_cpu_nowait(cpu_of(busiest),
+ active_load_balance_cpu_stop, busiest,
+ &busiest->active_balance_work);
/*
* We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
}
/*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
*/
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
{
+ struct rq *busiest_rq = data;
+ int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
- struct rq *target_rq;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance))
+ goto out_unlock;
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
- return;
-
- target_rq = cpu_rq(target_cpu);
+ goto out_unlock;
/*
* This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
- update_rq_clock(busiest_rq);
- update_rq_clock(target_rq);
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
schedstat_inc(sd, alb_failed);
}
double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
}
#ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d..83c66e8ad3e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
/*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-
-/*
* Only give sleepers 50% of their service deficit. This allows
* them to run sooner, but does not allow tons of sleepers to
* rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
/*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-
-/*
* Place new tasks ahead so that they do not starve already running
* tasks
*/
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
/*
- * Compute wakeup_gran based on task behaviour, clipped to
- * [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-
-/*
* Based on load and program behaviour, see if it makes sense to place
* a newly woken task on the same cpu as the task that woke it --
* improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
/*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-
-/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a5094..9fa0f402c87 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
*/
#ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
- /* adjust the active tasks as we might go into a long sleep */
- calc_load_account_active(rq);
+ calc_load_account_idle(rq);
return rq->idle;
}
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
* message if some code attempts to do it:
*/
static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
raw_spin_unlock_irq(&rq->lock);
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea..8afb953e31c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
- schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+ schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Adding/removing a task to/from a priority array:
*/
static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
- if (wakeup)
+ if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
- enqueue_rt_entity(rt_se, head);
+ enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
{
- struct rq *rq = task_rq(p);
-
if (sd_flag != SD_BALANCE_WAKE)
return smp_processor_id();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79..ef51d1fcf5e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,381 @@
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
- * GPL v2 and any later version.
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005 IBM Corporation.
+ * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010 SUSE Linux Products GmbH
+ * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
*/
+#include <linux/completion.h>
#include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
#include <linux/kthread.h>
#include <linux/module.h>
+#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/stop_machine.h>
-#include <linux/syscalls.h>
#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
#include <asm/atomic.h>
-#include <asm/uaccess.h>
+
+/*
+ * Structure to determine completion condition and record errors. May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+ atomic_t nr_todo; /* nr left to execute */
+ bool executed; /* actually executed? */
+ int ret; /* collected return value */
+ struct completion completion; /* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+ spinlock_t lock;
+ struct list_head works; /* list of pending works */
+ struct task_struct *thread; /* stopper thread */
+ bool enabled; /* is this stopper enabled? */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+ memset(done, 0, sizeof(*done));
+ atomic_set(&done->nr_todo, nr_todo);
+ init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+ if (done) {
+ if (executed)
+ done->executed = true;
+ if (atomic_dec_and_test(&done->nr_todo))
+ complete(&done->completion);
+ }
+}
+
+/* queue @work to @stopper. if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+ struct cpu_stop_work *work)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+
+ if (stopper->enabled) {
+ list_add_tail(&work->list, &stopper->works);
+ wake_up_process(stopper->thread);
+ } else
+ cpu_stop_signal_done(work->done, false);
+
+ spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu. @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it. This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes. If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus. @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_done done;
+ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+ cpu_stop_init_done(&done, 1);
+ cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+ wait_for_completion(&done.completion);
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion. The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+ struct cpu_stop_work *work_buf)
+{
+ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+ cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ struct cpu_stop_work *work;
+ struct cpu_stop_done done;
+ unsigned int cpu;
+
+ /* initialize works and done */
+ for_each_cpu(cpu, cpumask) {
+ work = &per_cpu(stop_cpus_work, cpu);
+ work->fn = fn;
+ work->arg = arg;
+ work->done = &done;
+ }
+ cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+ /*
+ * Disable preemption while queueing to avoid getting
+ * preempted by a stopper which might wait for other stoppers
+ * to enter @fn which can lead to deadlock.
+ */
+ preempt_disable();
+ for_each_cpu(cpu, cpumask)
+ cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+ &per_cpu(stop_cpus_work, cpu));
+ preempt_enable();
+
+ wait_for_completion(&done.completion);
+ return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it. This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes. If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus. @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ int ret;
+
+ /* static works are used, process one request at a time */
+ mutex_lock(&stop_cpus_mutex);
+ ret = __stop_cpus(cpumask, fn, arg);
+ mutex_unlock(&stop_cpus_mutex);
+ return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+ int ret;
+
+ /* static works are used, process one request at a time */
+ if (!mutex_trylock(&stop_cpus_mutex))
+ return -EAGAIN;
+ ret = __stop_cpus(cpumask, fn, arg);
+ mutex_unlock(&stop_cpus_mutex);
+ return ret;
+}
+
+static int cpu_stopper_thread(void *data)
+{
+ struct cpu_stopper *stopper = data;
+ struct cpu_stop_work *work;
+ int ret;
+
+repeat:
+ set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ work = NULL;
+ spin_lock_irq(&stopper->lock);
+ if (!list_empty(&stopper->works)) {
+ work = list_first_entry(&stopper->works,
+ struct cpu_stop_work, list);
+ list_del_init(&work->list);
+ }
+ spin_unlock_irq(&stopper->lock);
+
+ if (work) {
+ cpu_stop_fn_t fn = work->fn;
+ void *arg = work->arg;
+ struct cpu_stop_done *done = work->done;
+ char ksym_buf[KSYM_NAME_LEN];
+
+ __set_current_state(TASK_RUNNING);
+
+ /* cpu stop callbacks are not allowed to sleep */
+ preempt_disable();
+
+ ret = fn(arg);
+ if (ret)
+ done->ret = ret;
+
+ /* restore preemption and check it's still balanced */
+ preempt_enable();
+ WARN_ONCE(preempt_count(),
+ "cpu_stop: %s(%p) leaked preempt count\n",
+ kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+ ksym_buf), arg);
+
+ cpu_stop_signal_done(done, true);
+ } else
+ schedule();
+
+ goto repeat;
+}
+
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ unsigned int cpu = (unsigned long)hcpu;
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ struct cpu_stop_work *work;
+ struct task_struct *p;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ BUG_ON(stopper->thread || stopper->enabled ||
+ !list_empty(&stopper->works));
+ p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
+ cpu);
+ if (IS_ERR(p))
+ return NOTIFY_BAD;
+ sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+ get_task_struct(p);
+ stopper->thread = p;
+ break;
+
+ case CPU_ONLINE:
+ kthread_bind(stopper->thread, cpu);
+ /* strictly unnecessary, as first user will wake it */
+ wake_up_process(stopper->thread);
+ /* mark enabled */
+ spin_lock_irq(&stopper->lock);
+ stopper->enabled = true;
+ spin_unlock_irq(&stopper->lock);
+ break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ /* kill the stopper */
+ kthread_stop(stopper->thread);
+ /* drain remaining works */
+ spin_lock_irq(&stopper->lock);
+ list_for_each_entry(work, &stopper->works, list)
+ cpu_stop_signal_done(work->done, false);
+ stopper->enabled = false;
+ spin_unlock_irq(&stopper->lock);
+ /* release the stopper */
+ put_task_struct(stopper->thread);
+ stopper->thread = NULL;
+ break;
+#endif
+ }
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers. It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+ .notifier_call = cpu_stop_cpu_callback,
+ .priority = 10,
+};
+
+static int __init cpu_stop_init(void)
+{
+ void *bcpu = (void *)(long)smp_processor_id();
+ unsigned int cpu;
+ int err;
+
+ for_each_possible_cpu(cpu) {
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+ spin_lock_init(&stopper->lock);
+ INIT_LIST_HEAD(&stopper->works);
+ }
+
+ /* start one for the boot cpu */
+ err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+ bcpu);
+ BUG_ON(err == NOTIFY_BAD);
+ cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+ register_cpu_notifier(&cpu_stop_cpu_notifier);
+
+ return 0;
+}
+early_initcall(cpu_stop_init);
+
+#ifdef CONFIG_STOP_MACHINE
/* This controls the threads on each CPU. */
enum stopmachine_state {
@@ -26,174 +390,94 @@ enum stopmachine_state {
/* Exit */
STOPMACHINE_EXIT,
};
-static enum stopmachine_state state;
struct stop_machine_data {
- int (*fn)(void *);
- void *data;
- int fnret;
+ int (*fn)(void *);
+ void *data;
+ /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+ unsigned int num_threads;
+ const struct cpumask *active_cpus;
+
+ enum stopmachine_state state;
+ atomic_t thread_ack;
};
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void __percpu *stop_machine_work;
-
-static void set_state(enum stopmachine_state newstate)
+static void set_state(struct stop_machine_data *smdata,
+ enum stopmachine_state newstate)
{
/* Reset ack counter. */
- atomic_set(&thread_ack, num_threads);
+ atomic_set(&smdata->thread_ack, smdata->num_threads);
smp_wmb();
- state = newstate;
+ smdata->state = newstate;
}
/* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
{
- if (atomic_dec_and_test(&thread_ack))
- set_state(state + 1);
+ if (atomic_dec_and_test(&smdata->thread_ack))
+ set_state(smdata, smdata->state + 1);
}
-/* This is the actual function which stops the CPU. It runs
- * in the context of a dedicated stopmachine workqueue. */
-static void stop_cpu(struct work_struct *unused)
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
{
+ struct stop_machine_data *smdata = data;
enum stopmachine_state curstate = STOPMACHINE_NONE;
- struct stop_machine_data *smdata = &idle;
- int cpu = smp_processor_id();
- int err;
+ int cpu = smp_processor_id(), err = 0;
+ bool is_active;
+
+ if (!smdata->active_cpus)
+ is_active = cpu == cpumask_first(cpu_online_mask);
+ else
+ is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
- if (!active_cpus) {
- if (cpu == cpumask_first(cpu_online_mask))
- smdata = &active;
- } else {
- if (cpumask_test_cpu(cpu, active_cpus))
- smdata = &active;
- }
/* Simple state machine */
do {
/* Chill out and ensure we re-read stopmachine_state. */
cpu_relax();
- if (state != curstate) {
- curstate = state;
+ if (smdata->state != curstate) {
+ curstate = smdata->state;
switch (curstate) {
case STOPMACHINE_DISABLE_IRQ:
local_irq_disable();
hard_irq_disable();
break;
case STOPMACHINE_RUN:
- /* On multiple CPUs only a single error code
- * is needed to tell that something failed. */
- err = smdata->fn(smdata->data);
- if (err)
- smdata->fnret = err;
+ if (is_active)
+ err = smdata->fn(smdata->data);
break;
default:
break;
}
- ack_state();
+ ack_state(smdata);
}
} while (curstate != STOPMACHINE_EXIT);
local_irq_enable();
+ return err;
}
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
- return 0;
-}
-
-int stop_machine_create(void)
-{
- mutex_lock(&setup_lock);
- if (refcount)
- goto done;
- stop_machine_wq = create_rt_workqueue("kstop");
- if (!stop_machine_wq)
- goto err_out;
- stop_machine_work = alloc_percpu(struct work_struct);
- if (!stop_machine_work)
- goto err_out;
-done:
- refcount++;
- mutex_unlock(&setup_lock);
- return 0;
-
-err_out:
- if (stop_machine_wq)
- destroy_workqueue(stop_machine_wq);
- mutex_unlock(&setup_lock);
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-
-void stop_machine_destroy(void)
-{
- mutex_lock(&setup_lock);
- refcount--;
- if (refcount)
- goto done;
- destroy_workqueue(stop_machine_wq);
- free_percpu(stop_machine_work);
-done:
- mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
-
int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
{
- struct work_struct *sm_work;
- int i, ret;
-
- /* Set up initial state. */
- mutex_lock(&lock);
- num_threads = num_online_cpus();
- active_cpus = cpus;
- active.fn = fn;
- active.data = data;
- active.fnret = 0;
- idle.fn = chill;
- idle.data = NULL;
-
- set_state(STOPMACHINE_PREPARE);
-
- /* Schedule the stop_cpu work on all cpus: hold this CPU so one
- * doesn't hit this CPU until we're ready. */
- get_cpu();
- for_each_online_cpu(i) {
- sm_work = per_cpu_ptr(stop_machine_work, i);
- INIT_WORK(sm_work, stop_cpu);
- queue_work_on(i, stop_machine_wq, sm_work);
- }
- /* This will release the thread on our CPU. */
- put_cpu();
- flush_workqueue(stop_machine_wq);
- ret = active.fnret;
- mutex_unlock(&lock);
- return ret;
+ struct stop_machine_data smdata = { .fn = fn, .data = data,
+ .num_threads = num_online_cpus(),
+ .active_cpus = cpus };
+
+ /* Set the initial state and stop all online cpus. */
+ set_state(&smdata, STOPMACHINE_PREPARE);
+ return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
}
int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
{
int ret;
- ret = stop_machine_create();
- if (ret)
- return ret;
/* No CPUs can come up or down during this. */
get_online_cpus();
ret = __stop_machine(fn, data, cpus);
put_online_cpus();
- stop_machine_destroy();
return ret;
}
EXPORT_SYMBOL_GPL(stop_machine);
+
+#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f5..1d7b9bc1c03 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog();
}
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+{
+ ktime_t delta;
+
+ if (ts->idle_active) {
+ delta = ktime_sub(now, ts->idle_entrytime);
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ if (nr_iowait_cpu() > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ ts->idle_entrytime = now;
+ }
+
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+}
+
static void tick_nohz_stop_idle(int cpu, ktime_t now)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t delta;
- delta = ktime_sub(now, ts->idle_entrytime);
- ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ update_ts_time_stats(ts, now, NULL);
ts->idle_active = 0;
sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
{
- ktime_t now, delta;
+ ktime_t now;
now = ktime_get();
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- ts->idle_lastupdate = now;
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- }
+
+ update_ts_time_stats(ts, now, NULL);
+
ts->idle_entrytime = now;
ts->idle_active = 1;
sched_clock_idle_sleep_event();
return now;
}
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
if (!tick_nohz_enabled)
return -1;
- if (ts->idle_active)
- *last_update_time = ktime_to_us(ts->idle_lastupdate);
- else
- *last_update_time = ktime_to_us(ktime_get());
+ update_ts_time_stats(ts, ktime_get(), last_update_time);
return ktime_to_us(ts->idle_sleeptime);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+ if (!tick_nohz_enabled)
+ return -1;
+
+ update_ts_time_stats(ts, ktime_get(), last_update_time);
+
+ return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
/**
* tick_nohz_stop_sched_tick - stop the idle tick from the idle task
*
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
goto end;
}
+ if (nohz_ratelimit(cpu))
+ goto end;
+
ts->idle_calls++;
/* Read jiffies and the time when jiffies were updated last */
do {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd7877..ab8f5e33fa9 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
P_ns(idle_waketime);
P_ns(idle_exittime);
P_ns(idle_sleeptime);
+ P_ns(iowait_sleeptime);
P(last_jiffies);
P(next_jiffies);
P_ns(idle_expires);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f51..36ea2b65dcd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
}
}
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_abort(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ABORT);
}
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_insert(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_INSERT);
}
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+static void blk_add_trace_rq_issue(void *ignore,
+ struct request_queue *q, struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
}
-static void blk_add_trace_rq_requeue(struct request_queue *q,
+static void blk_add_trace_rq_requeue(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(struct request_queue *q,
+static void blk_add_trace_rq_complete(void *ignore,
+ struct request_queue *q,
struct request *rq)
{
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
}
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_bounce(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
}
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_complete(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
}
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
+static void blk_add_trace_bio_backmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
}
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+static void blk_add_trace_bio_frontmerge(void *ignore,
+ struct request_queue *q,
struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
}
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+static void blk_add_trace_bio_queue(void *ignore,
+ struct request_queue *q, struct bio *bio)
{
blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
}
-static void blk_add_trace_getrq(struct request_queue *q,
+static void blk_add_trace_getrq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
}
-static void blk_add_trace_sleeprq(struct request_queue *q,
+static void blk_add_trace_sleeprq(void *ignore,
+ struct request_queue *q,
struct bio *bio, int rw)
{
if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
}
}
-static void blk_add_trace_plug(struct request_queue *q)
+static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
-static void blk_add_trace_unplug_io(struct request_queue *q)
+static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
}
}
-static void blk_add_trace_unplug_timer(struct request_queue *q)
+static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
}
}
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+static void blk_add_trace_split(void *ignore,
+ struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
struct blk_trace *bt = q->blk_trace;
@@ -839,8 +852,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
* it spans a stripe (or similar). Add a trace for that action.
*
**/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from)
+static void blk_add_trace_remap(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_io_trace_remap r;
@@ -869,7 +883,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
* Add a trace for that action.
*
**/
-static void blk_add_trace_rq_remap(struct request_queue *q,
+static void blk_add_trace_rq_remap(void *ignore,
+ struct request_queue *q,
struct request *rq, dev_t dev,
sector_t from)
{
@@ -921,64 +936,64 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+ ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+ ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+ ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+ ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+ ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+ ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+ ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+ ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+ ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
WARN_ON(ret);
- ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+ ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
WARN_ON(ret);
- ret = register_trace_block_getrq(blk_add_trace_getrq);
+ ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
- ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+ ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
WARN_ON(ret);
- ret = register_trace_block_plug(blk_add_trace_plug);
+ ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+ ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+ ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
WARN_ON(ret);
- ret = register_trace_block_split(blk_add_trace_split);
+ ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
- ret = register_trace_block_remap(blk_add_trace_remap);
+ ret = register_trace_block_remap(blk_add_trace_remap, NULL);
WARN_ON(ret);
- ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+ ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
WARN_ON(ret);
}
static void blk_unregister_tracepoints(void)
{
- unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
- unregister_trace_block_remap(blk_add_trace_remap);
- unregister_trace_block_split(blk_add_trace_split);
- unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
- unregister_trace_block_plug(blk_add_trace_plug);
- unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
- unregister_trace_block_getrq(blk_add_trace_getrq);
- unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
- unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
- unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
- unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
- unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
- unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
- unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
- unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
- unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+ unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
+ unregister_trace_block_remap(blk_add_trace_remap, NULL);
+ unregister_trace_block_split(blk_add_trace_split, NULL);
+ unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+ unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+ unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
+ unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
+ unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
+ unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
+ unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
+ unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
+ unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
+ unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
+ unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
+ unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
+ unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
+ unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
@@ -1321,7 +1336,7 @@ out:
}
static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return print_one_line(iter, false);
}
@@ -1343,7 +1358,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
}
static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return blk_trace_synthesize_old_trace(iter) ?
TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1397,16 @@ static struct tracer blk_tracer __read_mostly = {
.set_flag = blk_tracer_set_flag,
};
-static struct trace_event trace_blk_event = {
- .type = TRACE_BLK,
+static struct trace_event_functions trace_blk_event_funcs = {
.trace = blk_trace_event_print,
.binary = blk_trace_event_print_binary,
};
+static struct trace_event trace_blk_event = {
+ .type = TRACE_BLK,
+ .funcs = &trace_blk_event_funcs,
+};
+
static int __init init_blk_tracer(void)
{
if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b309..6d2cb14f944 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
unsigned long counter;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
unsigned long long time;
+ unsigned long long time_squared;
#endif
};
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
{
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_printf(m, " Function "
- "Hit Time Avg\n"
+ "Hit Time Avg s^2\n"
" -------- "
- "--- ---- ---\n");
+ "--- ---- --- ---\n");
#else
seq_printf(m, " Function Hit\n"
" -------- ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
static DEFINE_MUTEX(mutex);
static struct trace_seq s;
unsigned long long avg;
+ unsigned long long stddev;
#endif
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
avg = rec->time;
do_div(avg, rec->counter);
+ /* Sample standard deviation (s^2) */
+ if (rec->counter <= 1)
+ stddev = 0;
+ else {
+ stddev = rec->time_squared - rec->counter * avg * avg;
+ /*
+ * Divide only 1000 for ns^2 -> us^2 conversion.
+ * trace_print_graph_duration will divide 1000 again.
+ */
+ do_div(stddev, (rec->counter - 1) * 1000);
+ }
+
mutex_lock(&mutex);
trace_seq_init(&s);
trace_print_graph_duration(rec->time, &s);
trace_seq_puts(&s, " ");
trace_print_graph_duration(avg, &s);
+ trace_seq_puts(&s, " ");
+ trace_print_graph_duration(stddev, &s);
trace_print_seq(m, &s);
mutex_unlock(&mutex);
#endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
if (!stat->hash || !ftrace_profile_enabled)
goto out;
+ /* If the calltime was zero'd ignore it */
+ if (!trace->calltime)
+ goto out;
+
calltime = trace->rettime - trace->calltime;
if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
}
rec = ftrace_find_profiled_func(stat, trace->func);
- if (rec)
+ if (rec) {
rec->time += calltime;
+ rec->time_squared += calltime * calltime;
+ }
out:
local_irq_restore(flags);
@@ -3212,8 +3234,8 @@ free:
}
static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
- struct task_struct *next)
+ftrace_graph_probe_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
unsigned long long timestamp;
int index;
@@ -3267,7 +3289,7 @@ static int start_graph_tracing(void)
} while (ret == -EAGAIN);
if (!ret) {
- ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+ ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
if (ret)
pr_info("ftrace_graph: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -3339,11 +3361,11 @@ void unregister_ftrace_graph(void)
goto out;
ftrace_graph_active--;
- unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
+ unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
out:
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153..bbfc1bb1660 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
trace_wake_up();
}
-static void kmemtrace_kmalloc(unsigned long call_site,
+static void kmemtrace_kmalloc(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, -1);
}
-static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, -1);
}
-static void kmemtrace_kmalloc_node(unsigned long call_site,
+static void kmemtrace_kmalloc_node(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, node);
}
-static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+static void kmemtrace_kmem_cache_alloc_node(void *ignore,
+ unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
bytes_req, bytes_alloc, gfp_flags, node);
}
-static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+static void
+kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
{
kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
}
-static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+static void kmemtrace_kmem_cache_free(void *ignore,
+ unsigned long call_site, const void *ptr)
{
kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
}
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
{
int err;
- err = register_trace_kmalloc(kmemtrace_kmalloc);
+ err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
if (err)
return err;
- err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+ err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
if (err)
return err;
- err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+ err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
if (err)
return err;
- err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+ err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
if (err)
return err;
- err = register_trace_kfree(kmemtrace_kfree);
+ err = register_trace_kfree(kmemtrace_kfree, NULL);
if (err)
return err;
- err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+ err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
return err;
}
static void kmemtrace_stop_probes(void)
{
- unregister_trace_kmalloc(kmemtrace_kmalloc);
- unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
- unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
- unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
- unregister_trace_kfree(kmemtrace_kfree);
- unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+ unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
+ unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
+ unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
+ unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
+ unregister_trace_kfree(kmemtrace_kfree, NULL);
+ unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
}
static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
};
static enum print_line_t
-kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
}
static enum print_line_t
-kmemtrace_print_free(struct trace_iterator *iter, int flags)
+kmemtrace_print_free(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
}
static enum print_line_t
-kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
}
static enum print_line_t
-kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
+kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
}
}
-static struct trace_event kmem_trace_alloc = {
- .type = TRACE_KMEM_ALLOC,
+static struct trace_event_functions kmem_trace_alloc_funcs = {
.trace = kmemtrace_print_alloc,
.binary = kmemtrace_print_alloc_user,
};
-static struct trace_event kmem_trace_free = {
- .type = TRACE_KMEM_FREE,
+static struct trace_event kmem_trace_alloc = {
+ .type = TRACE_KMEM_ALLOC,
+ .funcs = &kmem_trace_alloc_funcs,
+};
+
+static struct trace_event_functions kmem_trace_free_funcs = {
.trace = kmemtrace_print_free,
.binary = kmemtrace_print_free_user,
};
+static struct trace_event kmem_trace_free = {
+ .type = TRACE_KMEM_FREE,
+ .funcs = &kmem_trace_free_funcs,
+};
+
static struct tracer kmem_tracer __read_mostly = {
.name = "kmemtrace",
.init = kmem_trace_init,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41ca394feb2..7f6059c5aa9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS (1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED (1 << 30)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
local_t write; /* index for next write */
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
+ unsigned long real_end; /* real end of data */
struct buffer_data_page *page; /* Actual data page */
};
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
(unsigned int)sizeof(field.commit),
(unsigned int)is_signed_type(long));
+ ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
+
ret = trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
@@ -440,6 +452,8 @@ struct ring_buffer_per_cpu {
struct buffer_page *tail_page; /* write to tail */
struct buffer_page *commit_page; /* committed pages */
struct buffer_page *reader_page;
+ unsigned long lost_events;
+ unsigned long last_overrun;
local_t commit_overrun;
local_t overrun;
local_t entries;
@@ -1762,6 +1776,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
kmemcheck_annotate_bitfield(event, bitfield);
/*
+ * Save the original length to the meta data.
+ * This will be used by the reader to add lost event
+ * counter.
+ */
+ tail_page->real_end = tail;
+
+ /*
* If this event is bigger than the minimum size, then
* we need to be careful that we don't subtract the
* write counter enough to allow another writer to slip
@@ -1979,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
u64 *ts, u64 *delta)
{
struct ring_buffer_event *event;
- static int once;
int ret;
- if (unlikely(*delta > (1ULL << 59) && !once++)) {
- printk(KERN_WARNING "Delta way too big! %llu"
- " ts=%llu write stamp = %llu\n",
- (unsigned long long)*delta,
- (unsigned long long)*ts,
- (unsigned long long)cpu_buffer->write_stamp);
- WARN_ON(1);
- }
+ WARN_ONCE(*delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+ (unsigned long long)*delta,
+ (unsigned long long)*ts,
+ (unsigned long long)cpu_buffer->write_stamp);
/*
* The delta is too big, we to add a
@@ -2838,6 +2855,7 @@ static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
int ret;
@@ -2879,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
local_set(&cpu_buffer->reader_page->page->commit, 0);
+ cpu_buffer->reader_page->real_end = 0;
spin:
/*
@@ -2899,6 +2918,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
/*
+ * We want to make sure we read the overruns after we set up our
+ * pointers to the next object. The writer side does a
+ * cmpxchg to cross pages which acts as the mb on the writer
+ * side. Note, the reader will constantly fail the swap
+ * while the writer is updating the pointers, so this
+ * guarantees that the overwrite recorded here is the one we
+ * want to compare with the last_overrun.
+ */
+ smp_mb();
+ overwrite = local_read(&(cpu_buffer->overrun));
+
+ /*
* Here's the tricky part.
*
* We need to move the pointer past the header page.
@@ -2929,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page = reader;
rb_reset_reader_page(cpu_buffer);
+ if (overwrite != cpu_buffer->last_overrun) {
+ cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+ cpu_buffer->last_overrun = overwrite;
+ }
+
goto again;
out:
@@ -3005,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
rb_advance_iter(iter);
}
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->lost_events;
+}
+
static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+ unsigned long *lost_events)
{
struct ring_buffer_event *event;
struct buffer_page *reader;
@@ -3058,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
cpu_buffer->cpu, ts);
}
+ if (lost_events)
+ *lost_events = rb_lost_events(cpu_buffer);
return event;
default:
@@ -3168,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
* @buffer: The ring buffer to read
* @cpu: The cpu to peak at
* @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
*
* This will return the event that will be read next, but does
* not consume the data.
*/
struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+ unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -3188,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
local_irq_save(flags);
if (dolock)
spin_lock(&cpu_buffer->reader_lock);
- event = rb_buffer_peek(cpu_buffer, ts);
+ event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
rb_advance_reader(cpu_buffer);
if (dolock)
@@ -3230,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
/**
* ring_buffer_consume - return an event and consume it
* @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
*
* Returns the next event in the ring buffer, and that event is consumed.
* Meaning, that sequential reads will keep returning a different event,
* and eventually empty the ring buffer if the producer is slower.
*/
struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+ unsigned long *lost_events)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event = NULL;
@@ -3257,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
if (dolock)
spin_lock(&cpu_buffer->reader_lock);
- event = rb_buffer_peek(cpu_buffer, ts);
- if (event)
+ event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+ if (event) {
+ cpu_buffer->lost_events = 0;
rb_advance_reader(cpu_buffer);
+ }
if (dolock)
spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
*
- * This starts up an iteration through the buffer. It also disables
- * the recording to the buffer until the reading is finished.
- * This prevents the reading from being corrupted. This is not
- * a consuming read, so a producer is not expected.
+ * This performs the initial preparations necessary to iterate
+ * through the buffer. Memory is allocated, buffer recording
+ * is disabled, and the iterator pointer is returned to the caller.
*
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
- unsigned long flags;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
@@ -3306,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
iter->cpu_buffer = cpu_buffer;
atomic_inc(&cpu_buffer->record_disabled);
+
+ return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized. Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ if (!iter)
+ return;
+
+ cpu_buffer = iter->cpu_buffer;
spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -3408,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->write_stamp = 0;
cpu_buffer->read_stamp = 0;
+ cpu_buffer->lost_events = 0;
+ cpu_buffer->last_overrun = 0;
+
rb_head_page_activate(cpu_buffer);
}
@@ -3683,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
struct ring_buffer_event *event;
struct buffer_data_page *bpage;
struct buffer_page *reader;
+ unsigned long missed_events;
unsigned long flags;
unsigned int commit;
unsigned int read;
@@ -3719,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
read = reader->read;
commit = rb_page_commit(reader);
+ /* Check if any events were dropped */
+ missed_events = cpu_buffer->lost_events;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -3779,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
local_set(&reader->entries, 0);
reader->read = 0;
*data_page = bpage;
+
+ /*
+ * Use the real_end for the data size,
+ * This gives us a chance to store the lost events
+ * on the page.
+ */
+ if (reader->real_end)
+ local_set(&bpage->commit, reader->real_end);
}
ret = read;
+ cpu_buffer->lost_events = 0;
+ /*
+ * Set a flag in the commit field if we lost events
+ */
+ if (missed_events) {
+ commit = local_read(&bpage->commit);
+
+ /* If there is room at the end of the page to save the
+ * missed events, then record it there.
+ */
+ if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ memcpy(&bpage->data[commit], &missed_events,
+ sizeof(missed_events));
+ local_add(RB_MISSED_STORED, &bpage->commit);
+ }
+ local_add(RB_MISSED_EVENTS, &bpage->commit);
+ }
+
out_unlock:
spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c798225..302f8a61463 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
int *entry;
u64 ts;
- event = ring_buffer_consume(buffer, cpu, &ts);
+ event = ring_buffer_consume(buffer, cpu, &ts, NULL);
if (!event)
return EVENT_DROPPED;
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
if (ret >= 0) {
rpage = bpage;
- commit = local_read(&rpage->commit);
+ /* The commit may have missed event flags set, clear them */
+ commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !kill_test; i += inc) {
if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a0406..ba0ec81158b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
*
* It is default off, but you can enable it with either specifying
* "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
*/
-int ftrace_dump_on_oops;
+
+enum ftrace_dump_mode ftrace_dump_on_oops;
static int tracing_set_tracer(const char *buf);
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
static int __init set_ftrace_dump_on_oops(char *str)
{
- ftrace_dump_on_oops = 1;
- return 1;
+ if (*str++ != '=' || !*str) {
+ ftrace_dump_on_oops = DUMP_ALL;
+ return 1;
+ }
+
+ if (!strcmp("orig_cpu", str)) {
+ ftrace_dump_on_oops = DUMP_ORIG;
+ return 1;
+ }
+
+ return 0;
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
}
static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+ unsigned long *lost_events)
{
struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
else
- event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+ event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+ lost_events);
ftrace_enable_cpu();
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
}
static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+ unsigned long *missing_events, u64 *ent_ts)
{
struct ring_buffer *buffer = iter->tr->buffer;
struct trace_entry *ent, *next = NULL;
+ unsigned long lost_events = 0, next_lost = 0;
int cpu_file = iter->cpu_file;
u64 next_ts = 0, ts;
int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
if (cpu_file > TRACE_PIPE_ALL_CPU) {
if (ring_buffer_empty_cpu(buffer, cpu_file))
return NULL;
- ent = peek_next_entry(iter, cpu_file, ent_ts);
+ ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
if (ent_cpu)
*ent_cpu = cpu_file;
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
if (ring_buffer_empty_cpu(buffer, cpu))
continue;
- ent = peek_next_entry(iter, cpu, &ts);
+ ent = peek_next_entry(iter, cpu, &ts, &lost_events);
/*
* Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
next = ent;
next_cpu = cpu;
next_ts = ts;
+ next_lost = lost_events;
}
}
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
if (ent_ts)
*ent_ts = next_ts;
+ if (missing_events)
+ *missing_events = next_lost;
+
return next;
}
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts)
{
- return __find_next_entry(iter, ent_cpu, ent_ts);
+ return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
}
/* Find the next real entry, and increment the iterator to the next entry */
static void *find_next_entry_inc(struct trace_iterator *iter)
{
- iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+ iter->ent = __find_next_entry(iter, &iter->cpu,
+ &iter->lost_events, &iter->ts);
if (iter->ent)
trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
{
/* Don't allow ftrace to trace into the ring buffers */
ftrace_disable_cpu();
- ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+ &iter->lost_events);
ftrace_enable_cpu();
}
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
}
-static void
+void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1914,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
}
if (event)
- return event->trace(iter, sym_flags);
+ return event->funcs->trace(iter, sym_flags, event);
if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
goto partial;
@@ -1940,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event)
- return event->raw(iter, 0);
+ return event->funcs->raw(iter, 0, event);
if (!trace_seq_printf(s, "%d ?\n", entry->type))
goto partial;
@@ -1967,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
if (event) {
- enum print_line_t ret = event->hex(iter, 0);
+ enum print_line_t ret = event->funcs->hex(iter, 0, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -1992,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
}
event = ftrace_find_event(entry->type);
- return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
+ return event ? event->funcs->binary(iter, 0, event) :
+ TRACE_TYPE_HANDLED;
}
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
{
int cpu;
@@ -2030,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
+ if (iter->lost_events)
+ trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events);
+
if (iter->trace && iter->trace->print_line) {
ret = iter->trace->print_line(iter);
if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
return print_trace_fmt(iter);
}
+void trace_default_header(struct seq_file *m)
+{
+ struct trace_iterator *iter = m->private;
+
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+ print_trace_header(m, iter);
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_lat_help_header(m);
+ } else {
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_func_help_header(m);
+ }
+}
+
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
@@ -2070,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
}
if (iter->trace && iter->trace->print_header)
iter->trace->print_header(m);
- else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
- /* print nothing if the buffers are empty */
- if (trace_empty(iter))
- return 0;
- print_trace_header(m, iter);
- if (!(trace_flags & TRACE_ITER_VERBOSE))
- print_lat_help_header(m);
- } else {
- if (!(trace_flags & TRACE_ITER_VERBOSE))
- print_func_help_header(m);
- }
+ else
+ trace_default_header(m);
+
} else if (iter->leftover) {
/*
* If we filled the seq_file buffer earlier, we
@@ -2166,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
for_each_tracing_cpu(cpu) {
-
iter->buffer_iter[cpu] =
- ring_buffer_read_start(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ }
+ ring_buffer_read_prepare_sync();
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_read_start(iter->buffer_iter[cpu]);
tracing_iter_reset(iter, cpu);
}
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_start(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare_sync();
+ ring_buffer_read_start(iter->buffer_iter[cpu]);
tracing_iter_reset(iter, cpu);
}
@@ -4324,7 +4365,7 @@ static int trace_panic_handler(struct notifier_block *this,
unsigned long event, void *unused)
{
if (ftrace_dump_on_oops)
- ftrace_dump();
+ ftrace_dump(ftrace_dump_on_oops);
return NOTIFY_OK;
}
@@ -4341,7 +4382,7 @@ static int trace_die_handler(struct notifier_block *self,
switch (val) {
case DIE_OOPS:
if (ftrace_dump_on_oops)
- ftrace_dump();
+ ftrace_dump(ftrace_dump_on_oops);
break;
default:
break;
@@ -4382,7 +4423,8 @@ trace_printk_seq(struct trace_seq *s)
trace_seq_init(s);
}
-static void __ftrace_dump(bool disable_tracing)
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
{
static arch_spinlock_t ftrace_dump_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4415,12 +4457,25 @@ static void __ftrace_dump(bool disable_tracing)
/* don't look at user memory in panic mode */
trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
- printk(KERN_TRACE "Dumping ftrace buffer:\n");
-
/* Simulate the iterator */
iter.tr = &global_trace;
iter.trace = current_trace;
- iter.cpu_file = TRACE_PIPE_ALL_CPU;
+
+ switch (oops_dump_mode) {
+ case DUMP_ALL:
+ iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ break;
+ case DUMP_ORIG:
+ iter.cpu_file = raw_smp_processor_id();
+ break;
+ case DUMP_NONE:
+ goto out_enable;
+ default:
+ printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+ iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ }
+
+ printk(KERN_TRACE "Dumping ftrace buffer:\n");
/*
* We need to stop all tracing on all CPUS to read the
@@ -4459,6 +4514,7 @@ static void __ftrace_dump(bool disable_tracing)
else
printk(KERN_TRACE "---------------------------------\n");
+ out_enable:
/* Re-enable tracing if requested */
if (!disable_tracing) {
trace_flags |= old_userobj;
@@ -4475,9 +4531,9 @@ static void __ftrace_dump(bool disable_tracing)
}
/* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
- __ftrace_dump(true);
+ __ftrace_dump(true, oops_dump_mode);
}
__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3ebdb6bd236..2cd96399463 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -364,6 +364,9 @@ void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -402,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_stack(struct trace_array *tr,
+static inline void ftrace_trace_stack(struct ring_buffer *buffer,
unsigned long flags, int skip, int pc)
{
}
-static inline void ftrace_trace_userstack(struct trace_array *tr,
+static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
unsigned long flags, int pc)
{
}
@@ -475,9 +478,29 @@ extern int trace_clock_id;
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN 0x1
+#define TRACE_GRAPH_PRINT_CPU 0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
+#define TRACE_GRAPH_PRINT_PROC 0x8
+#define TRACE_GRAPH_PRINT_DURATION 0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
+
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
extern enum print_line_t
trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+ struct ftrace_graph_ret *trace,
+ unsigned long flags, int pc);
+
#ifdef CONFIG_DYNAMIC_FTRACE
/* TODO: make this variable */
@@ -508,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
#endif /* CONFIG_DYNAMIC_FTRACE */
#else /* CONFIG_FUNCTION_GRAPH_TRACER */
static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
return TRACE_TYPE_UNHANDLED;
}
@@ -755,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call);
+
static inline int
filter_check_discard(struct ftrace_event_call *call, void *rec,
struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
- if (unlikely(call->filter_active) &&
+ if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
!filter_match_preds(call->filter, rec)) {
ring_buffer_discard_commit(buffer, event);
return 1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d47017..8d3538b4ea5 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
}
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct trace_branch *field;
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
" |\n");
}
+static struct trace_event_functions trace_branch_funcs = {
+ .trace = trace_branch_print,
+};
+
static struct trace_event trace_branch_event = {
.type = TRACE_BRANCH,
- .trace = trace_branch_print,
+ .funcs = &trace_branch_funcs,
};
static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566..0a47e8d6b49 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -49,7 +49,12 @@ static int perf_trace_event_enable(struct ftrace_event_call *event)
rcu_assign_pointer(perf_trace_buf_nmi, buf);
}
- ret = event->perf_event_enable(event);
+ if (event->class->reg)
+ ret = event->class->reg(event, TRACE_REG_PERF_REGISTER);
+ else
+ ret = tracepoint_probe_register(event->name,
+ event->class->perf_probe,
+ event);
if (!ret) {
total_ref_count++;
return 0;
@@ -75,7 +80,8 @@ int perf_trace_enable(int event_id)
mutex_lock(&event_mutex);
list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id && event->perf_event_enable &&
+ if (event->event.type == event_id &&
+ event->class && event->class->perf_probe &&
try_module_get(event->mod)) {
ret = perf_trace_event_enable(event);
break;
@@ -93,7 +99,10 @@ static void perf_trace_event_disable(struct ftrace_event_call *event)
if (--event->perf_refcount > 0)
return;
- event->perf_event_disable(event);
+ if (event->class->reg)
+ event->class->reg(event, TRACE_REG_PERF_UNREGISTER);
+ else
+ tracepoint_probe_unregister(event->name, event->class->perf_probe, event);
if (!--total_ref_count) {
buf = perf_trace_buf;
@@ -119,7 +128,7 @@ void perf_trace_disable(int event_id)
mutex_lock(&event_mutex);
list_for_each_entry(event, &ftrace_events, list) {
- if (event->id == event_id) {
+ if (event->event.type == event_id) {
perf_trace_event_disable(event);
module_put(event->mod);
break;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c704334..53cffc0b080 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
+struct list_head *
+trace_get_fields(struct ftrace_event_call *event_call)
+{
+ if (!event_call->class->get_fields)
+ return &event_call->class->fields;
+ return event_call->class->get_fields(event_call);
+}
+
int trace_define_field(struct ftrace_event_call *call, const char *type,
const char *name, int offset, int size, int is_signed,
int filter_type)
{
struct ftrace_event_field *field;
+ struct list_head *head;
+
+ if (WARN_ON(!call->class))
+ return 0;
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
@@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
field->size = size;
field->is_signed = is_signed;
- list_add(&field->link, &call->fields);
+ head = trace_get_fields(call);
+ list_add(&field->link, head);
return 0;
@@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
+ struct list_head *head;
- list_for_each_entry_safe(field, next, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry_safe(field, next, head, link) {
list_del(&field->link);
kfree(field->type);
kfree(field->name);
@@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
{
int id;
- id = register_ftrace_event(call->event);
+ id = register_ftrace_event(&call->event);
if (!id)
return -ENODEV;
- call->id = id;
- INIT_LIST_HEAD(&call->fields);
return 0;
}
@@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
switch (enable) {
case 0:
- if (call->enabled) {
- call->enabled = 0;
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
+ call->flags &= ~TRACE_EVENT_FL_ENABLED;
tracing_stop_cmdline_record();
- call->unregfunc(call);
+ if (call->class->reg)
+ call->class->reg(call, TRACE_REG_UNREGISTER);
+ else
+ tracepoint_probe_unregister(call->name,
+ call->class->probe,
+ call);
}
break;
case 1:
- if (!call->enabled) {
+ if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
tracing_start_cmdline_record();
- ret = call->regfunc(call);
+ if (call->class->reg)
+ ret = call->class->reg(call, TRACE_REG_REGISTER);
+ else
+ ret = tracepoint_probe_register(call->name,
+ call->class->probe,
+ call);
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
"%s\n", call->name);
break;
}
- call->enabled = 1;
+ call->flags |= TRACE_EVENT_FL_ENABLED;
}
break;
}
@@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class ||
+ (!call->class->probe && !call->class->reg))
continue;
if (match &&
strcmp(match, call->name) != 0 &&
- strcmp(match, call->system) != 0)
+ strcmp(match, call->class->system) != 0)
continue;
- if (sub && strcmp(sub, call->system) != 0)
+ if (sub && strcmp(sub, call->class->system) != 0)
continue;
if (event && strcmp(event, call->name) != 0)
@@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
- if (call->regfunc)
+ if (call->class && (call->class->probe || call->class->reg))
return call;
}
@@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
(*pos)++;
list_for_each_entry_continue(call, &ftrace_events, list) {
- if (call->enabled)
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
return call;
}
@@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
{
struct ftrace_event_call *call = v;
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- seq_printf(m, "%s:", call->system);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ seq_printf(m, "%s:", call->class->system);
seq_printf(m, "%s\n", call->name);
return 0;
@@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
struct ftrace_event_call *call = filp->private_data;
char *buf;
- if (call->enabled)
+ if (call->flags & TRACE_EVENT_FL_ENABLED)
buf = "1\n";
else
buf = "0\n";
@@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->name || !call->regfunc)
+ if (!call->name || !call->class ||
+ (!call->class->probe && !call->class->reg))
continue;
- if (system && strcmp(call->system, system) != 0)
+ if (system && strcmp(call->class->system, system) != 0)
continue;
/*
@@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!call->enabled);
+ set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
{
struct ftrace_event_call *call = filp->private_data;
struct ftrace_event_field *field;
+ struct list_head *head;
struct trace_seq *s;
int common_field_count = 5;
char *buf;
@@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
trace_seq_init(s);
trace_seq_printf(s, "name: %s\n", call->name);
- trace_seq_printf(s, "ID: %d\n", call->id);
+ trace_seq_printf(s, "ID: %d\n", call->event.type);
trace_seq_printf(s, "format:\n");
- list_for_each_entry_reverse(field, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry_reverse(field, head, link) {
/*
* Smartly shows the array type(except dynamic array).
* Normal:
@@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return -ENOMEM;
trace_seq_init(s);
- trace_seq_printf(s, "%d\n", call->id);
+ trace_seq_printf(s, "%d\n", call->event.type);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
@@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
const struct file_operations *filter,
const struct file_operations *format)
{
+ struct list_head *head;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
- if (strcmp(call->system, TRACE_SYSTEM) != 0)
- d_events = event_subsystem_dir(call->system, d_events);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+ d_events = event_subsystem_dir(call->class->system, d_events);
call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
@@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return -1;
}
- if (call->regfunc)
+ if (call->class->probe || call->class->reg)
trace_create_file("enable", 0644, call->dir, call,
enable);
- if (call->id && call->perf_event_enable)
+#ifdef CONFIG_PERF_EVENTS
+ if (call->event.type && (call->class->perf_probe || call->class->reg))
trace_create_file("id", 0444, call->dir, call,
id);
+#endif
- if (call->define_fields) {
- ret = trace_define_common_fields(call);
- if (!ret)
- ret = call->define_fields(call);
- if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", call->name);
- return ret;
+ if (call->class->define_fields) {
+ /*
+ * Other events may have the same class. Only update
+ * the fields if they are not already defined.
+ */
+ head = trace_get_fields(call);
+ if (list_empty(head)) {
+ ret = trace_define_common_fields(call);
+ if (!ret)
+ ret = call->class->define_fields(call);
+ if (ret < 0) {
+ pr_warning("Could not initialize trace point"
+ " events/%s\n", call->name);
+ return ret;
+ }
}
trace_create_file("filter", 0644, call->dir, call,
filter);
@@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
if (!call->name)
return -EINVAL;
- if (call->raw_init) {
- ret = call->raw_init(call);
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
static void __trace_remove_event_call(struct ftrace_event_call *call)
{
ftrace_event_enable_disable(call, 0);
- if (call->event)
- __unregister_ftrace_event(call->event);
+ if (call->event.funcs)
+ __unregister_ftrace_event(&call->event);
debugfs_remove_recursive(call->dir);
list_del(&call->list);
trace_destroy_fields(call);
destroy_preds(call);
- remove_subsystem_dir(call->system);
+ remove_subsystem_dir(call->class->system);
}
/* Remove an event_call */
@@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
/* The linker may leave blanks */
if (!call->name)
continue;
- if (call->raw_init) {
- ret = call->raw_init(call);
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1286,8 +1323,8 @@ static __init int event_trace_init(void)
/* The linker may leave blanks */
if (!call->name)
continue;
- if (call->raw_init) {
- ret = call->raw_init(call);
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
if (ret < 0) {
if (ret != -ENOSYS)
pr_warning("Could not initialize trace "
@@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void)
list_for_each_entry(call, &ftrace_events, list) {
- /* Only test those that have a regfunc */
- if (!call->regfunc)
+ /* Only test those that have a probe */
+ if (!call->class || !call->class->probe)
continue;
/*
@@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void)
* syscalls as we test.
*/
#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
- if (call->system &&
- strcmp(call->system, "syscalls") == 0)
+ if (call->class->system &&
+ strcmp(call->class->system, "syscalls") == 0)
continue;
#endif
@@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void)
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (call->enabled) {
+ if (call->flags & TRACE_EVENT_FL_ENABLED) {
pr_warning("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 58092d844a1..57bb1bb3299 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -500,8 +500,10 @@ static struct ftrace_event_field *
find_event_field(struct ftrace_event_call *call, char *name)
{
struct ftrace_event_field *field;
+ struct list_head *head;
- list_for_each_entry(field, &call->fields, link) {
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
if (!strcmp(field->name, name))
return field;
}
@@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
struct event_filter *filter = call->filter;
int i;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
filter->n_preds = 0;
for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
{
__free_preds(call->filter);
call->filter = NULL;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
static struct event_filter *__alloc_preds(void)
@@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
if (call->filter)
return 0;
- call->filter_active = 0;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
call->filter = __alloc_preds();
if (IS_ERR(call->filter))
return PTR_ERR(call->filter);
@@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
int err;
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->define_fields)
+ if (!call->class || !call->class->define_fields)
continue;
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
err = init_preds(call);
@@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
struct ftrace_event_call *call;
list_for_each_entry(call, &ftrace_events, list) {
- if (!call->define_fields)
+ if (!call->class || !call->class->define_fields)
continue;
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
filter_disable_preds(call);
@@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
list_for_each_entry(call, &ftrace_events, list) {
struct event_filter *filter = call->filter;
- if (!call->define_fields)
+ if (!call->class || !call->class->define_fields)
continue;
- if (strcmp(call->system, system->name) != 0)
+ if (strcmp(call->class->system, system->name) != 0)
continue;
/* try to see if the filter can be applied */
@@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
if (err)
filter_disable_preds(call);
else {
- call->filter_active = 1;
+ call->flags |= TRACE_EVENT_FL_FILTERED;
replace_filter_string(filter, filter_string);
}
fail = false;
@@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
if (err)
append_filter_err(ps, call->filter);
else
- call->filter_active = 1;
+ call->flags |= TRACE_EVENT_FL_FILTERED;
out:
filter_opstack_clear(ps);
postfix_clear(ps);
@@ -1393,7 +1395,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
mutex_lock(&event_mutex);
list_for_each_entry(call, &ftrace_events, list) {
- if (call->id == event_id)
+ if (call->event.type == event_id)
break;
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6c..8536e2a6596 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
static int ftrace_raw_init_event(struct ftrace_event_call *call)
{
- INIT_LIST_HEAD(&call->fields);
+ INIT_LIST_HEAD(&call->class->fields);
return 0;
}
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
#define F_printk(fmt, args...) #fmt ", " __stringify(args)
#undef FTRACE_ENTRY
-#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
+#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
+ \
+struct ftrace_event_class event_class_ftrace_##call = { \
+ .system = __stringify(TRACE_SYSTEM), \
+ .define_fields = ftrace_define_fields_##call, \
+ .raw_init = ftrace_raw_init_event, \
+}; \
\
struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
- .id = type, \
- .system = __stringify(TRACE_SYSTEM), \
- .raw_init = ftrace_raw_init_event, \
+ .event.type = etype, \
+ .class = &event_class_ftrace_##call, \
.print_fmt = print, \
- .define_fields = ftrace_define_fields_##call, \
}; \
#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5cf55..79f4bac99a9 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
static struct tracer_opt trace_opts[] = {
/* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
return ret;
}
-static int __trace_graph_entry(struct trace_array *tr,
+int __trace_graph_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
unsigned long flags,
int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
return trace_graph_entry(trace);
}
-static void __trace_graph_return(struct trace_array *tr,
+void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
unsigned long flags,
int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
* We need to consume the current entry to see
* the next one.
*/
- ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+ ring_buffer_consume(iter->tr->buffer, iter->cpu,
+ NULL, NULL);
event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
- NULL);
+ NULL, NULL);
}
if (!event)
@@ -526,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
/* Signal a overhead of time execution to the output */
static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+ u32 flags)
{
/* If duration disappear, we don't need anything */
- if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+ if (!(flags & TRACE_GRAPH_PRINT_DURATION))
return 1;
/* Non nested entry or return */
if (duration == -1)
return trace_seq_printf(s, " ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
/* Duration exceeded 100 msecs */
if (duration > 100000ULL)
return trace_seq_printf(s, "! ");
@@ -562,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
static enum print_line_t
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
- enum trace_type type, int cpu, pid_t pid)
+ enum trace_type type, int cpu, pid_t pid, u32 flags)
{
int ret;
struct trace_seq *s = &iter->seq;
@@ -572,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
return TRACE_TYPE_UNHANDLED;
/* Absolute time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+ if (flags & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+ if (flags & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
}
/* No overhead */
- ret = print_graph_overhead(-1, s);
+ ret = print_graph_overhead(-1, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -609,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
return TRACE_TYPE_PARTIAL_LINE;
/* Don't close the duration column if haven't one */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
trace_seq_printf(s, " |");
ret = trace_seq_printf(s, "\n");
@@ -679,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
static enum print_line_t
print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry,
- struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+ struct ftrace_graph_ret_entry *ret_entry,
+ struct trace_seq *s, u32 flags)
{
struct fgraph_data *data = iter->private;
struct ftrace_graph_ret *graph_ret;
@@ -711,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
/* Overhead */
- ret = print_graph_overhead(duration, s);
+ ret = print_graph_overhead(duration, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
static enum print_line_t
print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry,
- struct trace_seq *s, int cpu)
+ struct trace_seq *s, int cpu, u32 flags)
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
@@ -759,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No overhead */
- ret = print_graph_overhead(-1, s);
+ ret = print_graph_overhead(-1, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
static enum print_line_t
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
- int type, unsigned long addr)
+ int type, unsigned long addr, u32 flags)
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
@@ -803,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
if (type) {
/* Interrupt */
- ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+ ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Absolute time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+ if (flags & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+ if (flags & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
static enum print_line_t
print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
- struct trace_iterator *iter)
+ struct trace_iterator *iter, u32 flags)
{
struct fgraph_data *data = iter->private;
struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
static enum print_line_t ret;
int cpu = iter->cpu;
- if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+ if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
return TRACE_TYPE_PARTIAL_LINE;
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
- ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
+ ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
else
- ret = print_graph_entry_nested(iter, field, s, cpu);
+ ret = print_graph_entry_nested(iter, field, s, cpu, flags);
if (data) {
/*
@@ -879,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
static enum print_line_t
print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
- struct trace_entry *ent, struct trace_iterator *iter)
+ struct trace_entry *ent, struct trace_iterator *iter,
+ u32 flags)
{
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
@@ -909,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
}
- if (print_graph_prologue(iter, s, 0, 0))
+ if (print_graph_prologue(iter, s, 0, 0, flags))
return TRACE_TYPE_PARTIAL_LINE;
/* Overhead */
- ret = print_graph_overhead(duration, s);
+ ret = print_graph_overhead(duration, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
/* Overrun */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+ if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
ret = trace_seq_printf(s, " (Overruns: %lu)\n",
trace->overrun);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
- ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+ ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+ cpu, pid, flags);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@@ -963,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
static enum print_line_t
-print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
- struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+ struct trace_iterator *iter, u32 flags)
{
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct fgraph_data *data = iter->private;
@@ -976,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (data)
depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
- if (print_graph_prologue(iter, s, 0, 0))
+ if (print_graph_prologue(iter, s, 0, 0, flags))
return TRACE_TYPE_PARTIAL_LINE;
/* No overhead */
- ret = print_graph_overhead(-1, s);
+ ret = print_graph_overhead(-1, s, flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+ if (flags & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@@ -1020,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (!event)
return TRACE_TYPE_UNHANDLED;
- ret = event->trace(iter, sym_flags);
+ ret = event->funcs->trace(iter, sym_flags, event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
@@ -1040,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
struct ftrace_graph_ent_entry *field;
struct fgraph_data *data = iter->private;
@@ -1061,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
if (data && data->failed) {
field = &data->ent;
iter->cpu = data->cpu;
- ret = print_graph_entry(field, s, iter);
+ ret = print_graph_entry(field, s, iter, flags);
if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
struct ftrace_graph_ent_entry saved;
trace_assign_type(field, entry);
saved = *field;
- return print_graph_entry(&saved, s, iter);
+ return print_graph_entry(&saved, s, iter, flags);
}
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
- return print_graph_return(&field->ret, s, entry, iter);
+ return print_graph_return(&field->ret, s, entry, iter, flags);
}
+ case TRACE_STACK:
+ case TRACE_FN:
+ /* dont trace stack and functions as comments */
+ return TRACE_TYPE_UNHANDLED;
+
default:
- return print_graph_comment(s, entry, iter);
+ return print_graph_comment(s, entry, iter, flags);
}
return TRACE_TYPE_HANDLED;
}
-static void print_lat_header(struct seq_file *s)
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+ return print_graph_function_flags(iter, tracer_flags.val);
+}
+
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ return print_graph_function(iter);
+}
+
+static void print_lat_header(struct seq_file *s, u32 flags)
{
static const char spaces[] = " " /* 16 spaces */
" " /* 4 spaces */
" "; /* 17 spaces */
int size = 0;
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
size += 16;
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+ if (flags & TRACE_GRAPH_PRINT_CPU)
size += 4;
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+ if (flags & TRACE_GRAPH_PRINT_PROC)
size += 17;
seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1117,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
seq_printf(s, "#%.*s|||| / \n", size, spaces);
}
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
{
int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
if (lat)
- print_lat_header(s);
+ print_lat_header(s, flags);
/* 1st line */
seq_printf(s, "#");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " TIME ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+ if (flags & TRACE_GRAPH_PRINT_CPU)
seq_printf(s, " CPU");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+ if (flags & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, " TASK/PID ");
if (lat)
seq_printf(s, "|||||");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " DURATION ");
seq_printf(s, " FUNCTION CALLS\n");
/* 2nd line */
seq_printf(s, "#");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " | ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+ if (flags & TRACE_GRAPH_PRINT_CPU)
seq_printf(s, " | ");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+ if (flags & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, " | | ");
if (lat)
seq_printf(s, "|||||");
- if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+ if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " | | ");
seq_printf(s, " | | | |\n");
}
-static void graph_trace_open(struct trace_iterator *iter)
+void print_graph_headers(struct seq_file *s)
+{
+ print_graph_headers_flags(s, tracer_flags.val);
+}
+
+void graph_trace_open(struct trace_iterator *iter)
{
/* pid and depth on the last trace processed */
struct fgraph_data *data;
@@ -1188,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
pr_warning("function graph tracer: not enough memory\n");
}
-static void graph_trace_close(struct trace_iterator *iter)
+void graph_trace_close(struct trace_iterator *iter)
{
struct fgraph_data *data = iter->private;
@@ -1198,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
}
}
+static struct trace_event_functions graph_functions = {
+ .trace = print_graph_function_event,
+};
+
+static struct trace_event graph_trace_entry_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &graph_functions,
+};
+
+static struct trace_event graph_trace_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &graph_functions
+};
+
static struct tracer graph_trace __read_mostly = {
.name = "function_graph",
.open = graph_trace_open,
@@ -1219,6 +1261,16 @@ static __init int init_graph_trace(void)
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
+ if (!register_ftrace_event(&graph_trace_entry_event)) {
+ pr_warning("Warning: could not register graph trace events\n");
+ return 1;
+ }
+
+ if (!register_ftrace_event(&graph_trace_ret_event)) {
+ pr_warning("Warning: could not register graph trace events\n");
+ return 1;
+ }
+
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c..6fd486e0cef 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
static int save_lat_flag;
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+
#ifdef CONFIG_PREEMPT_TRACER
static inline int
preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
# define irq_trace() (0)
#endif
+#define TRACE_DISPLAY_GRAPH 1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ /* display latency trace as call graph */
+ { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+ { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+ .val = 0,
+ .opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
/*
* Sequence count - we record it when starting a measurement and
* skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
};
#endif /* CONFIG_FUNCTION_TRACER */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+ int cpu;
+
+ if (!(bit & TRACE_DISPLAY_GRAPH))
+ return -EINVAL;
+
+ if (!(is_graph() ^ set))
+ return 0;
+
+ stop_irqsoff_tracer(irqsoff_trace, !set);
+
+ for_each_possible_cpu(cpu)
+ per_cpu(tracing_cpu, cpu) = 0;
+
+ tracing_max_latency = 0;
+ tracing_reset_online_cpus(irqsoff_trace);
+
+ return start_irqsoff_tracer(irqsoff_trace, set);
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int ret;
+ int cpu;
+ int pc;
+
+ cpu = raw_smp_processor_id();
+ if (likely(!per_cpu(tracing_cpu, cpu)))
+ return 0;
+
+ local_save_flags(flags);
+ /* slight chance to get a false positive on tracing_cpu */
+ if (!irqs_disabled_flags(flags))
+ return 0;
+
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ ret = __trace_graph_entry(tr, trace, flags, pc);
+ } else
+ ret = 0;
+
+ atomic_dec(&data->disabled);
+ return ret;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+ struct trace_array *tr = irqsoff_trace;
+ struct trace_array_cpu *data;
+ unsigned long flags;
+ long disabled;
+ int cpu;
+ int pc;
+
+ cpu = raw_smp_processor_id();
+ if (likely(!per_cpu(tracing_cpu, cpu)))
+ return;
+
+ local_save_flags(flags);
+ /* slight chance to get a false positive on tracing_cpu */
+ if (!irqs_disabled_flags(flags))
+ return;
+
+ data = tr->data[cpu];
+ disabled = atomic_inc_return(&data->disabled);
+
+ if (likely(disabled == 1)) {
+ pc = preempt_count();
+ __trace_graph_return(tr, trace, flags, pc);
+ }
+
+ atomic_dec(&data->disabled);
+}
+
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+ if (is_graph())
+ graph_trace_open(iter);
+
+}
+
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+ if (iter->private)
+ graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+ TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+ u32 flags = GRAPH_TRACER_FLAGS;
+
+ if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ flags |= TRACE_GRAPH_PRINT_DURATION;
+ else
+ flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+ /*
+ * In graph mode call the graph tracer output function,
+ * otherwise go with the TRACE_FN event handler
+ */
+ if (is_graph())
+ return print_graph_function_flags(iter, flags);
+
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_print_header(struct seq_file *s)
+{
+ if (is_graph()) {
+ struct trace_iterator *iter = s->private;
+ u32 flags = GRAPH_TRACER_FLAGS;
+
+ if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+
+ print_trace_header(s, iter);
+ flags |= TRACE_GRAPH_PRINT_DURATION;
+ } else
+ flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+ print_graph_headers_flags(s, flags);
+ } else
+ trace_default_header(s);
+}
+
+static void
+trace_graph_function(struct trace_array *tr,
+ unsigned long ip, unsigned long flags, int pc)
+{
+ u64 time = trace_clock_local();
+ struct ftrace_graph_ent ent = {
+ .func = ip,
+ .depth = 0,
+ };
+ struct ftrace_graph_ret ret = {
+ .func = ip,
+ .depth = 0,
+ .calltime = time,
+ .rettime = time,
+ };
+
+ __trace_graph_entry(tr, &ent, flags, pc);
+ __trace_graph_return(tr, &ret, flags, pc);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned long flags, int pc)
+{
+ if (!is_graph())
+ trace_function(tr, ip, parent_ip, flags, pc);
+ else {
+ trace_graph_function(tr, parent_ip, flags, pc);
+ trace_graph_function(tr, ip, flags, pc);
+ }
+}
+
+#else
+#define __trace_function trace_function
+
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+ return -EINVAL;
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+ return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
/*
* Should this new latency be reported/recorded?
*/
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(delta))
goto out_unlock;
- trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
/* Skip 5 functions to get to the irq/preempt enable function */
__trace_stack(tr, flags, 5, pc);
@@ -172,7 +388,7 @@ out_unlock:
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+ __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
local_save_flags(flags);
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ __trace_function(tr, ip, parent_ip, flags, preempt_count());
per_cpu(tracing_cpu, cpu) = 1;
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
atomic_inc(&data->disabled);
local_save_flags(flags);
- trace_function(tr, ip, parent_ip, flags, preempt_count());
+ __trace_function(tr, ip, parent_ip, flags, preempt_count());
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
{
- register_ftrace_function(&trace_ops);
- if (tracing_is_enabled())
+ int ret = 0;
+
+ if (!graph)
+ ret = register_ftrace_function(&trace_ops);
+ else
+ ret = register_ftrace_graph(&irqsoff_graph_return,
+ &irqsoff_graph_entry);
+
+ if (!ret && tracing_is_enabled())
tracer_enabled = 1;
else
tracer_enabled = 0;
+
+ return ret;
}
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
- unregister_ftrace_function(&trace_ops);
+
+ if (!graph)
+ unregister_ftrace_function(&trace_ops);
+ else
+ unregister_ftrace_graph();
}
static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
/* make sure that the tracer is visible */
smp_wmb();
tracing_reset_online_cpus(tr);
- start_irqsoff_tracer(tr);
+
+ if (start_irqsoff_tracer(tr, is_graph()))
+ printk(KERN_ERR "failed to start irqsoff tracer\n");
}
static void irqsoff_tracer_reset(struct trace_array *tr)
{
- stop_irqsoff_tracer(tr);
+ stop_irqsoff_tracer(tr, is_graph());
if (!save_lat_flag)
trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = 1,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
};
# define register_irqsoff(trace) register_tracer(&trace)
#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = 1,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
};
# define register_preemptoff(trace) register_tracer(&trace)
#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.print_max = 1,
+ .print_header = irqsoff_print_header,
+ .print_line = irqsoff_print_line,
+ .flags = &tracer_flags,
+ .set_flag = irqsoff_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
#endif
+ .open = irqsoff_trace_open,
+ .close = irqsoff_trace_close,
};
# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a7514326052..9a082bba953 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -324,8 +324,8 @@ struct trace_probe {
unsigned long nhit;
unsigned int flags; /* For TP_FLAG_* */
const char *symbol; /* symbol name */
+ struct ftrace_event_class class;
struct ftrace_event_call call;
- struct trace_event event;
ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
@@ -404,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
goto error;
}
+ tp->call.class = &tp->class;
tp->call.name = kstrdup(event, GFP_KERNEL);
if (!tp->call.name)
goto error;
@@ -413,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
goto error;
}
- tp->call.system = kstrdup(group, GFP_KERNEL);
- if (!tp->call.system)
+ tp->class.system = kstrdup(group, GFP_KERNEL);
+ if (!tp->class.system)
goto error;
INIT_LIST_HEAD(&tp->list);
@@ -443,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
for (i = 0; i < tp->nr_args; i++)
free_probe_arg(&tp->args[i]);
- kfree(tp->call.system);
+ kfree(tp->call.class->system);
kfree(tp->call.name);
kfree(tp->symbol);
kfree(tp);
@@ -456,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
list_for_each_entry(tp, &probe_list, list)
if (strcmp(tp->call.name, event) == 0 &&
- strcmp(tp->call.system, group) == 0)
+ strcmp(tp->call.class->system, group) == 0)
return tp;
return NULL;
}
@@ -481,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
mutex_lock(&probe_lock);
/* register as an event */
- old_tp = find_probe_event(tp->call.name, tp->call.system);
+ old_tp = find_probe_event(tp->call.name, tp->call.class->system);
if (old_tp) {
/* delete old event */
unregister_trace_probe(old_tp);
@@ -904,7 +905,7 @@ static int probes_seq_show(struct seq_file *m, void *v)
int i;
seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
- seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
+ seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
if (!tp->symbol)
seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -1061,8 +1062,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
size = sizeof(*entry) + tp->size;
- event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
- irq_flags, pc);
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
@@ -1094,8 +1095,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
size = sizeof(*entry) + tp->size;
- event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
- irq_flags, pc);
+ event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
+ size, irq_flags, pc);
if (!event)
return;
@@ -1112,18 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
/* Event entry printers */
enum print_line_t
-print_kprobe_event(struct trace_iterator *iter, int flags)
+print_kprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct kprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
- struct trace_event *event;
struct trace_probe *tp;
u8 *data;
int i;
field = (struct kprobe_trace_entry_head *)iter->ent;
- event = ftrace_find_event(field->ent.type);
- tp = container_of(event, struct trace_probe, event);
+ tp = container_of(event, struct trace_probe, call.event);
if (!trace_seq_printf(s, "%s: (", tp->call.name))
goto partial;
@@ -1149,18 +1149,17 @@ partial:
}
enum print_line_t
-print_kretprobe_event(struct trace_iterator *iter, int flags)
+print_kretprobe_event(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct kretprobe_trace_entry_head *field;
struct trace_seq *s = &iter->seq;
- struct trace_event *event;
struct trace_probe *tp;
u8 *data;
int i;
field = (struct kretprobe_trace_entry_head *)iter->ent;
- event = ftrace_find_event(field->ent.type);
- tp = container_of(event, struct trace_probe, event);
+ tp = container_of(event, struct trace_probe, call.event);
if (!trace_seq_printf(s, "%s: (", tp->call.name))
goto partial;
@@ -1217,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
static int probe_event_raw_init(struct ftrace_event_call *event_call)
{
- INIT_LIST_HEAD(&event_call->fields);
-
return 0;
}
@@ -1353,7 +1350,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
"profile buffer not large enough"))
return;
- entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->event.type,
+ &rctx, &irq_flags);
if (!entry)
return;
@@ -1384,7 +1382,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
"profile buffer not large enough"))
return;
- entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
+ entry = perf_trace_buf_prepare(size, call->event.type,
+ &rctx, &irq_flags);
if (!entry)
return;
@@ -1425,6 +1424,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
}
#endif /* CONFIG_PERF_EVENTS */
+static __kprobes
+int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return probe_event_enable(event);
+ case TRACE_REG_UNREGISTER:
+ probe_event_disable(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return probe_perf_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ probe_perf_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
static __kprobes
int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1454,6 +1473,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
return 0; /* We don't tweek kernel, so just return 0 */
}
+static struct trace_event_functions kretprobe_funcs = {
+ .trace = print_kretprobe_event
+};
+
+static struct trace_event_functions kprobe_funcs = {
+ .trace = print_kprobe_event
+};
+
static int register_probe_event(struct trace_probe *tp)
{
struct ftrace_event_call *call = &tp->call;
@@ -1461,36 +1488,31 @@ static int register_probe_event(struct trace_probe *tp)
/* Initialize ftrace_event_call */
if (probe_is_return(tp)) {
- tp->event.trace = print_kretprobe_event;
- call->raw_init = probe_event_raw_init;
- call->define_fields = kretprobe_event_define_fields;
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &kretprobe_funcs;
+ call->class->raw_init = probe_event_raw_init;
+ call->class->define_fields = kretprobe_event_define_fields;
} else {
- tp->event.trace = print_kprobe_event;
- call->raw_init = probe_event_raw_init;
- call->define_fields = kprobe_event_define_fields;
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &kprobe_funcs;
+ call->class->raw_init = probe_event_raw_init;
+ call->class->define_fields = kprobe_event_define_fields;
}
if (set_print_fmt(tp) < 0)
return -ENOMEM;
- call->event = &tp->event;
- call->id = register_ftrace_event(&tp->event);
- if (!call->id) {
+ ret = register_ftrace_event(&call->event);
+ if (!ret) {
kfree(call->print_fmt);
return -ENODEV;
}
- call->enabled = 0;
- call->regfunc = probe_event_enable;
- call->unregfunc = probe_event_disable;
-
-#ifdef CONFIG_PERF_EVENTS
- call->perf_event_enable = probe_perf_enable;
- call->perf_event_disable = probe_perf_disable;
-#endif
+ call->flags = 0;
+ call->class->reg = kprobe_register;
call->data = tp;
ret = trace_add_event_call(call);
if (ret) {
pr_info("Failed to register kprobe event: %s\n", call->name);
kfree(call->print_fmt);
- unregister_ftrace_event(&tp->event);
+ unregister_ftrace_event(&call->event);
}
return ret;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cd..fc9d4dbb089 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
void *ret;
if (s->full)
- return 0;
+ return NULL;
if (len > ((PAGE_SIZE - 1) - s->len)) {
s->full = 1;
@@ -726,6 +726,9 @@ int register_ftrace_event(struct trace_event *event)
if (WARN_ON(!event))
goto out;
+ if (WARN_ON(!event->funcs))
+ goto out;
+
INIT_LIST_HEAD(&event->list);
if (!event->type) {
@@ -758,14 +761,14 @@ int register_ftrace_event(struct trace_event *event)
goto out;
}
- if (event->trace == NULL)
- event->trace = trace_nop_print;
- if (event->raw == NULL)
- event->raw = trace_nop_print;
- if (event->hex == NULL)
- event->hex = trace_nop_print;
- if (event->binary == NULL)
- event->binary = trace_nop_print;
+ if (event->funcs->trace == NULL)
+ event->funcs->trace = trace_nop_print;
+ if (event->funcs->raw == NULL)
+ event->funcs->raw = trace_nop_print;
+ if (event->funcs->hex == NULL)
+ event->funcs->hex = trace_nop_print;
+ if (event->funcs->binary == NULL)
+ event->funcs->binary = trace_nop_print;
key = event->type & (EVENT_HASHSIZE - 1);
@@ -807,13 +810,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
* Standard events
*/
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return TRACE_TYPE_HANDLED;
}
/* TRACE_FN */
-static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -840,7 +845,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
@@ -854,7 +860,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -867,7 +874,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
@@ -880,14 +888,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_fn_event = {
- .type = TRACE_FN,
+static struct trace_event_functions trace_fn_funcs = {
.trace = trace_fn_trace,
.raw = trace_fn_raw,
.hex = trace_fn_hex,
.binary = trace_fn_bin,
};
+static struct trace_event trace_fn_event = {
+ .type = TRACE_FN,
+ .funcs = &trace_fn_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -916,13 +928,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_print(iter, "==>");
}
static enum print_line_t trace_wake_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
return trace_ctxwake_print(iter, " +");
}
@@ -950,12 +963,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, 0);
}
-static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_raw(iter, '+');
}
@@ -984,18 +999,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
return TRACE_TYPE_HANDLED;
}
-static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, 0);
}
-static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
return trace_ctxwake_hex(iter, '+');
}
static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct ctx_switch_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1012,25 +1029,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_ctx_event = {
- .type = TRACE_CTX,
+static struct trace_event_functions trace_ctx_funcs = {
.trace = trace_ctx_print,
.raw = trace_ctx_raw,
.hex = trace_ctx_hex,
.binary = trace_ctxwake_bin,
};
-static struct trace_event trace_wake_event = {
- .type = TRACE_WAKE,
+static struct trace_event trace_ctx_event = {
+ .type = TRACE_CTX,
+ .funcs = &trace_ctx_funcs,
+};
+
+static struct trace_event_functions trace_wake_funcs = {
.trace = trace_wake_print,
.raw = trace_wake_raw,
.hex = trace_wake_hex,
.binary = trace_ctxwake_bin,
};
+static struct trace_event trace_wake_event = {
+ .type = TRACE_WAKE,
+ .funcs = &trace_wake_funcs,
+};
+
/* TRACE_SPECIAL */
static enum print_line_t trace_special_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct special_entry *field;
@@ -1046,7 +1071,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
}
static enum print_line_t trace_special_hex(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct special_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1061,7 +1086,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
}
static enum print_line_t trace_special_bin(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct special_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1075,18 +1100,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
return TRACE_TYPE_HANDLED;
}
-static struct trace_event trace_special_event = {
- .type = TRACE_SPECIAL,
+static struct trace_event_functions trace_special_funcs = {
.trace = trace_special_print,
.raw = trace_special_print,
.hex = trace_special_hex,
.binary = trace_special_bin,
};
+static struct trace_event trace_special_event = {
+ .type = TRACE_SPECIAL,
+ .funcs = &trace_special_funcs,
+};
+
/* TRACE_STACK */
static enum print_line_t trace_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct stack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1114,17 +1143,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_stack_event = {
- .type = TRACE_STACK,
+static struct trace_event_functions trace_stack_funcs = {
.trace = trace_stack_print,
.raw = trace_special_print,
.hex = trace_special_hex,
.binary = trace_special_bin,
};
+static struct trace_event trace_stack_event = {
+ .type = TRACE_STACK,
+ .funcs = &trace_stack_funcs,
+};
+
/* TRACE_USER_STACK */
static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct userstack_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1143,17 +1176,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_user_stack_event = {
- .type = TRACE_USER_STACK,
+static struct trace_event_functions trace_user_stack_funcs = {
.trace = trace_user_stack_print,
.raw = trace_special_print,
.hex = trace_special_hex,
.binary = trace_special_bin,
};
+static struct trace_event trace_user_stack_event = {
+ .type = TRACE_USER_STACK,
+ .funcs = &trace_user_stack_funcs,
+};
+
/* TRACE_BPRINT */
static enum print_line_t
-trace_bprint_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
@@ -1178,7 +1216,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
static enum print_line_t
-trace_bprint_raw(struct trace_iterator *iter, int flags)
+trace_bprint_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct bprint_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1197,16 +1236,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
+static struct trace_event_functions trace_bprint_funcs = {
+ .trace = trace_bprint_print,
+ .raw = trace_bprint_raw,
+};
static struct trace_event trace_bprint_event = {
.type = TRACE_BPRINT,
- .trace = trace_bprint_print,
- .raw = trace_bprint_raw,
+ .funcs = &trace_bprint_funcs,
};
/* TRACE_PRINT */
static enum print_line_t trace_print_print(struct trace_iterator *iter,
- int flags)
+ int flags, struct trace_event *event)
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
@@ -1225,7 +1267,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
return TRACE_TYPE_PARTIAL_LINE;
}
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct print_entry *field;
@@ -1240,12 +1283,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
return TRACE_TYPE_PARTIAL_LINE;
}
-static struct trace_event trace_print_event = {
- .type = TRACE_PRINT,
+static struct trace_event_functions trace_print_funcs = {
.trace = trace_print_print,
.raw = trace_print_raw,
};
+static struct trace_event trace_print_event = {
+ .type = TRACE_PRINT,
+ .funcs = &trace_print_funcs,
+};
+
static struct trace_event *events[] __initdata = {
&trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38..c038eba0492 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
extern struct trace_event *ftrace_find_event(int type);
extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
- int flags);
+ int flags, struct trace_event *event);
extern int
trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde..8f758d070c4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
}
static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
- struct task_struct *next)
+probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
}
static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
{
struct trace_array_cpu *data;
unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
{
int ret;
- ret = register_trace_sched_wakeup(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return ret;
}
- ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_sched_switch);
+ ret = register_trace_sched_switch(probe_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
return ret;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
return ret;
}
static void tracing_sched_unregister(void)
{
- unregister_trace_sched_switch(probe_sched_switch);
- unregister_trace_sched_wakeup_new(probe_sched_wakeup);
- unregister_trace_sched_wakeup(probe_sched_wakeup);
+ unregister_trace_sched_switch(probe_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}
static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8..0e73bc2ef8c 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
return 1;
}
-static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
+static void
+probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
{
if (task != wakeup_task)
return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
}
static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
+probe_wakeup_sched_switch(void *ignore,
+ struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
}
static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p, int success)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
{
int ret;
- ret = register_trace_sched_wakeup(probe_wakeup);
+ ret = register_trace_sched_wakeup(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup\n");
return;
}
- ret = register_trace_sched_wakeup_new(probe_wakeup);
+ ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_wakeup_new\n");
goto fail_deprobe;
}
- ret = register_trace_sched_switch(probe_wakeup_sched_switch);
+ ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
if (ret) {
pr_info("sched trace: Couldn't activate tracepoint"
" probe to kernel_sched_switch\n");
goto fail_deprobe_wake_new;
}
- ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
+ ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
if (ret) {
pr_info("wakeup trace: Couldn't activate tracepoint"
" probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
return;
fail_deprobe_wake_new:
- unregister_trace_sched_wakeup_new(probe_wakeup);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
fail_deprobe:
- unregister_trace_sched_wakeup(probe_wakeup);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
}
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
- unregister_trace_sched_switch(probe_wakeup_sched_switch);
- unregister_trace_sched_wakeup_new(probe_wakeup);
- unregister_trace_sched_wakeup(probe_wakeup);
- unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
+ unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
+ unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
+ unregister_trace_sched_wakeup(probe_wakeup, NULL);
+ unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
}
static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 1cc9858258b..250e7f9bd2f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
struct trace_entry *entry;
unsigned int loops = 0;
- while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+ while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
entry = ring_buffer_event_data(event);
/*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
if (ftrace_dump_on_oops)
- __ftrace_dump(false);
+ __ftrace_dump(false, DUMP_ALL);
return 0;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f..9d358301ae3 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,54 @@ static int sys_refcount_exit;
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type);
+
+static int syscall_enter_define_fields(struct ftrace_event_call *call);
+static int syscall_exit_define_fields(struct ftrace_event_call *call);
+
+static struct list_head *
+syscall_get_enter_fields(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ return &entry->enter_fields;
+}
+
+static struct list_head *
+syscall_get_exit_fields(struct ftrace_event_call *call)
+{
+ struct syscall_metadata *entry = call->data;
+
+ return &entry->exit_fields;
+}
+
+struct trace_event_functions enter_syscall_print_funcs = {
+ .trace = print_syscall_enter,
+};
+
+struct trace_event_functions exit_syscall_print_funcs = {
+ .trace = print_syscall_exit,
+};
+
+struct ftrace_event_class event_class_syscall_enter = {
+ .system = "syscalls",
+ .reg = syscall_enter_register,
+ .define_fields = syscall_enter_define_fields,
+ .get_fields = syscall_get_enter_fields,
+ .raw_init = init_syscall_trace,
+};
+
+struct ftrace_event_class event_class_syscall_exit = {
+ .system = "syscalls",
+ .reg = syscall_exit_register,
+ .define_fields = syscall_exit_define_fields,
+ .get_fields = syscall_get_exit_fields,
+ .raw_init = init_syscall_trace,
+};
+
extern unsigned long __start_syscalls_metadata[];
extern unsigned long __stop_syscalls_metadata[];
@@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
}
enum print_line_t
-print_syscall_enter(struct trace_iterator *iter, int flags)
+print_syscall_enter(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
if (!entry)
goto end;
- if (entry->enter_event->id != ent->type) {
+ if (entry->enter_event->event.type != ent->type) {
WARN_ON_ONCE(1);
goto end;
}
@@ -105,7 +154,8 @@ end:
}
enum print_line_t
-print_syscall_exit(struct trace_iterator *iter, int flags)
+print_syscall_exit(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
return TRACE_TYPE_HANDLED;
}
- if (entry->exit_event->id != ent->type) {
+ if (entry->exit_event->event.type != ent->type) {
WARN_ON_ONCE(1);
return TRACE_TYPE_UNHANDLED;
}
@@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
-int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int syscall_enter_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
-int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int syscall_exit_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
-void ftrace_syscall_enter(struct pt_regs *regs, long id)
+void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
@@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->enter_event->id, size, 0, 0);
+ sys_data->enter_event->event.type, size, 0, 0);
if (!event)
return;
@@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-void ftrace_syscall_exit(struct pt_regs *regs, long ret)
+void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
@@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
return;
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->exit_event->id, sizeof(*entry), 0, 0);
+ sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
if (!event)
return;
@@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_enter)
- ret = register_trace_sys_enter(ftrace_syscall_enter);
+ ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
if (!ret) {
set_bit(num, enabled_enter_syscalls);
sys_refcount_enter++;
@@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
sys_refcount_enter--;
clear_bit(num, enabled_enter_syscalls);
if (!sys_refcount_enter)
- unregister_trace_sys_enter(ftrace_syscall_enter);
+ unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
mutex_unlock(&syscall_trace_lock);
}
@@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_exit)
- ret = register_trace_sys_exit(ftrace_syscall_exit);
+ ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
if (!ret) {
set_bit(num, enabled_exit_syscalls);
sys_refcount_exit++;
@@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
sys_refcount_exit--;
clear_bit(num, enabled_exit_syscalls);
if (!sys_refcount_exit)
- unregister_trace_sys_exit(ftrace_syscall_exit);
+ unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
mutex_unlock(&syscall_trace_lock);
}
@@ -434,7 +484,7 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit;
-static void perf_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
@@ -461,7 +511,8 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
return;
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->id, &rctx, &flags);
+ sys_data->enter_event->event.type,
+ &rctx, &flags);
if (!rec)
return;
@@ -480,7 +531,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
mutex_lock(&syscall_trace_lock);
if (!sys_perf_refcount_enter)
- ret = register_trace_sys_enter(perf_syscall_enter);
+ ret = register_trace_sys_enter(perf_syscall_enter, NULL);
if (ret) {
pr_info("event trace: Could not activate"
"syscall entry trace point");
@@ -502,11 +553,11 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
sys_perf_refcount_enter--;
clear_bit(num, enabled_perf_enter_syscalls);
if (!sys_perf_refcount_enter)
- unregister_trace_sys_enter(perf_syscall_enter);
+ unregister_trace_sys_enter(perf_syscall_enter, NULL);
mutex_unlock(&syscall_trace_lock);
}
-static void perf_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
@@ -536,7 +587,8 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
return;
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
- sys_data->exit_event->id, &rctx, &flags);
+ sys_data->exit_event->event.type,
+ &rctx, &flags);
if (!rec)
return;
@@ -555,7 +607,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
mutex_lock(&syscall_trace_lock);
if (!sys_perf_refcount_exit)
- ret = register_trace_sys_exit(perf_syscall_exit);
+ ret = register_trace_sys_exit(perf_syscall_exit, NULL);
if (ret) {
pr_info("event trace: Could not activate"
"syscall exit trace point");
@@ -577,9 +629,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
sys_perf_refcount_exit--;
clear_bit(num, enabled_perf_exit_syscalls);
if (!sys_perf_refcount_exit)
- unregister_trace_sys_exit(perf_syscall_exit);
+ unregister_trace_sys_exit(perf_syscall_exit, NULL);
mutex_unlock(&syscall_trace_lock);
}
#endif /* CONFIG_PERF_EVENTS */
+static int syscall_enter_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_enter(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_enter(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysenter_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysenter_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
+
+static int syscall_exit_register(struct ftrace_event_call *event,
+ enum trace_reg type)
+{
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ return reg_event_syscall_exit(event);
+ case TRACE_REG_UNREGISTER:
+ unreg_event_syscall_exit(event);
+ return 0;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ return perf_sysexit_enable(event);
+ case TRACE_REG_PERF_UNREGISTER:
+ perf_sysexit_disable(event);
+ return 0;
+#endif
+ }
+ return 0;
+}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9..a7cc3793baf 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
/* Insertion of a work */
static void
-probe_workqueue_insertion(struct task_struct *wq_thread,
+probe_workqueue_insertion(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
/* Execution of a work */
static void
-probe_workqueue_execution(struct task_struct *wq_thread,
+probe_workqueue_execution(void *ignore,
+ struct task_struct *wq_thread,
struct work_struct *work)
{
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
}
/* Creation of a cpu workqueue thread */
-static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+static void probe_workqueue_creation(void *ignore,
+ struct task_struct *wq_thread, int cpu)
{
struct cpu_workqueue_stats *cws;
unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
}
/* Destruction of a cpu workqueue thread */
-static void probe_workqueue_destruction(struct task_struct *wq_thread)
+static void
+probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
{
/* Workqueue only execute on one cpu */
int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
{
int ret, cpu;
- ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+ ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
if (ret)
goto out;
- ret = register_trace_workqueue_execution(probe_workqueue_execution);
+ ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
if (ret)
goto no_insertion;
- ret = register_trace_workqueue_creation(probe_workqueue_creation);
+ ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
if (ret)
goto no_execution;
- ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+ ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
if (ret)
goto no_creation;
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
return 0;
no_creation:
- unregister_trace_workqueue_creation(probe_workqueue_creation);
+ unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
no_execution:
- unregister_trace_workqueue_execution(probe_workqueue_execution);
+ unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
no_insertion:
- unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+ unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
out:
pr_warning("trace_workqueue: unable to trace workqueues\n");
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f..c77f3eceea2 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
*/
struct tracepoint_entry {
struct hlist_node hlist;
- void **funcs;
+ struct tracepoint_func *funcs;
int refcount; /* Number of times armed. 0 if disarmed. */
char name[0];
};
@@ -64,12 +64,12 @@ struct tp_probes {
struct rcu_head rcu;
struct list_head list;
} u;
- void *probes[0];
+ struct tracepoint_func probes[0];
};
static inline void *allocate_probes(int count)
{
- struct tp_probes *p = kmalloc(count * sizeof(void *)
+ struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
+ sizeof(struct tp_probes), GFP_KERNEL);
return p == NULL ? NULL : p->probes;
}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
kfree(container_of(head, struct tp_probes, u.rcu));
}
-static inline void release_probes(void *old)
+static inline void release_probes(struct tracepoint_func *old)
{
if (old) {
struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
if (!tracepoint_debug || !entry->funcs)
return;
- for (i = 0; entry->funcs[i]; i++)
- printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
+ for (i = 0; entry->funcs[i].func; i++)
+ printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
}
-static void *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
+static struct tracepoint_func *
+tracepoint_entry_add_probe(struct tracepoint_entry *entry,
+ void *probe, void *data)
{
int nr_probes = 0;
- void **old, **new;
+ struct tracepoint_func *old, *new;
WARN_ON(!probe);
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
old = entry->funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes]; nr_probes++)
- if (old[nr_probes] == probe)
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == probe &&
+ old[nr_probes].data == data)
return ERR_PTR(-EEXIST);
}
/* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old)
- memcpy(new, old, nr_probes * sizeof(void *));
- new[nr_probes] = probe;
- new[nr_probes + 1] = NULL;
+ memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+ new[nr_probes].func = probe;
+ new[nr_probes].data = data;
+ new[nr_probes + 1].func = NULL;
entry->refcount = nr_probes + 1;
entry->funcs = new;
debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
}
static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
+tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
+ void *probe, void *data)
{
int nr_probes = 0, nr_del = 0, i;
- void **old, **new;
+ struct tracepoint_func *old, *new;
old = entry->funcs;
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
debug_print_probes(entry);
/* (N -> M), (N > 1, M >= 0) probes */
- for (nr_probes = 0; old[nr_probes]; nr_probes++) {
- if ((!probe || old[nr_probes] == probe))
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if (!probe ||
+ (old[nr_probes].func == probe &&
+ old[nr_probes].data == data))
nr_del++;
}
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
new = allocate_probes(nr_probes - nr_del + 1);
if (new == NULL)
return ERR_PTR(-ENOMEM);
- for (i = 0; old[i]; i++)
- if ((probe && old[i] != probe))
+ for (i = 0; old[i].func; i++)
+ if (probe &&
+ (old[i].func != probe || old[i].data != data))
new[j++] = old[i];
- new[nr_probes - nr_del] = NULL;
+ new[nr_probes - nr_del].func = NULL;
entry->refcount = nr_probes - nr_del;
entry->funcs = new;
}
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
module_update_tracepoints();
}
-static void *tracepoint_add_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_add_probe(const char *name, void *probe, void *data)
{
struct tracepoint_entry *entry;
- void *old;
+ struct tracepoint_func *old;
entry = get_tracepoint(name);
if (!entry) {
entry = add_tracepoint(name);
if (IS_ERR(entry))
- return entry;
+ return (struct tracepoint_func *)entry;
}
- old = tracepoint_entry_add_probe(entry, probe);
+ old = tracepoint_entry_add_probe(entry, probe, data);
if (IS_ERR(old) && !entry->refcount)
remove_tracepoint(entry);
return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
* Returns 0 if ok, error value on error.
* The probe address must at least be aligned on the architecture pointer size.
*/
-int tracepoint_probe_register(const char *name, void *probe)
+int tracepoint_probe_register(const char *name, void *probe, void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe);
+ old = tracepoint_add_probe(name, probe, data);
mutex_unlock(&tracepoints_mutex);
if (IS_ERR(old))
return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
-static void *tracepoint_remove_probe(const char *name, void *probe)
+static struct tracepoint_func *
+tracepoint_remove_probe(const char *name, void *probe, void *data)
{
struct tracepoint_entry *entry;
- void *old;
+ struct tracepoint_func *old;
entry = get_tracepoint(name);
if (!entry)
return ERR_PTR(-ENOENT);
- old = tracepoint_entry_remove_probe(entry, probe);
+ old = tracepoint_entry_remove_probe(entry, probe, data);
if (IS_ERR(old))
return old;
if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
* itself uses stop_machine(), which insures that every preempt disabled section
* have finished.
*/
-int tracepoint_probe_unregister(const char *name, void *probe)
+int tracepoint_probe_unregister(const char *name, void *probe, void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe);
+ old = tracepoint_remove_probe(name, probe, data);
mutex_unlock(&tracepoints_mutex);
if (IS_ERR(old))
return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
*
* caller must call tracepoint_probe_update_all()
*/
-int tracepoint_probe_register_noupdate(const char *name, void *probe)
+int tracepoint_probe_register_noupdate(const char *name, void *probe,
+ void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_add_probe(name, probe);
+ old = tracepoint_add_probe(name, probe, data);
if (IS_ERR(old)) {
mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
*
* caller must call tracepoint_probe_update_all()
*/
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
+ void *data)
{
- void *old;
+ struct tracepoint_func *old;
mutex_lock(&tracepoints_mutex);
- old = tracepoint_remove_probe(name, probe);
+ old = tracepoint_remove_probe(name, probe, data);
if (IS_ERR(old)) {
mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb..7e72614b736 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/user_namespace.h>
-#include "cred-internals.h"
struct user_namespace init_user_ns = {
.kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up, *new;
- /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
- * atomic.
- */
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- /* This case is not possible when CONFIG_USER_SCHED
- * is defined, since we serialize alloc_uid() using
- * uids_mutex. Hence no need to call
- * sched_destroy_user() or remove_user_sysfs_dir().
- */
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
return up;
- put_user_ns(new->user_ns);
- kmem_cache_free(uid_cachep, new);
out_unlock:
return NULL;
}