From 45c01e824991b2dd0a332e19efc4901acb31209f Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:20:41 +0200 Subject: sched: prioritize non-migratable tasks over migratable ones Dmitry Adamushko pointed out a known flaw in the rt-balancing algorithm that could allow suboptimal balancing if a non-migratable task gets queued behind a running migratable one. It is discussed in this thread: http://lkml.org/lkml/2008/4/22/296 This issue has been further exacerbated by a recent checkin to sched-devel (git-id 5eee63a5ebc19a870ac40055c0be49457f3a89a3). >From a pure priority standpoint, the run-queue is doing the "right" thing. Using Dmitry's nomenclature, if T0 is on cpu1 first, and T1 wakes up at equal or lower priority (affined only to cpu1) later, it *should* wait for T0 to finish. However, in reality that is likely suboptimal from a system perspective if there are other cores that could allow T0 and T1 to run concurrently. Since T1 can not migrate, the only choice for higher concurrency is to try to move T0. This is not something we addessed in the recent rt-balancing re-work. This patch tries to enhance the balancing algorithm by accomodating this scenario. It accomplishes this by incorporating the migratability of a task into its priority calculation. Within a numerical tsk->prio, a non-migratable task is logically higher than a migratable one. We maintain this by introducing a new per-priority queue (xqueue, or exclusive-queue) for holding non-migratable tasks. The scheduler will draw from the xqueue over the standard shared-queue (squeue) when available. There are several details for utilizing this properly. 1) During task-wake-up, we not only need to check if the priority preempts the current task, but we also need to check for this non-migratable condition. Therefore, if a non-migratable task wakes up and sees an equal priority migratable task already running, it will attempt to preempt it *if* there is a likelyhood that the current task will find an immediate home. 2) Tasks only get this non-migratable "priority boost" on wake-up. Any requeuing will result in the non-migratable task being queued to the end of the shared queue. This is an attempt to prevent the system from being completely unfair to migratable tasks during things like SCHED_RR timeslicing. I am sure this patch introduces potentially "odd" behavior if you concoct a scenario where a bunch of non-migratable threads could starve migratable ones given the right pattern. I am not yet convinced that this is a problem since we are talking about tasks of equal RT priority anyway, and there never is much in the way of guarantees against starvation under that scenario anyway. (e.g. you could come up with a similar scenario with a specific timing environment verses an affinity environment). I can be convinced otherwise, but for now I think this is "ok". Signed-off-by: Gregory Haskins CC: Dmitry Adamushko CC: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 6 +++-- kernel/sched_rt.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index bfb8ad8ed17..7178b8c2351 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -151,7 +151,8 @@ static inline int task_has_rt_policy(struct task_struct *p) */ struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; + struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */ + struct list_head squeue[MAX_RT_PRIO]; /* shared queue */ }; struct rt_bandwidth { @@ -7542,7 +7543,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) array = &rt_rq->active; for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); + INIT_LIST_HEAD(array->xqueue + i); + INIT_LIST_HEAD(array->squeue + i); __clear_bit(i, array->bitmap); } /* delimiter for bitsearch: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3432d573205..fefed39fafd 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -458,7 +458,13 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) if (group_rq && rt_rq_throttled(group_rq)) return; - list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + if (rt_se->nr_cpus_allowed == 1) + list_add_tail(&rt_se->run_list, + array->xqueue + rt_se_prio(rt_se)); + else + list_add_tail(&rt_se->run_list, + array->squeue + rt_se_prio(rt_se)); + __set_bit(rt_se_prio(rt_se), array->bitmap); inc_rt_tasks(rt_se, rt_rq); @@ -470,7 +476,8 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) + if (list_empty(array->squeue + rt_se_prio(rt_se)) + && list_empty(array->xqueue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); @@ -537,13 +544,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) /* * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. + * + * Note: We always enqueue the task to the shared-queue, regardless of its + * previous position w.r.t. exclusive vs shared. This is so that exclusive RR + * tasks fairly round-robin with all tasks on the runqueue, not just other + * exclusive tasks. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; - list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + list_del_init(&rt_se->run_list); + list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se)); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -601,13 +614,46 @@ static int select_task_rq_rt(struct task_struct *p, int sync) } #endif /* CONFIG_SMP */ +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, + struct rt_rq *rt_rq); + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) { - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio) { resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * If: + * + * - the newly woken task is of equal priority to the current task + * - the newly woken task is non-migratable while current is migratable + * - current will be preempted on the next reschedule + * + * we should check to see if current can readily move to a different + * cpu. If so, we will reschedule to allow the push logic to try + * to move current somewhere else, making room for our non-migratable + * task. + */ + if((p->prio == rq->curr->prio) + && p->rt.nr_cpus_allowed == 1 + && rq->curr->rt.nr_cpus_allowed != 1 + && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) { + cpumask_t mask; + + if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) + /* + * There appears to be other cpus that can accept + * current, so lets reschedule to try and push it away + */ + resched_task(rq->curr); + } +#endif } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -621,8 +667,15 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); - queue = array->queue + idx; - next = list_entry(queue->next, struct sched_rt_entity, run_list); + queue = array->xqueue + idx; + if (!list_empty(queue)) + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + else { + queue = array->squeue + idx; + next = list_entry(queue->next, struct sched_rt_entity, + run_list); + } return next; } @@ -692,7 +745,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) continue; if (next && next->prio < idx) continue; - list_for_each_entry(rt_se, array->queue + idx, run_list) { + list_for_each_entry(rt_se, array->squeue + idx, run_list) { struct task_struct *p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; @@ -1146,6 +1199,14 @@ static void set_cpus_allowed_rt(struct task_struct *p, } update_rt_migration(rq); + + if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1)) + /* + * If either the new or old weight is a "1", we need + * to requeue to properly move between shared and + * exclusive queues. + */ + requeue_task_rt(rq, p); } p->cpus_allowed = *new_mask; -- cgit From f333fdc9098b71e2687e4e9b6349fcb352960d66 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 12 May 2008 21:20:55 +0200 Subject: sched: make !hrtick faster it is safe to ignore timers and flags when the feature is disabled. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7178b8c2351..aa960b84b88 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4134,7 +4134,7 @@ asmlinkage void __sched schedule(void) struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; - int cpu; + int cpu, hrtick = sched_feat(HRTICK); need_resched: preempt_disable(); @@ -4149,7 +4149,8 @@ need_resched_nonpreemptible: schedule_debug(prev); - hrtick_clear(rq); + if (hrtick) + hrtick_clear(rq); /* * Do the rq-clock update outside the rq lock: @@ -4197,7 +4198,8 @@ need_resched_nonpreemptible: } else spin_unlock_irq(&rq->lock); - hrtick_set(rq); + if (hrtick) + hrtick_set(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -- cgit From 6e0534f278199f1e3dd1049b9bc19a7a5b87ada1 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:21:01 +0200 Subject: sched: use a 2-d bitmap for searching lowest-pri CPU The current code use a linear algorithm which causes scaling issues on larger SMP machines. This patch replaces that algorithm with a 2-dimensional bitmap to reduce latencies in the wake-up path. Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/Makefile | 1 + kernel/sched.c | 7 ++ kernel/sched_cpupri.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_cpupri.h | 36 +++++++++++ kernel/sched_rt.c | 98 ++++++---------------------- 5 files changed, 239 insertions(+), 77 deletions(-) create mode 100644 kernel/sched_cpupri.c create mode 100644 kernel/sched_cpupri.h diff --git a/kernel/Makefile b/kernel/Makefile index 1c9938addb9..ecdd2d33563 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_SMP) += sched_cpupri.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/sched.c b/kernel/sched.c index aa960b84b88..8a1257b6556 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -74,6 +74,8 @@ #include #include +#include "sched_cpupri.h" + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -450,6 +452,9 @@ struct root_domain { */ cpumask_t rto_mask; atomic_t rto_count; +#ifdef CONFIG_SMP + struct cpupri cpupri; +#endif }; /* @@ -6392,6 +6397,8 @@ static void init_rootdomain(struct root_domain *rd) cpus_clear(rd->span); cpus_clear(rd->online); + + cpupri_init(&rd->cpupri); } static void init_defrootdomain(void) diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 00000000000..52154fefab7 --- /dev/null +++ b/kernel/sched_cpupri.c @@ -0,0 +1,174 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007-2008 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ + idx < CPUPRI_NR_PRIORITIES; \ + idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1)) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs + * + * Note: This function returns the recommended CPUs as calculated during the + * current invokation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + cpumask_t *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + cpumask_t mask; + + if (idx >= task_pri) + break; + + cpus_and(mask, p->cpus_allowed, vec->mask); + + if (cpus_empty(mask)) + continue; + + *lowest_mask = mask; + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * first need to unmap the old value + */ + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpu_clear(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + spin_lock_irqsave(&vec->lock, flags); + + cpu_set(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * + * Returns: (void) + */ +void cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + spin_lock_init(&vec->lock); + vec->count = 0; + cpus_clear(vec->mask); + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; +} + + diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 00000000000..0b6a3d110fa --- /dev/null +++ b/kernel/sched_cpupri.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO +#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + spinlock_t lock; + int count; + cpumask_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, cpumask_t *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +void cpupri_init(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fefed39fafd..44b06d75416 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -391,8 +391,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - if (rt_se_prio(rt_se) < rt_rq->highest_prio) + if (rt_se_prio(rt_se) < rt_rq->highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { @@ -416,6 +419,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { +#ifdef CONFIG_SMP + int highest_prio = rt_rq->highest_prio; +#endif + WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; @@ -439,6 +446,11 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rq->rt.rt_nr_migratory--; } + if (rt_rq->highest_prio != highest_prio) { + struct rq *rq = rq_of_rt_rq(rt_rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + } + update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ #ifdef CONFIG_RT_GROUP_SCHED @@ -763,73 +775,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); -static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) -{ - int lowest_prio = -1; - int lowest_cpu = -1; - int count = 0; - int cpu; - - cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); - - /* - * Scan each rq for the lowest prio. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - struct rq *rq = cpu_rq(cpu); - - /* We look for lowest RT prio or non-rt CPU */ - if (rq->rt.highest_prio >= MAX_RT_PRIO) { - /* - * if we already found a low RT queue - * and now we found this non-rt queue - * clear the mask and set our bit. - * Otherwise just return the queue as is - * and the count==1 will cause the algorithm - * to use the first bit found. - */ - if (lowest_cpu != -1) { - cpus_clear(*lowest_mask); - cpu_set(rq->cpu, *lowest_mask); - } - return 1; - } - - /* no locking for now */ - if ((rq->rt.highest_prio > task->prio) - && (rq->rt.highest_prio >= lowest_prio)) { - if (rq->rt.highest_prio > lowest_prio) { - /* new low - clear old data */ - lowest_prio = rq->rt.highest_prio; - lowest_cpu = cpu; - count = 0; - } - count++; - } else - cpu_clear(cpu, *lowest_mask); - } - - /* - * Clear out all the set bits that represent - * runqueues that were of higher prio than - * the lowest_prio. - */ - if (lowest_cpu > 0) { - /* - * Perhaps we could add another cpumask op to - * zero out bits. Like cpu_zero_bits(cpumask, nrbits); - * Then that could be optimized to use memset and such. - */ - for_each_cpu_mask(cpu, *lowest_mask) { - if (cpu >= lowest_cpu) - break; - cpu_clear(cpu, *lowest_mask); - } - } - - return count; -} - static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) { int first; @@ -851,17 +796,12 @@ static int find_lowest_rq(struct task_struct *task) cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); - int count = find_lowest_cpus(task, lowest_mask); - if (!count) - return -1; /* No targets found */ + if (task->rt.nr_cpus_allowed == 1) + return -1; /* No other targets possible */ - /* - * There is no sense in performing an optimal search if only one - * target is found. - */ - if (count == 1) - return first_cpu(*lowest_mask); + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return -1; /* No targets found */ /* * At this point we have built a mask of cpus representing the @@ -1218,6 +1158,8 @@ static void join_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } /* Assumes rq->lock is held */ @@ -1225,6 +1167,8 @@ static void leave_domain_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } /* -- cgit From 6d299f1b53b84e2665f402d9bcc494800aba6386 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 12 May 2008 21:21:14 +0200 Subject: sched: fix SCHED_OTHER balance iterator to include all tasks The currently logic inadvertently skips the last task on the run-queue, resulting in missed balance opportunities. Signed-off-by: Gregory Haskins Signed-off-by: David Bahi CC: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched_fair.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 08ae848b71d..1fe4c65a817 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1275,23 +1275,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) struct task_struct *p = NULL; struct sched_entity *se; - if (next == &cfs_rq->tasks) - return NULL; - - /* Skip over entities that are not tasks */ - do { + while (next != &cfs_rq->tasks) { se = list_entry(next, struct sched_entity, group_node); next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - if (next == &cfs_rq->tasks) - return NULL; + /* Skip over entities that are not tasks */ + if (entity_is_task(se)) { + p = task_of(se); + break; + } + } cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); - return p; } -- cgit From d07355f5def74d060333563b36ab51b89fd44cdd Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 12 May 2008 21:21:15 +0200 Subject: sched: check for SD_SERIALIZE atomically in rebalance_domains() Nothing really serious here, mainly just a matter of nit-picking :-/ From: Dmitry Adamushko For CONFIG_SCHED_DEBUG && CONFIG_SYSCT configs, sd->flags can be altered while being manipulated in rebalance_domains(). Let's do an atomic check. We rely here on the atomicity of read/write accesses for aligned words. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 8a1257b6556..90329f1f894 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3668,6 +3668,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + int need_serialize; cpumask_t tmp; for_each_domain(cpu, sd) { @@ -3685,8 +3686,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (interval > HZ*NR_CPUS/10) interval = HZ*NR_CPUS/10; + need_serialize = sd->flags & SD_SERIALIZE; - if (sd->flags & SD_SERIALIZE) { + if (need_serialize) { if (!spin_trylock(&balancing)) goto out; } @@ -3702,7 +3704,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } sd->last_balance = jiffies; } - if (sd->flags & SD_SERIALIZE) + if (need_serialize) spin_unlock(&balancing); out: if (time_after(next_balance, sd->last_balance + interval)) { -- cgit From f7dcd80bbc8e7032443e6539ea1b830364f82200 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 May 2008 23:20:38 +0200 Subject: namespacecheck: fixes in kernel/sched.c Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 90329f1f894..02a5eeedcb9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1633,7 +1633,7 @@ inline int task_curr(const struct task_struct *p) } /* Used instead of source_load when we know the type == 0 */ -unsigned long weighted_cpuload(const int cpu) +static unsigned long weighted_cpuload(const int cpu) { return cpu_rq(cpu)->load.weight; } -- cgit From 554ec22f075d46e4363520a407d2b7eeb5dfdd43 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:03 +0200 Subject: namespacecheck: more sched.c fixes [ Stephen Rothwell : build fix ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ae0be3c6237..dc36c3aea01 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -134,7 +134,6 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern unsigned long weighted_cpuload(const int cpu); struct seq_file; struct cfs_rq; @@ -823,23 +822,6 @@ extern int arch_reinit_sched_domains(void); #endif /* CONFIG_SMP */ -/* - * A runqueue laden with a single nice 0 task scores a weighted_cpuload of - * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a - * task of nice 0 or enough lower priority tasks to bring up the - * weighted_cpuload - */ -static inline int above_background_load(void) -{ - unsigned long cpu; - - for_each_online_cpu(cpu) { - if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) - return 1; - } - return 0; -} - struct io_context; /* See blkdev.h */ #define NGROUPS_SMALL 32 #define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t))) -- cgit From 81d41d7ece23a1c3b4bcd1604026d3a06cc4dc79 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Sun, 11 May 2008 05:55:33 +0530 Subject: sched: fix defined-but-unused warning Fix this warning, which appears with !CONFIG_SMP: kernel/sched.c:1216: warning: `init_hrtick' defined but not used Signed-off-by: Rabin Vincent Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 02a5eeedcb9..f3faec52c5a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1130,6 +1130,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } +#ifdef CONFIG_SMP static void hotplug_hrtick_disable(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -1185,6 +1186,7 @@ static void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } +#endif /* CONFIG_SMP */ static void init_rq_hrtick(struct rq *rq) { -- cgit From c7aceaba042702538b23cf4e0de1b2891ad8e671 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 15 May 2008 12:09:15 +0100 Subject: sched: reorder task_struct to reduce padding on 64bit builds This patch removes 24 bytes of padding and allows 1 extra object per slab on my fedora based config. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index dc36c3aea01..ea2857b9959 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1021,6 +1021,7 @@ struct task_struct { #endif int prio, static_prio, normal_prio; + unsigned int rt_priority; const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; @@ -1104,7 +1105,6 @@ struct task_struct { int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ - unsigned int rt_priority; cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; cputime_t prev_utime, prev_stime; @@ -1123,12 +1123,12 @@ struct task_struct { gid_t gid,egid,sgid,fsgid; struct group_info *group_info; kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset; - unsigned securebits; struct user_struct *user; + unsigned securebits; #ifdef CONFIG_KEYS + unsigned char jit_keyring; /* default keyring to attach requested keys to */ struct key *request_key_auth; /* assumed request_key authority */ struct key *thread_keyring; /* keyring private to this thread */ - unsigned char jit_keyring; /* default keyring to attach requested keys to */ #endif char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock @@ -1215,8 +1215,8 @@ struct task_struct { # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; - struct held_lock held_locks[MAX_LOCK_DEPTH]; unsigned int lockdep_recursion; + struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif /* journalling filesystem info */ @@ -1244,10 +1244,6 @@ struct task_struct { u64 acct_vm_mem1; /* accumulated virtual memory usage */ cputime_t acct_stimexpd;/* stime since last update */ #endif -#ifdef CONFIG_NUMA - struct mempolicy *mempolicy; - short il_next; -#endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; int cpuset_mems_generation; @@ -1266,6 +1262,10 @@ struct task_struct { #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; +#endif +#ifdef CONFIG_NUMA + struct mempolicy *mempolicy; + short il_next; #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; -- cgit From e21f5b153b9b4a6775d7d41964e372e13a9178ab Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 23 May 2008 09:05:58 -0700 Subject: sched: print module list in the "scheduling while atomic" warning For the normal WARN_ON() etc we added a print-the-modules-list already, which is very useful to figure out candidates for certain types of bugs. This patch adds the same print to the "scheduling while atomic" BUG warning, for the same reason: when we get here it's very useful to see which modules are loaded, to narrow down the candidate code list. Signed-off-by: Arjan van de Ven Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched.c b/kernel/sched.c index f3faec52c5a..84a360670b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4070,6 +4070,7 @@ static noinline void __schedule_bug(struct task_struct *prev) prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); + print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -- cgit From 6d6bc0ad867c46896d0994bb039e7550ecb9b51d Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Fri, 30 May 2008 14:23:45 +0200 Subject: sched: add comments for ifdefs in sched.c make sched.c easier to read. Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/sched.c | 76 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 84a360670b9..ef4e25604bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -292,15 +292,15 @@ struct task_group root_task_group; static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; -#endif -#else +#endif /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ #define root_task_group init_task_group -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -310,9 +310,9 @@ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else +#else /* !CONFIG_USER_SCHED */ # define INIT_TASK_GROUP_LOAD NICE_0_LOAD -#endif +#endif /* CONFIG_USER_SCHED */ /* * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. @@ -1316,15 +1316,15 @@ void wake_up_idle_cpu(int cpu) if (!tsk_is_polling(rq->idle)) smp_send_reschedule(cpu); } -#endif +#endif /* CONFIG_NO_HZ */ -#else +#else /* !CONFIG_SMP */ static void __resched_task(struct task_struct *p, int tif_bit) { assert_spin_locked(&task_rq(p)->lock); set_tsk_thread_flag(p, tif_bit); } -#endif +#endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) @@ -2129,7 +2129,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } } } -#endif +#endif /* CONFIG_SCHEDSTATS */ out_activate: #endif /* CONFIG_SMP */ @@ -2329,7 +2329,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, notifier->ops->sched_out(notifier, next); } -#else +#else /* !CONFIG_PREEMPT_NOTIFIERS */ static void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -2341,7 +2341,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif +#endif /* CONFIG_PREEMPT_NOTIFIERS */ /** * prepare_task_switch - prepare to switch tasks @@ -6300,9 +6300,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } kfree(groupmask); } -#else +#else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) -#endif +#endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) { @@ -6598,7 +6598,7 @@ static void sched_domain_node_span(int node, cpumask_t *span) cpus_or(*span, *span, *nodemask); } } -#endif +#endif /* CONFIG_NUMA */ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; @@ -6617,7 +6617,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, *sg = &per_cpu(sched_group_cpus, cpu); return cpu; } -#endif +#endif /* CONFIG_SCHED_SMT */ /* * multi-core sched-domains: @@ -6625,7 +6625,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_core); -#endif +#endif /* CONFIG_SCHED_MC */ #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int @@ -6727,7 +6727,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) sg = sg->next; } while (sg != group_head); } -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ @@ -6764,11 +6764,11 @@ next_sg: sched_group_nodes_bycpu[cpu] = NULL; } } -#else +#else /* !CONFIG_NUMA */ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } -#endif +#endif /* CONFIG_NUMA */ /* * Initialize sched groups cpu_power. @@ -7459,7 +7459,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) #endif return err; } -#endif +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ /* * Force a reinitialization of the sched domains hierarchy. The domains @@ -7677,8 +7677,8 @@ void __init sched_init(void) root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -7692,8 +7692,8 @@ void __init sched_init(void) root_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ } #ifdef CONFIG_SMP @@ -7709,8 +7709,8 @@ void __init sched_init(void) #ifdef CONFIG_USER_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), RUNTIME_INF); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); @@ -7720,8 +7720,8 @@ void __init sched_init(void) INIT_LIST_HEAD(&root_task_group.children); init_task_group.parent = &root_task_group; list_add(&init_task_group.siblings, &root_task_group.children); -#endif -#endif +#endif /* CONFIG_USER_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ for_each_possible_cpu(i) { struct rq *rq; @@ -8040,7 +8040,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); } -#else +#else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } @@ -8058,7 +8058,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu) static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static void free_rt_sched_group(struct task_group *tg) @@ -8129,7 +8129,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { } @@ -8147,7 +8147,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu) static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) { } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) @@ -8258,7 +8258,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } -#endif +#endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) @@ -8508,7 +8508,7 @@ static int sched_rt_global_constraints(void) return ret; } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static int sched_rt_global_constraints(void) { unsigned long flags; @@ -8526,7 +8526,7 @@ static int sched_rt_global_constraints(void) return 0; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, @@ -8634,7 +8634,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, @@ -8658,7 +8658,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) { return sched_group_rt_period(cgroup_tg(cgrp)); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit From e29c98d12b3f02d6ac711c60b4a5f8a46d1cf19b Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 29 May 2008 12:36:18 +0530 Subject: sched: update the sched-domains debug documentation SCHED_DOMAIN_DEBUG mentioned in the Documentation for sched-domains for enabling sched-domains debugging doesn't exist anymore. Update the documentation to reflect the correct way of enabling sched-domain debugging. Signed-off-by: Gautham R Shenoy Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-domains.txt | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt index a9e990ab980..373ceacc367 100644 --- a/Documentation/scheduler/sched-domains.txt +++ b/Documentation/scheduler/sched-domains.txt @@ -61,10 +61,7 @@ builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your arch_init_sched_domains function. This function will attach domains to all CPUs using cpu_attach_domain. -Implementors should change the line -#undef SCHED_DOMAIN_DEBUG -to -#define SCHED_DOMAIN_DEBUG -in kernel/sched.c as this enables an error checking parse of the sched domains +The sched-domains debugging infrastructure can be enabled by enabling +CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains which should catch most possible errors (described above). It also prints out the domain structure in a visual format. -- cgit From 099f98c8a1f13501a98afbfff4756395a610581c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Thu, 29 May 2008 20:56:32 +0530 Subject: sched: print the sd->level in sched_domain_debug code While printing out the visual representation of the sched-domains, print the level (MC, SMT, CPU, NODE, ... ) of each of the sched_domains. Credit: Peter Zijlstra Signed-off-by: Gautham R Shenoy Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index ef4e25604bb..dc0be113f41 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6197,6 +6197,28 @@ void __init migration_init(void) #ifdef CONFIG_SCHED_DEBUG +static inline const char *sd_level_to_string(enum sched_domain_level lvl) +{ + switch (lvl) { + case SD_LV_NONE: + return "NONE"; + case SD_LV_SIBLING: + return "SIBLING"; + case SD_LV_MC: + return "MC"; + case SD_LV_CPU: + return "CPU"; + case SD_LV_NODE: + return "NODE"; + case SD_LV_ALLNODES: + return "ALLNODES"; + case SD_LV_MAX: + return "MAX"; + + } + return "MAX"; +} + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_t *groupmask) { @@ -6216,7 +6238,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s\n", str); + printk(KERN_CONT "span %s level %s\n", + str, sd_level_to_string(sd->level)); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " -- cgit From 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 4 Jun 2008 15:04:05 -0400 Subject: sched: fix cpupri hotplug support The RT folks over at RedHat found an issue w.r.t. hotplug support which was traced to problems with the cpupri infrastructure in the scheduler: https://bugzilla.redhat.com/show_bug.cgi?id=449676 This bug affects 23-rt12+, 24-rtX, 25-rtX, and sched-devel. This patch applies to 25.4-rt4, though it should trivially apply to most cpupri enabled kernels mentioned above. It turned out that the issue was that offline cpus could get inadvertently registered with cpupri so that they were erroneously selected during migration decisions. The end result would be an OOPS as the offline cpu had tasks routed to it. This patch generalizes the old join/leave domain interface into an online/offline interface, and adjusts the root-domain/hotplug code to utilize it. I was able to easily reproduce the issue prior to this patch, and am no longer able to reproduce it after this patch. I can offline cpus indefinately and everything seems to be in working order. Thanks to Arnaldo (acme), Thomas, and Peter for doing the legwork to point me in the right direction. Also thank you to Peter for reviewing the early iterations of this patch. Signed-off-by: Gregory Haskins Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 4 ++-- kernel/sched.c | 54 ++++++++++++++++++++++++++++++++++++++------------- kernel/sched_rt.c | 24 +++++++++++++++++------ 3 files changed, 60 insertions(+), 22 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ea2857b9959..d25acf600a3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -903,8 +903,8 @@ struct sched_class { void (*set_cpus_allowed)(struct task_struct *p, const cpumask_t *newmask); - void (*join_domain)(struct rq *rq); - void (*leave_domain)(struct rq *rq); + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); void (*switched_from) (struct rq *this_rq, struct task_struct *task, int running); diff --git a/kernel/sched.c b/kernel/sched.c index dc0be113f41..f0ed81b7128 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -529,6 +529,7 @@ struct rq { int push_cpu; /* cpu of this runqueue: */ int cpu; + int online; struct task_struct *migration_thread; struct list_head migration_queue; @@ -1498,6 +1499,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif #define sched_class_highest (&rt_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) static inline void inc_load(struct rq *rq, const struct task_struct *p) { @@ -6065,6 +6068,36 @@ static void unregister_sched_domain_sysctl(void) } #endif +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpu_set(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpu_clear(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -6102,7 +6135,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); + + set_rq_online(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6163,7 +6197,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_clear(cpu, rq->rd->online); + set_rq_offline(rq); } spin_unlock_irqrestore(&rq->lock, flags); break; @@ -6385,20 +6419,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void rq_attach_root(struct rq *rq, struct root_domain *rd) { unsigned long flags; - const struct sched_class *class; spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { struct root_domain *old_rd = rq->rd; - for (class = sched_class_highest; class; class = class->next) { - if (class->leave_domain) - class->leave_domain(rq); - } + if (cpu_isset(rq->cpu, old_rd->online)) + set_rq_offline(rq); cpu_clear(rq->cpu, old_rd->span); - cpu_clear(rq->cpu, old_rd->online); if (atomic_dec_and_test(&old_rd->refcount)) kfree(old_rd); @@ -6409,12 +6439,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) cpu_set(rq->cpu, rd->span); if (cpu_isset(rq->cpu, cpu_online_map)) - cpu_set(rq->cpu, rd->online); - - for (class = sched_class_highest; class; class = class->next) { - if (class->join_domain) - class->join_domain(rq); - } + set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); } @@ -7824,6 +7849,7 @@ void __init sched_init(void) rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; + rq->online = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 44b06d75416..e4821593d4d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq) static inline void rt_set_overload(struct rq *rq) { + if (!rq->online) + return; + cpu_set(rq->cpu, rq->rd->rto_mask); /* * Make sure the mask is visible before we set @@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq) static inline void rt_clear_overload(struct rq *rq) { + if (!rq->online) + return; + /* the order here really doesn't matter */ atomic_dec(&rq->rd->rto_count); cpu_clear(rq->cpu, rq->rd->rto_mask); @@ -394,7 +400,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_prio(rt_se) < rt_rq->highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); rt_rq->highest_prio = rt_se_prio(rt_se); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_se_prio(rt_se)); } #endif #ifdef CONFIG_SMP @@ -448,7 +457,10 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_rq->highest_prio != highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio); + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, + rt_rq->highest_prio); } update_rt_migration(rq_of_rt_rq(rt_rq)); @@ -1154,7 +1166,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, } /* Assumes rq->lock is held */ -static void join_domain_rt(struct rq *rq) +static void rq_online_rt(struct rq *rq) { if (rq->rt.overloaded) rt_set_overload(rq); @@ -1163,7 +1175,7 @@ static void join_domain_rt(struct rq *rq) } /* Assumes rq->lock is held */ -static void leave_domain_rt(struct rq *rq) +static void rq_offline_rt(struct rq *rq) { if (rq->rt.overloaded) rt_clear_overload(rq); @@ -1331,8 +1343,8 @@ static const struct sched_class rt_sched_class = { .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, - .join_domain = join_domain_rt, - .leave_domain = leave_domain_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, -- cgit From 709d4b0c60f990bccf3e10ba7c6da407ad65c97f Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 4 Jun 2008 15:04:10 -0400 Subject: sched: fix cpupri priocount A rounding error was pointed out by Peter Zijlstra which would result in the structure holding priorities to be off by one. Signed-off-by: Gregory Haskins Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Signed-off-by: Thomas Gleixner --- kernel/sched_cpupri.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 0b6a3d110fa..6b38355e267 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -4,7 +4,7 @@ #include #define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO -#define CPUPRI_NR_PRI_WORDS CPUPRI_NR_PRIORITIES/BITS_PER_LONG +#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 -- cgit From e539d8fcd11af811db70707d47ea436d5621d0da Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Jun 2008 10:28:00 +0200 Subject: sched: fix the cpuprio count really Peter pointed out that the last version of the "fix" was still one off under certain circumstances. Use BITS_TO_LONG instead to get an accurate result. Signed-off-by: Thomas Gleixner --- kernel/sched_cpupri.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 6b38355e267..f25811b0f93 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -3,8 +3,8 @@ #include -#define CPUPRI_NR_PRIORITIES 2+MAX_RT_PRIO -#define CPUPRI_NR_PRI_WORDS (CPUPRI_NR_PRIORITIES + BITS_PER_LONG/2)/BITS_PER_LONG +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 -- cgit From 1100ac91b6af02d8639d518fad5b434b1bf44ed6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Jun 2008 12:25:37 +0200 Subject: sched: fix cpuprio build bug this patch was not built on !SMP: kernel/sched_rt.c: In function 'inc_rt_tasks': kernel/sched_rt.c:404: error: 'struct rq' has no member named 'online' Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e4821593d4d..eaa606071d5 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -399,16 +399,19 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_se_prio(rt_se) < rt_rq->highest_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - rt_rq->highest_prio = rt_se_prio(rt_se); + rt_rq->highest_prio = rt_se_prio(rt_se); +#ifdef CONFIG_SMP if (rq->online) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se)); +#endif } #endif #ifdef CONFIG_SMP if (rt_se->nr_cpus_allowed > 1) { struct rq *rq = rq_of_rt_rq(rt_rq); + rq->rt.rt_nr_migratory++; } -- cgit From 5c8e1ed1d204a6770ca2854cd3b3597070fe7e5a Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:01 -0700 Subject: sched: CPU hotplug events must not destroy scheduler domains created by the cpusets First issue is not related to the cpusets. We're simply leaking doms_cur. It's allocated in arch_init_sched_domains() which is called for every hotplug event. So we just keep reallocation doms_cur without freeing it. I introduced free_sched_domains() function that cleans things up. Second issue is that sched domains created by the cpusets are completely destroyed by the CPU hotplug events. For all CPU hotplug events scheduler attaches all CPUs to the NULL domain and then puts them all into the single domain thereby destroying domains created by the cpusets (partition_sched_domains). The solution is simple, when cpusets are enabled scheduler should not create default domain and instead let cpusets do that. Which is exactly what the patch does. Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/cpuset.c | 6 ++++++ kernel/sched.c | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 86ea9e34e32..6090d18b58a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1886,6 +1886,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } diff --git a/kernel/sched.c b/kernel/sched.c index f0ed81b7128..1ddb0a8c797 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7292,6 +7292,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) { } +/* + * Free current domain masks. + * Called after all cpus are attached to NULL domain. + */ +static void free_sched_domains(void) +{ + ndoms_cur = 0; + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = &fallback_doms; +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -7439,6 +7451,7 @@ int arch_reinit_sched_domains(void) get_online_cpus(); mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); + free_sched_domains(); err = arch_init_sched_domains(&cpu_online_map); mutex_unlock(&sched_domains_mutex); put_online_cpus(); @@ -7524,6 +7537,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); + free_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -7542,8 +7556,16 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_DONE; } +#ifndef CONFIG_CPUSETS + /* + * Create default domain partitioning if cpusets are disabled. + * Otherwise we let cpusets rebuild the domains based on the + * current setup. + */ + /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(&cpu_online_map); +#endif return NOTIFY_OK; } -- cgit From 68f4f1ec08e3d95730a2693b99df8260aa0d06ae Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Thu, 29 May 2008 11:17:02 -0700 Subject: sched: Move cpu masks from kernel/sched.c into kernel/cpu.c kernel/cpu.c seems a more logical place for those maps since they do not really have much to do with the scheduler these days. kernel/cpu.c is now built for the UP kernel too, but it does not affect the size the kernel sections. $ size vmlinux before text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux after text data bss dec hex filename 3313797 307060 310352 3931209 3bfc49 vmlinux Signed-off-by: Max Krasnyansky Cc: pj@sgi.com Cc: menage@google.com Cc: rostedt@goodmis.org Cc: mingo@elte.hu Acked-by: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/Makefile | 4 ++-- kernel/cpu.c | 24 ++++++++++++++++++++++++ kernel/sched.c | 18 ------------------ 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index ecdd2d33563..6c55301112e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ - exit.o itimer.o time.o softirq.o resource.o \ + cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ @@ -27,7 +27,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o diff --git a/kernel/cpu.c b/kernel/cpu.c index c77bc3a1c72..b11f06dc149 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -15,6 +15,28 @@ #include #include +/* + * Represents all cpu's present in the system + * In systems capable of hotplug, this map could dynamically grow + * as new cpu's are detected in the system via any platform specific + * method, such as ACPI for e.g. + */ +cpumask_t cpu_present_map __read_mostly; +EXPORT_SYMBOL(cpu_present_map); + +#ifndef CONFIG_SMP + +/* + * Represents all cpu's that are currently online. + */ +cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + +cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); + +#else /* CONFIG_SMP */ + /* Serializes the updates to cpu_online_map, cpu_present_map */ static DEFINE_MUTEX(cpu_add_remove_lock); @@ -403,3 +425,5 @@ out: cpu_maps_update_done(); } #endif /* CONFIG_PM_SLEEP_SMP */ + +#endif /* CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 1ddb0a8c797..f36f549e574 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5080,24 +5080,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, return sched_setaffinity(pid, &new_mask); } -/* - * Represents all cpu's present in the system - * In systems capable of hotplug, this map could dynamically grow - * as new cpu's are detected in the system via any platform specific - * method, such as ACPI for e.g. - */ - -cpumask_t cpu_present_map __read_mostly; -EXPORT_SYMBOL(cpu_present_map); - -#ifndef CONFIG_SMP -cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_online_map); - -cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; -EXPORT_SYMBOL(cpu_possible_map); -#endif - long sched_getaffinity(pid_t pid, cpumask_t *mask) { struct task_struct *p; -- cgit From e958b3600484533ff801920290468adc8135f89d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Jun 2008 23:22:32 +0200 Subject: sched: move weighted_cpuload into #ifdef CONFIG_SMP section weighted_cpuload is only used on SMP. move it into the CONFIG_SMP section. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f36f549e574..727bdef7616 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1637,12 +1637,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { set_task_rq(p, cpu); @@ -1671,6 +1665,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + /* * Is this task likely cache-hot: */ -- cgit From 7def2be1dc679984f4c4fb3ef19a8a081b2454ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2008 14:49:58 +0200 Subject: sched: fix hotplug cpus on ia64 Cliff Wickman wrote: > I built an ia64 kernel from Andrew's tree (2.6.26-rc2-mm1) > and get a very predictable hotplug cpu problem. > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > > The script that disables the cpu always hangs (unkillable) > on the 3rd attempt. > > And a bit further: > The kstopmachine thread always sits on the run queue (real time) for about > 30 minutes before running. this fix solves some (but not all) issues between CPU hotplug and RT bandwidth throttling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 ++++++-- kernel/sched_rt.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 727bdef7616..e9c24a12865 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7513,21 +7513,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (int)(long)hcpu; + switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); free_sched_domains(); return NOTIFY_OK; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index eaa606071d5..8ae3416e0bb 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -286,6 +286,9 @@ static int balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + if (iter->rt_runtime == RUNTIME_INF) + goto next; + diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { do_div(diff, weight); @@ -299,12 +302,105 @@ static int balance_runtime(struct rt_rq *rt_rq) break; } } +next: spin_unlock(&iter->rt_runtime_lock); } spin_unlock(&rt_b->rt_runtime_lock); return more; } + +static void __disable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + s64 want; + int i; + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_runtime == RUNTIME_INF || + rt_rq->rt_runtime == rt_b->rt_runtime) + goto balanced; + spin_unlock(&rt_rq->rt_runtime_lock); + + want = rt_b->rt_runtime - rt_rq->rt_runtime; + + for_each_cpu_mask(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + spin_lock(&iter->rt_runtime_lock); + if (want > 0) { + diff = min_t(s64, iter->rt_runtime, want); + iter->rt_runtime -= diff; + want -= diff; + } else { + iter->rt_runtime -= want; + want -= want; + } + spin_unlock(&iter->rt_runtime_lock); + + if (!want) + break; + } + + spin_lock(&rt_rq->rt_runtime_lock); + BUG_ON(want); +balanced: + rt_rq->rt_runtime = RUNTIME_INF; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void disable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __disable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void __enable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_b->rt_runtime; + rt_rq->rt_time = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void enable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __enable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -334,14 +430,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) #ifdef CONFIG_SMP if (rt_rq->rt_time > runtime) { - int more; - spin_unlock(&rt_rq->rt_runtime_lock); - more = balance_runtime(rt_rq); + balance_runtime(rt_rq); spin_lock(&rt_rq->rt_runtime_lock); - if (more) - runtime = sched_rt_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; } #endif @@ -1174,6 +1269,8 @@ static void rq_online_rt(struct rq *rq) if (rq->rt.overloaded) rt_set_overload(rq); + __enable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } @@ -1183,6 +1280,8 @@ static void rq_offline_rt(struct rq *rq) if (rq->rt.overloaded) rt_clear_overload(rq); + __disable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); } -- cgit From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Jun 2008 12:57:11 -0700 Subject: sched: prevent bound kthreads from changing cpus_allowed Kthreads that have called kthread_bind() are bound to specific cpus, so other tasks should not be able to change their cpus_allowed from under them. Otherwise, it is possible to move kthreads, such as the migration or software watchdog threads, so they are not allowed access to the cpu they work on. Cc: Peter Zijlstra Cc: Paul Menage Cc: Paul Jackson Signed-off-by: David Rientjes Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/cpuset.c | 14 +++++++++++++- kernel/kthread.c | 1 + kernel/sched.c | 6 ++++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d25acf600a3..2db1485f865 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1486,6 +1486,7 @@ static inline void put_task_struct(struct task_struct *t) #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6090d18b58a..b84354f4de3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; + if (tsk->flags & PF_THREAD_BOUND) { + cpumask_t mask; + + mutex_lock(&callback_mutex); + mask = cs->cpus_allowed; + mutex_unlock(&callback_mutex); + if (!cpus_equal(tsk->cpus_allowed, mask)) + return -EINVAL; + } return security_task_setscheduler(tsk, 0, NULL); } @@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + int err; mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed_ptr(tsk, &cpus); + err = set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); + if (err) + return; from = oldcs->mems_allowed; to = cs->mems_allowed; diff --git a/kernel/kthread.c b/kernel/kthread.c index bd1b9ea024e..97747cdd37c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; + k->flags |= PF_THREAD_BOUND; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/sched.c b/kernel/sched.c index e9c24a12865..164fe7fe0d8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5563,6 +5563,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) goto out; } + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && + !cpus_equal(p->cpus_allowed, *new_mask))) { + ret = -EINVAL; + goto out; + } + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); else { -- cgit From 6492c7f83e88a3a9521793b6934d882b97afe287 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 8 Jun 2008 09:27:13 +0200 Subject: sched: trivial sched_features cleanup Remove unused debug/tuning features. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 1c7283cb958..62b39ca92eb 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -6,5 +6,3 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(NORMALIZED_SLEEPER, 1) -SCHED_FEAT(DEADLINE, 1) -- cgit From e9886ca3a93d7d041d3de8e5acebe213da777d59 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 9 Jun 2008 17:12:24 +0900 Subject: sched: kill off dead cfs_rq_set_shares() Building with CONFIG_FAIR_GROUP_SCHED=y on UP results in an unused cfs_rq_set_shares() reference. As nothing is using this dummy function in the first place, just kill it off. Signed-off-by: Paul Mundt Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 164fe7fe0d8..07d5472dee9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1480,16 +1480,8 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); -#else /* CONFIG_SMP */ - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -} #endif -#endif /* CONFIG_SMP */ - #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" -- cgit From f7d62364b2cef85cbcd4feffdd3632ef7c3b61c2 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 10 Jun 2008 20:29:19 -0700 Subject: sched: fix typo in Documentation/scheduler/sched-rt-group.txt Fix minor typos. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-rt-group.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 14f901f639e..3ef339f491e 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -51,9 +51,9 @@ needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s = 0.00015s. So this group can be scheduled with a period of 0.005s and a run time of 0.00015s. -The remaining CPU time will be used for user input and other tass. Because +The remaining CPU time will be used for user input and other tasks. Because realtime tasks have explicitly allocated the CPU time they need to perform -their tasks, buffer underruns in the graphocs or audio can be eliminated. +their tasks, buffer underruns in the graphics or audio can be eliminated. NOTE: the above example is not fully implemented as of yet (2.6.25). We still lack an EDF scheduler to make non-uniform periods usable. -- cgit From 20b6331bfed1f07ba1e5006889a5d64adc53615e Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 11 Jun 2008 00:58:30 +0200 Subject: sched: rework of "prioritize non-migratable tasks over migratable ones" regarding this commit: 45c01e824991b2dd0a332e19efc4901acb31209f I think we can do it simpler. Please take a look at the patch below. Instead of having 2 separate arrays (which is + ~800 bytes on x86_32 and twice so on x86_64), let's add "exclusive" (the ones that are bound to this CPU) tasks to the head of the queue and "shared" ones -- to the end. In case of a few newly woken up "exclusive" tasks, they are 'stacked' (not queued as now), meaning that a task {i+1} is being placed in front of the previously woken up task {i}. But I don't think that this behavior may cause any realistic problems. There are a couple of changes on top of this one. (1) in check_preempt_curr_rt() I don't think there is a need for the "pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt" check. enqueue_task_rt(p) and check_preempt_curr_rt() are always called one after another with rq->lock being held so the following check "p->rt.nr_cpus_allowed == 1 && rq->curr->rt.nr_cpus_allowed != 1" should be enough (well, just its left part) to guarantee that 'p' has been queued in front of the 'curr'. (2) in set_cpus_allowed_rt() I don't thinks there is a need for requeue_task_rt() here. Perhaps, the only case when 'requeue' (+ reschedule) might be useful is as follows: i) weight == 1 && cpu_isset(task_cpu(p), *new_mask) i.e. a task is being bound to this CPU); ii) 'p' != rq->curr but here, 'p' has already been on this CPU for a while and was not migrated. i.e. it's possible that 'rq->curr' would not have high chances to be migrated right at this particular moment (although, has chance in a bit longer term), should we allow it to be preempted. Anyway, I think we should not perhaps make it more complex trying to address some rare corner cases. For instance, that's why a single queue approach would be preferable. Unless I'm missing something obvious, this approach gives us similar functionality at lower cost. Verified only compilation-wise. (Almost)-Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- kernel/sched_rt.c | 44 +++++++++----------------------------------- 2 files changed, 11 insertions(+), 39 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 554de400980..cc1d558406f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -153,8 +153,7 @@ static inline int task_has_rt_policy(struct task_struct *p) */ struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head xqueue[MAX_RT_PRIO]; /* exclusive queue */ - struct list_head squeue[MAX_RT_PRIO]; /* shared queue */ + struct list_head queue[MAX_RT_PRIO]; }; struct rt_bandwidth { @@ -7620,8 +7619,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) array = &rt_rq->active; for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->xqueue + i); - INIT_LIST_HEAD(array->squeue + i); + INIT_LIST_HEAD(array->queue + i); __clear_bit(i, array->bitmap); } /* delimiter for bitsearch: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8ae3416e0bb..f721b52acd8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -576,16 +576,15 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); + struct list_head *queue = array->queue + rt_se_prio(rt_se); if (group_rq && rt_rq_throttled(group_rq)) return; if (rt_se->nr_cpus_allowed == 1) - list_add_tail(&rt_se->run_list, - array->xqueue + rt_se_prio(rt_se)); + list_add(&rt_se->run_list, queue); else - list_add_tail(&rt_se->run_list, - array->squeue + rt_se_prio(rt_se)); + list_add_tail(&rt_se->run_list, queue); __set_bit(rt_se_prio(rt_se), array->bitmap); @@ -598,8 +597,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - if (list_empty(array->squeue + rt_se_prio(rt_se)) - && list_empty(array->xqueue + rt_se_prio(rt_se))) + if (list_empty(array->queue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); @@ -666,11 +664,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) /* * Put task to the end of the run list without the overhead of dequeue * followed by enqueue. - * - * Note: We always enqueue the task to the shared-queue, regardless of its - * previous position w.r.t. exclusive vs shared. This is so that exclusive RR - * tasks fairly round-robin with all tasks on the runqueue, not just other - * exclusive tasks. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) @@ -678,7 +671,7 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; list_del_init(&rt_se->run_list); - list_add_tail(&rt_se->run_list, array->squeue + rt_se_prio(rt_se)); + list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -736,9 +729,6 @@ static int select_task_rq_rt(struct task_struct *p, int sync) } #endif /* CONFIG_SMP */ -static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, - struct rt_rq *rt_rq); - /* * Preempt the current task with a newly woken task if needed: */ @@ -764,8 +754,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) */ if((p->prio == rq->curr->prio) && p->rt.nr_cpus_allowed == 1 - && rq->curr->rt.nr_cpus_allowed != 1 - && pick_next_rt_entity(rq, &rq->rt) != &rq->curr->rt) { + && rq->curr->rt.nr_cpus_allowed != 1) { cpumask_t mask; if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) @@ -789,15 +778,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); - queue = array->xqueue + idx; - if (!list_empty(queue)) - next = list_entry(queue->next, struct sched_rt_entity, - run_list); - else { - queue = array->squeue + idx; - next = list_entry(queue->next, struct sched_rt_entity, - run_list); - } + queue = array->queue + idx; + next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; } @@ -867,7 +849,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) continue; if (next && next->prio < idx) continue; - list_for_each_entry(rt_se, array->squeue + idx, run_list) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { struct task_struct *p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; @@ -1249,14 +1231,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, } update_rt_migration(rq); - - if (unlikely(weight == 1 || p->rt.nr_cpus_allowed == 1)) - /* - * If either the new or old weight is a "1", we need - * to requeue to properly move between shared and - * exclusive queues. - */ - requeue_task_rt(rq, p); } p->cpus_allowed = *new_mask; -- cgit From ada18de2eb76961a4d4847f63291744c9e7beec4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:24 +0200 Subject: sched: debug: add some rt debug output Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/sched_rt.c | 14 ++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8bb713040ac..8e077b9c91c 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; -#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#else +#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) char path[128] = ""; struct cgroup *cgroup = NULL; struct task_group *tg = cfs_rq->tg; @@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cgroup_path(cgroup, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#else + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); #endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", @@ -169,6 +169,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->nr_spread_over); } +void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) +{ +#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) + char path[128] = ""; + struct cgroup *cgroup = NULL; + struct task_group *tg = rt_rq->tg; + + if (tg) + cgroup = tg->css.cgroup; + + if (cgroup) + cgroup_path(cgroup, path, sizeof(path)); + + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); +#else + SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif + + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + P(rt_nr_running); + P(rt_throttled); + PN(rt_time); + PN(rt_runtime); + +#undef PN +#undef P +} + static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = &per_cpu(runqueues, cpu); @@ -208,6 +241,7 @@ static void print_cpu(struct seq_file *m, int cpu) #undef PN print_cfs_stats(m, cpu); + print_rt_stats(m, cpu); print_rq(m, rq, cpu); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index fee5fa7c72d..2e0ccdcf046 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1444,3 +1444,17 @@ static const struct sched_class rt_sched_class = { .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, }; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); + +static void print_rt_stats(struct seq_file *m, int cpu) +{ + struct rt_rq *rt_rq; + + rcu_read_lock(); + for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) + print_rt_rq(m, cpu, rt_rq); + rcu_read_unlock(); +} +#endif -- cgit From b79f3833d81d54fc71d98c8064dc45f33a755a8a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:25 +0200 Subject: sched: rt: fix SMP bandwidth balancing for throttled groups Now we exceed the runtime and get throttled - the period rollover tick will subtract the cpu quota from the runtime and check if we're below quota. However with this cpu having a very small portion of the runtime it will not refresh as fast as it should. Therefore, also rebalance the runtime when we're throttled. Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2e0ccdcf046..87b2e3bf947 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -228,6 +228,28 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif +#ifdef CONFIG_SMP +static int do_balance_runtime(struct rt_rq *rt_rq); + +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} +#endif + static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { int i, idle = 1; @@ -247,6 +269,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) u64 runtime; spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) + balance_runtime(rt_rq); runtime = rt_rq->rt_runtime; rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { @@ -267,7 +291,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } #ifdef CONFIG_SMP -static int balance_runtime(struct rt_rq *rt_rq) +static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); struct root_domain *rd = cpu_rq(smp_processor_id())->rd; @@ -428,17 +452,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) return 0; -#ifdef CONFIG_SMP - if (rt_rq->rt_time > runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); - balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); - - runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - } -#endif + balance_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; if (rt_rq->rt_time > runtime) { rt_rq->rt_throttled = 1; -- cgit From eff6549b957d15d1ad168d90b8c1eb643b9c163f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:26 +0200 Subject: sched: rt: move some code around Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 119 ++++++++++++++++++++++++++---------------------------- 1 file changed, 57 insertions(+), 62 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 87b2e3bf947..61d52112289 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -228,68 +228,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif -#ifdef CONFIG_SMP -static int do_balance_runtime(struct rt_rq *rt_rq); - -static int balance_runtime(struct rt_rq *rt_rq) -{ - int more = 0; - - if (rt_rq->rt_time > rt_rq->rt_runtime) { - spin_unlock(&rt_rq->rt_runtime_lock); - more = do_balance_runtime(rt_rq); - spin_lock(&rt_rq->rt_runtime_lock); - } - - return more; -} -#else -static inline int balance_runtime(struct rt_rq *rt_rq) -{ - return 0; -} -#endif - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) -{ - int i, idle = 1; - cpumask_t span; - - if (rt_b->rt_runtime == RUNTIME_INF) - return 1; - - span = sched_rt_period_mask(); - for_each_cpu_mask(i, span) { - int enqueue = 0; - struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); - struct rq *rq = rq_of_rt_rq(rt_rq); - - spin_lock(&rq->lock); - if (rt_rq->rt_time) { - u64 runtime; - - spin_lock(&rt_rq->rt_runtime_lock); - if (rt_rq->rt_throttled) - balance_runtime(rt_rq); - runtime = rt_rq->rt_runtime; - rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - enqueue = 1; - } - if (rt_rq->rt_time || rt_rq->rt_nr_running) - idle = 0; - spin_unlock(&rt_rq->rt_runtime_lock); - } - - if (enqueue) - sched_rt_rq_enqueue(rt_rq); - spin_unlock(&rq->lock); - } - - return idle; -} - #ifdef CONFIG_SMP static int do_balance_runtime(struct rt_rq *rt_rq) { @@ -425,8 +363,65 @@ static void enable_runtime(struct rq *rq) spin_unlock_irqrestore(&rq->lock, flags); } +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} #endif +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + cpumask_t span; + + if (rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu_mask(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime; + + spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) + balance_runtime(rt_rq); + runtime = rt_rq->rt_runtime; + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + } + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + spin_unlock(&rq->lock); + } + + return idle; +} + static inline int rt_se_prio(struct sched_rt_entity *rt_se) { #ifdef CONFIG_RT_GROUP_SCHED -- cgit From 10b612f440a22a294e87ec7e8f03f9eea3338628 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:27 +0200 Subject: sched: rt: fix the bandwidth contraint computations Signed-off-by: Peter Zijlstra Cc: "Daniel K." Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5b307da827e..1f711a58a2b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8419,7 +8419,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_CGROUP_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_group *tgi, *parent = tg ? tg->parent : NULL; + struct task_group *tgi, *parent = tg->parent; unsigned long total = 0; if (!parent) { @@ -8443,7 +8443,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) } rcu_read_unlock(); - return total + to_ratio(period, runtime) < + return total + to_ratio(period, runtime) <= to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), parent->rt_bandwidth.rt_runtime); } @@ -8560,10 +8560,15 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { + struct task_group *tg = &root_task_group; + u64 rt_runtime, rt_period; int ret = 0; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = tg->rt_bandwidth.rt_runtime; + mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(NULL, 1, 0)) + if (!__rt_schedulable(tg, rt_period, rt_runtime)) ret = -EINVAL; mutex_unlock(&rt_constraints_mutex); -- cgit From 6c3df25511c2c51f2dd36cc52a8d22363d731793 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Jun 2008 14:22:28 +0200 Subject: sched: rt: dont stop the period timer when there are tasks wanting to run So if the group ever gets throttled, it will never wake up again. Signed-off-by: Peter Zijlstra Cc: "Daniel K." Cc: Peter Zijlstra Reported-by: "Daniel K." --- kernel/sched_rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 61d52112289..bd90c8bb073 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -412,7 +412,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; spin_unlock(&rt_rq->rt_runtime_lock); - } + } else if (rt_rq->rt_nr_running) + idle = 0; if (enqueue) sched_rt_rq_enqueue(rt_rq); -- cgit From bf647b62fdb948e757a7b4d18d4f16e3c763b1d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:10 +0200 Subject: sched: clean up some unused variables In file included from /mnt/build/linux-2.6/kernel/sched.c:1496: /mnt/build/linux-2.6/kernel/sched_rt.c: In function '__enable_runtime': /mnt/build/linux-2.6/kernel/sched_rt.c:339: warning: unused variable 'rd' /mnt/build/linux-2.6/kernel/sched_rt.c: In function 'requeue_rt_entity': /mnt/build/linux-2.6/kernel/sched_rt.c:692: warning: unused variable 'queue' Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bd90c8bb073..6b4a6b5a416 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -336,7 +336,6 @@ static void disable_runtime(struct rq *rq) static void __enable_runtime(struct rq *rq) { - struct root_domain *rd = rq->rd; struct rt_rq *rt_rq; if (unlikely(!scheduler_running)) @@ -689,7 +688,6 @@ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; - struct list_head *queue = array->queue + rt_se_prio(rt_se); if (on_rt_rq(rt_se)) { list_del_init(&rt_se->run_list); -- cgit From a7be37ac8e1565e00880531f4e2aff421a21c803 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:11 +0200 Subject: sched: revert the revert of: weight calculations Try again.. initial commit: 8f1bc385cfbab474db6c27b5af1e439614f3025c revert: f9305d4a0968201b2818dbed0dc8cb0d4ee7aeb3 Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++--- kernel/sched_fair.c | 105 +++++++++++++++++++++++++++++++++--------------- kernel/sched_features.h | 1 + 3 files changed, 76 insertions(+), 39 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index c51d9fae8cd..f653af684fb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1342,6 +1342,9 @@ static void __resched_task(struct task_struct *p, int tif_bit) */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +/* + * delta *= weight / lw + */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) @@ -1369,12 +1372,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } -static inline unsigned long -calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) -{ - return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); -} - static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1fe4c65a817..496500988ce 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, } #endif +/* + * delta *= w / rw + */ +static inline unsigned long +calc_delta_weight(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + se->load.weight, &cfs_rq_of(se)->load); + } + + return delta; +} + +/* + * delta *= rw / w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, &se->load); + } + + return delta; +} + /* * The idea is to set a period in which each task runs once. * @@ -362,47 +390,54 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - } - - - return slice; + return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s/w = p/rw + * vs = s*rw/w = p */ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long nr_running = cfs_rq->nr_running; - unsigned long weight; - u64 vslice; if (!se->on_rq) nr_running++; - vslice = __sched_period(nr_running); + return __sched_period(nr_running); +} + +/* + * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in + * that it favours >=0 over <0. + * + * -20 | + * | + * 0 --------+------- + * .' + * 19 .' + * + */ +static unsigned long +calc_delta_asym(unsigned long delta, struct sched_entity *se) +{ + struct load_weight lw = { + .weight = NICE_0_LOAD, + .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) + }; for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); + struct load_weight *se_lw = &se->load; - weight = cfs_rq->load.weight; - if (!se->on_rq) - weight += se->load.weight; + if (se->load.weight < NICE_0_LOAD) + se_lw = &lw; - vslice *= NICE_0_LOAD; - do_div(vslice, weight); + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, se_lw); } - return vslice; + return delta; } /* @@ -419,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = delta_exec; - if (unlikely(curr->load.weight != NICE_0_LOAD)) { - delta_exec_weighted = calc_delta_fair(delta_exec_weighted, - &curr->load); - } + delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; } @@ -609,8 +640,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; + + /* + * convert the sleeper threshold into virtual time + */ + if (sched_feat(NORMALIZED_SLEEPER)) + thresh = calc_delta_fair(thresh, se); + + vruntime -= thresh; + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -1111,11 +1151,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) unsigned long gran = sysctl_sched_wakeup_granularity; /* - * More easily preempt - nice tasks, while not making - * it harder for + nice tasks. + * More easily preempt - nice tasks, while not making it harder for + * + nice tasks. */ - if (unlikely(se->load.weight > NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); return gran; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 62b39ca92eb..afa549166d8 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,4 +1,5 @@ SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +SCHED_FEAT(NORMALIZED_SLEEPER, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1) SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) -- cgit From c9c294a630e28eec5f2865f028ecfc58d45c0a5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:12 +0200 Subject: sched: fix calc_delta_asym() calc_delta_asym() is supposed to do the same as calc_delta_fair() except linearly shrink the result for negative nice processes - this causes them to have a smaller preemption threshold so that they are more easily preempted. The problem is that for task groups se->load.weight is the per cpu share of the actual task group weight; take that into account. Also provide a debug switch to disable the asymmetry (which I still don't like - but it does greatly benefit some workloads) This would explain the interactivity issues reported against group scheduling. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++++++++++++++++- kernel/sched_features.h | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 496500988ce..2268e634812 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -430,6 +430,29 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; +#ifdef CONFIG_FAIR_SCHED_GROUP + struct cfs_rq *cfs_rq = se->my_q; + struct task_group *tg = NULL + + if (cfs_rq) + tg = cfs_rq->tg; + + if (tg && tg->shares < NICE_0_LOAD) { + /* + * scale shares to what it would have been had + * tg->weight been NICE_0_LOAD: + * + * weight = 1024 * shares / tg->weight + */ + lw.weight *= se->load.weight; + lw.weight /= tg->shares; + + lw.inv_weight = 0; + + se_lw = &lw; + } else +#endif + if (se->load.weight < NICE_0_LOAD) se_lw = &lw; @@ -1154,7 +1177,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) * More easily preempt - nice tasks, while not making it harder for * + nice tasks. */ - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + if (sched_feat(ASYM_GRAN)) + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); + else + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); return gran; } diff --git a/kernel/sched_features.h b/kernel/sched_features.h index afa549166d8..04123af2e67 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -7,3 +7,4 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) +SCHED_FEAT(ASYM_GRAN, 1) -- cgit From ced8aa16e1db55c33c507174c1b1f9e107445865 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:13 +0200 Subject: sched: fix calc_delta_asym, #2 Ok, so why are we in this mess, it was: 1/w but now we mixed that rw in the mix like: rw/w rw being \Sum w suggests: fiddling w, we should also fiddle rw, humm? Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2268e634812..2e197b8e43f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -429,6 +429,7 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *se_lw = &se->load; + unsigned long rw = cfs_rq_of(se)->load.weight; #ifdef CONFIG_FAIR_SCHED_GROUP struct cfs_rq *cfs_rq = se->my_q; @@ -450,14 +451,16 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se) lw.inv_weight = 0; se_lw = &lw; + rw += lw.weight - se->load.weight; } else #endif - if (se->load.weight < NICE_0_LOAD) + if (se->load.weight < NICE_0_LOAD) { se_lw = &lw; + rw += NICE_0_LOAD - se->load.weight; + } - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, se_lw); + delta = calc_delta_mine(delta, rw, se_lw); } return delta; -- cgit From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:14 +0200 Subject: sched: revert revert of: fair-group: SMP-nice for group scheduling Try again.. Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 430 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched_debug.c | 5 + kernel/sched_fair.c | 124 +++++++++------ kernel/sched_rt.c | 4 + 5 files changed, 489 insertions(+), 75 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index eaf821072db..97a58b622ee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -765,6 +765,7 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ + int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index f653af684fb..874b6da1543 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -403,6 +403,43 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + unsigned long task_weight; + unsigned long shares; + /* + * We need space to build a sched_domain wide view of the full task + * group tree, in order to avoid depending on dynamic memory allocation + * during the load balancing we place this in the per cpu task group + * hierarchy. This limits the load balancing to one instance per cpu, + * but more should not be needed anyway. + */ + struct aggregate_struct { + /* + * load = weight(cpus) * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long load; + + /* + * part of the group weight distributed to this span. + */ + unsigned long shares; + + /* + * The sum of all runqueue weights within this span. + */ + unsigned long rq_weight; + + /* + * Weight contributed by tasks; this is the part we can + * influence by moving tasks around. + */ + unsigned long task_weight; + } aggregate; +#endif #endif }; @@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* + * Group load balancing. + * + * We calculate a few balance domain wide aggregate numbers; load and weight. + * Given the pictures below, and assuming each item has equal weight: + * + * root 1 - thread + * / | \ A - group + * A 1 B + * /|\ / \ + * C 2 D 3 4 + * | | + * 5 6 + * + * load: + * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, + * which equals 1/9-th of the total load. + * + * shares: + * The weight of this group on the selected cpus. + * + * rq_weight: + * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while + * B would get 2. + * + * task_weight: + * Part of the rq_weight contributed by tasks; all groups except B would + * get 1, B gets 2. + */ + +static inline struct aggregate_struct * +aggregate(struct task_group *tg, struct sched_domain *sd) +{ + return &tg->cfs_rq[sd->first_cpu]->aggregate; +} + +typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static +void aggregate_walk_tree(aggregate_func down, aggregate_func up, + struct sched_domain *sd) +{ + struct task_group *parent, *child; + + rcu_read_lock(); + parent = &root_task_group; +down: + (*down)(parent, sd); + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + (*up)(parent, sd); + + child = parent; + parent = parent->parent; + if (parent) + goto up; + rcu_read_unlock(); +} + +/* + * Calculate the aggregate runqueue weight. + */ +static +void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long rq_weight = 0; + unsigned long task_weight = 0; + int i; + + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + task_weight += tg->cfs_rq[i]->task_weight; + } + + aggregate(tg, sd)->rq_weight = rq_weight; + aggregate(tg, sd)->task_weight = task_weight; +} + +/* + * Compute the weight of this group on the given cpus. + */ +static +void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = 0; + int i; + + for_each_cpu_mask(i, sd->span) + shares += tg->cfs_rq[i]->shares; + + if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + shares = tg->shares; + + aggregate(tg, sd)->shares = shares; +} + +/* + * Compute the load fraction assigned to this group, relies on the aggregate + * weight and this group's parent's load, i.e. top-down. + */ +static +void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long load; + + if (!tg->parent) { + int i; + + load = 0; + for_each_cpu_mask(i, sd->span) + load += cpu_rq(i)->load.weight; + + } else { + load = aggregate(tg->parent, sd)->load; + + /* + * shares is our weight in the parent's rq so + * shares/parent->rq_weight gives our fraction of the load + */ + load *= aggregate(tg, sd)->shares; + load /= aggregate(tg->parent, sd)->rq_weight + 1; + } + + aggregate(tg, sd)->load = load; +} + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, + int tcpu) +{ + int boost = 0; + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[tcpu]) + return; + + rq_weight = tg->cfs_rq[tcpu]->load.weight; + + /* + * If there are currently no tasks on the cpu pretend there is one of + * average load so that when a new task gets to run here it will not + * get delayed by group starvation. + */ + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = aggregate(tg, sd)->shares * rq_weight; + shares /= aggregate(tg, sd)->rq_weight + 1; + + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; + + __set_se_shares(tg->se[tcpu], shares); +} + +/* + * Re-adjust the weights on the cpu the task came from and on the cpu the + * task went to. + */ +static void +__move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + unsigned long shares; + + shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + + __update_group_shares_cpu(tg, sd, scpu); + __update_group_shares_cpu(tg, sd, dcpu); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + if (shares) + tg->cfs_rq[dcpu]->shares += shares; +} + +/* + * Because changing a group's shares changes the weight of the super-group + * we need to walk up the tree and change all shares until we hit the root. + */ +static void +move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + while (tg) { + __move_group_shares(tg, sd, scpu, dcpu); + tg = tg->parent; + } +} + +static +void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = aggregate(tg, sd)->shares; + int i; + + for_each_cpu_mask(i, sd->span) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __update_group_shares_cpu(tg, sd, i); + spin_unlock_irqrestore(&rq->lock, flags); + } + + aggregate_group_shares(tg, sd); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= aggregate(tg, sd)->shares; + if (shares) { + tg->cfs_rq[sd->first_cpu]->shares += shares; + aggregate(tg, sd)->shares += shares; + } +} + +/* + * Calculate the accumulative weight and recursive load of each task group + * while walking down the tree. + */ +static +void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_weight(tg, sd); + aggregate_group_shares(tg, sd); + aggregate_group_load(tg, sd); +} + +/* + * Rebalance the cpu shares while walking back up the tree. + */ +static +void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_set_shares(tg, sd); +} + +static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + +static void __init init_aggregate(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(aggregate_lock, i)); +} + +static int get_aggregate(struct sched_domain *sd) +{ + if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + return 0; + + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + return 1; +} + +static void put_aggregate(struct sched_domain *sd) +{ + spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); +} + +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ + cfs_rq->shares = shares; +} + +#else + +static inline void init_aggregate(void) +{ +} + +static inline int get_aggregate(struct sched_domain *sd) +{ + return 0; +} + +static inline void put_aggregate(struct sched_domain *sd) +{ +} +#endif + #endif #include "sched_stats.h" @@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) -static inline void inc_load(struct rq *rq, const struct task_struct *p) -{ - update_load_add(&rq->load, p->se.load.weight); -} - -static inline void dec_load(struct rq *rq, const struct task_struct *p) -{ - update_load_sub(&rq->load, p->se.load.weight); -} - -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; - inc_load(rq, p); } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; - dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; + int unlock_aggregate; cpus_setall(*cpus); + unlock_aggregate = get_aggregate(sd); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3383,8 +3731,9 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return ld_moved; + ld_moved = -1; + + goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3399,8 +3748,13 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return 0; + ld_moved = -1; + else + ld_moved = 0; +out: + if (unlock_aggregate) + put_aggregate(sd); + return ld_moved; } /* @@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - dec_load(rq, p); - } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); - inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; + sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7757,6 +8113,7 @@ void __init sched_init(void) } #ifdef CONFIG_SMP + init_aggregate(); init_defrootdomain(); #endif @@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void set_se_shares(struct sched_entity *se, unsigned long shares) +static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; int on_rq; - spin_lock_irq(&rq->lock); - on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); @@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) enqueue_entity(cfs_rq, se, 0); +} - spin_unlock_irq(&rq->lock); +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); set_se_shares(tg->se[i], shares); + } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8e077b9c91c..04394ccac88 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); +#endif +#endif } void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e197b8e43f..183388c4dea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; list_add(&se->group_node, &cfs_rq->tasks); @@ -580,6 +597,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; list_del_init(&se->group_node); @@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +static unsigned long +__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, + struct cfs_rq *cfs_rq) { - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running || !first_fair(cfs_rq)) - return MAX_PRIO; - - curr = cfs_rq->curr; - if (!curr) - curr = __pick_next_entity(cfs_rq); + struct rq_iterator cfs_rq_iterator; - p = task_of(curr); + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + cfs_rq_iterator.arg = cfs_rq; - return p->prio; + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &cfs_rq_iterator); } -#endif +#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { - struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - struct rq_iterator cfs_rq_iterator; - - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { -#ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; + rcu_read_lock(); + list_for_each_entry(tg, &task_groups, list) { long imbalance; - unsigned long maxload; + unsigned long this_weight, busiest_weight; + long rem_load, max_load, moved_load; + + /* + * empty group + */ + if (!aggregate(tg, sd)->task_weight) + continue; + + rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; + rem_load /= aggregate(tg, sd)->load + 1; + + this_weight = tg->cfs_rq[this_cpu]->task_weight; + busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + imbalance = (busiest_weight - this_weight) / 2; - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + if (imbalance < 0) + imbalance = busiest_weight; + + max_load = max(rem_load, imbalance); + moved_load = __load_balance_fair(this_rq, this_cpu, busiest, + max_load, sd, idle, all_pinned, this_best_prio, + tg->cfs_rq[busiest_cpu]); + + if (!moved_load) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + move_group_shares(tg, sd, busiest_cpu, this_cpu); - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); -#else -# define maxload rem_load_move -#endif - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, - this_best_prio, - &cfs_rq_iterator); + moved_load *= aggregate(tg, sd)->load; + moved_load /= aggregate(tg, sd)->rq_weight + 1; - if (rem_load_move <= 0) + rem_load_move -= moved_load; + if (rem_load_move < 0) break; } + rcu_read_unlock(); return max_load_move - rem_load_move; } +#else +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return __load_balance_fair(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &busiest->cfs); +} +#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6b4a6b5a416..765932d0399 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) rt_se->timeout = 0; enqueue_rt_entity(rt_se); + + inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + + dec_cpu_load(rq, p->se.load.weight); } /* -- cgit From 76a2a6ee8a0660a29127f05989ac59ae1ce865fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:15 +0200 Subject: sched: sched_clock_cpu() based cpu_clock() with sched_clock_cpu() being reasonably in sync between cpus (max 1 jiffy difference) use this to provide cpu_clock(). Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 76 ---------------------------------------------------- kernel/sched_clock.c | 12 +++++++++ 2 files changed, 12 insertions(+), 76 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 874b6da1543..eb3454c410f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -818,82 +818,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -unsigned long long time_sync_thresh = 100000; - -static DEFINE_PER_CPU(unsigned long long, time_offset); -static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); - -/* - * Global lock which we take every now and then to synchronize - * the CPUs time. This method is not warp-safe, but it's good - * enough to synchronize slowly diverging time sources and thus - * it's good enough for tracing: - */ -static DEFINE_SPINLOCK(time_sync_lock); -static unsigned long long prev_global_time; - -static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) -{ - /* - * We want this inlined, to not get tracer function calls - * in this critical section: - */ - spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); - __raw_spin_lock(&time_sync_lock.raw_lock); - - if (time < prev_global_time) { - per_cpu(time_offset, cpu) += prev_global_time - time; - time = prev_global_time; - } else { - prev_global_time = time; - } - - __raw_spin_unlock(&time_sync_lock.raw_lock); - spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); - - return time; -} - -static unsigned long long __cpu_clock(int cpu) -{ - unsigned long long now; - - /* - * Only call sched_clock() if the scheduler has already been - * initialized (some code might call cpu_clock() very early): - */ - if (unlikely(!scheduler_running)) - return 0; - - now = sched_clock_cpu(cpu); - - return now; -} - -/* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): - */ -unsigned long long cpu_clock(int cpu) -{ - unsigned long long prev_cpu_time, time, delta_time; - unsigned long flags; - - local_irq_save(flags); - prev_cpu_time = per_cpu(prev_cpu_time, cpu); - time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); - delta_time = time-prev_cpu_time; - - if (unlikely(delta_time > time_sync_thresh)) { - time = __sync_cpu_clock(time, cpu); - per_cpu(prev_cpu_time, cpu) = time; - } - local_irq_restore(flags); - - return time; -} -EXPORT_SYMBOL_GPL(cpu_clock); - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219a..3c696db5945 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -244,3 +244,15 @@ unsigned long long __attribute__((weak)) sched_clock(void) { return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); } + +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + raw_local_irq_save(flags); + clock = sched_clock_cpu(cpu); + raw_local_irq_restore(flags); + + return clock; +} -- cgit From 103638d95ba5b0c53c8d9c0cb581156ccc8513ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:16 +0200 Subject: sched: fix wakeup granularity and buddy granularity Uncouple buddy selection from wakeup granularity. The initial idea was that buddies could run ahead as far as a normal task can - do this by measuring a pair 'slice' just as we do for a normal task. This means we can drop the wakeup_granularity back to 5ms. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_fair.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index eb3454c410f..7d282c52bd4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -375,6 +375,7 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; + u64 pair_start; struct rb_root tasks_timeline; struct rb_node *rb_leftmost; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 183388c4dea..509092af033 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -813,17 +813,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - static struct sched_entity * pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!cfs_rq->next) - return se; + struct rq *rq = rq_of(cfs_rq); + u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (wakeup_preempt_entity(cfs_rq->next, se) != 0) + if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + cfs_rq->pair_start = rq->clock; return se; + } return cfs_rq->next; } -- cgit From 32df2ee86a580f70f2dbb90cf81f413aa655f838 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:17 +0200 Subject: sched: add full schedstats to /proc/sched_debug show all the schedstats in /debug/sched_debug as well. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 04394ccac88..bbe6b31c3c5 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -162,8 +162,23 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, " .%-30s: %d\n", "bkl_count", - rq->bkl_count); +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); + + P(yld_exp_empty); + P(yld_act_empty); + P(yld_both_empty); + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); + + P(ttwu_count); + P(ttwu_local); + + P(bkl_count); + +#undef P #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", cfs_rq->nr_spread_over); -- cgit From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:18 +0200 Subject: sched: fix sched_domain aggregation Keeping the aggregate on the first cpu of the sched domain has two problems: - it could collide between different sched domains on different cpus - it could slow things down because of the remote accesses Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 113 ++++++++++++++++++++++++-------------------------- kernel/sched_fair.c | 12 +++--- 3 files changed, 60 insertions(+), 66 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 97a58b622ee..eaf821072db 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -765,7 +765,6 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ - int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index 7d282c52bd4..160d3c209b8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); */ static inline struct aggregate_struct * -aggregate(struct task_group *tg, struct sched_domain *sd) +aggregate(struct task_group *tg, int cpu) { - return &tg->cfs_rq[sd->first_cpu]->aggregate; + return &tg->cfs_rq[cpu]->aggregate; } -typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); +typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when @@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); */ static void aggregate_walk_tree(aggregate_func down, aggregate_func up, - struct sched_domain *sd) + int cpu, struct sched_domain *sd) { struct task_group *parent, *child; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, sd); + (*down)(parent, cpu, sd); list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1508,7 +1508,7 @@ down: up: continue; } - (*up)(parent, sd); + (*up)(parent, cpu, sd); child = parent; parent = parent->parent; @@ -1520,8 +1520,8 @@ up: /* * Calculate the aggregate runqueue weight. */ -static -void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; unsigned long task_weight = 0; @@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) task_weight += tg->cfs_rq[i]->task_weight; } - aggregate(tg, sd)->rq_weight = rq_weight; - aggregate(tg, sd)->task_weight = task_weight; + aggregate(tg, cpu)->rq_weight = rq_weight; + aggregate(tg, cpu)->task_weight = task_weight; } /* * Compute the weight of this group on the given cpus. */ -static -void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long shares = 0; int i; @@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) for_each_cpu_mask(i, sd->span) shares += tg->cfs_rq[i]->shares; - if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) + if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) shares = tg->shares; - aggregate(tg, sd)->shares = shares; + aggregate(tg, cpu)->shares = shares; } /* * Compute the load fraction assigned to this group, relies on the aggregate * weight and this group's parent's load, i.e. top-down. */ -static -void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long load; @@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) load += cpu_rq(i)->load.weight; } else { - load = aggregate(tg->parent, sd)->load; + load = aggregate(tg->parent, cpu)->load; /* * shares is our weight in the parent's rq so * shares/parent->rq_weight gives our fraction of the load */ - load *= aggregate(tg, sd)->shares; - load /= aggregate(tg->parent, sd)->rq_weight + 1; + load *= aggregate(tg, cpu)->shares; + load /= aggregate(tg->parent, cpu)->rq_weight + 1; } - aggregate(tg, sd)->load = load; + aggregate(tg, cpu)->load = load; } static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, - int tcpu) +__update_group_shares_cpu(struct task_group *tg, int cpu, + struct sched_domain *sd, int tcpu) { int boost = 0; unsigned long shares; @@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * \Sum rq_weight * */ - shares = aggregate(tg, sd)->shares * rq_weight; - shares /= aggregate(tg, sd)->rq_weight + 1; + shares = aggregate(tg, cpu)->shares * rq_weight; + shares /= aggregate(tg, cpu)->rq_weight + 1; /* * record the actual number of shares, not the boosted amount. @@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, * task went to. */ static void -__move_group_shares(struct task_group *tg, struct sched_domain *sd, +__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { unsigned long shares; shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - __update_group_shares_cpu(tg, sd, scpu); - __update_group_shares_cpu(tg, sd, dcpu); + __update_group_shares_cpu(tg, cpu, sd, scpu); + __update_group_shares_cpu(tg, cpu, sd, dcpu); /* * ensure we never loose shares due to rounding errors in the @@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd, * we need to walk up the tree and change all shares until we hit the root. */ static void -move_group_shares(struct task_group *tg, struct sched_domain *sd, +move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { while (tg) { - __move_group_shares(tg, sd, scpu, dcpu); + __move_group_shares(tg, cpu, sd, scpu, dcpu); tg = tg->parent; } } -static -void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { - unsigned long shares = aggregate(tg, sd)->shares; + unsigned long shares = aggregate(tg, cpu)->shares; int i; for_each_cpu_mask(i, sd->span) { @@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, sd, i); + __update_group_shares_cpu(tg, cpu, sd, i); spin_unlock_irqrestore(&rq->lock, flags); } - aggregate_group_shares(tg, sd); + aggregate_group_shares(tg, cpu, sd); /* * ensure we never loose shares due to rounding errors in the * above redistribution. */ - shares -= aggregate(tg, sd)->shares; + shares -= aggregate(tg, cpu)->shares; if (shares) { - tg->cfs_rq[sd->first_cpu]->shares += shares; - aggregate(tg, sd)->shares += shares; + tg->cfs_rq[cpu]->shares += shares; + aggregate(tg, cpu)->shares += shares; } } @@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) * Calculate the accumulative weight and recursive load of each task group * while walking down the tree. */ -static -void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_weight(tg, sd); - aggregate_group_shares(tg, sd); - aggregate_group_load(tg, sd); + aggregate_group_weight(tg, cpu, sd); + aggregate_group_shares(tg, cpu, sd); + aggregate_group_load(tg, cpu, sd); } /* * Rebalance the cpu shares while walking back up the tree. */ -static -void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +static void +aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_set_shares(tg, sd); + aggregate_group_set_shares(tg, cpu, sd); } static DEFINE_PER_CPU(spinlock_t, aggregate_lock); @@ -1731,18 +1731,18 @@ static void __init init_aggregate(void) spin_lock_init(&per_cpu(aggregate_lock, i)); } -static int get_aggregate(struct sched_domain *sd) +static int get_aggregate(int cpu, struct sched_domain *sd) { - if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) return 0; - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); return 1; } -static void put_aggregate(struct sched_domain *sd) +static void put_aggregate(int cpu, struct sched_domain *sd) { - spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); + spin_unlock(&per_cpu(aggregate_lock, cpu)); } static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) @@ -1756,12 +1756,12 @@ static inline void init_aggregate(void) { } -static inline int get_aggregate(struct sched_domain *sd) +static inline int get_aggregate(int cpu, struct sched_domain *sd) { return 0; } -static inline void put_aggregate(struct sched_domain *sd) +static inline void put_aggregate(int cpu, struct sched_domain *sd) { } #endif @@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpus_setall(*cpus); - unlock_aggregate = get_aggregate(sd); + unlock_aggregate = get_aggregate(this_cpu, sd); /* * When power savings policy is enabled for the parent domain, idle @@ -3678,7 +3678,7 @@ out_one_pinned: ld_moved = 0; out: if (unlock_aggregate) - put_aggregate(sd); + put_aggregate(this_cpu, sd); return ld_moved; } @@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; - sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; - sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); - sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 509092af033..40cf24ab4de 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * empty group */ - if (!aggregate(tg, sd)->task_weight) + if (!aggregate(tg, this_cpu)->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; - rem_load /= aggregate(tg, sd)->load + 1; + rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; + rem_load /= aggregate(tg, this_cpu)->load + 1; this_weight = tg->cfs_rq[this_cpu]->task_weight; busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; @@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - move_group_shares(tg, sd, busiest_cpu, this_cpu); + move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); - moved_load *= aggregate(tg, sd)->load; - moved_load /= aggregate(tg, sd)->rq_weight + 1; + moved_load *= aggregate(tg, this_cpu)->load; + moved_load /= aggregate(tg, this_cpu)->rq_weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit From 4d8d595dfa69e1c807bf928f364668a7f30da5dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:19 +0200 Subject: sched: update aggregate when holding the RQs It was observed that in __update_group_shares_cpu() rq_weight > aggregate()->rq_weight This is caused by forks/wakeups in between the initial aggregate pass and locking of the RQs for load balance. To avoid this situation partially re-do the aggregation once we have the RQs locked (which avoids new tasks from appearing). Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 160d3c209b8..dae20199dc9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1721,6 +1721,11 @@ aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) aggregate_group_set_shares(tg, cpu, sd); } +static void +aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) +{ +} + static DEFINE_PER_CPU(spinlock_t, aggregate_lock); static void __init init_aggregate(void) @@ -1740,6 +1745,11 @@ static int get_aggregate(int cpu, struct sched_domain *sd) return 1; } +static void update_aggregate(int cpu, struct sched_domain *sd) +{ + aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); +} + static void put_aggregate(int cpu, struct sched_domain *sd) { spin_unlock(&per_cpu(aggregate_lock, cpu)); @@ -1761,6 +1771,10 @@ static inline int get_aggregate(int cpu, struct sched_domain *sd) return 0; } +static inline void update_aggregate(int cpu, struct sched_domain *sd) +{ +} + static inline void put_aggregate(int cpu, struct sched_domain *sd) { } @@ -2192,6 +2206,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + /* + * now that we have both rqs locked the rq weight won't change + * anymore - so update the stats. + */ + update_aggregate(this_cpu, sd); + do { unsigned long load, avg_load; int local_group; -- cgit From 53fecd8ae1900fb571086f54f664051004665b55 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 27 Jun 2008 13:41:20 +0200 Subject: sched: kill task_group balancing The idea was to balance groups until we've reached the global goal, however Vatsa rightly pointed out that we might never reach that goal this way - hence take out this logic. [ the initial rationale for this 'feature' was to promote max concurrency within a group - it does not however affect fairness ] Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 40cf24ab4de..b10c0d61a2a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1422,9 +1422,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); list_for_each_entry(tg, &task_groups, list) { - long imbalance; - unsigned long this_weight, busiest_weight; - long rem_load, max_load, moved_load; + long rem_load, moved_load; /* * empty group @@ -1435,17 +1433,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; rem_load /= aggregate(tg, this_cpu)->load + 1; - this_weight = tg->cfs_rq[this_cpu]->task_weight; - busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - - imbalance = (busiest_weight - this_weight) / 2; - - if (imbalance < 0) - imbalance = busiest_weight; - - max_load = max(rem_load, imbalance); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, - max_load, sd, idle, all_pinned, this_best_prio, + rem_load, sd, idle, all_pinned, this_best_prio, tg->cfs_rq[busiest_cpu]); if (!moved_load) -- cgit From d3f40dbab954d83383b6a516582d5c09cc216dcc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:21 +0200 Subject: sched: dont micro manage share losses We used to try and contain the loss of 'shares' by playing arithmetic games. Replace that by noticing that at the top sched_domain we'll always have the full weight in shares to distribute. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index dae20199dc9..28229c5d498 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1551,6 +1551,9 @@ aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) shares = tg->shares; + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; + aggregate(tg, cpu)->shares = shares; } @@ -1642,20 +1645,8 @@ static void __move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, int scpu, int dcpu) { - unsigned long shares; - - shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - __update_group_shares_cpu(tg, cpu, sd, scpu); __update_group_shares_cpu(tg, cpu, sd, dcpu); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; - if (shares) - tg->cfs_rq[dcpu]->shares += shares; } /* @@ -1675,7 +1666,6 @@ move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, static void aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) { - unsigned long shares = aggregate(tg, cpu)->shares; int i; for_each_cpu_mask(i, sd->span) { @@ -1688,16 +1678,6 @@ aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain * } aggregate_group_shares(tg, cpu, sd); - - /* - * ensure we never loose shares due to rounding errors in the - * above redistribution. - */ - shares -= aggregate(tg, cpu)->shares; - if (shares) { - tg->cfs_rq[cpu]->shares += shares; - aggregate(tg, cpu)->shares += shares; - } } /* -- cgit From a25b5aca8740ea99d5e18dfc71235a52b685dcf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:22 +0200 Subject: sched: no need to aggregate task_weight We only need to know the task_weight of the busiest rq - nothing to do if there are no tasks there. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 +--------------- kernel/sched_fair.c | 2 +- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 28229c5d498..716cfc8e099 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -433,12 +433,6 @@ struct cfs_rq { * The sum of all runqueue weights within this span. */ unsigned long rq_weight; - - /* - * Weight contributed by tasks; this is the part we can - * influence by moving tasks around. - */ - unsigned long task_weight; } aggregate; #endif #endif @@ -1473,10 +1467,6 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); * rq_weight: * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while * B would get 2. - * - * task_weight: - * Part of the rq_weight contributed by tasks; all groups except B would - * get 1, B gets 2. */ static inline struct aggregate_struct * @@ -1524,16 +1514,12 @@ static void aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) { unsigned long rq_weight = 0; - unsigned long task_weight = 0; int i; - for_each_cpu_mask(i, sd->span) { + for_each_cpu_mask(i, sd->span) rq_weight += tg->cfs_rq[i]->load.weight; - task_weight += tg->cfs_rq[i]->task_weight; - } aggregate(tg, cpu)->rq_weight = rq_weight; - aggregate(tg, cpu)->task_weight = task_weight; } /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b10c0d61a2a..03b9fbd9d64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1427,7 +1427,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * empty group */ - if (!aggregate(tg, this_cpu)->task_weight) + if (!tg->cfs_rq[busiest_cpu]->task_weight) continue; rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; -- cgit From c8cba857b4997d5b00451d01474638f6a153f713 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:23 +0200 Subject: sched: simplify the group load balancer While thinking about the previous patch - I realized that using per domain aggregate load values in load_balance_fair() is wrong. We should use the load value for that CPU. By not needing per domain hierarchical load values we don't need to store per domain aggregate shares, which greatly simplifies all the math. It basically falls apart in two separate computations: - per domain update of the shares - per CPU update of the hierarchical load Also get rid of the move_group_shares() stuff - just re-compute the shares again after a successful load balance. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 286 ++++++++++++---------------------------------------- kernel/sched_fair.c | 15 +-- 2 files changed, 72 insertions(+), 229 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 716cfc8e099..f864b751fd1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -406,34 +406,23 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP - unsigned long task_weight; - unsigned long shares; /* - * We need space to build a sched_domain wide view of the full task - * group tree, in order to avoid depending on dynamic memory allocation - * during the load balancing we place this in the per cpu task group - * hierarchy. This limits the load balancing to one instance per cpu, - * but more should not be needed anyway. + * the part of load.weight contributed by tasks */ - struct aggregate_struct { - /* - * load = weight(cpus) * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long load; + unsigned long task_weight; - /* - * part of the group weight distributed to this span. - */ - unsigned long shares; + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; - /* - * The sum of all runqueue weights within this span. - */ - unsigned long rq_weight; - } aggregate; + /* + * this cpu's part of tg->shares + */ + unsigned long shares; #endif #endif }; @@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Group load balancing. - * - * We calculate a few balance domain wide aggregate numbers; load and weight. - * Given the pictures below, and assuming each item has equal weight: - * - * root 1 - thread - * / | \ A - group - * A 1 B - * /|\ / \ - * C 2 D 3 4 - * | | - * 5 6 - * - * load: - * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, - * which equals 1/9-th of the total load. - * - * shares: - * The weight of this group on the selected cpus. - * - * rq_weight: - * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while - * B would get 2. - */ - -static inline struct aggregate_struct * -aggregate(struct task_group *tg, int cpu) -{ - return &tg->cfs_rq[cpu]->aggregate; -} - -typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); +typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static -void aggregate_walk_tree(aggregate_func down, aggregate_func up, - int cpu, struct sched_domain *sd) +static void +walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) { struct task_group *parent, *child; @@ -1507,72 +1463,6 @@ up: rcu_read_unlock(); } -/* - * Calculate the aggregate runqueue weight. - */ -static void -aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long rq_weight = 0; - int i; - - for_each_cpu_mask(i, sd->span) - rq_weight += tg->cfs_rq[i]->load.weight; - - aggregate(tg, cpu)->rq_weight = rq_weight; -} - -/* - * Compute the weight of this group on the given cpus. - */ -static void -aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long shares = 0; - int i; - - for_each_cpu_mask(i, sd->span) - shares += tg->cfs_rq[i]->shares; - - if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - aggregate(tg, cpu)->shares = shares; -} - -/* - * Compute the load fraction assigned to this group, relies on the aggregate - * weight and this group's parent's load, i.e. top-down. - */ -static void -aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - unsigned long load; - - if (!tg->parent) { - int i; - - load = 0; - for_each_cpu_mask(i, sd->span) - load += cpu_rq(i)->load.weight; - - } else { - load = aggregate(tg->parent, cpu)->load; - - /* - * shares is our weight in the parent's rq so - * shares/parent->rq_weight gives our fraction of the load - */ - load *= aggregate(tg, cpu)->shares; - load /= aggregate(tg->parent, cpu)->rq_weight + 1; - } - - aggregate(tg, cpu)->load = load; -} - static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); */ static void __update_group_shares_cpu(struct task_group *tg, int cpu, - struct sched_domain *sd, int tcpu) + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; unsigned long rq_weight; - if (!tg->se[tcpu]) + if (!tg->se[cpu]) return; - rq_weight = tg->cfs_rq[tcpu]->load.weight; + rq_weight = tg->cfs_rq[cpu]->load.weight; /* * If there are currently no tasks on the cpu pretend there is one of @@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, rq_weight = NICE_0_LOAD; } + if (unlikely(rq_weight > sd_rq_weight)) + rq_weight = sd_rq_weight; + /* * \Sum shares * rq_weight * shares = ----------------------- * \Sum rq_weight * */ - shares = aggregate(tg, cpu)->shares * rq_weight; - shares /= aggregate(tg, cpu)->rq_weight + 1; + shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); /* * record the actual number of shares, not the boosted amount. */ - tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; if (shares < MIN_SHARES) shares = MIN_SHARES; else if (shares > MAX_SHARES) shares = MAX_SHARES; - __set_se_shares(tg->se[tcpu], shares); + __set_se_shares(tg->se[cpu], shares); } /* - * Re-adjust the weights on the cpu the task came from and on the cpu the - * task went to. + * Re-compute the task group their per cpu shares over the given domain. + * This needs to be done in a bottom-up fashion because the rq weight of a + * parent group depends on the shares of its child groups. */ static void -__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, - int scpu, int dcpu) +tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) { - __update_group_shares_cpu(tg, cpu, sd, scpu); - __update_group_shares_cpu(tg, cpu, sd, dcpu); -} + unsigned long rq_weight = 0; + unsigned long shares = 0; + int i; -/* - * Because changing a group's shares changes the weight of the super-group - * we need to walk up the tree and change all shares until we hit the root. - */ -static void -move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, - int scpu, int dcpu) -{ - while (tg) { - __move_group_shares(tg, cpu, sd, scpu, dcpu); - tg = tg->parent; + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + shares += tg->cfs_rq[i]->shares; } -} -static void -aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - int i; + if ((!shares && rq_weight) || shares > tg->shares) + shares = tg->shares; + + if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) + shares = tg->shares; for_each_cpu_mask(i, sd->span) { struct rq *rq = cpu_rq(i); unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, cpu, sd, i); + __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } - - aggregate_group_shares(tg, cpu, sd); } /* - * Calculate the accumulative weight and recursive load of each task group - * while walking down the tree. + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. */ static void -aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) +tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) { - aggregate_group_weight(tg, cpu, sd); - aggregate_group_shares(tg, cpu, sd); - aggregate_group_load(tg, cpu, sd); -} - -/* - * Rebalance the cpu shares while walking back up the tree. - */ -static void -aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) -{ - aggregate_group_set_shares(tg, cpu, sd); -} - -static void -aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ -} - -static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + unsigned long load; -static void __init init_aggregate(void) -{ - int i; + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->cfs_rq[cpu]->shares; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } - for_each_possible_cpu(i) - spin_lock_init(&per_cpu(aggregate_lock, i)); + tg->cfs_rq[cpu]->h_load = load; } -static int get_aggregate(int cpu, struct sched_domain *sd) +static void +tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) { - if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) - return 0; - - aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); - return 1; } -static void update_aggregate(int cpu, struct sched_domain *sd) +static void update_shares(struct sched_domain *sd) { - aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); } -static void put_aggregate(int cpu, struct sched_domain *sd) +static void update_h_load(int cpu) { - spin_unlock(&per_cpu(aggregate_lock, cpu)); + walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); } static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) @@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #else -static inline void init_aggregate(void) -{ -} - -static inline int get_aggregate(int cpu, struct sched_domain *sd) -{ - return 0; -} - -static inline void update_aggregate(int cpu, struct sched_domain *sd) +static inline void update_shares(struct sched_domain *sd) { } -static inline void put_aggregate(int cpu, struct sched_domain *sd) -{ -} #endif #endif @@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; - /* - * now that we have both rqs locked the rq weight won't change - * anymore - so update the stats. - */ - update_aggregate(this_cpu, sd); - do { unsigned long load, avg_load; int local_group; @@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; - int unlock_aggregate; cpus_setall(*cpus); - unlock_aggregate = get_aggregate(this_cpu, sd); - /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: + update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3663,8 +3506,8 @@ out_one_pinned: else ld_moved = 0; out: - if (unlock_aggregate) - put_aggregate(this_cpu, sd); + if (ld_moved) + update_shares(sd); return ld_moved; } @@ -8019,7 +7862,6 @@ void __init sched_init(void) } #ifdef CONFIG_SMP - init_aggregate(); init_defrootdomain(); #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 03b9fbd9d64..7b8d664d6f2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct task_group *tg; rcu_read_lock(); + update_h_load(busiest_cpu); + list_for_each_entry(tg, &task_groups, list) { + struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; long rem_load, moved_load; /* * empty group */ - if (!tg->cfs_rq[busiest_cpu]->task_weight) + if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; - rem_load /= aggregate(tg, this_cpu)->load + 1; + rem_load = rem_load_move * busiest_cfs_rq->load.weight; + rem_load /= busiest_cfs_rq->h_load + 1; moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); - - moved_load *= aggregate(tg, this_cpu)->load; - moved_load /= aggregate(tg, this_cpu)->rq_weight + 1; + moved_load *= busiest_cfs_rq->h_load; + moved_load /= busiest_cfs_rq->load.weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit From 3e5459b4bea3ca2618cc02d56d12639f2cba531d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:24 +0200 Subject: sched: fix newidle smp group balancing Re-compute the shares on newidle - so we can make a decision based on recent data. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index f864b751fd1..cdd09462fc9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1579,6 +1579,13 @@ static void update_shares(struct sched_domain *sd) walk_tg_tree(tg_nop, tg_shares_up, 0, sd); } +static void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ + spin_unlock(&rq->lock); + update_shares(sd); + spin_lock(&rq->lock); +} + static void update_h_load(int cpu) { walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); @@ -1595,6 +1602,10 @@ static inline void update_shares(struct sched_domain *sd) { } +static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) +{ +} + #endif #endif @@ -3543,6 +3554,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: + update_shares_locked(this_rq, sd); group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, cpus, NULL); if (!group) { @@ -3586,6 +3598,7 @@ redo: } else sd->nr_balance_failed = 0; + update_shares_locked(this_rq, sd); return ld_moved; out_balanced: -- cgit From 039a1c41b3a489e34593ea1e1687f6fdad6b13ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:25 +0200 Subject: sched: fix sched_balance_self() smp group balancing Finding the least idle cpu is more accurate when done with updated shares. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index cdd09462fc9..39d5495540d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2128,6 +2128,9 @@ static int sched_balance_self(int cpu, int flag) sd = tmp; } + if (sd) + update_shares(sd); + while (sd) { cpumask_t span, tmpmask; struct sched_group *group; -- cgit From a8a51d5e59561aa5b4d66e19eca819b537783e8f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:26 +0200 Subject: sched: persistent average load per task Remove the fall-back to SCHED_LOAD_SCALE by remembering the previous value of cpu_avg_load_per_task() - this is useful because of the hierarchical group model in which task weight can be much smaller. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 39d5495540d..6a6b0139eb3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -554,6 +554,8 @@ struct rq { int cpu; int online; + unsigned long avg_load_per_task; + struct task_struct *migration_thread; struct list_head migration_queue; #endif @@ -1427,9 +1429,18 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + #ifdef CONFIG_FAIR_GROUP_SCHED typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); @@ -2010,18 +2021,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -/* - * Return the average load per task on the cpu's run queue - */ -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - unsigned long n = rq->nr_running; - - return n ? total / n : SCHED_LOAD_SCALE; -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. -- cgit From bb3469ac9b50f14ad6eba129ca0ad4fd033097a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:27 +0200 Subject: sched: hierarchical load vs affine wakeups With hierarchical grouping we can't just compare task weight to rq weight - we need to scale the weight appropriately. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7b8d664d6f2..865cb53a7cc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1073,6 +1073,25 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; +#ifdef CONFIG_FAIR_GROUP_SCHED +static unsigned long task_h_load(struct task_struct *p) +{ + unsigned long h_load = p->se.load.weight; + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + + update_h_load(task_cpu(p)); + + h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load); + + return h_load; +} +#else +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.load.weight; +} +#endif + static int wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, @@ -1093,9 +1112,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * of the current CPU: */ if (sync) - tl -= current->se.load.weight; + tl -= task_h_load(current); - balanced = 100*(tl + p->se.load.weight) <= imbalance*load; + balanced = 100*(tl + task_h_load(p)) <= imbalance*load; /* * If the currently running task will sleep within -- cgit From 408ed066b11cf9ee4536573b4269ee3613bd735e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:28 +0200 Subject: sched: hierarchical load vs find_busiest_group find_busiest_group() has some assumptions about task weight being in the NICE_0_LOAD range. Hierarchical task groups break this assumption - fix this by replacing it with the average task weight, which will adapt the situation. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 6a6b0139eb3..5e2aa394a81 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3050,6 +3050,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; + if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; else if (idle == CPU_NEWLY_IDLE) @@ -3064,6 +3065,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; local_group = cpu_isset(this_cpu, group->cpumask); @@ -3072,6 +3075,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + sum_avg_load_per_task = avg_load_per_task = 0; + max_cpu_load = 0; min_cpu_load = ~0UL; @@ -3105,6 +3110,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load += load; sum_nr_running += rq->nr_running; sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); } /* @@ -3126,7 +3133,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); - if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) __group_imb = 1; group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; @@ -3267,9 +3287,9 @@ small_imbalance: if (busiest_load_per_task > this_load_per_task) imbn = 1; } else - this_load_per_task = SCHED_LOAD_SCALE; + this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= + if (max_load - this_load + 2*busiest_load_per_task >= busiest_load_per_task * imbn) { *imbalance = busiest_load_per_task; return busiest; -- cgit From 42a3ac7d5cee89849448b41b86faeb86f98e92f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:29 +0200 Subject: sched: fix load scaling in group balancing doing the load balance will change cfs_rq->load.weight (that's the whole point) but since that's part of the scale factor, we'll scale back with a different amount. Weight getting smaller would result in an inflated moved_load which causes it to stop balancing too soon. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 865cb53a7cc..734e4c556fc 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1444,6 +1444,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, list_for_each_entry(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + unsigned long busiest_h_load = busiest_cfs_rq->h_load; + unsigned long busiest_weight = busiest_cfs_rq->load.weight; long rem_load, moved_load; /* @@ -1452,8 +1454,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * busiest_cfs_rq->load.weight; - rem_load /= busiest_cfs_rq->h_load + 1; + rem_load = rem_load_move * busiest_weight; + rem_load /= busiest_h_load + 1; moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1462,8 +1464,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - moved_load *= busiest_cfs_rq->h_load; - moved_load /= busiest_cfs_rq->load.weight + 1; + moved_load *= busiest_h_load; + moved_load /= busiest_weight + 1; rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit From 4be9daaa1b33701f011f4117f22dc1e45a3e6e34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:30 +0200 Subject: sched: fix task_h_load() Currently task_h_load() computes the load of a task and uses that to either subtract it from the total, or add to it. However, removing or adding a task need not have any effect on the total load at all. Imagine adding a task to a group that is local to one cpu - in that case the total load of that cpu is unaffected. So properly compute addition/removal: s_i = S * rw_i / \Sum_j rw_j s'_i = S * (rw_i + wl) / (\Sum_j rw_j + wg) then s'_i - s_i gives the change in load. Where s_i is the shares for cpu i, S the group weight, rw_i the runqueue weight for that cpu, wl the weight we add (subtract) and wg the weight contribution to the runqueue. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 49 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 734e4c556fc..a1694441f8b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,22 +1074,53 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long task_h_load(struct task_struct *p) +static unsigned long effective_load(struct task_group *tg, long wl, int cpu) { - unsigned long h_load = p->se.load.weight; - struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + struct sched_entity *se = tg->se[cpu]; + long wg = wl; - update_h_load(task_cpu(p)); + for_each_sched_entity(se) { +#define D(n) (likely(n) ? (n) : 1) + + long S, Srw, rw, s, sn; + + S = se->my_q->tg->shares; + s = se->my_q->shares; + rw = se->my_q->load.weight; - h_load = calc_delta_mine(h_load, cfs_rq->h_load, &cfs_rq->load); + Srw = S * rw / D(s); + sn = S * (rw + wl) / D(Srw + wg); + + wl = sn - s; + wg = 0; +#undef D + } - return h_load; + return wl; } + +static unsigned long task_load_sub(struct task_struct *p) +{ + return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p)); +} + +static unsigned long task_load_add(struct task_struct *p, int cpu) +{ + return effective_load(task_group(p), p->se.load.weight, cpu); +} + #else -static unsigned long task_h_load(struct task_struct *p) + +static unsigned long task_load_sub(struct task_struct *p) +{ + return -p->se.load.weight; +} + +static unsigned long task_load_add(struct task_struct *p, int cpu) { return p->se.load.weight; } + #endif static int @@ -1112,9 +1143,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * of the current CPU: */ if (sync) - tl -= task_h_load(current); + tl += task_load_sub(current); - balanced = 100*(tl + task_h_load(p)) <= imbalance*load; + balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load; /* * If the currently running task will sleep within -- cgit From 051c67640e771fd6ad1b676fc0c16c379b3c6f80 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:31 +0200 Subject: sched: remove prio preference from balance decisions Priority looses much of its meaning in a hierarchical context. So don't use it in balance decisions. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5e2aa394a81..10d43f5bf0f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2896,7 +2896,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct rq_iterator *iterator) { - int loops = 0, pulled = 0, pinned = 0, skip_for_load; + int loops = 0, pulled = 0, pinned = 0; struct task_struct *p; long rem_load_move = max_load_move; @@ -2912,14 +2912,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, next: if (!p || loops++ > sysctl_sched_nr_migrate) goto out; - /* - * To help distribute high priority tasks across CPUs we don't - * skip a task if it will be the highest priority task (i.e. smallest - * prio value) on its new queue regardless of its load weight - */ - skip_for_load = (p->se.load.weight >> 1) > rem_load_move + - SCHED_LOAD_SCALE_FUZZ; - if ((skip_for_load && p->prio >= *this_best_prio) || + + if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { p = iterator->next(iterator->arg); goto next; -- cgit From cb5ef42a03a13f95a9ea94e6cda4f7a47497871f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:32 +0200 Subject: sched: optimize effective_load() s_i = S * rw_i / \Sum_j rw_j -> \Sum_j rw_j = S * rw_i / s_i -> s'_i = S * (rw_i + w) / (\Sum_j rw_j + w) delta s = s' - s = S * (rw + w) / ((S * rw / s) + w) = s * (S * (rw + w) / (S * rw + s * w) - 1) a = S*(rw+w), b = S*rw + s*w delta s = s * (a-b) / b IOW, trade one divide for two multiplies Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a1694441f8b..0d197be3e3e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1082,16 +1082,16 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) - long S, Srw, rw, s, sn; + long S, rw, s, a, b; S = se->my_q->tg->shares; s = se->my_q->shares; rw = se->my_q->load.weight; - Srw = S * rw / D(s); - sn = S * (rw + wl) / D(Srw + wg); + a = S*(rw + wl); + b = S*rw + s*wg; - wl = sn - s; + wl = s*(a-b)/D(b); wg = 0; #undef D } -- cgit From 93b75217df39e6d75889cc6f8050343286aff4a5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:33 +0200 Subject: sched: disable source/target_load bias The bias given by source/target_load functions can be very large, disable it by default to get faster convergence. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- kernel/sched_features.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 10d43f5bf0f..6c5eb3bc37e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2000,7 +2000,7 @@ static unsigned long source_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return min(rq->cpu_load[type-1], total); @@ -2015,7 +2015,7 @@ static unsigned long target_load(int cpu, int type) struct rq *rq = cpu_rq(cpu); unsigned long total = weighted_cpuload(cpu); - if (type == 0) + if (type == 0 || !sched_feat(LB_BIAS)) return total; return max(rq->cpu_load[type-1], total); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 04123af2e67..d56e3053e74 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,3 +8,4 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) +SCHED_FEAT(LB_BIAS, 0) \ No newline at end of file -- cgit From cd80917e4ff465ea77106f8e4fb631eedc4cf426 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:34 +0200 Subject: sched: fix shares boost logic In case the domain is empty, pretend there is a single task on each cpu, so that together with the boost logic we end up giving 1/n shares to each cpu. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 6c5eb3bc37e..1cff969f664 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1549,6 +1549,9 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; + if (!rq_weight) + rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; + for_each_cpu_mask(i, sd->span) { struct rq *rq = cpu_rq(i); unsigned long flags; -- cgit From 2398f2c6d34b43025f274fc42eaca34d23ec2320 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:35 +0200 Subject: sched: update shares on wakeup We found that the affine wakeup code needs rather accurate load figures to be effective. The trouble is that updating the load figures is fairly expensive with group scheduling. Therefore ratelimit the updating. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched.c | 30 +++++++++++++++++++++++++++++- kernel/sched_features.h | 3 ++- kernel/sysctl.c | 8 ++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index eaf821072db..835b6c6fcc5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -783,6 +783,8 @@ struct sched_domain { unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ + u64 last_update; + #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; @@ -1605,6 +1607,7 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_shares_ratelimit; int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --git a/kernel/sched.c b/kernel/sched.c index 1cff969f664..62db0891025 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -777,6 +777,12 @@ late_initcall(sched_init_debug); */ const_debug unsigned int sysctl_sched_nr_migrate = 32; +/* + * ratelimit for updating the group shares. + * default: 0.5ms + */ +const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1590,7 +1596,13 @@ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) static void update_shares(struct sched_domain *sd) { - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + u64 now = cpu_clock(raw_smp_processor_id()); + s64 elapsed = now - sd->last_update; + + if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { + sd->last_update = now; + walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + } } static void update_shares_locked(struct rq *rq, struct sched_domain *sd) @@ -2199,6 +2211,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; +#ifdef CONFIG_SMP + if (sched_feat(LB_WAKEUP_UPDATE)) { + struct sched_domain *sd; + + this_cpu = raw_smp_processor_id(); + cpu = task_cpu(p); + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + update_shares(sd); + break; + } + } + } +#endif + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d56e3053e74..7d616d2a2a3 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,4 +8,5 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) \ No newline at end of file +SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_WAKEUP_UPDATE, 1) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29116652dca..fe8cdc80ff0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_ratelimit", + .data = &sysctl_sched_shares_ratelimit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", -- cgit From 243e0e7b7d3b54749ece2e879ecd7e2a11874443 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Fri, 27 Jun 2008 13:41:36 +0200 Subject: sched: fix mult overflow It was observed these mults can overflow. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0d197be3e3e..26ebe180cde 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1477,7 +1477,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; - long rem_load, moved_load; + u64 rem_load, moved_load; /* * empty group @@ -1485,8 +1485,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * busiest_weight; - rem_load /= busiest_h_load + 1; + rem_load = (u64)rem_load_move * busiest_weight; + rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1496,7 +1496,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, continue; moved_load *= busiest_h_load; - moved_load /= busiest_weight + 1; + moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) -- cgit From 83378269a5fad98f562ebc0f09c349575e6cbfe1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:37 +0200 Subject: sched: correct wakeup weight calculations rw_i = {2, 4, 1, 0} s_i = {2/7, 4/7, 1/7, 0} wakeup on cpu0, weight=1 rw'_i = {3, 4, 1, 0} s'_i = {3/8, 4/8, 1/8, 0} s_0 = S * rw_0 / \Sum rw_j -> \Sum rw_j = S*rw_0/s_0 = 1*2*7/2 = 7 (correct) s'_0 = S * (rw_0 + 1) / (\Sum rw_j + 1) = 1 * (2+1) / (7+1) = 3/8 (correct so we find that adding 1 to cpu0 gains 5/56 in weight if say the other cpu were, cpu1, we'd also have to calculate its 4/56 loss Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ kernel/sched_fair.c | 48 ++++++++++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 62db0891025..01d3e51b711 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -365,6 +365,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} #endif /* CONFIG_GROUP_SCHED */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 26ebe180cde..bed2f71e63d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,10 +1074,10 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long effective_load(struct task_group *tg, long wl, int cpu) +static unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { struct sched_entity *se = tg->se[cpu]; - long wg = wl; for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) @@ -1092,6 +1092,13 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) b = S*rw + s*wg; wl = s*(a-b)/D(b); + /* + * Assume the group is already running and will + * thus already be accounted for in the weight. + * + * That is, moving shares between CPUs, does not + * alter the group weight. + */ wg = 0; #undef D } @@ -1099,26 +1106,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu) return wl; } -static unsigned long task_load_sub(struct task_struct *p) -{ - return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p)); -} - -static unsigned long task_load_add(struct task_struct *p, int cpu) -{ - return effective_load(task_group(p), p->se.load.weight, cpu); -} - #else -static unsigned long task_load_sub(struct task_struct *p) +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) { - return -p->se.load.weight; -} - -static unsigned long task_load_add(struct task_struct *p, int cpu) -{ - return p->se.load.weight; + return wl; } #endif @@ -1130,8 +1123,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, unsigned int imbalance) { struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) @@ -1142,10 +1137,19 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * effect of the currently running task from the load * of the current CPU: */ - if (sync) - tl += task_load_sub(current); + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; + + tl += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } + + tg = task_group(p); + weight = p->se.load.weight; - balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load; + balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* * If the currently running task will sleep within -- cgit From f1d239f73200a5803a89e5929fb3abc1596b7589 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:38 +0200 Subject: sched: incremental effective_load() Increase the accuracy of the effective_load values. Not only consider the current increment (as per the attempted wakeup), but also consider the delta between when we last adjusted the shares and the current situation. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ kernel/sched_fair.c | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 01d3e51b711..7613f69f097 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -427,6 +427,11 @@ struct cfs_rq { * this cpu's part of tg->shares */ unsigned long shares; + + /* + * load.weight at the time we set shares + */ + unsigned long rq_weight; #endif #endif }; @@ -1527,6 +1532,7 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * record the actual number of shares, not the boosted amount. */ tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; if (shares < MIN_SHARES) shares = MIN_SHARES; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bed2f71e63d..e87f1a52f62 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,10 +1074,22 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED -static unsigned long effective_load(struct task_group *tg, int cpu, - unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, + long wl, long wg) { struct sched_entity *se = tg->se[cpu]; + long more_w; + + if (!tg->parent) + return wl; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; for_each_sched_entity(se) { #define D(n) (likely(n) ? (n) : 1) @@ -1086,7 +1098,7 @@ static unsigned long effective_load(struct task_group *tg, int cpu, S = se->my_q->tg->shares; s = se->my_q->shares; - rw = se->my_q->load.weight; + rw = se->my_q->rq_weight; a = S*(rw + wl); b = S*rw + s*wg; -- cgit From f5bfb7d9ff73d72ee4f2f4830a6f0c9088d00f92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Jun 2008 13:41:39 +0200 Subject: sched: bias effective_load() error towards failing wake_affine(). Measurement shows that the difference between cgroup:/ and cgroup:/foo wake_affine() results is that the latter succeeds significantly more. Therefore bias the calculations towards failing the test. Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 ++++++++++++++++++++++++++++ kernel/sched_features.h | 1 + 2 files changed, 29 insertions(+) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e87f1a52f62..9bcc0030a58 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1074,6 +1074,27 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + * + * The problem is that perfectly aligning the shares is rather expensive, hence + * we try to avoid doing that too often - see update_shares(), which ratelimits + * this change. + * + * We compensate this by not only taking the current delta into account, but + * also considering the delta between when the shares were last adjusted and + * now. + * + * We still saw a performance dip, some tracing learned us that between + * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased + * significantly. Therefore try to bias the error in direction of failing + * the affine wakeup. + * + */ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { @@ -1083,6 +1104,13 @@ static long effective_load(struct task_group *tg, int cpu, if (!tg->parent) return wl; + /* + * By not taking the decrease of shares on the other cpu into + * account our error leans towards reducing the affine wakeups. + */ + if (!wl && sched_feat(ASYM_EFF_LOAD)) + return wl; + /* * Instead of using this increment, also add the difference * between when the shares were last updated and now. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 7d616d2a2a3..862b06bd560 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -10,3 +10,4 @@ SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 0) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(ASYM_EFF_LOAD, 1) -- cgit From 55e12e5e7b1d7e7c05a4be10cb5fd092c039aa78 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Tue, 24 Jun 2008 23:39:43 +0530 Subject: sched: make sched_{rt,fair}.c ifdefs more readable Signed-off-by: Dhaval Giani Cc: Srivatsa Vaddagiri Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- kernel/sched_rt.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9bcc0030a58..2e43d4a748c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -921,7 +921,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta, requeue); } } -#else +#else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -1062,7 +1062,7 @@ static int wake_idle(int cpu, struct task_struct *p) } return cpu; } -#else +#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ static inline int wake_idle(int cpu, struct task_struct *p) { return cpu; @@ -1586,7 +1586,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -#endif +#endif /* CONFIG_SMP */ /* * scheduler tick hitting a task of our scheduling class: diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 765932d0399..47ceac9e855 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -161,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else +#else /* !CONFIG_RT_GROUP_SCHED */ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { @@ -226,7 +226,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &def_rt_bandwidth; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP static int do_balance_runtime(struct rt_rq *rt_rq) @@ -374,12 +374,12 @@ static int balance_runtime(struct rt_rq *rt_rq) return more; } -#else +#else /* !CONFIG_SMP */ static inline int balance_runtime(struct rt_rq *rt_rq) { return 0; } -#endif +#endif /* CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -1472,4 +1472,4 @@ static void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif +#endif /* CONFIG_SCHED_DEBUG */ -- cgit From 4c9fe8ad813b257a2b9ddf0f752105a75a7dae63 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 14:49:35 +0200 Subject: sched: export cpu_clock the rcutorture module relies on cpu_clock. Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 3c696db5945..ed5a8c41504 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -256,3 +256,4 @@ unsigned long long cpu_clock(int cpu) return clock; } +EXPORT_SYMBOL_GPL(cpu_clock); -- cgit From 2d452c9b10caeec455eb5e56a0ef4ed485178213 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 29 Jun 2008 15:01:59 +0200 Subject: sched: sched_clock_cpu() based cpu_clock(), lockdep fix Vegard Nossum reported: > WARNING: at kernel/lockdep.c:2738 check_flags+0x142/0x160() which happens due to: unsigned long long cpu_clock(int cpu) { unsigned long long clock; unsigned long flags; raw_local_irq_save(flags); as lower level functions can take locks, we must not do that, use proper lockdep-annotated irq save/restore. Reported-by: Vegard Nossum Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ed5a8c41504..60094e257a9 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -250,9 +250,9 @@ unsigned long long cpu_clock(int cpu) unsigned long long clock; unsigned long flags; - raw_local_irq_save(flags); + local_irq_save(flags); clock = sched_clock_cpu(cpu); - raw_local_irq_restore(flags); + local_irq_restore(flags); return clock; } -- cgit From 34e83e850f5e5ee2a18cd77a5d70d31972a632e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Jun 2008 15:42:36 +0200 Subject: sched: build fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: kernel/sched.c: In function ‘sched_group_set_shares': kernel/sched.c:8635: error: implicit declaration of function ‘cfs_rq_set_shares' Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7613f69f097..058250a63b6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1627,11 +1627,6 @@ static void update_h_load(int cpu) walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); } -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ - cfs_rq->shares = shares; -} - #else static inline void update_shares(struct sched_domain *sd) @@ -1646,6 +1641,13 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + cfs_rq->shares = shares; +#endif +} + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" -- cgit From 30432094a7f506ad24997a3ba6aed913ab61c01d Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 Jun 2008 21:35:50 +0200 Subject: sched: fix warning This patch fixes the following warning: kernel/sched.c:1667: warning: 'cfs_rq_set_shares' defined but not used This seems the correct way to fix this; cfs_rq_set_shares() is only used in a single place, which is also inside #ifdef CONFIG_FAIR_GROUP_SCHED. Signed-off-by: Vegard Nossum Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 058250a63b6..677c80b9a6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1641,12 +1641,14 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +#ifdef CONFIG_FAIR_GROUP_SCHED static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) { -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +#ifdef CONFIG_SMP cfs_rq->shares = shares; #endif } +#endif #include "sched_stats.h" #include "sched_idletask.c" -- cgit From c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Fri, 27 Jun 2008 14:29:55 -0600 Subject: sched: terminate newidle balancing once at least one task has moved over Inspired by Peter Zijlstra. Signed-off-by: Gregory Haskins Cc: npiggin@suse.de Cc: rostedt@goodmis.org Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 677c80b9a6b..d99aeabeb72 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3013,6 +3013,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, max_load_move - total_load_moved, sd, idle, all_pinned, &this_best_prio); class = class->next; + + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + } while (class && max_load_move > total_load_moved); return total_load_moved > 0; -- cgit From 2087a1ad822cd3a68b73338457047fcc54da726b Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Fri, 27 Jun 2008 14:30:00 -0600 Subject: sched: add avg-overlap support to RT tasks We have the notion of tracking process-coupling (a.k.a. buddy-wake) via the p->se.last_wake / p->se.avg_overlap facilities, but it is only used for cfs to cfs interactions. There is no reason why an rt to cfs interaction cannot share in establishing a relationhip in a similar manner. Because PREEMPT_RT runs many kernel threads as FIFO priority, we often times have heavy interaction between RT threads waking CFS applications. This patch offers a substantial boost (50-60%+) in perfomance under those circumstances. Signed-off-by: Gregory Haskins Cc: npiggin@suse.de Cc: rostedt@goodmis.org Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++++++ kernel/sched_fair.c | 21 ++------------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index d99aeabeb72..bbc40c3a065 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1693,6 +1693,12 @@ static void set_load_weight(struct task_struct *p) p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { sched_info_queued(p); @@ -1702,6 +1708,12 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { + if (sleep && p->se.last_wakeup) { + update_avg(&p->se.avg_overlap, + p->se.sum_exec_runtime - p->se.last_wakeup); + p->se.last_wakeup = 0; + } + p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } @@ -2313,6 +2325,8 @@ out_running: p->sched_class->task_wake_up(rq, p); #endif out: + current->se.last_wakeup = current->se.sum_exec_runtime; + task_rq_unlock(rq, &flags); return success; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e43d4a748c..f2aa987027d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (!se->last_wakeup) - return; - - update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); - se->last_wakeup = 0; -} - static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { - update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -1196,9 +1180,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced && curr->sched_class == &fair_sched_class) { + if (sync && balanced) { if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) + p->se.avg_overlap < sysctl_sched_migration_cost) return 1; } @@ -1359,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } - se->last_wakeup = se->sum_exec_runtime; if (unlikely(se == pse)) return; -- cgit From 46ac22bab42cc868b9c1d0e915ddbc8e8065a44d Mon Sep 17 00:00:00 2001 From: Ankita Garg Date: Tue, 1 Jul 2008 14:30:06 +0530 Subject: sched: fix accounting in task delay accounting & migration On Thu, Jun 19, 2008 at 12:27:14PM +0200, Peter Zijlstra wrote: > On Thu, 2008-06-05 at 10:50 +0530, Ankita Garg wrote: > > > Thanks Peter for the explanation... > > > > I agree with the above and that is the reason why I did not see weird > > values with cpu_time. But, run_delay still would suffer skews as the end > > points for delta could be taken on different cpus due to migration (more > > so on RT kernel due to the push-pull operations). With the below patch, > > I could not reproduce the issue I had seen earlier. After every dequeue, > > we take the delta and start wait measurements from zero when moved to a > > different rq. > > OK, so task delay delay accounting is broken because it doesn't take > migration into account. > > What you've done is make it symmetric wrt enqueue, and account it like > > cpu0 cpu1 > > enqueue > > dequeue > enqueue > > run > > Where you add both d1 and d2 to the run_delay,.. right? > Thanks for reviewing the patch. The above is exactly what I have done. > This seems like a good fix, however it looks like the patch will break > compilation in !CONFIG_SCHEDSTATS && !CONFIG_TASK_DELAY_ACCT, of it > failing to provide a stub for sched_info_dequeue() in that case. Fixed. Pl. find the new patch below. Signed-off-by: Ankita Garg Acked-by: Peter Zijlstra Cc: Gregory Haskins Cc: rostedt@goodmis.org Cc: suresh.b.siddha@intel.com Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dhaval@linux.vnet.ibm.com Cc: vatsa@linux.vnet.ibm.com Cc: David Bahi Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + kernel/sched_stats.h | 42 +++++++++++++++++++++++++++++++++--------- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index bbc40c3a065..996bc15196a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1714,6 +1714,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) p->se.last_wakeup = 0; } + sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 80179ef7450..8385d43987e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.cpu_time += delta; } + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) # define schedstat_set(var, val) do { var = (val); } while (0) @@ -126,6 +133,9 @@ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) {} static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} # define schedstat_inc(rq, field) do { } while (0) @@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) #endif #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) * active queue, thus delaying tasks in the expired queue from running; * see scheduler_tick()). * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. + * Though we are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. */ static inline void sched_info_dequeued(struct task_struct *t) { - t->sched_info.last_queued = 0; + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (unlikely(sched_info_on())) + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(task_rq(t), delta); } /* @@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t) if (t->sched_info.last_queued) delta = now - t->sched_info.last_queued; - sched_info_dequeued(t); + sched_info_reset_dequeued(t); t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; @@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) __sched_info_switch(prev, next); } #else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) +#define sched_info_queued(t) do { } while (0) +#define sched_info_reset_dequeued(t) do { } while (0) +#define sched_info_dequeued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ -- cgit From 62c43dd9864dbd52ff158922d1d08c75f20335af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:50 -0400 Subject: sched_clock: record from last tick The sched_clock code tries to keep within the gtod time by one tick (jiffy). The current code mistakenly keeps track of the delta jiffies between updates of the clock, where the the delta is used to compare with the number of jiffies that have past since an update of the gtod. The gtod is updated at each schedule tick not each sched_clock update. After one jiffy passes the clock is updated fine. But the delta is taken from the last update so if the next update happens before the next tick the delta jiffies used will be incorrect. This patch changes the code to check the delta of jiffies between ticks and not updates to match the comparison of the updates with the gtod. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219a..e383bc7df6d 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -40,7 +40,7 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long prev_jiffies; + unsigned long tick_jiffies; u64 prev_raw; u64 tick_raw; u64 tick_gtod; @@ -71,7 +71,7 @@ void sched_clock_init(void) struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->prev_jiffies = now_jiffies; + scd->tick_jiffies = now_jiffies; scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; @@ -90,7 +90,7 @@ void sched_clock_init(void) static void __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->prev_jiffies; + long delta_jiffies = now_jiffies - scd->tick_jiffies; u64 clock = scd->clock; u64 min_clock, max_clock; s64 delta = now - scd->prev_raw; @@ -119,7 +119,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = min_clock; scd->prev_raw = now; - scd->prev_jiffies = now_jiffies; scd->clock = clock; } @@ -179,6 +178,7 @@ u64 sched_clock_cpu(int cpu) void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); + unsigned long now_jiffies = jiffies; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -196,6 +196,7 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ + scd->tick_jiffies = now_jiffies; scd->tick_raw = now; scd->tick_gtod = now_gtod; __raw_spin_unlock(&scd->lock); -- cgit From f7cce27f5605b9e137b829a47949cb2d3c7e1cab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:51 -0400 Subject: sched_clock: widen the max and min time With keeping the max and min sched time within one jiffy of the gtod clock was too tight. Just before a schedule tick the max could easily be hit, as well as just after a schedule_tick the min could be hit. This caused the clock to jump around by a jiffy. This patch widens the minimum to last gtod + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSECS and the maximum to last gtod + (2 + delta_jiffies) * TICK_NSECS This keeps the minum to gtod or if one jiffy less than delta jiffies and the maxim 2 jiffies ahead of gtod. This may cause unstable TSCs to be a bit more sporadic, but it helps keep a clock with a stable TSC working well. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e383bc7df6d..42b81fa38cb 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -96,14 +96,21 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) s64 delta = now - scd->prev_raw; WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + min_clock = scd->tick_gtod + + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; if (unlikely(delta < 0)) { clock++; goto out; } - max_clock = min_clock + TICK_NSEC; + /* + * The clock must stay within a jiffie of the gtod. + * But since we may be at the start of a jiffy or the end of one + * we add another jiffy buffer. + */ + max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; if (unlikely(clock + delta > max_clock)) { if (clock < max_clock) -- cgit From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:52 -0400 Subject: sched_clock: stop maximum check on NO HZ Working with ftrace I would get large jumps of 11 millisecs or more with the clock tracer. This killed the latencing timings of ftrace and also caused the irqoff self tests to fail. What was happening is with NO_HZ the idle would stop the jiffy counter and before the jiffy counter was updated the sched_clock would have a bad delta jiffies to compare with the gtod with the maximum. The jiffies would stop and the last sched_tick would record the last gtod. On wakeup, the sched clock update would compare the gtod + delta jiffies (which would be zero) and compare it to the TSC. The TSC would have correctly (with a stable TSC) moved forward several jiffies. But because the jiffies has not been updated yet the clock would be prevented from moving forward because it would appear that the TSC jumped too far ahead. The clock would then virtually stop, until the jiffies are updated. Then the next sched clock update would see that the clock was very much behind since the delta jiffies is now correct. This would then jump the clock forward by several jiffies. This caused ftrace to report several milliseconds of interrupts off latency at every resume from NO_HZ idle. This patch adds hooks into the nohz code to disable the checking of the maximum clock update when nohz is in effect. It resumes the max check when nohz has updated the jiffies again. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 ++++++++++++++++- kernel/sched_clock.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/time/tick-sched.c | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f847ca8..33a8f42041f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1573,13 +1573,28 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } -#else + +#ifdef CONFIG_NO_HZ +static inline void sched_clock_tick_stop(int cpu) +{ +} + +static inline void sched_clock_tick_start(int cpu) +{ +} +#endif + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#ifdef CONFIG_NO_HZ +extern void sched_clock_tick_stop(int cpu); +extern void sched_clock_tick_start(int cpu); #endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 42b81fa38cb..97159e225a7 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -45,6 +45,9 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; +#ifdef CONFIG_NO_HZ + int check_max; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -76,11 +79,45 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; +#ifdef CONFIG_NO_HZ + scd->check_max = 1; +#endif } sched_clock_running = 1; } +#ifdef CONFIG_NO_HZ +/* + * The dynamic ticks makes the delta jiffies inaccurate. This + * prevents us from checking the maximum time update. + * Disable the maximum check during stopped ticks. + */ +void sched_clock_tick_stop(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 0; +} + +void sched_clock_tick_start(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 1; +} + +static int check_max(struct sched_clock_data *scd) +{ + return scd->check_max; +} +#else +static int check_max(struct sched_clock_data *scd) +{ + return 1; +} +#endif /* CONFIG_NO_HZ */ + /* * update the percpu scd from the raw @now value * @@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - if (unlikely(clock + delta > max_clock)) { + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591..d63008b09a4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); + sched_clock_tick_stop(cpu); } /* @@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); + sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* -- cgit From 2b8a0cf4890d7537a77b51caa8f508e4a05a0e67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 19:49:41 -0400 Subject: sched_clock: fix calculation of other CPU The algorithm to calculate the 'now' of another CPU is not correct. At each scheduler tick, each CPU records the last sched_clock and gtod (tick_raw and tick_gtod respectively). If the TSC is somewhat the same in speed between two clocks the algorithm would be: tick_gtod1 + (now1 - tick_raw1) = tick_gtod2 + (now2 - tick_raw2) To calculate now2 we would have: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw2 - tick_raw1) + now1 Currently the algorithm is: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw1 - tick_raw2) + now1 This solves most of the rest of the issues I've had with timestamps in ftace. Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: john stultz Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 97159e225a7..55fca1e9e12 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -203,8 +203,8 @@ u64 sched_clock_cpu(int cpu) now -= my_scd->tick_raw; now += scd->tick_raw; - now -= my_scd->tick_gtod; - now += scd->tick_gtod; + now += my_scd->tick_gtod; + now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); } else { -- cgit From c0c87734f125d2fa8ebc70310f3257fa6209f2b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:31 -0400 Subject: sched_clock: only update deltas with local reads. Reading the CPU clock should try to stay accurate within the CPU. By reading the CPU clock from another CPU and updating the deltas can cause unneeded jumps when reading from the local CPU. This patch changes the code to update the last read TSC only when read from the local CPU. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 55fca1e9e12..ee7cce5029c 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static int check_max(struct sched_clock_data *scd) * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -162,8 +162,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(clock < min_clock)) clock = min_clock; - scd->prev_raw = now; - scd->clock = clock; + if (time) + *time = clock; + else { + scd->prev_raw = now; + scd->clock = clock; + } } static void lock_double_clock(struct sched_clock_data *data1, @@ -207,15 +211,18 @@ u64 sched_clock_cpu(int cpu) now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); + + __update_sched_clock(scd, now, &clock); + + __raw_spin_unlock(&scd->lock); + } else { __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now, NULL); + clock = scd->clock; + __raw_spin_unlock(&scd->lock); } - __update_sched_clock(scd, now); - clock = scd->clock; - - __raw_spin_unlock(&scd->lock); - return clock; } @@ -234,7 +241,7 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now); + __update_sched_clock(scd, now, NULL); /* * update tick_gtod after __update_sched_clock() because that will * already observe 1 new jiffy; adding a new tick_gtod to that would -- cgit From a83bc47c33ab182f1e48977fd5a04024d713c75e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:32 -0400 Subject: sched_clock: record TSC after gtod To read the gtod we need to grab the xtime lock for read. Reading the gtod before the TSC can cause a bigger gab if the xtime lock is contended. This patch simply reverses the order to read the TSC after the gtod. The locking in the reading of the gtod handles any barriers one might think is needed. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ee7cce5029c..28ff6bf5e02 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -237,8 +237,8 @@ void sched_clock_tick(void) WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); now_gtod = ktime_to_ns(ktime_get()); + now = sched_clock(); __raw_spin_lock(&scd->lock); __update_sched_clock(scd, now, NULL); -- cgit From c300ba252829e9325e08f0af60687add94445b25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:33 -0400 Subject: sched_clock: and multiplier for TSC to gtod drift The sched_clock code currently tries to keep all CPU clocks of all CPUS somewhat in sync. At every clock tick it records the gtod clock and uses that and jiffies and the TSC to calculate a CPU clock that tries to stay in sync with all the other CPUs. ftrace depends heavily on this timer and it detects when this timer "jumps". One problem is that the TSC and the gtod also drift. When the TSC is 0.1% faster or slower than the gtod it is very noticeable in ftrace. To help compensate for this, I've added a multiplier that tries to keep the CPU clock updating at the same rate as the gtod. I've tried various ways to get it to be in sync and this ended up being the most reliable. At every scheduler tick we calculate the new multiplier: multi = delta_gtod / delta_TSC This means we perform a 64 bit divide at the tick (once a HZ). A shift is used to handle the accuracy. Other methods that failed due to dynamic HZ are: (not used) multi += (gtod - tsc) / delta_gtod (not used) multi += (gtod - (last_tsc + delta_tsc)) / delta_gtod as well as other variants. This code still allows for a slight drift between TSC and gtod, but it keeps the damage down to a minimum. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 28ff6bf5e02..8affbfd0cdb 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -3,6 +3,9 @@ * * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * + * Updates and enhancements: + * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt + * * Based on code by: * Ingo Molnar * Guillaume Chazarain @@ -32,6 +35,11 @@ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +#define MULTI_SHIFT 15 +/* Max is double, Min is 1/2 */ +#define MAX_MULTI (2LL << MULTI_SHIFT) +#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) + struct sched_clock_data { /* * Raw spinlock - this is a special case: this might be called @@ -45,6 +53,7 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; + s64 multi; #ifdef CONFIG_NO_HZ int check_max; #endif @@ -79,6 +88,7 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; + scd->multi = 1 << MULTI_SHIFT; #ifdef CONFIG_NO_HZ scd->check_max = 1; #endif @@ -134,8 +144,13 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + - (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; + /* + * At schedule tick the clock can be just under the gtod. We don't + * want to push it too prematurely. + */ + min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); + if (min_clock > TICK_NSEC) + min_clock -= TICK_NSEC / 2; if (unlikely(delta < 0)) { clock++; @@ -149,6 +164,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; + delta *= scd->multi; + delta >>= MULTI_SHIFT; + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; @@ -230,6 +248,7 @@ void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); unsigned long now_jiffies = jiffies; + s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -247,9 +266,23 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ - scd->tick_jiffies = now_jiffies; + delta_gtod = now_gtod - scd->tick_gtod; + delta_raw = now - scd->tick_raw; + + if ((long)delta_raw > 0) { + mult = delta_gtod << MULTI_SHIFT; + do_div(mult, delta_raw); + scd->multi = mult; + if (scd->multi > MAX_MULTI) + scd->multi = MAX_MULTI; + else if (scd->multi < MIN_MULTI) + scd->multi = MIN_MULTI; + } else + scd->multi = 1 << MULTI_SHIFT; + scd->tick_raw = now; scd->tick_gtod = now_gtod; + scd->tick_jiffies = now_jiffies; __raw_spin_unlock(&scd->lock); } @@ -279,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) __raw_spin_lock(&scd->lock); scd->prev_raw = now; scd->clock += delta_ns; + scd->multi = 1 << MULTI_SHIFT; __raw_spin_unlock(&scd->lock); touch_softlockup_watchdog(); -- cgit