diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 306 |
1 files changed, 190 insertions, 116 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 24ab80c2876..dd1a1466c1e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -209,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static inline int rt_bandwidth_enabled(void) @@ -361,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; #ifdef CONFIG_USER_SCHED - tg = p->user->tg; + rcu_read_lock(); + tg = __task_cred(p)->user->tg; + rcu_read_unlock(); #elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); @@ -614,6 +615,8 @@ struct rq { #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_exp_empty; @@ -1147,7 +1150,6 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1912,6 +1914,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; + trace_sched_migrate_task(p, task_cpu(p), new_cpu); + #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) p->se.wait_start -= clock_offset; @@ -2318,6 +2322,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) smp_wmb(); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); old_state = p->state; if (!(old_state & state)) goto out; @@ -2375,12 +2380,11 @@ out_activate: schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); activate_task(rq, p, 1); success = 1; out_running: - trace_sched_wakeup(rq, p); + trace_sched_wakeup(rq, p, success); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@ -2515,7 +2519,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_sched_wakeup_new(rq, p); + trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2900,7 +2904,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(!cpu_active(dest_cpu))) goto out; - trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -3777,7 +3780,7 @@ redo: * don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ - if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { double_unlock_balance(this_rq, busiest); all_pinned = 1; return ld_moved; @@ -3790,8 +3793,13 @@ redo: } double_unlock_balance(this_rq, busiest); + /* + * Should not call ttwu while holding a rq->lock + */ + spin_unlock(&this_rq->lock); if (active_balance) wake_up_process(busiest->migration_thread); + spin_lock(&this_rq->lock); } else sd->nr_balance_failed = 0; @@ -4212,13 +4220,17 @@ unsigned long long task_delta_exec(struct task_struct *p) * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime) +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; + /* Add user time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); /* Add user time to cpustat. */ @@ -4235,51 +4247,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { cputime64_t tmp; struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; tmp = cputime_to_cputime64(cputime); + /* Add guest time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); + /* Add guest time to cpustat. */ cpustat->user = cputime64_add(cpustat->user, tmp); cpustat->guest = cputime64_add(cpustat->guest, tmp); } /* - * Account scaled user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->utimescaled = cputime_add(p->utimescaled, cputime); -} - -/* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) + cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime); + account_guest_time(p, cputime, cputime_scaled); return; } + /* Add system time to process. */ p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -4288,48 +4297,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else - cpustat->idle = cputime64_add(cpustat->idle, tmp); + cpustat->system = cputime64_add(cpustat->system, tmp); + /* Account for system time used */ acct_update_integrals(p); } /* - * Account scaled system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update + * Account for involuntary wait time. + * @steal: the cpu time spent in involuntary wait */ -void account_system_time_scaled(struct task_struct *p, cputime_t cputime) +void account_steal_time(cputime_t cputime) { - p->stimescaled = cputime_add(p->stimescaled, cputime); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); } /* - * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen - * @steal: the cpu time spent in involuntary wait + * Account for idle time. + * @cputime: the cpu time spent in idle wait */ -void account_steal_time(struct task_struct *p, cputime_t steal) +void account_idle_time(cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp = cputime_to_cputime64(steal); + cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); } +#endif + /* * Use precise platform statistics if available: */ @@ -5252,6 +5297,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) set_load_weight(p); } +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + rcu_read_unlock(); + return match; +} + static int __sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param, bool user) { @@ -5311,8 +5372,7 @@ recheck: return -EPERM; /* can't change other user's priorities */ - if ((current->euid != p->euid) && - (current->euid != p->uid)) + if (!check_same_owner(p)) return -EPERM; } @@ -5551,8 +5611,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, 0, NULL); @@ -6268,9 +6327,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - /* FIXME: Use cpumask_of_node here. */ - cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu)); - const struct cpumask *nodemask = &_nodemask; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: /* Look for allowed, online CPU in same node. */ @@ -6970,7 +7027,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) spin_unlock_irqrestore(&rq->lock, flags); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) { memset(rd, 0, sizeof(*rd)); @@ -6983,7 +7040,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem) } if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) - goto free_rd; + goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) @@ -6999,8 +7056,7 @@ free_online: free_cpumask_var(rd->online); free_span: free_cpumask_var(rd->span); -free_rd: - kfree(rd); +out: return -ENOMEM; } @@ -7181,21 +7237,18 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) static void sched_domain_node_span(int node, struct cpumask *span) { nodemask_t used_nodes; - /* FIXME: use cpumask_of_node() */ - node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(*span); + cpumask_clear(span); nodes_clear(used_nodes); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(node)); node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, &used_nodes); - node_to_cpumask_ptr_next(nodemask, next_node); - cpus_or(*span, *span, *nodemask); + cpumask_or(span, span, cpumask_of_node(next_node)); } } #endif /* CONFIG_NUMA */ @@ -7275,9 +7328,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, { int group; #ifdef CONFIG_SCHED_MC - /* FIXME: Use cpu_coregroup_mask. */ - *mask = cpu_coregroup_map(cpu); - cpus_and(*mask, *mask, *cpu_map); + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); @@ -7307,10 +7358,8 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, struct cpumask *nodemask) { int group; - /* FIXME: use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu)); - cpumask_and(nodemask, pnodemask, cpu_map); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); group = cpumask_first(nodemask); if (sg) @@ -7361,10 +7410,8 @@ static void free_sched_groups(const struct cpumask *cpu_map, for (i = 0; i < nr_node_ids; i++) { struct sched_group *oldsg, *sg = sched_group_nodes[i]; - /* FIXME: Use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, i); - cpus_and(*nodemask, *pnodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) continue; @@ -7573,9 +7620,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - /* FIXME: use cpumask_of_node */ - *nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > @@ -7616,9 +7661,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(core_domains, i).sd; SD_INIT(sd, MC); set_domain_attribute(sd, attr); - *sched_domain_span(sd) = cpu_coregroup_map(i); - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); + cpumask_and(sched_domain_span(sd), cpu_map, + cpu_coregroup_mask(i)); sd->parent = p; p->child = sd; cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); @@ -7654,9 +7698,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - /* FIXME: Use cpu_coregroup_mask */ - *this_core_map = cpu_coregroup_map(i); - cpus_and(*this_core_map, *this_core_map, *cpu_map); + cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); if (i != cpumask_first(this_core_map)) continue; @@ -7668,9 +7710,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - /* FIXME: Use cpumask_of_node */ - *nodemask = node_to_cpumask(i); - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) continue; @@ -7692,11 +7732,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_group *sg, *prev; int j; - /* FIXME: Use cpumask_of_node */ - *nodemask = node_to_cpumask(i); cpumask_clear(covered); - - cpus_and(*nodemask, *nodemask, *cpu_map); + cpumask_and(nodemask, cpumask_of_node(i), cpu_map); if (cpumask_empty(nodemask)) { sched_group_nodes[i] = NULL; continue; @@ -7727,8 +7764,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; - /* FIXME: Use cpumask_of_node */ - node_to_cpumask_ptr(pnodemask, n); cpumask_complement(notcovered, covered); cpumask_and(tmpmask, notcovered, cpu_map); @@ -7736,7 +7771,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (cpumask_empty(tmpmask)) break; - cpumask_and(tmpmask, tmpmask, pnodemask); + cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); if (cpumask_empty(tmpmask)) continue; @@ -8021,7 +8056,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static void arch_reinit_sched_domains(void) { get_online_cpus(); @@ -8030,13 +8065,10 @@ int arch_reinit_sched_domains(void) rebuild_sched_domains(); put_online_cpus(); - - return 0; } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) { - int ret; unsigned int level = 0; if (sscanf(buf, "%u", &level) != 1) @@ -8057,9 +8089,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) else sched_mc_power_savings = level; - ret = arch_reinit_sched_domains(); + arch_reinit_sched_domains(); - return ret ? ret : count; + return count; } #ifdef CONFIG_SCHED_MC @@ -8094,7 +8126,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_store); #endif -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) { int err = 0; @@ -9490,6 +9522,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) kfree(ca); } +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + /* return total cpu usage (in nanoseconds) of a group */ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { @@ -9497,17 +9564,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) u64 totalcpuusage = 0; int i; - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; } @@ -9524,23 +9582,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, goto out; } - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } out: return err; } +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + static struct cftype files[] = { { .name = "usage", .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) |