From 3691c5199e8a4be1c7a91b5ab925db5feb866e19 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 31 Mar 2006 02:30:30 -0800
Subject: [PATCH] kill __init_timer_base in favor of boot_tvec_bases

Commit a4a6198b80cf82eb8160603c98da218d1bd5e104:
	[PATCH] tvec_bases too large for per-cpu data

introduced "struct tvec_t_base_s boot_tvec_bases" which is visible at
compile time.  This means we can kill __init_timer_base and move
timer_base_s's content into tvec_t_base_s.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 84 ++++++++++++++++++++++++----------------------------------
 1 file changed, 35 insertions(+), 49 deletions(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index ab189dd187c..b04dc03b593 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(jiffies_64);
 /*
  * per-CPU timer vector definitions:
  */
-
 #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6)
 #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8)
 #define TVN_SIZE (1 << TVN_BITS)
@@ -62,11 +61,6 @@ EXPORT_SYMBOL(jiffies_64);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 
-struct timer_base_s {
-	spinlock_t lock;
-	struct timer_list *running_timer;
-};
-
 typedef struct tvec_s {
 	struct list_head vec[TVN_SIZE];
 } tvec_t;
@@ -76,7 +70,8 @@ typedef struct tvec_root_s {
 } tvec_root_t;
 
 struct tvec_t_base_s {
-	struct timer_base_s t_base;
+	spinlock_t lock;
+	struct timer_list *running_timer;
 	unsigned long timer_jiffies;
 	tvec_root_t tv1;
 	tvec_t tv2;
@@ -87,13 +82,14 @@ struct tvec_t_base_s {
 
 typedef struct tvec_t_base_s tvec_base_t;
 static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
-static tvec_base_t boot_tvec_bases;
+tvec_base_t boot_tvec_bases;
+EXPORT_SYMBOL(boot_tvec_bases);
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
-	base->t_base.running_timer = timer;
+	base->running_timer = timer;
 #endif
 }
 
@@ -139,15 +135,6 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 	list_add_tail(&timer->entry, vec);
 }
 
-typedef struct timer_base_s timer_base_t;
-/*
- * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
- * at compile time, and we need timer->base to lock the timer.
- */
-timer_base_t __init_timer_base
-	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
-EXPORT_SYMBOL(__init_timer_base);
-
 /***
  * init_timer - initialize a timer.
  * @timer: the timer to be initialized
@@ -158,7 +145,7 @@ EXPORT_SYMBOL(__init_timer_base);
 void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
-	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id())->t_base;
+	timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -174,7 +161,7 @@ static inline void detach_timer(struct timer_list *timer,
 }
 
 /*
- * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * We are using hashed locking: holding per_cpu(tvec_bases).lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
@@ -185,10 +172,10 @@ static inline void detach_timer(struct timer_list *timer,
  * possible to set timer->base = NULL and drop the lock: the timer remains
  * locked.
  */
-static timer_base_t *lock_timer_base(struct timer_list *timer,
+static tvec_base_t *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 {
-	timer_base_t *base;
+	tvec_base_t *base;
 
 	for (;;) {
 		base = timer->base;
@@ -205,8 +192,7 @@ static timer_base_t *lock_timer_base(struct timer_list *timer,
 
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	timer_base_t *base;
-	tvec_base_t *new_base;
+	tvec_base_t *base, *new_base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -221,7 +207,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 
 	new_base = __get_cpu_var(tvec_bases);
 
-	if (base != &new_base->t_base) {
+	if (base != new_base) {
 		/*
 		 * We are trying to schedule the timer on the local CPU.
 		 * However we can't change timer's base while it is running,
@@ -231,19 +217,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 		 */
 		if (unlikely(base->running_timer == timer)) {
 			/* The timer remains on a former base */
-			new_base = container_of(base, tvec_base_t, t_base);
+			new_base = base;
 		} else {
 			/* See the comment in lock_timer_base() */
 			timer->base = NULL;
 			spin_unlock(&base->lock);
-			spin_lock(&new_base->t_base.lock);
-			timer->base = &new_base->t_base;
+			spin_lock(&new_base->lock);
+			timer->base = new_base;
 		}
 	}
 
 	timer->expires = expires;
 	internal_add_timer(new_base, timer);
-	spin_unlock_irqrestore(&new_base->t_base.lock, flags);
+	spin_unlock_irqrestore(&new_base->lock, flags);
 
 	return ret;
 }
@@ -263,10 +249,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
-	spin_lock_irqsave(&base->t_base.lock, flags);
-	timer->base = &base->t_base;
+	spin_lock_irqsave(&base->lock, flags);
+	timer->base = base;
 	internal_add_timer(base, timer);
-	spin_unlock_irqrestore(&base->t_base.lock, flags);
+	spin_unlock_irqrestore(&base->lock, flags);
 }
 
 
@@ -319,7 +305,7 @@ EXPORT_SYMBOL(mod_timer);
  */
 int del_timer(struct timer_list *timer)
 {
-	timer_base_t *base;
+	tvec_base_t *base;
 	unsigned long flags;
 	int ret = 0;
 
@@ -346,7 +332,7 @@ EXPORT_SYMBOL(del_timer);
  */
 int try_to_del_timer_sync(struct timer_list *timer)
 {
-	timer_base_t *base;
+	tvec_base_t *base;
 	unsigned long flags;
 	int ret = -1;
 
@@ -410,7 +396,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 		struct timer_list *tmp;
 
 		tmp = list_entry(curr, struct timer_list, entry);
-		BUG_ON(tmp->base != &base->t_base);
+		BUG_ON(tmp->base != base);
 		curr = curr->next;
 		internal_add_timer(base, tmp);
 	}
@@ -432,7 +418,7 @@ static inline void __run_timers(tvec_base_t *base)
 {
 	struct timer_list *timer;
 
-	spin_lock_irq(&base->t_base.lock);
+	spin_lock_irq(&base->lock);
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
@@ -458,7 +444,7 @@ static inline void __run_timers(tvec_base_t *base)
 
 			set_running_timer(base, timer);
 			detach_timer(timer, 1);
-			spin_unlock_irq(&base->t_base.lock);
+			spin_unlock_irq(&base->lock);
 			{
 				int preempt_count = preempt_count();
 				fn(data);
@@ -471,11 +457,11 @@ static inline void __run_timers(tvec_base_t *base)
 					BUG();
 				}
 			}
-			spin_lock_irq(&base->t_base.lock);
+			spin_lock_irq(&base->lock);
 		}
 	}
 	set_running_timer(base, NULL);
-	spin_unlock_irq(&base->t_base.lock);
+	spin_unlock_irq(&base->lock);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -506,7 +492,7 @@ unsigned long next_timer_interrupt(void)
 	hr_expires += jiffies;
 
 	base = __get_cpu_var(tvec_bases);
-	spin_lock(&base->t_base.lock);
+	spin_lock(&base->lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = NULL;
 
@@ -554,7 +540,7 @@ found:
 				expires = nte->expires;
 		}
 	}
-	spin_unlock(&base->t_base.lock);
+	spin_unlock(&base->lock);
 
 	if (time_before(hr_expires, expires))
 		return hr_expires;
@@ -1262,7 +1248,7 @@ static int __devinit init_timers_cpu(int cpu)
 		}
 		per_cpu(tvec_bases, cpu) = base;
 	}
-	spin_lock_init(&base->t_base.lock);
+	spin_lock_init(&base->lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1284,7 +1270,7 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 	while (!list_empty(head)) {
 		timer = list_entry(head->next, struct timer_list, entry);
 		detach_timer(timer, 0);
-		timer->base = &new_base->t_base;
+		timer->base = new_base;
 		internal_add_timer(new_base, timer);
 	}
 }
@@ -1300,11 +1286,11 @@ static void __devinit migrate_timers(int cpu)
 	new_base = get_cpu_var(tvec_bases);
 
 	local_irq_disable();
-	spin_lock(&new_base->t_base.lock);
-	spin_lock(&old_base->t_base.lock);
+	spin_lock(&new_base->lock);
+	spin_lock(&old_base->lock);
+
+	BUG_ON(old_base->running_timer);
 
-	if (old_base->t_base.running_timer)
-		BUG();
 	for (i = 0; i < TVR_SIZE; i++)
 		migrate_timer_list(new_base, old_base->tv1.vec + i);
 	for (i = 0; i < TVN_SIZE; i++) {
@@ -1314,8 +1300,8 @@ static void __devinit migrate_timers(int cpu)
 		migrate_timer_list(new_base, old_base->tv5.vec + i);
 	}
 
-	spin_unlock(&old_base->t_base.lock);
-	spin_unlock(&new_base->t_base.lock);
+	spin_unlock(&old_base->lock);
+	spin_unlock(&new_base->lock);
 	local_irq_enable();
 	put_cpu_var(tvec_bases);
 }
-- 
cgit 


From a2c348fe0117adced11e374329a5ea3f7c43cb41 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 31 Mar 2006 02:30:31 -0800
Subject: [PATCH] __mod_timer: simplify ->base changing

Since base and new_base are of the same type now, we can save one 'if'
branch and simplify the code a bit.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index b04dc03b593..9062a82ee8e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -215,21 +215,19 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 		 * handler yet has not finished. This also guarantees that
 		 * the timer is serialized wrt itself.
 		 */
-		if (unlikely(base->running_timer == timer)) {
-			/* The timer remains on a former base */
-			new_base = base;
-		} else {
+		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
 			timer->base = NULL;
 			spin_unlock(&base->lock);
-			spin_lock(&new_base->lock);
-			timer->base = new_base;
+			base = new_base;
+			spin_lock(&base->lock);
+			timer->base = base;
 		}
 	}
 
 	timer->expires = expires;
-	internal_add_timer(new_base, timer);
-	spin_unlock_irqrestore(&new_base->lock, flags);
+	internal_add_timer(base, timer);
+	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
 }
-- 
cgit 


From db1b1fefc2cecbff2e4214062fa8c680cb6e7b7d Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Fri, 31 Mar 2006 02:31:21 -0800
Subject: [PATCH] sched: reduce overhead of calc_load

Currently, count_active_tasks() calls both nr_running() &
nr_interruptible().  Each of these functions does a "for_each_cpu" & reads
values from the runqueue of each cpu.  Although this is not a lot of
instructions, each runqueue may be located on different node.  Depending on
the architecture, a unique TLB entry may be required to access each
runqueue.

Since there may be more runqueues than cpu TLB entries, a scan of all
runqueues can trash the TLB.  Each memory reference incurs a TLB miss &
refill.

In addition, the runqueue cacheline that contains nr_running &
nr_uninterruptible may be evicted from the cache between the two passes.
This causes unnecessary cache misses.

Combining nr_running() & nr_interruptible() into a single function
substantially reduces the TLB & cache misses on large systems.  This should
have no measureable effect on smaller systems.

On a 128p IA64 system running a memory stress workload, the new function
reduced the overhead of calc_load() from 605 usec/call to 324 usec/call.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index 9062a82ee8e..6b812c04737 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -825,7 +825,7 @@ void update_process_times(int user_tick)
  */
 static unsigned long count_active_tasks(void)
 {
-	return (nr_running() + nr_uninterruptible()) * FIXED_1;
+	return nr_active() * FIXED_1;
 }
 
 /*
-- 
cgit 


From 9f31252cb61d5bc641cdef8a069a2ca5a77855f2 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 2 Apr 2006 13:45:55 +0200
Subject: BUG_ON() Conversion in kernel/signal.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 kernel/timer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6b812c04737..c3a874f1393 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1479,8 +1479,7 @@ register_time_interpolator(struct time_interpolator *ti)
 	unsigned long flags;
 
 	/* Sanity check */
-	if (ti->frequency == 0 || ti->mask == 0)
-		BUG();
+	BUG_ON(ti->frequency == 0 || ti->mask == 0);
 
 	ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
 	spin_lock(&time_interpolator_lock);
-- 
cgit 


From b20367a6c2a0cd937cb1f0a8cf848f1402fef99c Mon Sep 17 00:00:00 2001
From: Jordan Hargrave <jordan_hargrave@dell.com>
Date: Fri, 7 Apr 2006 19:50:18 +0200
Subject: [PATCH] x86_64: Fix drift with HPET timer enabled

If the HPET timer is enabled, the clock can drift by ~3 seconds a day.
This is due to the HPET timer not being initialized with the correct
setting (still using PIT count).

If HZ changes, this drift can become even more pronounced.

HPET patch initializes tick_nsec with correct tick_nsec settings for
HPET timer.

Vojtech comments:

  "It's not entirely correct (it assumes the HPET ticks totally
   exactly), but it's significantly better than assuming the PIT error
   there."

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index c3a874f1393..471ab8710b8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1455,7 +1455,7 @@ static void time_interpolator_update(long delta_nsec)
 	 */
 	if (jiffies % INTERPOLATOR_ADJUST == 0)
 	{
-		if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
+		if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec)
 			time_interpolator->nsec_per_cyc--;
 		if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
 			time_interpolator->nsec_per_cyc++;
-- 
cgit 


From ba6edfcd1708da2e665f14eee76e87f39448ec40 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 10 Apr 2006 22:53:58 -0700
Subject: [PATCH] timer initialisation fix

We need the boot CPU's tvec_bases[] entry to be initialised super-early in
boot, for early_serial_setup().  That runs within setup_arch(), before even
per-cpu areas are initialised.

The patch changes tvec_bases to use compile-time initialisation, and adds a
separate array `tvec_base_done' to keep track of which CPU has had its
tvec_bases[] entry initialised (because we can no longer use the zeroness of
that tvec_bases[] entry to determine whether it has been initialised).

Thanks to Eugene Surovegin <ebs@ebshome.net> for diagnosing this.

Cc: Eugene Surovegin <ebs@ebshome.net>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel/timer.c')

diff --git a/kernel/timer.c b/kernel/timer.c
index 471ab8710b8..88377378883 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -81,9 +81,10 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 
 typedef struct tvec_t_base_s tvec_base_t;
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases);
+
 tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
@@ -1224,28 +1225,36 @@ static int __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
+	static char __devinitdata tvec_base_done[NR_CPUS];
 
-	base = per_cpu(tvec_bases, cpu);
-	if (!base) {
+	if (!tvec_base_done[cpu]) {
 		static char boot_done;
 
-		/*
-		 * Cannot do allocation in init_timers as that runs before the
-		 * allocator initializes (and would waste memory if there are
-		 * more possible CPUs than will ever be installed/brought up).
-		 */
 		if (boot_done) {
+			/*
+			 * The APs use this path later in boot
+			 */
 			base = kmalloc_node(sizeof(*base), GFP_KERNEL,
 						cpu_to_node(cpu));
 			if (!base)
 				return -ENOMEM;
 			memset(base, 0, sizeof(*base));
+			per_cpu(tvec_bases, cpu) = base;
 		} else {
-			base = &boot_tvec_bases;
+			/*
+			 * This is for the boot CPU - we use compile-time
+			 * static initialisation because per-cpu memory isn't
+			 * ready yet and because the memory allocators are not
+			 * initialised either.
+			 */
 			boot_done = 1;
+			base = &boot_tvec_bases;
 		}
-		per_cpu(tvec_bases, cpu) = base;
+		tvec_base_done[cpu] = 1;
+	} else {
+		base = per_cpu(tvec_bases, cpu);
 	}
+
 	spin_lock_init(&base->lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
-- 
cgit