Merge with /home/shaggy/git/linus-clean/

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
author: Dave Kleikamp <shaggy@austin.ibm.com> 2005-06-02 12:12:57 -0500
committer: Dave Kleikamp <shaggy@austin.ibm.com> 2005-06-02 12:12:57 -0500
commit: 7078253c085c037c070ca4e8bc9e9e7f18aa1e84 (patch)
tree: eaf56c1a77b0de6ee82e23cee4433b2c4a47e67e /kernel
parent: 259692bd5a2b2c2d351dd90748ba4126bc2a21b9 (diff)
parent: 1e86d1c648508fd50e6c9960576b87906a7906ad (diff)
download: kernel-crypto-7078253c085c037c070ca4e8bc9e9e7f18aa1e84.tar.gz
kernel-crypto-7078253c085c037c070ca4e8bc9e9e7f18aa1e84.tar.xz
kernel-crypto-7078253c085c037c070ca4e8bc9e9e7f18aa1e84.zip
14 files changed, 240 insertions, 73 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eb88b446c2c..b01d26fe8db 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 
-ifneq ($(CONFIG_IA64),y)
+ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d74044de..00e8f257551 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
  * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
  * (usually) grab cpuset_sem.  These are the two most performance
  * critical pieces of code here.  The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
- * notify_on_release.  In that case, the cpuset_sem is taken, the
- * path to the released cpuset calculated, and a usermode call made
+ * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
@@ -1404,6 +1403,18 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1426,13 @@ void cpuset_exit(struct task_struct *tsk)
 	tsk->cpuset = NULL;
 	task_unlock(tsk);
 
-	if (atomic_dec_and_test(&cs->count)) {
+	if (notify_on_release(cs)) {
 		down(&cpuset_sem);
-		check_for_release(cs);
+		if (atomic_dec_and_test(&cs->count))
+			check_for_release(cs);
 		up(&cpuset_sem);
+	} else {
+		atomic_dec(&cs->count);
 	}
 }
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46e11f..436c7d93c00 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
  */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
+		.status = IRQ_DISABLED,
 		.handler = &no_irq_type,
 		.lock = SPIN_LOCK_UNLOCKED
 	}
@@ -118,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 		 */
 		desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
 		desc->handler->end(irq);
 		return 1;
 	}
diff --git a/kernel/itimer.c b/kernel/itimer.c
index e9a40e947e0..1dc988e0d2c 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -123,7 +123,11 @@ static inline void it_real_arm(struct task_struct *p, unsigned long interval)
 		return;
 	if (interval > (unsigned long) LONG_MAX)
 		interval = LONG_MAX;
-	p->signal->real_timer.expires = jiffies + interval;
+	/* the "+ 1" below makes sure that the timer doesn't go off before
+	 * the interval requested. This could happen if
+	 * time requested % (usecs per jiffy) is more than the usecs left
+	 * in the current jiffy */
+	p->signal->real_timer.expires = jiffies + interval + 1;
 	add_timer(&p->signal->real_timer);
 }
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1627f8d6e0c..13bcec151b5 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -46,6 +46,14 @@ static inline int is_kernel_inittext(unsigned long addr)
 	return 0;
 }
 
+static inline int is_kernel_extratext(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sextratext
+	    && addr <= (unsigned long)_eextratext)
+		return 1;
+	return 0;
+}
+
 static inline int is_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext)
@@ -169,8 +177,9 @@ const char *kallsyms_lookup(unsigned long addr,
 	namebuf[0] = 0;
 
 	if ((all_var && is_kernel(addr)) ||
-	    (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr)))) {
-		unsigned long symbol_end=0;
+	    (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) ||
+				is_kernel_extratext(addr)))) {
+		unsigned long symbol_end = 0;
 
 		/* do a binary search on the sorted kallsyms_addresses array */
 		low = 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1d5dd1337bd..037142b72a4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 
 unsigned int kprobe_cpu = NR_CPUS;
 static DEFINE_SPINLOCK(kprobe_lock);
+static struct kprobe *curr_kprobe;
 
 /* Locks kprobe: irqs must be disabled */
 void lock_kprobes(void)
@@ -73,22 +74,139 @@ struct kprobe *get_kprobe(void *addr)
 	return NULL;
 }
 
+/*
+ * Aggregate handlers for multiple kprobes support - these handlers
+ * take care of invoking the individual kprobe handlers on p->list
+ */
+int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry(kp, &p->list, list) {
+		if (kp->pre_handler) {
+			curr_kprobe = kp;
+			kp->pre_handler(kp, regs);
+			curr_kprobe = NULL;
+		}
+	}
+	return 0;
+}
+
+void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+		unsigned long flags)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry(kp, &p->list, list) {
+		if (kp->post_handler) {
+			curr_kprobe = kp;
+			kp->post_handler(kp, regs, flags);
+			curr_kprobe = NULL;
+		}
+	}
+	return;
+}
+
+int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
+{
+	/*
+	 * if we faulted "during" the execution of a user specified
+	 * probe handler, invoke just that probe's fault handler
+	 */
+	if (curr_kprobe && curr_kprobe->fault_handler) {
+		if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Fill in the required fields of the "manager kprobe". Replace the
+ * earlier kprobe in the hlist with the manager kprobe
+ */
+static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+{
+	ap->addr = p->addr;
+	ap->opcode = p->opcode;
+	memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
+
+	ap->pre_handler = aggr_pre_handler;
+	ap->post_handler = aggr_post_handler;
+	ap->fault_handler = aggr_fault_handler;
+
+	INIT_LIST_HEAD(&ap->list);
+	list_add(&p->list, &ap->list);
+
+	INIT_HLIST_NODE(&ap->hlist);
+	hlist_del(&p->hlist);
+	hlist_add_head(&ap->hlist,
+		&kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
+}
+
+/*
+ * This is the second or subsequent kprobe at the address - handle
+ * the intricacies
+ * TODO: Move kcalloc outside the spinlock
+ */
+static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+	int ret = 0;
+	struct kprobe *ap;
+
+	if (old_p->break_handler || p->break_handler) {
+		ret = -EEXIST;	/* kprobe and jprobe can't (yet) coexist */
+	} else if (old_p->pre_handler == aggr_pre_handler) {
+		list_add(&p->list, &old_p->list);
+	} else {
+		ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
+		if (!ap)
+			return -ENOMEM;
+		add_aggr_kprobe(ap, old_p);
+		list_add(&p->list, &ap->list);
+	}
+	return ret;
+}
+
+/* kprobe removal house-keeping routines */
+static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
+{
+	*p->addr = p->opcode;
+	hlist_del(&p->hlist);
+	flush_icache_range((unsigned long) p->addr,
+		   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+	spin_unlock_irqrestore(&kprobe_lock, flags);
+	arch_remove_kprobe(p);
+}
+
+static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
+		struct kprobe *p, unsigned long flags)
+{
+	list_del(&p->list);
+	if (list_empty(&old_p->list)) {
+		cleanup_kprobe(old_p, flags);
+		kfree(old_p);
+	} else
+		spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
 int register_kprobe(struct kprobe *p)
 {
 	int ret = 0;
 	unsigned long flags = 0;
+	struct kprobe *old_p;
 
 	if ((ret = arch_prepare_kprobe(p)) != 0) {
 		goto rm_kprobe;
 	}
 	spin_lock_irqsave(&kprobe_lock, flags);
-	INIT_HLIST_NODE(&p->hlist);
-	if (get_kprobe(p->addr)) {
-		ret = -EEXIST;
+	old_p = get_kprobe(p->addr);
+	if (old_p) {
+		ret = register_aggr_kprobe(old_p, p);
 		goto out;
 	}
-	arch_copy_kprobe(p);
 
+	arch_copy_kprobe(p);
+	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
@@ -107,13 +225,17 @@ rm_kprobe:
 void unregister_kprobe(struct kprobe *p)
 {
 	unsigned long flags;
-	arch_remove_kprobe(p);
+	struct kprobe *old_p;
+
 	spin_lock_irqsave(&kprobe_lock, flags);
-	*p->addr = p->opcode;
-	hlist_del(&p->hlist);
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
-	spin_unlock_irqrestore(&kprobe_lock, flags);
+	old_p = get_kprobe(p->addr);
+	if (old_p) {
+		if (old_p->pre_handler == aggr_pre_handler)
+			cleanup_aggr_kprobe(old_p, p, flags);
+		else
+			cleanup_kprobe(p, flags);
+	} else
+		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
 static struct notifier_block kprobe_exceptions_nb = {
diff --git a/kernel/module.c b/kernel/module.c
index 5734ab09d3f..83b3d376708 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1758,6 +1758,7 @@ sys_init_module(void __user *umod,
 		const char __user *uargs)
 {
 	struct module *mod;
+	mm_segment_t old_fs = get_fs();
 	int ret = 0;
 
 	/* Must have permission */
@@ -1775,6 +1776,9 @@ sys_init_module(void __user *umod,
 		return PTR_ERR(mod);
 	}
 
+	/* flush the icache in correct context */
+	set_fs(KERNEL_DS);
+
 	/* Flush the instruction cache, since we've played with text */
 	if (mod->module_init)
 		flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1787,8 @@ sys_init_module(void __user *umod,
 	flush_icache_range((unsigned long)mod->module_core,
 			   (unsigned long)mod->module_core + mod->core_size);
 
+	set_fs(old_fs);
+
 	/* Now sew it into the lists.  They won't access us, since
            strong_try_module_get() will fail. */
 	stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf04a5..4cdebc972ff 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -156,14 +156,14 @@ static int enter_state(suspend_state_t state)
 		goto Unlock;
 	}
 
-	pr_debug("PM: Preparing system for suspend\n");
+	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
 
-	pr_debug("PM: Entering state.\n");
+	pr_debug("PM: Entering %s sleep\n", pm_states[state]);
 	error = suspend_enter(state);
 
-	pr_debug("PM: Finishing up.\n");
+	pr_debug("PM: Finishing wakeup.\n");
 	suspend_finish(state);
  Unlock:
 	up(&pm_sem);
diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07ce2c8..01b58d7d17f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
 
 __setup("console=", console_setup);
 
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init.  Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
-	struct console_cmdline *c;
-	int i;
-
-	/*
-	 *	See if this tty is not yet registered, and
-	 *	if we have a slot free.
-	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
-		if (strcmp(console_cmdline[i].name, name) == 0 &&
-			  console_cmdline[i].index == idx) {
-				selected_console = i;
-				return 0;
-		}
-	if (i == MAX_CMDLINECONSOLES)
-		return -E2BIG;
-	selected_console = i;
-	c = &console_cmdline[i];
-	memcpy(c->name, name, sizeof(c->name));
-	c->name[sizeof(c->name) - 1] = 0;
-	c->options = options;
-	c->index = idx;
-	return 0;
-}
-
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned long size = memparse(str, &str);
@@ -671,6 +635,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
 #endif
 
 /**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init.  Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+	struct console_cmdline *c;
+	int i;
+
+	/*
+	 *	See if this tty is not yet registered, and
+	 *	if we have a slot free.
+	 */
+	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+		if (strcmp(console_cmdline[i].name, name) == 0 &&
+			  console_cmdline[i].index == idx) {
+				selected_console = i;
+				return 0;
+		}
+	if (i == MAX_CMDLINECONSOLES)
+		return -E2BIG;
+	selected_console = i;
+	c = &console_cmdline[i];
+	memcpy(c->name, name, sizeof(c->name));
+	c->name[sizeof(c->name) - 1] = 0;
+	c->options = options;
+	c->index = idx;
+	return 0;
+}
+
+/**
  * acquire_console_sem - lock the console system for exclusive use.
  *
  * Acquires a semaphore which guarantees that the caller has
diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50ca86..ad8cbb75ffa 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
 
 static int __init profile_setup(char * str)
 {
+	static char __initdata schedstr[] = "schedule";
 	int par;
 
-	if (!strncmp(str, "schedule", 8)) {
+	if (!strncmp(str, schedstr, strlen(schedstr))) {
 		prof_on = SCHED_PROFILING;
-		printk(KERN_INFO "kernel schedule profiling enabled\n");
-		if (str[7] == ',')
-			str += 8;
-	}
-	if (get_option(&str,&par)) {
+		if (str[strlen(schedstr)] == ',')
+			str += strlen(schedstr) + 1;
+		if (get_option(&str, &par))
+			prof_shift = par;
+		printk(KERN_INFO
+			"kernel schedule profiling enabled (shift: %ld)\n",
+			prof_shift);
+	} else if (get_option(&str, &par)) {
 		prof_shift = par;
 		prof_on = CPU_PROFILING;
 		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158667a..66b2ed78482 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4243,7 +4243,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
 
 	/* No more Mr. Nice Guy. */
 	if (dest_cpu == NR_CPUS) {
-		tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+		cpus_setall(tsk->cpus_allowed);
 		dest_cpu = any_online_cpu(tsk->cpus_allowed);
 
 		/*
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c5..b3c24c732c5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -522,7 +522,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
 	int sig = 0;
 
-	sig = next_signal(pending, mask);
+	/* SIGKILL must have priority, otherwise it is quite easy
+	 * to create an unkillable process, sending sig < SIGKILL
+	 * to self */
+	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+		if (!sigismember(mask, SIGKILL))
+			sig = SIGKILL;
+	}
+
+	if (likely(!sig))
+		sig = next_signal(pending, mask);
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17863f..0c3f9d8bbe1 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
 {
 	_raw_spin_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
 void __lockfunc _read_unlock_bh(rwlock_t *lock)
 {
 	_raw_read_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
 void __lockfunc _write_unlock_bh(rwlock_t *lock)
 {
 	_raw_write_unlock(lock);
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
 EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
 	if (_raw_spin_trylock(lock))
 		return 1;
 
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index f64e97cabe2..f006632c2ba 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1195,7 +1195,7 @@ static int groups_from_user(struct group_info *group_info,
 	return 0;
 }
 
-/* a simple shell-metzner sort */
+/* a simple Shell sort */
 static void groups_sort(struct group_info *group_info)
 {
 	int base, max, stride;
author	Dave Kleikamp <shaggy@austin.ibm.com>	2005-06-02 12:12:57 -0500
committer	Dave Kleikamp <shaggy@austin.ibm.com>	2005-06-02 12:12:57 -0500
commit	7078253c085c037c070ca4e8bc9e9e7f18aa1e84 (patch)
tree	eaf56c1a77b0de6ee82e23cee4433b2c4a47e67e /kernel
parent	259692bd5a2b2c2d351dd90748ba4126bc2a21b9 (diff)
parent	1e86d1c648508fd50e6c9960576b87906a7906ad (diff)
download	kernel-crypto-7078253c085c037c070ca4e8bc9e9e7f18aa1e84.tar.gz kernel-crypto-7078253c085c037c070ca4e8bc9e9e7f18aa1e84.tar.xz kernel-crypto-7078253c085c037c070ca4e8bc9e9e7f18aa1e84.zip