25 files changed, 378 insertions, 175 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d2010..6802020e0ce 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
 	spin_unlock(&acct_globals.lock);
 
 	/* May block */
-	if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+	if (vfs_statfs(file->f_dentry, &sbuf))
 		return res;
 	suspend = sbuf.f_blocks * SUSPEND;
 	resume = sbuf.f_blocks * RESUME;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b097ccb4eb7..9ebd96fda29 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1558,6 +1558,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
  * @uid: msgq user id
  * @gid: msgq group id
  * @mode: msgq mode (permissions)
+ * @ipcp: in-kernel IPC permissions
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d..2f672332430 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
+#include <linux/migrate.h>
 
 #include <asm/uaccess.h>
 
@@ -934,3 +935,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 
 	return ret;
 }
+
+#ifdef CONFIG_NUMA
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
+		compat_uptr_t __user *pages32,
+		const int __user *nodes,
+		int __user *status,
+		int flags)
+{
+	const void __user * __user *pages;
+	int i;
+
+	pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+	for (i = 0; i < nr_pages; i++) {
+		compat_uptr_t p;
+
+		if (get_user(p, pages32 + i) ||
+			put_user(compat_ptr(p), pages + i))
+			return -EFAULT;
+	}
+	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572..b602f73fb38 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -41,6 +41,7 @@
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
+#include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/spinlock.h>
@@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
 	return 0;
 }
 
-static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
-					int flags, const char *unused_dev_name,
-					void *data)
+static int cpuset_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *unused_dev_name,
+			 void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+	return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
 }
 
 static struct file_system_type cpuset_fs_type = {
@@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	cpumask_t cpus;
 	nodemask_t from, to;
 	struct mm_struct *mm;
+	int retval;
 
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
@@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		get_task_struct(tsk);
 	}
 
+	retval = security_task_setscheduler(tsk, 0, NULL);
+	if (retval) {
+		put_task_struct(tsk);
+		return retval;
+	}
+
 	mutex_lock(&callback_mutex);
 
 	task_lock(tsk);
diff --git a/kernel/exit.c b/kernel/exit.c
index e06d0c10a24..a3baf92462b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -579,7 +579,7 @@ static void exit_mm(struct task_struct * tsk)
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
-	if (mm != tsk->active_mm) BUG();
+	BUG_ON(mm != tsk->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
 	tsk->mm = NULL;
@@ -1530,8 +1530,7 @@ check_continued:
 		if (options & __WNOTHREAD)
 			break;
 		tsk = next_thread(tsk);
-		if (tsk->signal != current->signal)
-			BUG();
+		BUG_ON(tsk->signal != current->signal);
 	} while (tsk != current);
 
 	read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e3088..49adc0e8d47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm)
  */
 void mmput(struct mm_struct *mm)
 {
+	might_sleep();
+
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
 		exit_mmap(mm);
@@ -623,6 +625,7 @@ out:
 /*
  * Allocate a new files structure and copy contents from the
  * passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
  */
 static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
@@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	int open_files, size, i, expand;
 	struct fdtable *old_fdt, *new_fdt;
 
+	*errorp = -ENOMEM;
 	newf = alloc_files();
 	if (!newf)
 		goto out;
@@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	 * break this.
 	 */
 	tsk->files = NULL;
-	error = -ENOMEM;
 	newf = dup_fd(oldf, &error);
 	if (!newf)
 		goto out;
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057..e1a380c77a5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
 			(unsigned long)uaddr2, val2, val3);
 }
 
-static struct super_block *
-futexfs_get_sb(struct file_system_type *fs_type,
-	       int flags, const char *dev_name, void *data)
+static int futexfs_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type futex_fs_type = {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37d..0f653011710 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
 /*
  * Have got an event to handle:
  */
-fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				struct irqaction *action)
 {
-	int ret, retval = 0, status = 0;
+	irqreturn_t ret, retval = IRQ_NONE;
+	unsigned int status = 0;
 
 	if (!(action->flags & SA_INTERRUPT))
 		local_irq_enable();
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 134f9f2e0e3..a12d00eb5e7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -30,7 +30,7 @@ void move_native_irq(int irq)
 
 	desc->move_irq = 0;
 
-	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+	if (unlikely(cpus_empty(pending_irq_cpumask[irq])))
 		return;
 
 	if (!desc->handler->set_affinity)
@@ -49,7 +49,7 @@ void move_native_irq(int irq)
 	 * cause some ioapics to mal-function.
 	 * Being paranoid i guess!
 	 */
-	if (unlikely(!cpus_empty(tmp))) {
+	if (likely(!cpus_empty(tmp))) {
 		if (likely(!(desc->status & IRQ_DISABLED)))
 			desc->handler->disable(irq);
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce..afacd6f585f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
+
 	/*
 	 * Save these away for later use. Re-progam when the
 	 * interrupt is pending
@@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 #else
 void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
+	set_balance_irq_affinity(irq, mask_val);
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7df9abd5ec8..b2fb3c18d06 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,7 +11,7 @@
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 
-static int irqfixup;
+static int irqfixup __read_mostly;
 
 /*
  * Recovery handler for misrouted interrupts.
@@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
 void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
 			struct pt_regs *regs)
 {
-	if (action_ret != IRQ_HANDLED) {
+	if (unlikely(action_ret != IRQ_HANDLED)) {
 		desc->irqs_unhandled++;
-		if (action_ret != IRQ_NONE)
+		if (unlikely(action_ret != IRQ_NONE))
 			report_bad_irq(irq, desc, action_ret);
 	}
 
@@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
 	}
 
 	desc->irq_count++;
-	if (desc->irq_count < 100000)
+	if (likely(desc->irq_count < 100000))
 		return;
 
 	desc->irq_count = 0;
-	if (desc->irqs_unhandled > 99900) {
+	if (unlikely(desc->irqs_unhandled > 99900)) {
 		/*
 		 * The interrupt is stuck
 		 */
@@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
 	desc->irqs_unhandled = 0;
 }
 
-int noirqdebug;
+int noirqdebug __read_mostly;
 
 int __init noirqdebug_setup(char *str)
 {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0..58f0f382597 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
  * kexec does not sync, or unmount filesystems so if you need
  * that to happen you need to do that yourself.
  */
-struct kimage *kexec_image = NULL;
-static struct kimage *kexec_crash_image = NULL;
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
 /*
  * A home grown binary mutex.
  * Nothing can wait so this mutex is safe to use
  * in interrupt context :)
  */
-static int kexec_lock = 0;
+static int kexec_lock;
 
 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 				struct kexec_segment __user *segments,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67..9e28478a17a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
 #include <linux/sysfs.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/kexec.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
 KERNEL_ATTR_RW(uevent_helper);
 #endif
 
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+
+static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+#endif /* CONFIG_KEXEC */
+
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
 
@@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = {
 	&uevent_seqnum_attr.attr,
 	&uevent_helper_attr.attr,
 #endif
+#ifdef CONFIG_KEXEC
+	&kexec_loaded_attr.attr,
+	&kexec_crash_loaded_attr.attr,
+#endif
 	NULL
 };
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0a907f0dc56..cdf0f07af92 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/pm.h>
-
+#include <linux/console.h>
 
 #include "power.h"
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f2176..98c41423f3b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -55,7 +55,7 @@ struct snapshot_handle {
 	unsigned int	page;
 	unsigned int	page_offset;
 	unsigned int	prev;
-	struct pbe	*pbe;
+	struct pbe	*pbe, *last_pbe;
 	void		*buffer;
 	unsigned int	buf_offset;
 };
@@ -105,6 +105,10 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
 extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 
+extern unsigned int count_special_pages(void);
+extern int save_special_mem(void);
+extern int restore_special_mem(void);
+
 extern int swsusp_check(void);
 extern int swsusp_shrink_memory(void);
 extern void swsusp_free(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b7..3d9284100b2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,8 +39,90 @@ static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
 static unsigned long *buffer;
 
+struct arch_saveable_page {
+	unsigned long start;
+	unsigned long end;
+	char *data;
+	struct arch_saveable_page *next;
+};
+static struct arch_saveable_page *arch_pages;
+
+int swsusp_add_arch_pages(unsigned long start, unsigned long end)
+{
+	struct arch_saveable_page *tmp;
+
+	while (start < end) {
+		tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL);
+		if (!tmp)
+			return -ENOMEM;
+		tmp->start = start;
+		tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT;
+		if (tmp->end > end)
+			tmp->end = end;
+		tmp->next = arch_pages;
+		start = tmp->end;
+		arch_pages = tmp;
+	}
+	return 0;
+}
+
+static unsigned int count_arch_pages(void)
+{
+	unsigned int count = 0;
+	struct arch_saveable_page *tmp = arch_pages;
+	while (tmp) {
+		count++;
+		tmp = tmp->next;
+	}
+	return count;
+}
+
+static int save_arch_mem(void)
+{
+	char *kaddr;
+	struct arch_saveable_page *tmp = arch_pages;
+	int offset;
+
+	pr_debug("swsusp: Saving arch specific memory");
+	while (tmp) {
+		tmp->data = (char *)__get_free_page(GFP_ATOMIC);
+		if (!tmp->data)
+			return -ENOMEM;
+		offset = tmp->start - (tmp->start & PAGE_MASK);
+		/* arch pages might haven't a 'struct page' */
+		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
+		memcpy(tmp->data + offset, kaddr + offset,
+			tmp->end - tmp->start);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		tmp = tmp->next;
+	}
+	return 0;
+}
+
+static int restore_arch_mem(void)
+{
+	char *kaddr;
+	struct arch_saveable_page *tmp = arch_pages;
+	int offset;
+
+	while (tmp) {
+		if (!tmp->data)
+			continue;
+		offset = tmp->start - (tmp->start & PAGE_MASK);
+		kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
+		memcpy(kaddr + offset, tmp->data + offset,
+			tmp->end - tmp->start);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long)tmp->data);
+		tmp->data = NULL;
+		tmp = tmp->next;
+	}
+	return 0;
+}
+
 #ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void)
+static unsigned int count_highmem_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
@@ -117,7 +199,7 @@ static int save_highmem_zone(struct zone *zone)
 	return 0;
 }
 
-int save_highmem(void)
+static int save_highmem(void)
 {
 	struct zone *zone;
 	int res = 0;
@@ -134,7 +216,7 @@ int save_highmem(void)
 	return 0;
 }
 
-int restore_highmem(void)
+static int restore_highmem(void)
 {
 	printk("swsusp: Restoring Highmem\n");
 	while (highmem_copy) {
@@ -150,8 +232,35 @@ int restore_highmem(void)
 	}
 	return 0;
 }
+#else
+static inline unsigned int count_highmem_pages(void) {return 0;}
+static inline int save_highmem(void) {return 0;}
+static inline int restore_highmem(void) {return 0;}
 #endif
 
+unsigned int count_special_pages(void)
+{
+	return count_arch_pages() + count_highmem_pages();
+}
+
+int save_special_mem(void)
+{
+	int ret;
+	ret = save_arch_mem();
+	if (!ret)
+		ret = save_highmem();
+	return ret;
+}
+
+int restore_special_mem(void)
+{
+	int ret;
+	ret = restore_arch_mem();
+	if (!ret)
+		ret = restore_highmem();
+	return ret;
+}
+
 static int pfn_is_nosave(unsigned long pfn)
 {
 	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
@@ -177,7 +286,6 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
 	if (PageReserved(page) && pfn_is_nosave(pfn))
@@ -293,62 +401,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 	}
 }
 
-/**
- *	On resume it is necessary to trace and eventually free the unsafe
- *	pages that have been allocated, because they are needed for I/O
- *	(on x86-64 we likely will "eat" these pages once again while
- *	creating the temporary page translation tables)
- */
-
-struct eaten_page {
-	struct eaten_page *next;
-	char padding[PAGE_SIZE - sizeof(void *)];
-};
-
-static struct eaten_page *eaten_pages = NULL;
-
-static void release_eaten_pages(void)
-{
-	struct eaten_page *p, *q;
-
-	p = eaten_pages;
-	while (p) {
-		q = p->next;
-		/* We don't want swsusp_free() to free this page again */
-		ClearPageNosave(virt_to_page(p));
-		free_page((unsigned long)p);
-		p = q;
-	}
-	eaten_pages = NULL;
-}
+static unsigned int unsafe_pages;
 
 /**
  *	@safe_needed - on resume, for storing the PBE list and the image,
  *	we can only use memory pages that do not conflict with the pages
- *	which had been used before suspend.
+ *	used before suspend.
  *
  *	The unsafe pages are marked with the PG_nosave_free flag
- *
- *	Allocated but unusable (ie eaten) memory pages should be marked
- *	so that swsusp_free() can release them
+ *	and we count them using unsafe_pages
  */
 
 static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
 	void *res;
 
+	res = (void *)get_zeroed_page(gfp_mask);
 	if (safe_needed)
-		do {
+		while (res && PageNosaveFree(virt_to_page(res))) {
+			/* The page is unsafe, mark it for swsusp_free() */
+			SetPageNosave(virt_to_page(res));
+			unsafe_pages++;
 			res = (void *)get_zeroed_page(gfp_mask);
-			if (res && PageNosaveFree(virt_to_page(res))) {
-				/* This is for swsusp_free() */
-				SetPageNosave(virt_to_page(res));
-				((struct eaten_page *)res)->next = eaten_pages;
-				eaten_pages = res;
-			}
-		} while (res && PageNosaveFree(virt_to_page(res)));
-	else
-		res = (void *)get_zeroed_page(gfp_mask);
+		}
 	if (res) {
 		SetPageNosave(virt_to_page(res));
 		SetPageNosaveFree(virt_to_page(res));
@@ -374,7 +449,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
  *	On each page we set up a list of struct_pbe elements.
  */
 
-struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
+static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
+				 int safe_needed)
 {
 	unsigned int num;
 	struct pbe *pblist, *pbe;
@@ -642,6 +718,8 @@ static int mark_unsafe_pages(struct pbe *pblist)
 			return -EFAULT;
 	}
 
+	unsafe_pages = 0;
+
 	return 0;
 }
 
@@ -719,42 +797,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
 }
 
 /**
- *	create_image - use metadata contained in the PBE list
+ *	prepare_image - use metadata contained in the PBE list
  *	pointed to by pagedir_nosave to mark the pages that will
  *	be overwritten in the process of restoring the system
- *	memory state from the image and allocate memory for
- *	the image avoiding these pages
+ *	memory state from the image ("unsafe" pages) and allocate
+ *	memory for the image
+ *
+ *	The idea is to allocate the PBE list first and then
+ *	allocate as many pages as it's needed for the image data,
+ *	but not to assign these pages to the PBEs initially.
+ *	Instead, we just mark them as allocated and create a list
+ *	of "safe" which will be used later
  */
 
-static int create_image(struct snapshot_handle *handle)
+struct safe_page {
+	struct safe_page *next;
+	char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct safe_page *safe_pages;
+
+static int prepare_image(struct snapshot_handle *handle)
 {
 	int error = 0;
-	struct pbe *p, *pblist;
+	unsigned int nr_pages = nr_copy_pages;
+	struct pbe *p, *pblist = NULL;
 
 	p = pagedir_nosave;
 	error = mark_unsafe_pages(p);
 	if (!error) {
-		pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
 		if (pblist)
 			copy_page_backup_list(pblist, p);
 		free_pagedir(p, 0);
 		if (!pblist)
 			error = -ENOMEM;
 	}
-	if (!error)
-		error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+	safe_pages = NULL;
+	if (!error && nr_pages > unsafe_pages) {
+		nr_pages -= unsafe_pages;
+		while (nr_pages--) {
+			struct safe_page *ptr;
+
+			ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
+			if (!ptr) {
+				error = -ENOMEM;
+				break;
+			}
+			if (!PageNosaveFree(virt_to_page(ptr))) {
+				/* The page is "safe", add it to the list */
+				ptr->next = safe_pages;
+				safe_pages = ptr;
+			}
+			/* Mark the page as allocated */
+			SetPageNosave(virt_to_page(ptr));
+			SetPageNosaveFree(virt_to_page(ptr));
+		}
+	}
 	if (!error) {
-		release_eaten_pages();
 		pagedir_nosave = pblist;
 	} else {
-		pagedir_nosave = NULL;
 		handle->pbe = NULL;
-		nr_copy_pages = 0;
-		nr_meta_pages = 0;
+		swsusp_free();
 	}
 	return error;
 }
 
+static void *get_buffer(struct snapshot_handle *handle)
+{
+	struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
+	struct page *page = virt_to_page(pbe->orig_address);
+
+	if (PageNosave(page) && PageNosaveFree(page)) {
+		/*
+		 * We have allocated the "original" page frame and we can
+		 * use it directly to store the read page
+		 */
+		pbe->address = 0;
+		if (last && last->next)
+			last->next = NULL;
+		return (void *)pbe->orig_address;
+	}
+	/*
+	 * The "original" page frame has not been allocated and we have to
+	 * use a "safe" page frame to store the read page
+	 */
+	pbe->address = (unsigned long)safe_pages;
+	safe_pages = safe_pages->next;
+	if (last)
+		last->next = pbe;
+	handle->last_pbe = pbe;
+	return (void *)pbe->address;
+}
+
 /**
  *	snapshot_write_next - used for writing the system memory snapshot.
  *
@@ -799,15 +934,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 		} else if (handle->prev <= nr_meta_pages) {
 			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
 			if (!handle->pbe) {
-				error = create_image(handle);
+				error = prepare_image(handle);
 				if (error)
 					return error;
 				handle->pbe = pagedir_nosave;
-				handle->buffer = (void *)handle->pbe->address;
+				handle->last_pbe = NULL;
+				handle->buffer = get_buffer(handle);
 			}
 		} else {
 			handle->pbe = handle->pbe->next;
-			handle->buffer = (void *)handle->pbe->address;
+			handle->buffer = get_buffer(handle);
 		}
 		handle->prev = handle->page;
 	}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e..f0ee4e7780d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -62,16 +62,6 @@ unsigned long image_size = 500 * 1024 * 1024;
 
 int in_suspend __nosavedata = 0;
 
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void);
-int save_highmem(void);
-int restore_highmem(void);
-#else
-static int save_highmem(void) { return 0; }
-static int restore_highmem(void) { return 0; }
-static unsigned int count_highmem_pages(void) { return 0; }
-#endif
-
 /**
  *	The following functions are used for tracing the allocated
  *	swap pages, so that they can be freed in case of an error.
@@ -175,6 +165,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
  */
 
 #define SHRINK_BITE	10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+	if (tmp > SHRINK_BITE)
+		tmp = SHRINK_BITE;
+	return shrink_all_memory(tmp);
+}
 
 int swsusp_shrink_memory(void)
 {
@@ -186,21 +182,23 @@ int swsusp_shrink_memory(void)
 
 	printk("Shrinking memory...  ");
 	do {
-		size = 2 * count_highmem_pages();
+		size = 2 * count_special_pages();
 		size += size / 50 + count_data_pages();
 		size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
 			PAGES_FOR_IO;
 		tmp = size;
 		for_each_zone (zone)
-			if (!is_highmem(zone))
+			if (!is_highmem(zone) && populated_zone(zone)) {
 				tmp -= zone->free_pages;
+				tmp += zone->lowmem_reserve[ZONE_NORMAL];
+			}
 		if (tmp > 0) {
-			tmp = shrink_all_memory(SHRINK_BITE);
+			tmp = __shrink_memory(tmp);
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
 		} else if (size > image_size / PAGE_SIZE) {
-			tmp = shrink_all_memory(SHRINK_BITE);
+			tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
 			pages += tmp;
 		}
 		printk("\b%c", p[i++%4]);
@@ -228,7 +226,7 @@ int swsusp_suspend(void)
 		goto Enable_irqs;
 	}
 
-	if ((error = save_highmem())) {
+	if ((error = save_special_mem())) {
 		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
 		goto Restore_highmem;
 	}
@@ -239,7 +237,7 @@ int swsusp_suspend(void)
 	/* Restore control flow magically appears here */
 	restore_processor_state();
 Restore_highmem:
-	restore_highmem();
+	restore_special_mem();
 	device_power_up();
 Enable_irqs:
 	local_irq_enable();
@@ -265,7 +263,7 @@ int swsusp_resume(void)
 	 */
 	swsusp_free();
 	restore_processor_state();
-	restore_highmem();
+	restore_special_mem();
 	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bb..20e9710fc21 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -612,14 +612,6 @@ void synchronize_rcu(void)
 	wait_for_completion(&rcu.completion);
 }
 
-/*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
- */
-void synchronize_kernel(void)
-{
-	synchronize_rcu();
-}
-
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
@@ -627,7 +619,6 @@ module_param(qlowmark, int, 0);
 module_param(rsinterval, int, 0);
 #endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu);	/* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh);	/* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7..5dbc4269447 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3886,6 +3886,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 			!capable(CAP_SYS_NICE))
 		goto out_unlock;
 
+	retval = security_task_setscheduler(p, 0, NULL);
+	if (retval)
+		goto out_unlock;
+
 	cpus_allowed = cpuset_cpus_allowed(p);
 	cpus_and(new_mask, new_mask, cpus_allowed);
 	retval = set_cpus_allowed(p, new_mask);
@@ -3954,7 +3958,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	if (!p)
 		goto out_unlock;
 
-	retval = 0;
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
 	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 
 out_unlock:
@@ -4046,6 +4053,9 @@ asmlinkage long sys_sched_yield(void)
 
 static inline void __cond_resched(void)
 {
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+	__might_sleep(__FILE__, __LINE__);
+#endif
 	/*
 	 * The BKS might be reacquired before we have dropped
 	 * PREEMPT_ACTIVE, which could trigger a second
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936..90930b28d2c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -13,7 +13,6 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
-#include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
@@ -57,6 +56,12 @@
 #ifndef GET_FPEXC_CTL
 # define GET_FPEXC_CTL(a,b)	(-EINVAL)
 #endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a,b)	(-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a,b)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -1860,23 +1865,20 @@ out:
  * fields when reaping, so a sample either gets all the additions of a
  * given child after it's reaped, or none so this sample is before reaping.
  *
- * tasklist_lock locking optimisation:
- * If we are current and single threaded, we do not need to take the tasklist
- * lock or the siglock.  No one else can take our signal_struct away,
- * no one else can reap the children to update signal->c* counters, and
- * no one else can race with the signal-> fields.
- * If we do not take the tasklist_lock, the signal-> fields could be read
- * out of order while another thread was just exiting. So we place a
- * read memory barrier when we avoid the lock.  On the writer side,
- * write memory barrier is implied in  __exit_signal as __exit_signal releases
- * the siglock spinlock after updating the signal-> fields.
- *
- * We don't really need the siglock when we access the non c* fields
- * of the signal_struct (for RUSAGE_SELF) even in multithreaded
- * case, since we take the tasklist lock for read and the non c* signal->
- * fields are updated only in __exit_signal, which is called with
- * tasklist_lock taken for write, hence these two threads cannot execute
- * concurrently.
+ * Locking:
+ * We need to take the siglock for CHILDEREN, SELF and BOTH
+ * for  the cases current multithreaded, non-current single threaded
+ * non-current multithreaded.  Thread traversal is now safe with
+ * the siglock held.
+ * Strictly speaking, we donot need to take the siglock if we are current and
+ * single threaded,  as no one else can take our signal_struct away, no one
+ * else can  reap the  children to update signal->c* counters, and no one else
+ * can race with the signal-> fields. If we do not take any lock, the
+ * signal-> fields could be read out of order while another thread was just
+ * exiting. So we should  place a read memory barrier when we avoid the lock.
+ * On the writer side,  write memory barrier is implied in  __exit_signal
+ * as __exit_signal releases  the siglock spinlock after updating the signal->
+ * fields. But we don't do this yet to keep things simple.
  *
  */
 
@@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 	struct task_struct *t;
 	unsigned long flags;
 	cputime_t utime, stime;
-	int need_lock = 0;
 
 	memset((char *) r, 0, sizeof *r);
 	utime = stime = cputime_zero;
 
-	if (p != current || !thread_group_empty(p))
-		need_lock = 1;
-
-	if (need_lock) {
-		read_lock(&tasklist_lock);
-		if (unlikely(!p->signal)) {
-			read_unlock(&tasklist_lock);
-			return;
-		}
-	} else
-		/* See locking comments above */
-		smp_rmb();
+	rcu_read_lock();
+	if (!lock_task_sighand(p, &flags)) {
+		rcu_read_unlock();
+		return;
+	}
 
 	switch (who) {
 		case RUSAGE_BOTH:
 		case RUSAGE_CHILDREN:
-			spin_lock_irqsave(&p->sighand->siglock, flags);
 			utime = p->signal->cutime;
 			stime = p->signal->cstime;
 			r->ru_nvcsw = p->signal->cnvcsw;
 			r->ru_nivcsw = p->signal->cnivcsw;
 			r->ru_minflt = p->signal->cmin_flt;
 			r->ru_majflt = p->signal->cmaj_flt;
-			spin_unlock_irqrestore(&p->sighand->siglock, flags);
 
 			if (who == RUSAGE_CHILDREN)
 				break;
@@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
 			BUG();
 	}
 
-	if (need_lock)
-		read_unlock(&tasklist_lock);
+	unlock_task_sighand(p, &flags);
+	rcu_read_unlock();
+
 	cputime_to_timeval(utime, &r->ru_utime);
 	cputime_to_timeval(stime, &r->ru_stime);
 }
@@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				return -EFAULT;
 			return 0;
 		}
+		case PR_GET_ENDIAN:
+			error = GET_ENDIAN(current, arg2);
+			break;
+		case PR_SET_ENDIAN:
+			error = SET_ENDIAN(current, arg2);
+			break;
+
 		default:
 			error = -EINVAL;
 			break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f..6991bece67e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
 cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
 cond_syscall(sys_chown16);
 cond_syscall(sys_fchown16);
 cond_syscall(sys_getegid16);
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore);
 cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d656e61621..eb8bd214e7d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 extern int C_A_D;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
 extern int max_threads;
 extern int sysrq_enabled;
 extern int core_uses_pid;
@@ -398,7 +399,7 @@ static ctl_table kern_table[] = {
 		.strategy	= &sysctl_string,
 	},
 #endif
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	{
 		.ctl_name	= KERN_HOTPLUG,
 		.procname	= "hotplug",
@@ -702,6 +703,14 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= VM_PANIC_ON_OOM,
+		.procname	= "panic_on_oom",
+		.data		= &sysctl_panic_on_oom,
+		.maxlen		= sizeof(sysctl_panic_on_oom),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= VM_OVERCOMMIT_RATIO,
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468..f35b3939e93 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync);
 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
-	struct list_head *head, *curr;
+	struct timer_list *timer, *tmp;
+	struct list_head tv_list;
+
+	list_replace_init(tv->vec + index, &tv_list);
 
-	head = tv->vec + index;
-	curr = head->next;
 	/*
-	 * We are removing _all_ timers from the list, so we don't  have to
-	 * detach them individually, just clear the list afterwards.
+	 * We are removing _all_ timers from the list, so we
+	 * don't have to detach them individually.
 	 */
-	while (curr != head) {
-		struct timer_list *tmp;
-
-		tmp = list_entry(curr, struct timer_list, entry);
-		BUG_ON(tmp->base != base);
-		curr = curr->next;
-		internal_add_timer(base, tmp);
+	list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
+		BUG_ON(timer->base != base);
+		internal_add_timer(base, timer);
 	}
-	INIT_LIST_HEAD(head);
 
 	return index;
 }
@@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base)
 
 	spin_lock_irq(&base->lock);
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
-		struct list_head work_list = LIST_HEAD_INIT(work_list);
+		struct list_head work_list;
 		struct list_head *head = &work_list;
  		int index = base->timer_jiffies & TVR_MASK;
- 
+
 		/*
 		 * Cascade timers:
 		 */
@@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base)
 				(!cascade(base, &base->tv3, INDEX(1))) &&
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
-		++base->timer_jiffies; 
-		list_splice_init(base->tv1.vec + index, &work_list);
+		++base->timer_jiffies;
+		list_replace_init(base->tv1.vec + index, &work_list);
 		while (!list_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
diff --git a/kernel/user.c b/kernel/user.c
index 4b1eb745afa..6408c042429 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
 		new->mq_bytes = 0;
 		new->locked_shm = 0;
 
-		if (alloc_uid_keyring(new) < 0) {
+		if (alloc_uid_keyring(new, current) < 0) {
 			kmem_cache_free(uid_cachep, new);
 			return NULL;
 		}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f..740c5abceb0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -531,11 +531,11 @@ int current_is_keventd(void)
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
 	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-	LIST_HEAD(list);
+	struct list_head list;
 	struct work_struct *work;
 
 	spin_lock_irq(&cwq->lock);
-	list_splice_init(&cwq->worklist, &list);
+	list_replace_init(&cwq->worklist, &list);
 
 	while (!list_empty(&list)) {
 		printk("Taking work for %s\n", wq->name);