12 files changed, 112 insertions, 22 deletions
diff --git a/fs/compat.c b/fs/compat.c
index a2ba78bdf7f..5333c7d7427 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1757,7 +1757,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 			goto sticky;
 		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
 		rtv.tv_sec = timeout;
-		if (compat_timeval_compare(&rtv, &tv) < 0)
+		if (compat_timeval_compare(&rtv, &tv) >= 0)
 			rtv = tv;
 		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
@@ -1834,7 +1834,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
 			rts.tv_sec++;
 			rts.tv_nsec -= NSEC_PER_SEC;
 		}
-		if (compat_timespec_compare(&rts, &ts) < 0)
+		if (compat_timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		copy_to_user(tsp, &rts, sizeof(rts));
 	}
@@ -1934,7 +1934,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 					1000;
 		rts.tv_sec = timeout;
-		if (compat_timespec_compare(&rts, &ts) < 0)
+		if (compat_timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index a2ca3107d47..86ae8e93adb 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -792,18 +792,20 @@ ext2_xattr_delete_inode(struct inode *inode)
 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
 		get_bh(bh);
 		bforget(bh);
+		unlock_buffer(bh);
 	} else {
 		HDR(bh)->h_refcount = cpu_to_le32(
 			le32_to_cpu(HDR(bh)->h_refcount) - 1);
 		if (ce)
 			mb_cache_entry_release(ce);
+		ea_bdebug(bh, "refcount now=%d",
+			le32_to_cpu(HDR(bh)->h_refcount));
+		unlock_buffer(bh);
 		mark_buffer_dirty(bh);
 		if (IS_SYNC(inode))
 			sync_dirty_buffer(bh);
 		DQUOT_FREE_BLOCK(inode, 1);
 	}
-	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
-	unlock_buffer(bh);
 	EXT2_I(inode)->i_file_acl = 0;
 
 cleanup:
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f556a0d5c0d..0c9a2ee54c9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -66,6 +66,12 @@ static void restore_sigs(sigset_t *oldset)
 	sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
+/*
+ * Reset request, so that it can be reused
+ *
+ * The caller must be _very_ careful to make sure, that it is holding
+ * the only reference to req
+ */
 void fuse_reset_request(struct fuse_req *req)
 {
 	int preallocated = req->preallocated;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 296351615b0..6f05379b0a0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -116,9 +116,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 /* Special case for failed iget in CREATE */
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	u64 nodeid = req->in.h.nodeid;
-	fuse_reset_request(req);
-	fuse_send_forget(fc, req, nodeid, 1);
+	/* If called from end_io_requests(), req has more than one
+	   reference and fuse_reset_request() cannot work */
+	if (fc->connected) {
+		u64 nodeid = req->in.h.nodeid;
+		fuse_reset_request(req);
+		fuse_send_forget(fc, req, nodeid, 1);
+	} else
+		fuse_put_request(fc, req);
 }
 
 void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 42eb53b5293..23ceaa7127b 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -208,6 +208,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
 #define DLM_LOCK_RES_IN_PROGRESS          0x00000010
 #define DLM_LOCK_RES_MIGRATING            0x00000020
 
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
 #define DLM_PURGE_INTERVAL_MS   (8 * 1000)
 
 struct dlm_lock_resource
@@ -658,6 +661,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
 
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 6001b22a997..f66e2d818cc 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
 	} else {
 		mlog_errno(tmpret);
 		if (dlm_is_host_down(tmpret)) {
+			/* instead of logging the same network error over
+			 * and over, sleep here and wait for the heartbeat
+			 * to notice the node is dead.  times out after 5s. */
+			dlm_wait_for_node_death(dlm, res->owner, 
+						DLM_NODE_DEATH_WAIT_MAX);
 			ret = DLM_RECOVERING;
 			mlog(0, "node %u died so returning DLM_RECOVERING "
 			     "from convert message!\n", res->owner);
@@ -421,7 +426,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_lockstatus *lksb;
 	enum dlm_status status = DLM_NORMAL;
 	u32 flags;
-	int call_ast = 0, kick_thread = 0;
+	int call_ast = 0, kick_thread = 0, ast_reserved = 0;
 
 	if (!dlm_grab(dlm)) {
 		dlm_error(DLM_REJECTED);
@@ -490,6 +495,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	status = __dlm_lockres_state_to_status(res);
 	if (status == DLM_NORMAL) {
 		__dlm_lockres_reserve_ast(res);
+		ast_reserved = 1;
 		res->state |= DLM_LOCK_RES_IN_PROGRESS;
 		status = __dlmconvert_master(dlm, res, lock, flags,
 					     cnv->requested_type,
@@ -512,10 +518,10 @@ leave:
 	else
 		dlm_lock_put(lock);
 
-	/* either queue the ast or release it */
+	/* either queue the ast or release it, if reserved */
 	if (call_ast)
 		dlm_queue_ast(dlm, lock);
-	else
+	else if (ast_reserved)
 		dlm_lockres_release_ast(dlm, res);
 
 	if (kick_thread)
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index d1a0038557a..671d4ff222c 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 			dlm_error(status);
 		dlm_revert_pending_lock(res, lock);
 		dlm_lock_put(lock);
+	} else if (dlm_is_recovery_lock(res->lockname.name, 
+					res->lockname.len)) {
+		/* special case for the $RECOVERY lock.
+		 * there will never be an AST delivered to put
+		 * this lock on the proper secondary queue
+		 * (granted), so do it manually. */
+		mlog(0, "%s: $RECOVERY lock for this node (%u) is "
+		     "mastered by %u; got lock, manually granting (no ast)\n",
+		     dlm->name, dlm->node_num, res->owner);
+		list_del_init(&lock->list);
+		list_add_tail(&lock->list, &res->granted);
 	}
 	spin_unlock(&res->spinlock);
 
@@ -646,7 +657,19 @@ retry_lock:
 			mlog(0, "retrying lock with migration/"
 			     "recovery/in progress\n");
 			msleep(100);
-			dlm_wait_for_recovery(dlm);
+			/* no waiting for dlm_reco_thread */
+			if (recovery) {
+				if (status == DLM_RECOVERING) {
+					mlog(0, "%s: got RECOVERING "
+					     "for $REOCVERY lock, master "
+					     "was %u\n", dlm->name, 
+					     res->owner);
+					dlm_wait_for_node_death(dlm, res->owner, 
+							DLM_NODE_DEATH_WAIT_MAX);
+				}
+			} else {
+				dlm_wait_for_recovery(dlm);
+			}
 			goto retry_lock;
 		}
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a3194fe173d..2e2e95e6949 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2482,7 +2482,9 @@ top:
 				atomic_set(&mle->woken, 1);
 				spin_unlock(&mle->spinlock);
 				wake_up(&mle->wq);
-				/* final put will take care of list removal */
+				/* do not need events any longer, so detach 
+				 * from heartbeat */
+				__dlm_mle_detach_hb_events(dlm, mle);
 				__dlm_put_mle(mle);
 			}
 			continue;
@@ -2537,6 +2539,9 @@ top:
 			spin_unlock(&res->spinlock);
 			dlm_lockres_put(res);
 
+			/* about to get rid of mle, detach from heartbeat */
+			__dlm_mle_detach_hb_events(dlm, mle);
+
 			/* dump the mle */
 			spin_lock(&dlm->master_lock);
 			__dlm_put_mle(mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 186e9a76aa5..ed76bda1a53 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 	return dead;
 }
 
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+		     "death of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
+		     "of death of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_dead(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
@@ -2032,6 +2050,30 @@ again:
 			     dlm->reco.new_master);
 			status = -EEXIST;
 		} else {
+			status = 0;
+
+			/* see if recovery was already finished elsewhere */
+			spin_lock(&dlm->spinlock);
+			if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+				status = -EINVAL;	
+				mlog(0, "%s: got reco EX lock, but "
+				     "node got recovered already\n", dlm->name);
+				if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+					mlog(ML_ERROR, "%s: new master is %u "
+					     "but no dead node!\n", 
+					     dlm->name, dlm->reco.new_master);
+					BUG();
+				}
+			}
+			spin_unlock(&dlm->spinlock);
+		}
+
+		/* if this node has actually become the recovery master,
+		 * set the master and send the messages to begin recovery */
+		if (!status) {
+			mlog(0, "%s: dead=%u, this=%u, sending "
+			     "begin_reco now\n", dlm->name, 
+			     dlm->reco.dead_node, dlm->node_num);
 			status = dlm_send_begin_reco_message(dlm,
 				      dlm->reco.dead_node);
 			/* this always succeeds */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index fa0bcac5cea..d329c9df90a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1584,10 +1584,9 @@ static int ocfs2_commit_thread(void *arg)
 	while (!(kthread_should_stop() &&
 		 atomic_read(&journal->j_num_trans) == 0)) {
 
-		wait_event_interruptible_timeout(osb->checkpoint_event,
-						 atomic_read(&journal->j_num_trans)
-						 || kthread_should_stop(),
-						 OCFS2_CHECKPOINT_INTERVAL);
+		wait_event_interruptible(osb->checkpoint_event,
+					 atomic_read(&journal->j_num_trans)
+					 || kthread_should_stop());
 
 		status = ocfs2_commit_cache(osb);
 		if (status < 0)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7d0a816184f..2f3a6acdac4 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -29,8 +29,6 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 
-#define OCFS2_CHECKPOINT_INTERVAL        (8 * HZ)
-
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
 	OCFS2_JOURNAL_LOADED,
diff --git a/fs/select.c b/fs/select.c
index 6ce68a9c897..1815a57d225 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -404,7 +404,7 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			goto sticky;
 		rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ));
 		rtv.tv_sec = timeout;
-		if (timeval_compare(&rtv, &tv) < 0)
+		if (timeval_compare(&rtv, &tv) >= 0)
 			rtv = tv;
 		if (copy_to_user(tvp, &rtv, sizeof(rtv))) {
 sticky:
@@ -471,7 +471,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 						1000;
 		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) < 0)
+		if (timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 sticky:
@@ -775,7 +775,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
 		rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) *
 						1000;
 		rts.tv_sec = timeout;
-		if (timespec_compare(&rts, &ts) < 0)
+		if (timespec_compare(&rts, &ts) >= 0)
 			rts = ts;
 		if (copy_to_user(tsp, &rts, sizeof(rts))) {
 		sticky: