From 898efface1a5076cbae5af87b935212b1869971b Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:01:25 -0800 Subject: [PATCH] ocfs2: recheck recovery state after getting lock * after successfully taking the $RECOVERY lock in EX mode, recheck to make sure that recovery has not already begun or completed on another node Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmrecovery.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 186e9a76aa5..f9ce864966e 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2032,6 +2032,30 @@ again: dlm->reco.new_master); status = -EEXIST; } else { + status = 0; + + /* see if recovery was already finished elsewhere */ + spin_lock(&dlm->spinlock); + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + status = -EINVAL; + mlog(0, "%s: got reco EX lock, but " + "node got recovered already\n", dlm->name); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + mlog(ML_ERROR, "%s: new master is %u " + "but no dead node!\n", + dlm->name, dlm->reco.new_master); + BUG(); + } + } + spin_unlock(&dlm->spinlock); + } + + /* if this node has actually become the recovery master, + * set the master and send the messages to begin recovery */ + if (!status) { + mlog(0, "%s: dead=%u, this=%u, sending " + "begin_reco now\n", dlm->name, + dlm->reco.dead_node, dlm->node_num); status = dlm_send_begin_reco_message(dlm, dlm->reco.dead_node); /* this always succeeds */ -- cgit From e2b5e4506f5c5187b91d7a79fbad28fe3ebd2fc5 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:02:56 -0800 Subject: [PATCH] ocfs2: fix release of ast never reserved * fix a bug in dlm_convert_lock_handler where dlm_lockres_release_ast was being called even if no ast was ever reserved Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmconvert.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 6001b22a997..f5c2f1979ad 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -421,7 +421,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; - int call_ast = 0, kick_thread = 0; + int call_ast = 0, kick_thread = 0, ast_reserved = 0; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -490,6 +490,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) status = __dlm_lockres_state_to_status(res); if (status == DLM_NORMAL) { __dlm_lockres_reserve_ast(res); + ast_reserved = 1; res->state |= DLM_LOCK_RES_IN_PROGRESS; status = __dlmconvert_master(dlm, res, lock, flags, cnv->requested_type, @@ -512,10 +513,10 @@ leave: else dlm_lock_put(lock); - /* either queue the ast or release it */ + /* either queue the ast or release it, if reserved */ if (call_ast) dlm_queue_ast(dlm, lock); - else + else if (ast_reserved) dlm_lockres_release_ast(dlm, res); if (kick_thread) -- cgit From 44465a7daf7c4e34199b2b0ebb3c5101619dcb9d Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:05:38 -0800 Subject: [PATCH] ocfs2: add dlm_wait_for_node_death * add dlm_wait_for_node_death function to be used after receiving a network error. this will wait for the given timeout to allow the heartbeat callbacks to update the domain map. without this, some paths may spin and consume enough cpu that the heartbeat gets starved and never updates. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmcommon.h | 4 ++++ fs/ocfs2/dlm/dlmconvert.c | 5 +++++ fs/ocfs2/dlm/dlmlock.c | 14 +++++++++++++- fs/ocfs2/dlm/dlmrecovery.c | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 42eb53b5293..23ceaa7127b 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -208,6 +208,9 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +/* max milliseconds to wait to sync up a network failure with a node death */ +#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) + #define DLM_PURGE_INTERVAL_MS (8 * 1000) struct dlm_lock_resource @@ -658,6 +661,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index f5c2f1979ad..f66e2d818cc 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { + /* instead of logging the same network error over + * and over, sleep here and wait for the heartbeat + * to notice the node is dead. times out after 5s. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); ret = DLM_RECOVERING; mlog(0, "node %u died so returning DLM_RECOVERING " "from convert message!\n", res->owner); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index d1a0038557a..e709412e6e3 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -646,7 +646,19 @@ retry_lock: mlog(0, "retrying lock with migration/" "recovery/in progress\n"); msleep(100); - dlm_wait_for_recovery(dlm); + /* no waiting for dlm_reco_thread */ + if (recovery) { + if (status == DLM_RECOVERING) { + mlog(0, "%s: got RECOVERING " + "for $REOCVERY lock, master " + "was %u\n", dlm->name, + res->owner); + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + } + } else { + dlm_wait_for_recovery(dlm); + } goto retry_lock; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f9ce864966e..ed76bda1a53 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) return dead; } +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(ML_NOTICE, "%s: waiting %dms for notification of " + "death of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(ML_NOTICE, "%s: waiting indefinitely for notification " + "of death of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins -- cgit From 558c70c59b75a5a53ba496fe3bccea80a9e3e6fb Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Wed, 18 Jan 2006 17:07:47 -0800 Subject: [PATCH] ocfs2: manually grant remote recovery lock * fix a hang in recovery that occurred in dlmlock_remote. the $RECOVERY lock was never moved to the granted queue even after getting DLM_NORMAL back from the master node. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmlock.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index e709412e6e3..671d4ff222c 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, dlm_error(status); dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); + } else if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* special case for the $RECOVERY lock. + * there will never be an AST delivered to put + * this lock on the proper secondary queue + * (granted), so do it manually. */ + mlog(0, "%s: $RECOVERY lock for this node (%u) is " + "mastered by %u; got lock, manually granting (no ast)\n", + dlm->name, dlm->node_num, res->owner); + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); -- cgit From 745ae8ba29e729ec922393fa4d9448c385673599 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 9 Feb 2006 13:23:39 -0800 Subject: [PATCH] ocfs2: only checkpoint journal when asked to Disable automatic checkpointing of the journal - this is a relic from older ocfs2 days. Worth quite a bit of performance on longer running single node tests. Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 7 +++---- fs/ocfs2/journal.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index fa0bcac5cea..d329c9df90a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1584,10 +1584,9 @@ static int ocfs2_commit_thread(void *arg) while (!(kthread_should_stop() && atomic_read(&journal->j_num_trans) == 0)) { - wait_event_interruptible_timeout(osb->checkpoint_event, - atomic_read(&journal->j_num_trans) - || kthread_should_stop(), - OCFS2_CHECKPOINT_INTERVAL); + wait_event_interruptible(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop()); status = ocfs2_commit_cache(osb); if (status < 0) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7d0a816184f..2f3a6acdac4 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -29,8 +29,6 @@ #include #include -#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) - enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, OCFS2_JOURNAL_LOADED, -- cgit From f671c09bce88ea253d576c842f8f39d9a2a29028 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Tue, 14 Feb 2006 11:45:21 -0800 Subject: [PATCH] ocfs2: detach from heartbeat events before freeing mle Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmmaster.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a3194fe173d..2e2e95e6949 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2482,7 +2482,9 @@ top: atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - /* final put will take care of list removal */ + /* do not need events any longer, so detach + * from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); __dlm_put_mle(mle); } continue; @@ -2537,6 +2539,9 @@ top: spin_unlock(&res->spinlock); dlm_lockres_put(res); + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + /* dump the mle */ spin_lock(&dlm->master_lock); __dlm_put_mle(mle); -- cgit From b2f49033d80c952a0ffc2d5647bc1a0b8a09c1b3 Mon Sep 17 00:00:00 2001 From: Peter Staubach Date: Fri, 17 Feb 2006 13:52:36 -0800 Subject: [PATCH] fix deadlock in ext2 Fix a deadlock possible in the ext2 file system implementation. This deadlock occurs when a file is removed from an ext2 file system which was mounted with the "sync" mount option. The problem is that ext2_xattr_delete_inode() was invoking the routine, sync_dirty_buffer(), using a buffer head which was previously locked via lock_buffer(). The first thing that sync_dirty_buffer() does is to lock the buffer head that it was passed. It does this via lock_buffer(). Oops. The solution is to unlock the buffer head in ext2_xattr_delete_inode() before invoking sync_dirty_buffer(). This makes the code in ext2_xattr_delete_inode() obey the same locking rules as all other callers of sync_dirty_buffer() in the ext2 file system implementation. Signed-off-by: Peter Staubach Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/xattr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index a2ca3107d47..86ae8e93adb 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -792,18 +792,20 @@ ext2_xattr_delete_inode(struct inode *inode) ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); + unlock_buffer(bh); } else { HDR(bh)->h_refcount = cpu_to_le32( le32_to_cpu(HDR(bh)->h_refcount) - 1); if (ce) mb_cache_entry_release(ce); + ea_bdebug(bh, "refcount now=%d", + le32_to_cpu(HDR(bh)->h_refcount)); + unlock_buffer(bh); mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); DQUOT_FREE_BLOCK(inode, 1); } - ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); - unlock_buffer(bh); EXT2_I(inode)->i_file_acl = 0; cleanup: -- cgit From 77e7f250f88cd62844e24c42aff4d0e95969c746 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 17 Feb 2006 13:52:52 -0800 Subject: [PATCH] fuse: fix bug in aborted fuse_release_end() There's a rather theoretical case of the BUG triggering in fuse_reset_request(): - iget() fails because of OOM after a successful CREATE_OPEN request - during IO on the resulting RELEASE request the connection is aborted Fix and add warning to fuse_reset_request(). Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 6 ++++++ fs/fuse/file.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f556a0d5c0d..0c9a2ee54c9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -66,6 +66,12 @@ static void restore_sigs(sigset_t *oldset) sigprocmask(SIG_SETMASK, oldset, NULL); } +/* + * Reset request, so that it can be reused + * + * The caller must be _very_ careful to make sure, that it is holding + * the only reference to req + */ void fuse_reset_request(struct fuse_req *req) { int preallocated = req->preallocated; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 296351615b0..6f05379b0a0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -116,9 +116,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir) /* Special case for failed iget in CREATE */ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - u64 nodeid = req->in.h.nodeid; - fuse_reset_request(req); - fuse_send_forget(fc, req, nodeid, 1); + /* If called from end_io_requests(), req has more than one + reference and fuse_reset_request() cannot work */ + if (fc->connected) { + u64 nodeid = req->in.h.nodeid; + fuse_reset_request(req); + fuse_send_forget(fc, req, nodeid, 1); + } else + fuse_put_request(fc, req); } void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff, -- cgit From 74910e6c7dc7471b286a883c1a7af70483ffd2ba Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 17 Feb 2006 13:52:58 -0800 Subject: [PATCH] select: time comparison fixes I got all of these backwards. We want to return min(input timeout, new timeout) to userspace to prevent increasing the time-remaining value. Thanks to Ernst Herzberg for reporting and diagnosing. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 6 +++--- fs/select.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index a2ba78bdf7f..5333c7d7427 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1757,7 +1757,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, goto sticky; rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); rtv.tv_sec = timeout; - if (compat_timeval_compare(&rtv, &tv) < 0) + if (compat_timeval_compare(&rtv, &tv) >= 0) rtv = tv; if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: @@ -1834,7 +1834,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, rts.tv_sec++; rts.tv_nsec -= NSEC_PER_SEC; } - if (compat_timespec_compare(&rts, &ts) < 0) + if (compat_timespec_compare(&rts, &ts) >= 0) rts = ts; copy_to_user(tsp, &rts, sizeof(rts)); } @@ -1934,7 +1934,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; rts.tv_sec = timeout; - if (compat_timespec_compare(&rts, &ts) < 0) + if (compat_timespec_compare(&rts, &ts) >= 0) rts = ts; if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: diff --git a/fs/select.c b/fs/select.c index 6ce68a9c897..1815a57d225 100644 --- a/fs/select.c +++ b/fs/select.c @@ -404,7 +404,7 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto sticky; rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); rtv.tv_sec = timeout; - if (timeval_compare(&rtv, &tv) < 0) + if (timeval_compare(&rtv, &tv) >= 0) rtv = tv; if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: @@ -471,7 +471,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; rts.tv_sec = timeout; - if (timespec_compare(&rts, &ts) < 0) + if (timespec_compare(&rts, &ts) >= 0) rts = ts; if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: @@ -775,7 +775,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; rts.tv_sec = timeout; - if (timespec_compare(&rts, &ts) < 0) + if (timespec_compare(&rts, &ts) >= 0) rts = ts; if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: -- cgit