From fc80c44277b3c92d808b73e9d40e120229aa4b6a Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Fri, 25 Jul 2008 01:46:29 -0700 Subject: jbd: positively dispose the unmapped data buffers in journal_commit_transaction() After ext3-ordered files are truncated, there is a possibility that the pages which cannot be estimated still remain. Remaining pages can be released when the system has really few memory. So, it is not memory leakage. But the resource management software etc. may not work correctly. It is possible that journal_unmap_buffer() cannot release the buffers, and the pages to which they belong because they are attached to a commiting transaction and journal_unmap_buffer() cannot release them. To release such the buffers and the pages later, journal_unmap_buffer() leaves it to journal_commit_transaction(). (journal_unmap_buffer() puts the mark 'BH_Freed' to the buffers so that journal_commit_transaction() can identify whether they can be released or not.) In the journalled mode and the writeback mode, jbd does with only metadata buffers. But in the ordered mode, jbd does with metadata buffers and also data buffers. Actually, journal_commit_transaction() releases only the metadata buffers of which release is demanded by journal_unmap_buffer(), and also releases the pages to which they belong if possible. As a result, the data buffers of which release is demanded by journal_unmap_buffer() remain after a transaction commits. And also the pages to which they belong remain. Such the remained pages don't have mapping any longer. Due to this fact, there is a possibility that the pages which cannot be estimated remain. The metadata buffers marked 'BH_Freed' and the pages to which they belong can be released at 'JBD: commit phase 7'. Therefore, by applying the same code into 'JBD: commit phase 2' (where the data buffers are done with), journal_commit_transaction() can also release the data buffers marked 'BH_Freed' and the pages to which they belong. As a result, all the buffers marked 'BH_Freed' can be released, and also all the pages to which these buffers belong can be released at journal_commit_transaction(). So, the page which cannot be estimated is lost. <> > spin_lock(&journal->j_list_lock); > while (commit_transaction->t_forget) { > transaction_t *cp_transaction; > struct buffer_head *bh; > > jh = commit_transaction->t_forget; >... > if (buffer_freed(bh)) { > ^^^^^^^^^^^^^^^^^^^^^^^^ > clear_buffer_freed(bh); > ^^^^^^^^^^^^^^^^^^^^^^^^ > clear_buffer_jbddirty(bh); > } > > if (buffer_jbddirty(bh)) { > JBUFFER_TRACE(jh, "add to new checkpointing trans"); > __journal_insert_checkpoint(jh, commit_transaction); > JBUFFER_TRACE(jh, "refile for checkpoint writeback"); > __journal_refile_buffer(jh); > jbd_unlock_bh_state(bh); > } else { > J_ASSERT_BH(bh, !buffer_dirty(bh)); > ... > JBUFFER_TRACE(jh, "refile or unfile freed buffer"); > __journal_refile_buffer(jh); > if (!jh->b_transaction) { > jbd_unlock_bh_state(bh); > /* needs a brelse */ > journal_remove_journal_head(bh); > release_buffer_page(bh); > ^^^^^^^^^^^^^^^^^^^^^^^^ > } else > } **************************************************************** * Apply the code of "^^^^^^" lines into 'JBD: commit phase 2' * **************************************************************** At journal_commit_transaction() code, there is one extra message in the series of jbd debug messages. ("JBD: commit phase 2") This patch fixes it, too. Signed-off-by: Toshiyuki Okajima Acked-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'fs/jbd/commit.c') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 5a8ca61498c..f943b9b3f20 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) /* * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * not successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under journal->j_list_lock. The caller provided us with a ref + * against the buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -77,6 +77,19 @@ nope: __brelse(bh); } +/* + * Decrement reference counter for data buffer. If it has been marked + * 'BH_Freed', release it and the page to which it belongs if possible. + */ +static void release_data_buffer(struct buffer_head *bh) +{ + if (buffer_freed(bh)) { + clear_buffer_freed(bh); + release_buffer_page(bh); + } else + put_bh(bh); +} + /* * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is * held. For ranking reasons we must trylock. If we lose, schedule away and @@ -231,7 +244,7 @@ write_out_data: if (locked) unlock_buffer(bh); BUFFER_TRACE(bh, "already cleaned up"); - put_bh(bh); + release_data_buffer(bh); continue; } if (locked && test_clear_buffer_dirty(bh)) { @@ -258,10 +271,10 @@ write_out_data: if (locked) unlock_buffer(bh); journal_remove_journal_head(bh); - /* Once for our safety reference, once for + /* One for our safety reference, other for * journal_remove_journal_head() */ put_bh(bh); - put_bh(bh); + release_data_buffer(bh); } if (need_resched() || spin_needbreak(&journal->j_list_lock)) { @@ -443,7 +456,7 @@ void journal_commit_transaction(journal_t *journal) } else { jbd_unlock_bh_state(bh); } - put_bh(bh); + release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); @@ -453,8 +466,6 @@ void journal_commit_transaction(journal_t *journal) journal_write_revoke_records(journal, commit_transaction); - jbd_debug(3, "JBD: commit phase 2\n"); - /* * If we found any dirty or locked buffers, then we should have * looped back up to the write_out_data label. If there weren't -- cgit From cbe5f466f6995e10a10c7ae66d6dc8608f08a6b8 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Fri, 25 Jul 2008 01:46:30 -0700 Subject: jbd: don't abort if flushing file data failed In ordered mode, the current jbd aborts the journal if a file data buffer has an error. But this behavior is unintended, and we found that it has been adopted accidentally. This patch undoes it and just calls printk() instead of aborting the journal. Additionally, set AS_EIO into the address_space object of the failed buffer which is submitted by journal_do_submit_data() so that fsync() can get -EIO. Missing error checkings are also added to inform errors on file data buffers to the user. The following buffers are targeted. (a) the buffer which has already been written out by pdflush (b) the buffer which has been unlocked before scanned in the t_locked_list loop [akpm@linux-foundation.org: improve grammar in a printk] Signed-off-by: Hidehiro Kawai Acked-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'fs/jbd/commit.c') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f943b9b3f20..2eccbfaa1d4 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -185,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) /* * Submit all the data buffers to disk */ -static void journal_submit_data_buffers(journal_t *journal, +static int journal_submit_data_buffers(journal_t *journal, transaction_t *commit_transaction) { struct journal_head *jh; @@ -193,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal, int locked; int bufs = 0; struct buffer_head **wbuf = journal->j_wbuf; + int err = 0; /* * Whenever we unlock the journal and sleep, things can get added @@ -266,6 +267,8 @@ write_out_data: put_bh(bh); } else { BUFFER_TRACE(bh, "writeout complete: unfile"); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); if (locked) @@ -284,6 +287,8 @@ write_out_data: } spin_unlock(&journal->j_list_lock); journal_do_submit_data(wbuf, bufs); + + return err; } /* @@ -423,8 +428,7 @@ void journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - journal_submit_data_buffers(journal, commit_transaction); + err = journal_submit_data_buffers(journal, commit_transaction); /* * Wait for all previously submitted IO to complete. @@ -439,10 +443,21 @@ void journal_commit_transaction(journal_t *journal) if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; spin_lock(&journal->j_list_lock); } + if (unlikely(!buffer_uptodate(bh))) { + if (TestSetPageLocked(bh->b_page)) { + spin_unlock(&journal->j_list_lock); + lock_page(bh->b_page); + spin_lock(&journal->j_list_lock); + } + if (bh->b_page->mapping) + set_bit(AS_EIO, &bh->b_page->mapping->flags); + + unlock_page(bh->b_page); + SetPageError(bh->b_page); + err = -EIO; + } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); @@ -461,8 +476,14 @@ void journal_commit_transaction(journal_t *journal) } spin_unlock(&journal->j_list_lock); - if (err) - journal_abort(journal, err); + if (err) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: Detected IO errors while flushing file data " + "on %s\n", bdevname(journal->j_fs_dev, b)); + err = 0; + } journal_write_revoke_records(journal, commit_transaction); -- cgit From 529ae9aaa08378cfe2a4350bded76f32cc8ff0ce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 2 Aug 2008 12:01:03 +0200 Subject: mm: rename page trylock Converting page lock to new locking bitops requires a change of page flag operation naming, so we might as well convert it to something nicer (!TestSetPageLocked_Lock => trylock_page, SetPageLocked => set_page_locked). This also facilitates lockdeping of page lock. Signed-off-by: Nick Piggin Acked-by: KOSAKI Motohiro Acked-by: Peter Zijlstra Acked-by: Andrew Morton Acked-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/jbd/commit.c') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 2eccbfaa1d4..81a9ad7177c 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh) goto nope; /* OK, it's a truncated page */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto nope; page_cache_get(page); @@ -446,7 +446,7 @@ void journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_list_lock); } if (unlikely(!buffer_uptodate(bh))) { - if (TestSetPageLocked(bh->b_page)) { + if (!trylock_page(bh->b_page)) { spin_unlock(&journal->j_list_lock); lock_page(bh->b_page); spin_lock(&journal->j_list_lock); -- cgit From ca5de404ff036a29b25e9a83f6919c9f606c5841 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 2 Aug 2008 12:02:13 +0200 Subject: fs: rename buffer trylock Like the page lock change, this also requires name change, so convert the raw test_and_set bitop to a trylock. Signed-off-by: Nick Piggin Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/jbd/commit.c') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 81a9ad7177c..ae08c057e75 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -221,7 +221,7 @@ write_out_data: * blocking lock_buffer(). */ if (buffer_dirty(bh)) { - if (test_set_buffer_locked(bh)) { + if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); /* Write out all data to prevent deadlocks */ -- cgit From d1645e526a1e5842c9ac433d73419ba886676cf3 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Sat, 18 Oct 2008 20:27:53 -0700 Subject: jbd: abort when failed to log metadata buffers If we failed to write metadata buffers to the journal space and succeeded to write the commit record, stale data can be written back to the filesystem as metadata in the recovery phase. To avoid this, when we failed to write out metadata buffers, abort the journal before writing the commit record. Signed-off-by: Hidehiro Kawai Acked-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/jbd/commit.c') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index ae08c057e75..f1ea861b992 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -762,6 +762,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 6\n"); if (journal_write_commit_record(journal, commit_transaction)) -- cgit From 885e353c7427db7b60692789741b34e605b0b69b Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Sat, 18 Oct 2008 20:27:54 -0700 Subject: jbd: don't dirty original metadata buffer on abort Currently, original metadata buffers are dirtied when they are unfiled whether the journal has aborted or not. Eventually these buffers will be written-back to the filesystem by pdflush. This means some metadata buffers are written to the filesystem without journaling if the journal aborts. So if both journal abort and system crash happen at the same time, the filesystem would become inconsistent state. Additionally, replaying journaled metadata can overwrite the latest metadata on the filesystem partly. Because, if the journal aborts, journaled metadata are preserved and replayed during the next mount not to lose uncheckpointed metadata. This would also break the consistency of the filesystem. This patch prevents original metadata buffers from being dirtied on abort by clearing BH_JBDDirty flag from those buffers. Thus, no metadata buffers are written to the filesystem without journaling. Signed-off-by: Hidehiro Kawai Acked-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/jbd/commit.c') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f1ea861b992..d6a6659f3e4 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -518,9 +518,10 @@ void journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -855,6 +856,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); -- cgit From 0e4fb5e283870757024294bc4567a7c59d936f0b Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Sat, 18 Oct 2008 20:27:57 -0700 Subject: ext3: add an option to control error handling on file data If the journal doesn't abort when it gets an IO error in file data blocks, the file data corruption will spread silently. Because most of applications and commands do buffered writes without fsync(), they don't notice the IO error. It's scary for mission critical systems. On the other hand, if the journal aborts whenever it gets an IO error in file data blocks, the system will easily become inoperable. So this patch introduces a filesystem option to determine whether it aborts the journal or just call printk() when it gets an IO error in file data. If you mount a ext3 fs with data_err=abort option, it aborts on file data write error. If you mount it with data_err=ignore, it doesn't abort, just call printk(). data_err=ignore is the default. Signed-off-by: Hidehiro Kawai Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/jbd/commit.c') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index d6a6659f3e4..25719d902c5 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal) printk(KERN_WARNING "JBD: Detected IO errors while flushing file data " "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); err = 0; } -- cgit