From 6be9d4940134b36f9ed020aead36f831f19b49f1 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 23 May 2008 13:04:34 -0700 Subject: md: md: raid5 rate limit error printk Last night we had scsi problems and a hardware raid unit was offlined during heavy i/o. While this happened we got for about 3 minutes a huge number messages like these Apr 12 03:36:07 pfs1n14 kernel: [197510.696595] raid5:md7: read error not correctable (sector 2993096568 on sdj2). I guess the high error rate is responsible for not scheduling other events - during this time the system was not pingable and in the end also other devices run into scsi command timeouts causing problems on these unrelated devices as well. Signed-off-by: Bernd Schubert Signed-off-by: Dan Williams Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 93fde48c0f4..2f28745dacf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -94,6 +94,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); @@ -1143,10 +1145,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector + rdev->data_offset), - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1160,16 +1164,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING -- cgit