summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/raid56.c
diff options
context:
space:
mode:
authorQu Wenruo <wqu@suse.com>2022-11-07 15:32:30 +0800
committerDavid Sterba <dsterba@suse.com>2022-12-05 18:00:55 +0100
commit75b47033296595efb208cc563cbb8cf4fb7c3ebc (patch)
treefd6875fa4e07cd145a21b38a1fbd5671f154f8cf /fs/btrfs/raid56.c
parent2942a50dea74126bf3395b3060a808fb046136fc (diff)
downloadlinux-75b47033296595efb208cc563cbb8cf4fb7c3ebc.tar.gz
linux-75b47033296595efb208cc563cbb8cf4fb7c3ebc.tar.bz2
linux-75b47033296595efb208cc563cbb8cf4fb7c3ebc.zip
btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap
Since we have rbio::error_bitmap to indicate exactly where the errors are (including read error and csum mismatch error), we can make recovery path more accurate. For example: 0 32K 64K Data 1 |XXXXXXXX| | Data 2 | |XXXXXXXXX| Parity | | | 1) Get csum mismatch when reading data 1 [0, 32K) 2) Mark corresponding range error The old code will mark the whole data 1 stripe as error. While the new code will only mark data 1 [0, 32K) as error. 3) Recovery path The old code will recover data 1 [0, 64K), all using Data 2 and parity. This means, Data 1 [32K, 64K) will be corrupted data, as data 2 [32K, 64K) is already corrupted. While the new code will only recover data 1 [0, 32K), as only that range has error so far. This new behavior can avoid populating rbio cache with incorrect data. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/raid56.c')
-rw-r--r--fs/btrfs/raid56.c279
1 files changed, 193 insertions, 86 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index e14968a37c08..fd67ba4648c9 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1014,6 +1014,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
}
/*
+ * Return the total numer of errors found in the vertical stripe of @sector_nr.
+ *
+ * @faila and @failb will also be updated to the first and second stripe
+ * number of the errors.
+ */
+static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
+ int *faila, int *failb)
+{
+ int stripe_nr;
+ int found_errors = 0;
+
+ ASSERT(faila && failb);
+ *faila = -1;
+ *failb = -1;
+
+ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
+ int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
+
+ if (test_bit(total_sector_nr, rbio->error_bitmap)) {
+ found_errors++;
+ if (*faila < 0)
+ *faila = stripe_nr;
+ else if (*failb < 0)
+ *failb = stripe_nr;
+ }
+ }
+ return found_errors;
+}
+
+/*
* Add a single sector @sector into our list of bios for IO.
*
* Return 0 if everything went well.
@@ -1740,14 +1770,15 @@ fail:
* @*pointers are the pre-allocated pointers by the caller, so we don't
* need to allocate/free the pointers again and again.
*/
-static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
- void **pointers, void **unmap_array)
+static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ void **pointers, void **unmap_array)
{
struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
struct sector_ptr *sector;
const u32 sectorsize = fs_info->sectorsize;
- const int faila = rbio->faila;
- const int failb = rbio->failb;
+ int found_errors;
+ int faila;
+ int failb;
int stripe_nr;
/*
@@ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
*/
if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
!test_bit(sector_nr, &rbio->dbitmap))
- return;
+ return 0;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
+ &failb);
+ /*
+ * No errors in the veritical stripe, skip it. Can happen for recovery
+ * which only part of a stripe failed csum check.
+ */
+ if (!found_errors)
+ return 0;
+
+ if (found_errors > rbio->bioc->max_errors)
+ return -EIO;
/*
* Setup our array of pointers with sectors from each stripe
@@ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
*/
for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
/*
- * If we're rebuilding a read, we have to use
- * pages from the bio list
+ * If we're rebuilding a read, we have to use pages from the
+ * bio list if possible.
*/
if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
- (stripe_nr == faila || stripe_nr == failb)) {
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
} else {
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
@@ -1859,18 +1901,19 @@ pstripe:
* Especially if we determine to cache the rbio, we need to
* have at least all data sectors uptodate.
*/
- if (rbio->faila >= 0) {
- sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr);
+ if (faila >= 0) {
+ sector = rbio_stripe_sector(rbio, faila, sector_nr);
sector->uptodate = 1;
}
- if (rbio->failb >= 0) {
- sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr);
+ if (failb >= 0) {
+ sector = rbio_stripe_sector(rbio, failb, sector_nr);
sector->uptodate = 1;
}
cleanup:
for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
kunmap_local(unmap_array[stripe_nr]);
+ return 0;
}
static int recover_sectors(struct btrfs_raid_bio *rbio)
@@ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
goto out;
}
- /* Make sure faila and fail b are in order. */
- if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb)
- swap(rbio->faila, rbio->failb);
-
if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
spin_lock_irq(&rbio->bio_list_lock);
@@ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
index_rbio_pages(rbio);
- for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
- recover_vertical(rbio, sectornr, pointers, unmap_array);
+ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
+ ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
+ if (ret < 0)
+ break;
+ }
out:
kfree(pointers);
@@ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
int sectornr = total_sector_nr % rbio->stripe_nsectors;
struct sector_ptr *sector;
- if (rbio->faila == stripe || rbio->failb == stripe) {
- /* Skip the current stripe. */
- ASSERT(sectornr == 0);
- total_sector_nr += rbio->stripe_nsectors - 1;
- atomic_inc(&rbio->error);
+ /*
+ * Skip the range which has error. It can be a range which is
+ * marked error (for csum mismatch), or it can be a missing
+ * device.
+ */
+ if (!rbio->bioc->stripes[stripe].dev->bdev ||
+ test_bit(total_sector_nr, rbio->error_bitmap)) {
+ /*
+ * Also set the error bit for missing device, which
+ * may not yet have its error bit set.
+ */
+ set_bit(total_sector_nr, rbio->error_bitmap);
continue;
}
+
sector = rbio_stripe_sector(rbio, stripe, sectornr);
ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
sectornr, REQ_OP_READ);
@@ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
/*
* Either we're doing recover for a read failure or degraded write,
- * caller should have set faila/b and error bitmap correctly.
+ * caller should have set error bitmap correctly.
*/
- ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
bio_list_init(&bio_list);
@@ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
submit_read_bios(rbio, &bio_list);
wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
- /* We have more errors than our tolerance during the read. */
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
- ret = -EIO;
- goto out;
- }
-
ret = recover_sectors(rbio);
out:
@@ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work)
rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}
+static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
+{
+ bool found = false;
+ int sector_nr;
+
+ /*
+ * This is for RAID6 extra recovery tries, thus mirror number should
+ * be large than 2.
+ * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
+ * RAID5 methods.
+ */
+ ASSERT(mirror_num > 2);
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int found_errors;
+ int faila;
+ int failb;
+
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ /* This vertical stripe doesn't have errors. */
+ if (!found_errors)
+ continue;
+
+ /*
+ * If we found errors, there should be only one error marked
+ * by previous set_rbio_range_error().
+ */
+ ASSERT(found_errors == 1);
+ found = true;
+
+ /* Now select another stripe to mark as error. */
+ failb = rbio->real_stripes - (mirror_num - 1);
+ if (failb <= faila)
+ failb--;
+
+ /* Set the extra bit in error bitmap. */
+ if (failb >= 0)
+ set_bit(failb * rbio->stripe_nsectors + sector_nr,
+ rbio->error_bitmap);
+ }
+
+ /* We should found at least one vertical stripe with error.*/
+ ASSERT(found);
+}
+
/*
* the main entry point for reads from the higher layers. This
* is really only called when the normal read path had a failure,
@@ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
* for 'mirror_num > 2', select a stripe to fail on every retry.
*/
if (mirror_num > 2) {
- /*
- * 'mirror == 3' is to fail the p stripe and
- * reconstruct from the q stripe. 'mirror > 3' is to
- * fail a data stripe and reconstruct from p+q stripe.
- */
+ set_rbio_raid6_extra_error(rbio, mirror_num);
rbio->failb = rbio->real_stripes - (mirror_num - 1);
ASSERT(rbio->failb > 0);
if (rbio->failb <= rbio->faila)
@@ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
{
- int dfail = 0, failp = -1;
+ void **pointers = NULL;
+ void **unmap_array = NULL;
+ int sector_nr;
int ret;
- /* No error case should be already handled by the caller. */
- ASSERT(rbio->faila >= 0 || rbio->failb >= 0);
+ /*
+ * @pointers array stores the pointer for each sector.
+ *
+ * @unmap_array stores copy of pointers that does not get reordered
+ * during reconstruction so that kunmap_local works.
+ */
+ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!pointers || !unmap_array) {
+ ret = -ENOMEM;
+ goto out;
+ }
- if (is_data_stripe(rbio, rbio->faila))
- dfail++;
- else if (is_parity_stripe(rbio->faila))
- failp = rbio->faila;
+ for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
+ int dfail = 0, failp = -1;
+ int faila;
+ int failb;
+ int found_errors;
- if (is_data_stripe(rbio, rbio->failb))
- dfail++;
- else if (is_parity_stripe(rbio->failb))
- failp = rbio->failb;
+ found_errors = get_rbio_veritical_errors(rbio, sector_nr,
+ &faila, &failb);
+ if (found_errors > rbio->bioc->max_errors) {
+ ret = -EIO;
+ goto out;
+ }
+ if (found_errors == 0)
+ continue;
- /*
- * Because we can not use a scrubbing parity to repair
- * the data, so the capability of the repair is declined.
- * (In the case of RAID5, we can not repair anything)
- */
- if (dfail > rbio->bioc->max_errors - 1)
- return -EIO;
+ /* We should have at least one error here. */
+ ASSERT(faila >= 0 || failb >= 0);
- /*
- * If all data is good, only parity is correctly, just
- * repair the parity.
- */
- if (dfail == 0)
- return 0;
+ if (is_data_stripe(rbio, faila))
+ dfail++;
+ else if (is_parity_stripe(faila))
+ failp = faila;
- /*
- * Here means we got one corrupted data stripe and one
- * corrupted parity on RAID6, if the corrupted parity
- * is scrubbing parity, luckily, use the other one to repair
- * the data, or we can not repair the data stripe.
- */
- if (failp != rbio->scrubp)
- return -EIO;
+ if (is_data_stripe(rbio, failb))
+ dfail++;
+ else if (is_parity_stripe(failb))
+ failp = failb;
+ /*
+ * Because we can not use a scrubbing parity to repair the
+ * data, so the capability of the repair is declined. (In the
+ * case of RAID5, we can not repair anything.)
+ */
+ if (dfail > rbio->bioc->max_errors - 1) {
+ ret = -EIO;
+ goto out;
+ }
+ /*
+ * If all data is good, only parity is correctly, just repair
+ * the parity, no need to recover data stripes.
+ */
+ if (dfail == 0)
+ continue;
- /* We have some corrupted sectors, need to repair them. */
- ret = recover_sectors(rbio);
+ /*
+ * Here means we got one corrupted data stripe and one
+ * corrupted parity on RAID6, if the corrupted parity is
+ * scrubbing parity, luckily, use the other one to repair the
+ * data, or we can not repair the data stripe.
+ */
+ if (failp != rbio->scrubp) {
+ ret = -EIO;
+ goto out;
+ }
+
+ ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
+ if (ret < 0)
+ goto out;
+ }
+out:
+ kfree(pointers);
+ kfree(unmap_array);
return ret;
}
@@ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
submit_read_bios(rbio, &bio_list);
wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
- if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
- ret = -EIO;
- goto cleanup;
- }
- /*
- * No error during read, can finish the scrub and need to verify the
- * P/Q sectors;
- */
- if (atomic_read(&rbio->error) == 0) {
- need_check = true;
- goto finish;
- }
-
- /* We have some failures, need to recover the failed sectors first. */
+ /* We may have some failures, recover the failed sectors first. */
ret = recover_scrub_rbio(rbio);
if (ret < 0)
goto cleanup;
-finish:
/*
* We have every sector properly prepared. Can finish the scrub
* and writeback the good content.