From 3f677f9c998a18d11641b5a2de6f11d0af32a153 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 14 Jun 2019 15:41:04 -0700 Subject: drivers: md: Unify common definitions of raid1 and raid10 These definitions are being moved to raid1-10.c. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid1-10.c | 25 +++++++++++++++++++++++++ drivers/md/raid1.c | 29 ++--------------------------- drivers/md/raid10.c | 27 +-------------------------- 3 files changed, 28 insertions(+), 53 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 400001b815db..7d968bf08e54 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -3,6 +3,31 @@ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* When there are this many requests queue to be written by + * the raid thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; + /* for managing resync I/O pages */ struct resync_pages { void *raid_bio; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2aa36e570e04..e331b433d00c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -42,31 +42,6 @@ (1L << MD_HAS_PPL) | \ (1L << MD_HAS_MULTIPLE_PPLS)) -/* - * Number of guaranteed r1bios in case of extreme VM load: - */ -#define NR_RAID1_BIOS 256 - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* When there are this many requests queue to be written by - * the raid1 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); @@ -2947,7 +2922,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->poolinfo) goto abort; conf->poolinfo->raid_disks = mddev->raid_disks * 2; - err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc, + err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); if (err) goto abort; @@ -3232,7 +3207,7 @@ static int raid1_reshape(struct mddev *mddev) newpoolinfo->mddev = mddev; newpoolinfo->raid_disks = raid_disks * 2; - ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc, + ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, r1bio_pool_free, newpoolinfo); if (ret) { kfree(newpoolinfo); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index aea11476fee6..1facd0153399 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -64,31 +64,6 @@ * [B A] [D C] [B A] [E C D] */ -/* - * Number of guaranteed r10bios in case of extreme VM load: - */ -#define NR_RAID10_BIOS 256 - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* When there are this many requests queued to be written by - * the raid10 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); static int _enough(struct r10conf *conf, int previous, int ignore); @@ -3675,7 +3650,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->geo = geo; conf->copies = copies; - err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc, + err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, r10bio_pool_free, conf); if (err) goto out; -- cgit v1.2.3 From d9771f5ec46c282d518b453c793635dbdc3a2a94 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 14 Jun 2019 15:41:05 -0700 Subject: raid5-cache: Need to do start() part job after adding journal device commit d5d885fd514f ("md: introduce new personality funciton start()") splits the init job to two parts. The first part run() does the jobs that do not require the md threads. The second part start() does the jobs that require the md threads. Now it just does run() in adding new journal device. It needs to do the second part start() too. Fixes: d5d885fd514f ("md: introduce new personality funciton start()") Cc: stable@vger.kernel.org #v4.9+ Reported-by: Michal Soltys Signed-off-by: Xiao Ni Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b83bce2beb66..da94cbaa1a9e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7672,7 +7672,7 @@ abort: static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; - int err = -EEXIST; + int ret, err = -EEXIST; int disk; struct disk_info *p; int first = 0; @@ -7687,7 +7687,14 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * The array is in readonly mode if journal is missing, so no * write requests running. We should be safe */ - log_init(conf, rdev, false); + ret = log_init(conf, rdev, false); + if (ret) + return ret; + + ret = r5l_start(conf->log); + if (ret) + return ret; + return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) -- cgit v1.2.3 From 168b305b0cfb7467a6691993f922ecbdcfc00c98 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 14 Jun 2019 15:41:06 -0700 Subject: md: md.c: Return -ENODEV when mddev is NULL in rdev_attr_show Commit c42d3240990814eec1e4b2b93fa0487fc4873aed ("md: return -ENODEV if rdev has no mddev assigned") changed rdev_attr_store to return -ENODEV when rdev->mddev is NULL, now do the same to rdev_attr_show. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 04f4f131f9d6..86f4f2b5a724 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3356,7 +3356,7 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; if (!rdev->mddev) - return -EBUSY; + return -ENODEV; return entry->show(rdev, page); } -- cgit v1.2.3 From e5b521ee9b58c8954ad4d75ccaed9428f4b1a0ca Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 14 Jun 2019 15:41:07 -0700 Subject: md: fix spelling typo and add necessary space This patch fix a spelling typo and add necessary space for code. In addition, the patch get rid of the unnecessary 'if'. Signed-off-by: Yufen Yu Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/md.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 86f4f2b5a724..1f37a1adc926 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5639,8 +5639,7 @@ int md_run(struct mddev *mddev) spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + sysfs_link_rdev(mddev, rdev); /* failure here is OK */ if (mddev->degraded && !mddev->ro) /* This ensures that recovering status is reported immediately @@ -8190,8 +8189,7 @@ void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct mddev *mddev2; - unsigned int currspeed = 0, - window; + unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors, recovery_done; unsigned long mark[SYNC_MARKS]; unsigned long update_time; @@ -8248,7 +8246,7 @@ void md_do_sync(struct md_thread *thread) * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync * 1 == like 2, but have yielded to allow conflicting resync to - * commense + * commence * other == active in resync - this many blocks * * Before starting a resync we must have set curr_resync to @@ -8379,7 +8377,7 @@ void md_do_sync(struct md_thread *thread) /* * Tune reconstruction: */ - window = 32*(PAGE_SIZE/512); + window = 32 * (PAGE_SIZE / 512); pr_debug("md: using %dk window, over a total of %lluk.\n", window/2, (unsigned long long)max_sectors/2); @@ -9192,7 +9190,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) * perform resync with the new activated disk */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - } /* device faulty * We just want to do the minimum to mark the disk -- cgit v1.2.3 From ebfeb444fa6fd9bc7be62694fff838bc57e19a7d Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 14 Jun 2019 15:41:08 -0700 Subject: md/raid1: get rid of extra blank line and space This patch get rid of extra blank line and space, and add necessary space for code. Signed-off-by: Yufen Yu Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e331b433d00c..869c32fea1b8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1424,7 +1424,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!r1_bio->bios[i]) continue; - if (first_clone) { /* do behind I/O ? * Not if there are too many, or cannot @@ -1704,9 +1703,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) first = last = rdev->saved_raid_disk; for (mirror = first; mirror <= last; mirror++) { - p = conf->mirrors+mirror; + p = conf->mirrors + mirror; if (!p->rdev) { - if (mddev->gendisk) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -2863,7 +2861,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); - } return nr_sectors; } @@ -3064,7 +3061,7 @@ static int raid1_run(struct mddev *mddev) } mddev->degraded = 0; - for (i=0; i < conf->raid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->mirrors[i].rdev == NULL || !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || test_bit(Faulty, &conf->mirrors[i].rdev->flags)) @@ -3099,7 +3096,7 @@ static int raid1_run(struct mddev *mddev) mddev->queue); } - ret = md_integrity_register(mddev); + ret = md_integrity_register(mddev); if (ret) { md_unregister_thread(&mddev->thread); raid1_free(mddev, conf); -- cgit v1.2.3 From 8cf05a7841e1cfd894741d6bab43067b0ca85eb8 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 14 Jun 2019 15:41:09 -0700 Subject: md: raid10: Use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; struct boo entry[]; }; instance = kmalloc(size, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kmalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1facd0153399..f35e076ee47d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4755,8 +4755,7 @@ static int handle_reshape_read_error(struct mddev *mddev, int idx = 0; struct page **pages; - r10b = kmalloc(sizeof(*r10b) + - sizeof(struct r10dev) * conf->copies, GFP_NOIO); + r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); if (!r10b) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); return -ENOMEM; -- cgit v1.2.3 From c7afa8034b09bc2bb664d86de7db34466401f352 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 14 Jun 2019 15:41:10 -0700 Subject: md: raid1-10: Unify r{1,10}bio_pool_free Avoiding duplicated code, since they just execute a kfree. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Jens Axboe --- drivers/md/raid1-10.c | 5 +++++ drivers/md/raid1.c | 13 ++++--------- drivers/md/raid10.c | 11 +++-------- 3 files changed, 12 insertions(+), 17 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 7d968bf08e54..54db34163968 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -34,6 +34,11 @@ struct resync_pages { struct page *pages[RESYNC_PAGES]; }; +static void rbio_pool_free(void *rbio, void *data) +{ + kfree(rbio); +} + static inline int resync_alloc_pages(struct resync_pages *rp, gfp_t gfp_flags) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 869c32fea1b8..a7860b5f33f2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -68,11 +68,6 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) return kzalloc(size, gfp_flags); } -static void r1bio_pool_free(void *r1_bio, void *data) -{ - kfree(r1_bio); -} - #define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) @@ -148,7 +143,7 @@ out_free_bio: kfree(rps); out_free_r1bio: - r1bio_pool_free(r1_bio, data); + rbio_pool_free(r1_bio, data); return NULL; } @@ -168,7 +163,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data) /* resync pages array stored in the 1st bio's .bi_private */ kfree(rp); - r1bio_pool_free(r1bio, data); + rbio_pool_free(r1bio, data); } static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) @@ -2920,7 +2915,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; conf->poolinfo->raid_disks = mddev->raid_disks * 2; err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, - r1bio_pool_free, conf->poolinfo); + rbio_pool_free, conf->poolinfo); if (err) goto abort; @@ -3205,7 +3200,7 @@ static int raid1_reshape(struct mddev *mddev) newpoolinfo->raid_disks = raid_disks * 2; ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, - r1bio_pool_free, newpoolinfo); + rbio_pool_free, newpoolinfo); if (ret) { kfree(newpoolinfo); return ret; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f35e076ee47d..c9a149b2ec86 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -98,11 +98,6 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) return kzalloc(size, gfp_flags); } -static void r10bio_pool_free(void *r10_bio, void *data) -{ - kfree(r10_bio); -} - #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) @@ -208,7 +203,7 @@ out_free_bio: } kfree(rps); out_free_r10bio: - r10bio_pool_free(r10_bio, conf); + rbio_pool_free(r10_bio, conf); return NULL; } @@ -236,7 +231,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data) /* resync pages array stored in the 1st bio's .bi_private */ kfree(rp); - r10bio_pool_free(r10bio, conf); + rbio_pool_free(r10bio, conf); } static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) @@ -3651,7 +3646,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->geo = geo; conf->copies = copies; err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); + rbio_pool_free, conf); if (err) goto out; -- cgit v1.2.3 From e9eeba28a1e01a55b49cdcf9c7a346d2aaa0aa7d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 15:41:11 -0700 Subject: md/raid10: read balance chooses idlest disk for SSD Andy reported that raid10 array with SSD disks has poor read performance. Compared with raid1, RAID-1 can be 3x faster than RAID-10 sometimes [1]. The thing is that raid10 chooses the low distance disk for read request, however, the approach doesn't work well for SSD device since it doesn't have spindle like HDD, we should just read from the SSD which has less pending IO like commit 9dedf60313fa4 ("md/raid1: read balance chooses idlest disk for SSD"). So this commit selects the idlest SSD disk for read if array has none rotational disk, otherwise, read_balance uses the previous distance priority algorithm. With the change, the performance of raid10 gets increased largely per Andy's test [2]. [1]. https://marc.info/?l=linux-raid&m=155915890004761&w=2 [2]. https://marc.info/?l=linux-raid&m=155990654223786&w=2 Tested-by: Andy Smith Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c9a149b2ec86..8a1354a08a1a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -707,15 +707,19 @@ static struct md_rdev *read_balance(struct r10conf *conf, int sectors = r10_bio->sectors; int best_good_sectors; sector_t new_distance, best_dist; - struct md_rdev *best_rdev, *rdev = NULL; + struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; int do_balance; - int best_slot; + int best_dist_slot, best_pending_slot; + bool has_nonrot_disk = false; + unsigned int min_pending; struct geom *geo = &conf->geo; raid10_find_phys(conf, r10_bio); rcu_read_lock(); - best_slot = -1; - best_rdev = NULL; + best_dist_slot = -1; + min_pending = UINT_MAX; + best_dist_rdev = NULL; + best_pending_rdev = NULL; best_dist = MaxSector; best_good_sectors = 0; do_balance = 1; @@ -737,6 +741,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, sector_t first_bad; int bad_sectors; sector_t dev_sector; + unsigned int pending; + bool nonrot; if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; @@ -773,8 +779,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, first_bad - dev_sector; if (good_sectors > best_good_sectors) { best_good_sectors = good_sectors; - best_slot = slot; - best_rdev = rdev; + best_dist_slot = slot; + best_dist_rdev = rdev; } if (!do_balance) /* Must read from here */ @@ -787,14 +793,23 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - if (best_slot >= 0) + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); + if (min_pending > pending && nonrot) { + min_pending = pending; + best_pending_slot = slot; + best_pending_rdev = rdev; + } + + if (best_dist_slot >= 0) /* At least 2 disks to choose from so failfast is OK */ set_bit(R10BIO_FailFast, &r10_bio->state); /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) + if (geo->near_copies > 1 && !pending) new_distance = 0; /* for far > 1 always use the lowest address */ @@ -803,15 +818,21 @@ static struct md_rdev *read_balance(struct r10conf *conf, else new_distance = abs(r10_bio->devs[slot].addr - conf->mirrors[disk].head_position); + if (new_distance < best_dist) { best_dist = new_distance; - best_slot = slot; - best_rdev = rdev; + best_dist_slot = slot; + best_dist_rdev = rdev; } } if (slot >= conf->copies) { - slot = best_slot; - rdev = best_rdev; + if (has_nonrot_disk) { + slot = best_pending_slot; + rdev = best_pending_rdev; + } else { + slot = best_dist_slot; + rdev = best_dist_rdev; + } } if (slot >= 0) { -- cgit v1.2.3 From 14ccb66b3f585b2bc21e7256c96090abed5a512c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Jun 2019 12:29:01 +0200 Subject: block: remove the bi_phys_segments field in struct bio We only need the number of segments in the blk-mq submission path. Remove the field from struct bio, and return it from a variant of blk_queue_split instead of that it can passed as an argument to those functions that need the value. This also means we stop recounting segments except for cloning and partial segments. To keep the number of arguments in this how path down remove pointless struct request_queue arguments from any of the functions that had it and grew a nr_segs argument. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index da94cbaa1a9e..3de4e13bde98 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5251,7 +5251,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; bio_set_dev(align_bi, rdev->bdev); - bio_clear_flag(align_bi, BIO_SEG_VALID); if (is_badblock(rdev, align_bi->bi_iter.bi_sector, bio_sectors(align_bi), -- cgit v1.2.3 From 3e148a3209792e04f63ec99701235c960765fc9a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 19 Jun 2019 17:30:46 +0800 Subject: md/raid1: fix potential data inconsistency issue with write behind device For write-behind mode, we think write IO is complete once it has reached all the non-writemostly devices. It works fine for single queue devices. But for multiqueue device, if there are lots of IOs come from upper layer, then the write-behind device could issue those IOs to different queues, depends on the each queue's delay, so there is no guarantee that those IOs can arrive in order. To address the issue, we need to check the collision among write behind IOs, we can only continue without collision, otherwise wait for the completion of previous collisioned IO. And WBCollision is introduced for multiqueue device which is worked under write-behind mode. But this patch doesn't handle below cases which could have the data inconsistency issue as well, these cases will be handled in later patches. 1. modify max_write_behind by write backlog node. 2. add or remove array's bitmap dynamically. 3. the change of member disk. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 41 ++++++++++++++++++++++++++++++++ drivers/md/md.h | 21 +++++++++++++++++ drivers/md/raid1.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 129 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1f37a1adc926..9a9762c83cc8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -124,6 +124,19 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static int rdev_init_wb(struct md_rdev *rdev) +{ + if (rdev->bdev->bd_queue->nr_hw_queues == 1) + return 0; + + spin_lock_init(&rdev->wb_list_lock); + INIT_LIST_HEAD(&rdev->wb_list); + init_waitqueue_head(&rdev->wb_io_wait); + set_bit(WBCollisionCheck, &rdev->flags); + + return 1; +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -5597,6 +5610,32 @@ int md_run(struct mddev *mddev) md_bitmap_destroy(mddev); goto abort; } + + if (mddev->bitmap_info.max_write_behind > 0) { + bool creat_pool = false; + + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags) && + rdev_init_wb(rdev)) + creat_pool = true; + } + if (creat_pool && mddev->wb_info_pool == NULL) { + mddev->wb_info_pool = + mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + if (!mddev->wb_info_pool) { + err = -ENOMEM; + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); + goto abort; + } + } + } + if (mddev->queue) { bool nonrot = true; @@ -5825,6 +5864,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; } void md_stop_writes(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index 7c930c091193..d449d514cff9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -109,6 +109,14 @@ struct md_rdev { * for reporting to userspace and storing * in superblock. */ + + /* + * The members for check collision of write behind IOs. + */ + struct list_head wb_list; + spinlock_t wb_list_lock; + wait_queue_head_t wb_io_wait; + struct work_struct del_work; /* used for delayed sysfs removal */ struct kernfs_node *sysfs_state; /* handle for 'state' @@ -193,6 +201,10 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ + WBCollisionCheck, /* + * multiqueue device should check if there + * is collision between write behind bios. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -245,6 +257,14 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_WB_INFOS 8 +/* record current range of write behind IOs */ +struct wb_info { + sector_t lo; + sector_t hi; + struct list_head list; +}; + struct mddev { void *private; struct md_personality *pers; @@ -461,6 +481,7 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ + mempool_t *wb_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a7860b5f33f2..3d44da663797 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -50,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" +static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi, *temp_wi; + unsigned long flags; + int ret = 0; + struct mddev *mddev = rdev->mddev; + + wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(temp_wi, &rdev->wb_list, list) { + /* collision happened */ + if (hi > temp_wi->lo && lo < temp_wi->hi) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + wi->lo = lo; + wi->hi = hi; + list_add(&wi->list, &rdev->wb_list); + } else + mempool_free(wi, mddev->wb_info_pool); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + + return ret; +} + +static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi; + unsigned long flags; + int found = 0; + struct mddev *mddev = rdev->mddev; + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(wi, &rdev->wb_list, list) + if (hi == wi->hi && lo == wi->lo) { + list_del(&wi->list); + mempool_free(wi, mddev->wb_info_pool); + found = 1; + break; + } + + if (!found) + WARN_ON("The write behind IO is not recorded\n"); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + wake_up(&rdev->wb_io_wait); +} + /* * for resync bio, r1bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -446,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + remove_wb(rdev, lo, hi); + } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -1443,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + wait_event(rdev->wb_io_wait, + check_and_add_wb(rdev, lo, hi) == 0); + } + if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } -- cgit v1.2.3 From 963c555e75b033202dd76cf6325a7b7c83d08d5f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:36 +0800 Subject: md: introduce mddev_create/destroy_wb_pool for the change of member device Previously, we called rdev_init_wb to avoid potential data inconsistency when array is created. Now, we need to call the function and create mempool if a device is added or just be flaged as "writemostly". So mddev_create_wb_pool is introduced and called accordingly. And for safety reason, we mark implicit GFP_NOIO allocation scope for create mempool during mddev_suspend/mddev_resume. And mempool should be removed conversely after remove a member device or its's "writemostly" flag, which is done by call mddev_destroy_wb_pool. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 2 ++ 2 files changed, 67 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9a9762c83cc8..b43207ab00b2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -37,6 +37,7 @@ */ +#include #include #include #include @@ -137,6 +138,64 @@ static int rdev_init_wb(struct md_rdev *rdev) return 1; } +/* + * Create wb_info_pool if rdev is the first multi-queue device flaged + * with writemostly, also write-behind mode is enabled. + */ +void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + if (mddev->bitmap_info.max_write_behind == 0) + return; + + if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + return; + + if (mddev->wb_info_pool == NULL) { + unsigned int noio_flag; + + if (!is_suspend) + mddev_suspend(mddev); + noio_flag = memalloc_noio_save(); + mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + memalloc_noio_restore(noio_flag); + if (!mddev->wb_info_pool) + pr_err("can't alloc memory pool for writemostly\n"); + if (!is_suspend) + mddev_resume(mddev); + } +} +EXPORT_SYMBOL_GPL(mddev_create_wb_pool); + +/* + * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + */ +static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + return; + + if (mddev->wb_info_pool) { + struct md_rdev *temp; + int num = 0; + + /* + * Check if other rdevs need wb_info_pool. + */ + rdev_for_each(temp, mddev) + if (temp != rdev && + test_bit(WBCollisionCheck, &temp->flags)) + num++; + if (!num) { + mddev_suspend(rdev->mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + mddev_resume(rdev->mddev); + } + } +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -2223,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); + if (mddev->raid_disks) + mddev_create_wb_pool(mddev, rdev, false); + if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2259,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); + mddev_destroy_wb_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2771,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); + mddev_create_wb_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { + mddev_destroy_wb_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { diff --git a/drivers/md/md.h b/drivers/md/md.h index d449d514cff9..10f98200e2f8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -730,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); +extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); -- cgit v1.2.3 From 10c92fca636e40dcb15d85ffe06b1b6843cd28fc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:37 +0800 Subject: md-bitmap: create and destroy wb_info_pool with the change of backlog Since we can enable write-behind mode by write backlog node, so create wb_info_pool if the mode is just enabled, also call call md_bitmap_update_sb to make user aware the write-behind mode is enabled. Conversely, wb_info_pool should be destroyed when write-behind mode is disabled. Beside above, it is better to update bitmap sb if we change the number of max_write_behind. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c01d41198f5e..15dd817fe83b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2462,12 +2462,26 @@ static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; + unsigned long old_mwb = mddev->bitmap_info.max_write_behind; int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; + if (!backlog && mddev->wb_info_pool) { + /* wb_info_pool is not needed if backlog is zero */ + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + } else if (backlog && !mddev->wb_info_pool) { + /* wb_info_pool is needed since backlog is not zero */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, false); + } + if (old_mwb != backlog) + md_bitmap_update_sb(mddev->bitmap); return len; } -- cgit v1.2.3 From 617b194a13c0f3b0a6d14fc6227c222877c23b4e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:38 +0800 Subject: md-bitmap: create and destroy wb_info_pool with the change of bitmap The write-behind attribute is part of bitmap, since bitmap can be added/removed dynamically with the following. 1. mdadm --grow /dev/md0 --bitmap=none 2. mdadm --grow /dev/md0 --bitmap=internal --write-behind So we need to destroy wb_info_pool in md_bitmap_destroy, and create the pool before load bitmap. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 15dd817fe83b..b092c7b5282f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev) sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; + struct md_rdev *rdev; if (!bitmap) goto out; + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, true); + if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); -- cgit v1.2.3 From d494549ac8852ec42854d1491dd17bb9350a0abc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jun 2019 17:10:39 +0800 Subject: md: add bitmap_abort label in md_run Now, there are two places need to consider about the failure of destroy bitmap, so move the common part between bitmap_abort and abort label. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index b43207ab00b2..692fc365e73c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5666,15 +5666,8 @@ int md_run(struct mddev *mddev) mddev->bitmap = bitmap; } - if (err) { - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; - } + if (err) + goto bitmap_abort; if (mddev->bitmap_info.max_write_behind > 0) { bool creat_pool = false; @@ -5690,13 +5683,7 @@ int md_run(struct mddev *mddev) sizeof(struct wb_info)); if (!mddev->wb_info_pool) { err = -ENOMEM; - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; + goto bitmap_abort; } } } @@ -5761,6 +5748,13 @@ int md_run(struct mddev *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; +bitmap_abort: + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); -- cgit v1.2.3 From 16d4b74654ff7c3c5d0b6446278ef51b1de41484 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 26 Jun 2019 12:42:51 +0300 Subject: md/raid1: Fix a warning message in remove_wb() The WARN_ON() macro doesn't take an error message, it just takes a condition. I've changed this to use WARN(1, "...") instead. Fixes: 3e148a320979 ("md/raid1: fix potential data inconsistency issue with write behind device") Signed-off-by: Dan Carpenter Signed-off-by: Song Liu --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3d44da663797..34e26834ad28 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -96,7 +96,7 @@ static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) } if (!found) - WARN_ON("The write behind IO is not recorded\n"); + WARN(1, "The write behind IO is not recorded\n"); spin_unlock_irqrestore(&rdev->wb_list_lock, flags); wake_up(&rdev->wb_io_wait); } -- cgit v1.2.3 From 141df8bb5dc052f605de8f48a7aa10290e1384ae Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:24 +0800 Subject: bcache: don't set max writeback rate if gc is running When gc is running, user space I/O processes may wait inside bcache code, so no new I/O coming. Indeed this is not a real idle time, maximum writeback rate should not be set in such situation. Otherwise a faster writeback thread may compete locks with gc thread and makes garbage collection slower, which results a longer I/O freeze period. This patch checks c->gc_mark_valid in set_at_max_writeback_rate(). If c->gc_mark_valid is 0 (gc running), set_at_max_writeback_rate() returns false, then update_writeback_rate() will not set writeback rate to maximum value even c->idle_counter reaches an idle threshold. Now writeback thread won't interfere gc thread performance. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 73f0efac2b9f..262f7ef20992 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,9 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; /* * Idle_counter is increased everytime when update_writeback_rate() is * called. If all backing devices attached to the same cache set have -- cgit v1.2.3 From b387e9b58679c60f5b1e4313939bd4878204fc37 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:25 +0800 Subject: bcache: check c->gc_thread by IS_ERR_OR_NULL in cache_set_flush() When system memory is in heavy pressure, bch_gc_thread_start() from run_cache_set() may fail due to out of memory. In such condition, c->gc_thread is assigned to -ENOMEM, not NULL pointer. Then in following failure code path bch_cache_set_error(), when cache_set_flush() gets called, the code piece to stop c->gc_thread is broken, if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); And KASAN catches such NULL pointer deference problem, with the warning information: [ 561.207881] ================================================================== [ 561.207900] BUG: KASAN: null-ptr-deref in kthread_stop+0x3b/0x440 [ 561.207904] Write of size 4 at addr 000000000000001c by task kworker/15:1/313 [ 561.207913] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G W 5.0.0-vanilla+ #3 [ 561.207916] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019 [ 561.207935] Workqueue: events cache_set_flush [bcache] [ 561.207940] Call Trace: [ 561.207948] dump_stack+0x9a/0xeb [ 561.207955] ? kthread_stop+0x3b/0x440 [ 561.207960] ? kthread_stop+0x3b/0x440 [ 561.207965] kasan_report+0x176/0x192 [ 561.207973] ? kthread_stop+0x3b/0x440 [ 561.207981] kthread_stop+0x3b/0x440 [ 561.207995] cache_set_flush+0xd4/0x6d0 [bcache] [ 561.208008] process_one_work+0x856/0x1620 [ 561.208015] ? find_held_lock+0x39/0x1d0 [ 561.208028] ? drain_workqueue+0x380/0x380 [ 561.208048] worker_thread+0x87/0xb80 [ 561.208058] ? __kthread_parkme+0xb6/0x180 [ 561.208067] ? process_one_work+0x1620/0x1620 [ 561.208072] kthread+0x326/0x3e0 [ 561.208079] ? kthread_create_worker_on_cpu+0xc0/0xc0 [ 561.208090] ret_from_fork+0x3a/0x50 [ 561.208110] ================================================================== [ 561.208113] Disabling lock debugging due to kernel taint [ 561.208115] irq event stamp: 11800231 [ 561.208126] hardirqs last enabled at (11800231): [] do_syscall_64+0x18/0x410 [ 561.208127] BUG: unable to handle kernel NULL pointer dereference at 000000000000001c [ 561.208129] #PF error: [WRITE] [ 561.312253] hardirqs last disabled at (11800230): [] trace_hardirqs_off_thunk+0x1a/0x1c [ 561.312259] softirqs last enabled at (11799832): [] __do_softirq+0x5c7/0x8c3 [ 561.405975] PGD 0 P4D 0 [ 561.442494] softirqs last disabled at (11799821): [] irq_exit+0x1ac/0x1e0 [ 561.791359] Oops: 0002 [#1] SMP KASAN NOPTI [ 561.791362] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G B W 5.0.0-vanilla+ #3 [ 561.791363] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019 [ 561.791371] Workqueue: events cache_set_flush [bcache] [ 561.791374] RIP: 0010:kthread_stop+0x3b/0x440 [ 561.791376] Code: 00 00 65 8b 05 26 d5 e0 7c 89 c0 48 0f a3 05 ec aa df 02 0f 82 dc 02 00 00 4c 8d 63 20 be 04 00 00 00 4c 89 e7 e8 65 c5 53 00 ff 43 20 48 8d 7b 24 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 [ 561.791377] RSP: 0018:ffff88872fc8fd10 EFLAGS: 00010286 [ 561.838895] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838916] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838934] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838948] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838966] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838979] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 561.838996] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 563.067028] RAX: 0000000000000000 RBX: fffffffffffffffc RCX: ffffffff832dd314 [ 563.067030] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000297 [ 563.067032] RBP: ffff88872fc8fe88 R08: fffffbfff0b8213d R09: fffffbfff0b8213d [ 563.067034] R10: 0000000000000001 R11: fffffbfff0b8213c R12: 000000000000001c [ 563.408618] R13: ffff88dc61cc0f68 R14: ffff888102b94900 R15: ffff88dc61cc0f68 [ 563.408620] FS: 0000000000000000(0000) GS:ffff888f7dc00000(0000) knlGS:0000000000000000 [ 563.408622] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 563.408623] CR2: 000000000000001c CR3: 0000000f48a1a004 CR4: 00000000007606e0 [ 563.408625] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 563.408627] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 563.904795] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 563.915796] PKRU: 55555554 [ 563.915797] Call Trace: [ 563.915807] cache_set_flush+0xd4/0x6d0 [bcache] [ 563.915812] process_one_work+0x856/0x1620 [ 564.001226] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.033563] ? find_held_lock+0x39/0x1d0 [ 564.033567] ? drain_workqueue+0x380/0x380 [ 564.033574] worker_thread+0x87/0xb80 [ 564.062823] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.118042] ? __kthread_parkme+0xb6/0x180 [ 564.118046] ? process_one_work+0x1620/0x1620 [ 564.118048] kthread+0x326/0x3e0 [ 564.118050] ? kthread_create_worker_on_cpu+0xc0/0xc0 [ 564.167066] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.252441] ret_from_fork+0x3a/0x50 [ 564.252447] Modules linked in: msr rpcrdma sunrpc rdma_ucm ib_iser ib_umad rdma_cm ib_ipoib i40iw configfs iw_cm ib_cm libiscsi scsi_transport_iscsi mlx4_ib ib_uverbs mlx4_en ib_core nls_iso8859_1 nls_cp437 vfat fat intel_rapl skx_edac x86_pkg_temp_thermal coretemp iTCO_wdt iTCO_vendor_support crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ses raid0 aesni_intel cdc_ether enclosure usbnet ipmi_ssif joydev aes_x86_64 i40e scsi_transport_sas mii bcache md_mod crypto_simd mei_me ioatdma crc64 ptp cryptd pcspkr i2c_i801 mlx4_core glue_helper pps_core mei lpc_ich dca wmi ipmi_si ipmi_devintf nd_pmem dax_pmem nd_btt ipmi_msghandler device_dax pcc_cpufreq button hid_generic usbhid mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect xhci_pci sysimgblt fb_sys_fops xhci_hcd ttm megaraid_sas drm usbcore nfit libnvdimm sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua efivarfs [ 564.299390] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree. [ 564.348360] CR2: 000000000000001c [ 564.348362] ---[ end trace b7f0e5cc7b2103b0 ]--- Therefore, it is not enough to only check whether c->gc_thread is NULL, we should use IS_ERR_OR_NULL() to check both NULL pointer and error value. This patch changes the above buggy code piece in this way, if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1b63ac876169..64d9de89a63f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1564,7 +1564,7 @@ static void cache_set_flush(struct closure *cl) kobject_put(&c->internal); kobject_del(&c->kobj); - if (c->gc_thread) + if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); if (!IS_ERR_OR_NULL(c->root)) -- cgit v1.2.3 From 0ae49cb7aa005ed18fe8f4d6ccf73019b78ac7b2 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:26 +0800 Subject: bcache: fix return value error in bch_journal_read() When everything is OK in bch_journal_read(), finally the return value is returned by, return ret; which assumes ret will be 0 here. This assumption is wrong when all journal buckets as are full and filled with valid journal entries. In such cache the last location referencess read_bucket() sets 'ret' to 1, which means new jset added into jset list. The jset list is list 'journal' in caller run_cache_set(). Return 1 to run_cache_set() means something wrong and the cache set won't start, but indeed everything is OK. This patch changes the line at end of bch_journal_read() to directly return 0 since everything if verything is good. Then a bogus error is fixed. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 12dae9348147..4e5fc05720fc 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -268,7 +268,7 @@ bsearch: struct journal_replay, list)->j.seq; - return ret; + return 0; #undef read_bucket } -- cgit v1.2.3 From 695277f16b3a102fcc22c97fdf2de77c7b19f0b3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:27 +0800 Subject: Revert "bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error()" This reverts commit 6147305c73e4511ca1a975b766b97a779d442567. Although this patch helps the failed bcache device to stop faster when too many I/O errors detected on corresponding cached device, setting CACHE_SET_IO_DISABLE bit to cache set c->flags was not a good idea. This operation will disable all I/Os on cache set, which means other attached bcache devices won't work neither. Without this patch, the failed bcache device can also be stopped eventually if internal I/O accomplished (e.g. writeback). Therefore here I revert it. Fixes: 6147305c73e4 ("bcache: set CACHE_SET_IO_DISABLE in bch_cached_dev_error()") Reported-by: Yong Li Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 64d9de89a63f..ba2ad093bc80 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1437,8 +1437,6 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) bool bch_cached_dev_error(struct cached_dev *dc) { - struct cache_set *c; - if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return false; @@ -1449,21 +1447,6 @@ bool bch_cached_dev_error(struct cached_dev *dc) pr_err("stop %s: too many IO errors on backing device %s\n", dc->disk.disk->disk_name, dc->backing_dev_name); - /* - * If the cached device is still attached to a cache set, - * even dc->io_disable is true and no more I/O requests - * accepted, cache device internal I/O (writeback scan or - * garbage collection) may still prevent bcache device from - * being stopped. So here CACHE_SET_IO_DISABLE should be - * set to c->flags too, to make the internal I/O to cache - * device rejected and stopped immediately. - * If c is NULL, that means the bcache device is not attached - * to any cache set, then no CACHE_SET_IO_DISABLE bit to set. - */ - c = dc->disk.c; - if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) - pr_info("CACHE_SET_IO_DISABLE already set"); - bcache_device_stop(&dc->disk); return true; } -- cgit v1.2.3 From e6dcbd3e6c91b7828cb305ec324eb7fd9bdea8a0 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:28 +0800 Subject: bcache: avoid flushing btree node in cache_set_flush() if io disabled When cache_set_flush() is called for too many I/O errors detected on cache device and the cache set is retiring, inside the function it doesn't make sense to flushing cached btree nodes from c->btree_cache because CACHE_SET_IO_DISABLE is set on c->flags already and all I/Os onto cache device will be rejected. This patch checks in cache_set_flush() that whether CACHE_SET_IO_DISABLE is set. If yes, then avoids to flush the cached btree nodes to reduce more time and make cache set retiring more faster. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ba2ad093bc80..dc6702c2c4b6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1553,13 +1553,17 @@ static void cache_set_flush(struct closure *cl) if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); - /* Should skip this if we're unregistering because of an error */ - list_for_each_entry(b, &c->btree_cache, list) { - mutex_lock(&b->write_lock); - if (btree_node_dirty(b)) - __bch_btree_node_write(b, NULL); - mutex_unlock(&b->write_lock); - } + /* + * Avoid flushing cached nodes if cache set is retiring + * due to too many I/O errors detected. + */ + if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } for_each_cache(ca, c, i) if (ca->alloc_thread) -- cgit v1.2.3 From 578df99b1b0531d19af956530fe4da63d01a1604 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:29 +0800 Subject: bcache: ignore read-ahead request failure on backing device When md raid device (e.g. raid456) is used as backing device, read-ahead requests on a degrading and recovering md raid device might be failured immediately by md raid code, but indeed this md raid array can still be read or write for normal I/O requests. Therefore such failed read-ahead request are not real hardware failure. Further more, after degrading and recovering accomplished, read-ahead requests will be handled by md raid array again. For such condition, I/O failures of read-ahead requests don't indicate real health status (because normal I/O still be served), they should not be counted into I/O error counter dc->io_errors. Since there is no simple way to detect whether the backing divice is a md raid device, this patch simply ignores I/O failures for read-ahead bios on backing device, to avoid bogus backing device failure on a degrading md raid array. Suggested-and-tested-by: Thorsten Knabe Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/io.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index c25097968319..4d93f07f63e5 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + /* + * Read-ahead requests on a degrading and recovering md raid + * (e.g. raid6) device might be failured immediately by md + * raid code, which is not a real hardware media failure. So + * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. + */ + if (bio->bi_opf & REQ_RAHEAD) { + pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", + dc->backing_dev_name); + return; + } + errors = atomic_add_return(1, &dc->io_errors); if (errors < dc->error_limit) pr_err("%s: IO error on backing device, unrecoverable", -- cgit v1.2.3 From 08ec1e6282f271698f0053983fab89de6e1a8217 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:30 +0800 Subject: bcache: add io error counting in write_bdev_super_endio() When backing device super block is written by bch_write_bdev_super(), the bio complete callback write_bdev_super_endio() simply ignores I/O status. Indeed such write request also contribute to backing device health status if the request failed. This patch checkes bio->bi_status in write_bdev_super_endio(), if there is error, bch_count_backing_io_errors() will be called to count an I/O error to dc->io_errors. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dc6702c2c4b6..73466bda12a7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -197,7 +197,9 @@ err: static void write_bdev_super_endio(struct bio *bio) { struct cached_dev *dc = bio->bi_private; - /* XXX: error checking */ + + if (bio->bi_status) + bch_count_backing_io_errors(dc, bio); closure_put(&dc->sb_write); } -- cgit v1.2.3 From f960facb399ece6ff88a7a2d4b4a5515e3a467a0 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:31 +0800 Subject: bcache: remove unnecessary prefetch() in bset_search_tree() In function bset_search_tree(), when p >= t->size, t->tree[0] will be prefetched by the following code piece, 974 unsigned int p = n << 4; 975 976 p &= ((int) (p - t->size)) >> 31; 977 978 prefetch(&t->tree[p]); The purpose of the above code is to avoid a branch instruction, but when p >= t->size, prefetch(&t->tree[0]) has no positive performance contribution at all. This patch avoids the unncessary prefetch by only calling prefetch() when p < t->size. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 268f1b685084..e36a108d3648 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -970,22 +970,10 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, unsigned int inorder, j, n = 1; do { - /* - * A bit trick here. - * If p < t->size, (int)(p - t->size) is a minus value and - * the most significant bit is set, right shifting 31 bits - * gets 1. If p >= t->size, the most significant bit is - * not set, right shifting 31 bits gets 0. - * So the following 2 lines equals to - * if (p >= t->size) - * p = 0; - * but a branch instruction is avoided. - */ unsigned int p = n << 4; - p &= ((int) (p - t->size)) >> 31; - - prefetch(&t->tree[p]); + if (p < t->size) + prefetch(&t->tree[p]); j = n; f = &t->tree[j]; -- cgit v1.2.3 From 89e0341af082dbc170019f908846f4a424efc86b Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Fri, 28 Jun 2019 19:59:32 +0800 Subject: bcache: use sysfs_match_string() instead of __sysfs_match_string() The arrays (of strings) that are passed to __sysfs_match_string() are static, so use sysfs_match_string() which does an implicit ARRAY_SIZE() over these arrays. Functionally, this doesn't change anything. The change is more cosmetic. It only shrinks the static arrays by 1 byte each. Signed-off-by: Alexandru Ardelean Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index bfb437ffb13c..760cf8951338 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -21,28 +21,24 @@ static const char * const bch_cache_modes[] = { "writethrough", "writeback", "writearound", - "none", - NULL + "none" }; /* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", - "always", - NULL + "always" }; static const char * const cache_replacement_policies[] = { "lru", "fifo", - "random", - NULL + "random" }; static const char * const error_actions[] = { "unregister", - "panic", - NULL + "panic" }; write_attribute(attach); @@ -333,7 +329,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - v = __sysfs_match_string(bch_cache_modes, -1, buf); + v = sysfs_match_string(bch_cache_modes, buf); if (v < 0) return v; @@ -344,7 +340,7 @@ STORE(__cached_dev) } if (attr == &sysfs_stop_when_cache_set_failed) { - v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); + v = sysfs_match_string(bch_stop_on_failure_modes, buf); if (v < 0) return v; @@ -799,7 +795,7 @@ STORE(__bch_cache_set) 0, UINT_MAX); if (attr == &sysfs_errors) { - v = __sysfs_match_string(error_actions, -1, buf); + v = sysfs_match_string(error_actions, buf); if (v < 0) return v; @@ -1063,7 +1059,7 @@ STORE(__bch_cache) } if (attr == &sysfs_cache_replacement_policy) { - v = __sysfs_match_string(cache_replacement_policies, -1, buf); + v = sysfs_match_string(cache_replacement_policies, buf); if (v < 0) return v; -- cgit v1.2.3 From 0b13efecf5f25ce5e31f2ab3930335015cb65a7d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:33 +0800 Subject: bcache: add return value check to bch_cached_dev_run() This patch adds return value check to bch_cached_dev_run(), now if there is error happens inside bch_cached_dev_run(), it can be catched. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 33 ++++++++++++++++++++++++++------- drivers/md/bcache/sysfs.c | 7 +++++-- 3 files changed, 32 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index fdf75352e16a..73a97586a2ef 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1006,7 +1006,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid); void bch_cached_dev_detach(struct cached_dev *dc); -void bch_cached_dev_run(struct cached_dev *dc); +int bch_cached_dev_run(struct cached_dev *dc); void bcache_device_stop(struct bcache_device *d); void bch_cache_set_unregister(struct cache_set *c); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 73466bda12a7..0abee44092bf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -910,7 +910,7 @@ static int cached_dev_status_update(void *arg) } -void bch_cached_dev_run(struct cached_dev *dc) +int bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); @@ -921,11 +921,14 @@ void bch_cached_dev_run(struct cached_dev *dc) NULL, }; + if (dc->io_disable) + return -EIO; + if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); kfree(buf); - return; + return -EBUSY; } if (!d->c && @@ -951,8 +954,11 @@ void bch_cached_dev_run(struct cached_dev *dc) kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) + sysfs_create_link(&disk_to_dev(d->disk)->kobj, + &d->kobj, "bcache")) { pr_debug("error creating sysfs link"); + return -ENOMEM; + } dc->status_update_thread = kthread_run(cached_dev_status_update, dc, "bcache_status_update"); @@ -961,6 +967,8 @@ void bch_cached_dev_run(struct cached_dev *dc) "continue to run without monitoring backing " "device status"); } + + return 0; } /* @@ -1056,6 +1064,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); struct uuid_entry *u; struct cached_dev *exist_dc, *t; + int ret = 0; if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) @@ -1165,7 +1174,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, bch_sectors_dirty_init(&dc->disk); - bch_cached_dev_run(dc); + ret = bch_cached_dev_run(dc); + if (ret && (ret != -EBUSY)) { + up_write(&dc->writeback_lock); + return ret; + } + bcache_device_link(&dc->disk, c, "bdev"); atomic_inc(&c->attached_dev_nr); @@ -1292,6 +1306,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, { const char *err = "cannot allocate memory"; struct cache_set *c; + int ret = -ENOMEM; bdevname(bdev, dc->backing_dev_name); memcpy(&dc->sb, sb, sizeof(struct cache_sb)); @@ -1321,14 +1336,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, bch_cached_dev_attach(dc, c, NULL); if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || - BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) - bch_cached_dev_run(dc); + BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { + err = "failed to run cached device"; + ret = bch_cached_dev_run(dc); + if (ret) + goto err; + } return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); - return -EIO; + return ret; } /* Flash only volumes */ diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 760cf8951338..eb678e43ac00 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -325,8 +325,11 @@ STORE(__cached_dev) bch_cache_accounting_clear(&dc->accounting); if (attr == &sysfs_running && - strtoul_or_return(buf)) - bch_cached_dev_run(dc); + strtoul_or_return(buf)) { + v = bch_cached_dev_run(dc); + if (v) + return v; + } if (attr == &sysfs_cache_mode) { v = sysfs_match_string(bch_cache_modes, buf); -- cgit v1.2.3 From bd9026c8a7f33ebe25543b7b7e6276b49db60f7e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:34 +0800 Subject: bcache: remove unncessary code in bch_btree_keys_init() Function bch_btree_keys_init() initializes b->set[].size and b->set[].data to zero. As the code comments indicates, these code indeed is unncessary, because both struct btree_keys and struct bset_tree are nested embedded into struct btree, when struct btree is filled with 0 bits by kzalloc() in mca_bucket_alloc(), b->set[].size and b->set[].data are initialized to 0 (a.k.a NULL) already. This patch removes the redundant code, and add comments in bch_btree_keys_init() and mca_bucket_alloc() to explain why it's safe. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 15 ++++++--------- drivers/md/bcache/btree.c | 4 ++++ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e36a108d3648..8af9509e78bd 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) { - unsigned int i; - b->ops = ops; b->expensive_debug_checks = expensive_debug_checks; b->nsets = 0; b->last_set_unwritten = 0; - /* XXX: shouldn't be needed */ - for (i = 0; i < MAX_BSETS; i++) - b->set[i].size = 0; /* - * Second loop starts at 1 because b->keys[0]->data is the memory we - * allocated + * struct btree_keys in embedded in struct btree, and struct + * bset_tree is embedded into struct btree_keys. They are all + * initialized as 0 by kzalloc() in mca_bucket_alloc(), and + * b->set[0].data is allocated in bch_btree_keys_alloc(), so we + * don't have to initiate b->set[].size and b->set[].data here + * any more. */ - for (i = 1; i < MAX_BSETS; i++) - b->set[i].data = NULL; } EXPORT_SYMBOL(bch_btree_keys_init); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 773f5fdad25f..cf38a1b031fa 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { + /* + * kzalloc() is necessary here for initialization, + * see code comments in bch_btree_keys_init(). + */ struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) -- cgit v1.2.3 From e775339e1ae1205b47d94881db124c11385e597c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:35 +0800 Subject: bcache: check CACHE_SET_IO_DISABLE in allocator code If CACHE_SET_IO_DISABLE of a cache set flag is set by too many I/O errors, currently allocator routines can still continue allocate space which may introduce inconsistent metadata state. This patch checkes CACHE_SET_IO_DISABLE bit in following allocator routines, - bch_bucket_alloc() - __bch_bucket_alloc_set() Once CACHE_SET_IO_DISABLE is set on cache set, the allocator routines may reject allocation request earlier to avoid potential inconsistent metadata. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index f8986effcb50..6f776823b9ba 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) struct bucket *b; long r; + + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) + return -1; + /* fastpath */ if (fifo_pop(&ca->free[RESERVE_NONE], r) || fifo_pop(&ca->free[reserve], r)) @@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, { int i; + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return -1; + lockdep_assert_held(&c->bucket_lock); BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); -- cgit v1.2.3 From 383ff2183ad16a8842d1fbd9dd3e1cbd66813e64 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:36 +0800 Subject: bcache: check CACHE_SET_IO_DISABLE bit in bch_journal() When too many I/O errors happen on cache set and CACHE_SET_IO_DISABLE bit is set, bch_journal() may continue to work because the journaling bkey might be still in write set yet. The caller of bch_journal() may believe the journal still work but the truth is in-memory journal write set won't be written into cache device any more. This behavior may introduce potential inconsistent metadata status. This patch checks CACHE_SET_IO_DISABLE bit at the head of bch_journal(), if the bit is set, bch_journal() returns NULL immediately to notice caller to know journal does not work. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 4e5fc05720fc..54f8886b6177 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -811,6 +811,10 @@ atomic_t *bch_journal(struct cache_set *c, struct journal_write *w; atomic_t *ret; + /* No journaling if CACHE_SET_IO_DISABLE set already */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return NULL; + if (!CACHE_SYNC(&c->sb)) return NULL; -- cgit v1.2.3 From 4b6efb4bdbce25097f1a6329e18c2b77c4f27722 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:37 +0800 Subject: bcache: more detailed error message to bcache_device_link() This patch adds more accurate error message for specific ssyfs_create_link() call, to help debugging failure during bcache device start tup. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0abee44092bf..d4d8d1300faf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -693,6 +693,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, { unsigned int i; struct cache *ca; + int ret; for_each_cache(ca, d->c, i) bd_link_disk_holder(ca->bdev, d->disk); @@ -700,9 +701,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, snprintf(d->name, BCACHEDEVNAME_SIZE, "%s%u", name, d->id); - WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || - sysfs_create_link(&c->kobj, &d->kobj, d->name), - "Couldn't create device <-> cache set symlinks"); + ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); + if (ret < 0) + pr_err("Couldn't create device -> cache set symlink"); + + ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); + if (ret < 0) + pr_err("Couldn't create cache set -> device symlink"); clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } -- cgit v1.2.3 From 633bb2ce60b949e2990c15324be162c54788c027 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:38 +0800 Subject: bcache: add more error message in bch_cached_dev_attach() This patch adds more error message for attaching cached device, this is helpful to debug code failure during bache device start up. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d4d8d1300faf..a836910ef368 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1169,6 +1169,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, down_write(&dc->writeback_lock); if (bch_cached_dev_writeback_start(dc)) { up_write(&dc->writeback_lock); + pr_err("Couldn't start writeback facilities for %s", + dc->disk.disk->disk_name); return -ENOMEM; } @@ -1182,6 +1184,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, ret = bch_cached_dev_run(dc); if (ret && (ret != -EBUSY)) { up_write(&dc->writeback_lock); + pr_err("Couldn't run cached device %s", + dc->backing_dev_name); return ret; } -- cgit v1.2.3 From e0faa3d7f79f7e1abb43de168e88c76061518ea4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:39 +0800 Subject: bcache: improve error message in bch_cached_dev_run() This patch adds more error message in bch_cached_dev_run() to indicate the exact reason why an error value is returned. Please notice when printing out the "is running already" message, pr_info() is used here, because in this case also -EBUSY is returned, the bcache device can continue to attach to the cache devince and run, so it won't be an error level message in kernel message. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a836910ef368..e9e6d653bf70 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -926,13 +926,18 @@ int bch_cached_dev_run(struct cached_dev *dc) NULL, }; - if (dc->io_disable) + if (dc->io_disable) { + pr_err("I/O disabled on cached dev %s", + dc->backing_dev_name); return -EIO; + } if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); kfree(buf); + pr_info("cached dev %s is running already", + dc->backing_dev_name); return -EBUSY; } @@ -961,7 +966,7 @@ int bch_cached_dev_run(struct cached_dev *dc) if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) { - pr_debug("error creating sysfs link"); + pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); return -ENOMEM; } -- cgit v1.2.3 From 68a53c95a0fce541321fbca74a7f72c71361f496 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:40 +0800 Subject: bcache: remove "XXX:" comment line from run_cache_set() In previous bcache patches for Linux v5.2, the failure code path of run_cache_set() is tested and fixed. So now the following comment line can be removed from run_cache_set(), /* XXX: test this, it's broken */ Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e9e6d653bf70..c53fe0f1629f 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1979,7 +1979,7 @@ err: } closure_sync(&cl); - /* XXX: test this, it's broken */ + bch_cache_set_error(c, "%s", err); return -EIO; -- cgit v1.2.3 From 944a4f340a65c21ee311d2d3e617034bef9d0b25 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:41 +0800 Subject: bcache: make bset_search_tree() be more understandable The purpose of following code in bset_search_tree() is to avoid a branch instruction, 994 if (likely(f->exponent != 127)) 995 n = j * 2 + (((unsigned int) 996 (f->mantissa - 997 bfloat_mantissa(search, f))) >> 31); 998 else 999 n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) 1000 ? j * 2 1001 : j * 2 + 1; This piece of code is not very clear to understand, even when I tried to add code comment for it, I made mistake. This patch removes the implict bit operation and uses explicit branch to calculate next location in binary tree search. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 8af9509e78bd..08768796b543 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -975,25 +975,17 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, j = n; f = &t->tree[j]; - /* - * Similar bit trick, use subtract operation to avoid a branch - * instruction. - * - * n = (f->mantissa > bfloat_mantissa()) - * ? j * 2 - * : j * 2 + 1; - * - * We need to subtract 1 from f->mantissa for the sign bit trick - * to work - that's done in make_bfloat() - */ - if (likely(f->exponent != 127)) - n = j * 2 + (((unsigned int) - (f->mantissa - - bfloat_mantissa(search, f))) >> 31); - else - n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) - ? j * 2 - : j * 2 + 1; + if (likely(f->exponent != 127)) { + if (f->mantissa >= bfloat_mantissa(search, f)) + n = j * 2; + else + n = j * 2 + 1; + } else { + if (bkey_cmp(tree_to_bkey(t, j), search) > 0) + n = j * 2; + else + n = j * 2 + 1; + } } while (n < t->size); inorder = to_inorder(j, t); -- cgit v1.2.3 From 0c277e211aae056b26513358fc060291d8523747 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:42 +0800 Subject: bcache: add pendings_cleanup to stop pending bcache device If a bcache device is in dirty state and its cache set is not registered, this bcache device will not appear in /dev/bcache, and there is no way to stop it or remove the bcache kernel module. This is an as-designed behavior, but sometimes people has to reboot whole system to release or stop the pending backing device. This sysfs interface may remove such pending bcache devices when write anything into the sysfs file manually. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c53fe0f1629f..c4c4b2d99dc2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2273,9 +2273,13 @@ err: static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size); +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); +kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); static bool bch_is_open_backing(struct block_device *bdev) { @@ -2400,6 +2404,56 @@ err: goto out; } + +struct pdev { + struct list_head list; + struct cached_dev *dc; +}; + +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, + size_t size) +{ + LIST_HEAD(pending_devs); + ssize_t ret = size; + struct cached_dev *dc, *tdc; + struct pdev *pdev, *tpdev; + struct cache_set *c, *tc; + + mutex_lock(&bch_register_lock); + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { + pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); + if (!pdev) + break; + pdev->dc = dc; + list_add(&pdev->list, &pending_devs); + } + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + char *pdev_set_uuid = pdev->dc->sb.set_uuid; + char *set_uuid = c->sb.uuid; + + if (!memcmp(pdev_set_uuid, set_uuid, 16)) { + list_del(&pdev->list); + kfree(pdev); + break; + } + } + } + mutex_unlock(&bch_register_lock); + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + pr_info("delete pdev %p", pdev); + list_del(&pdev->list); + bcache_device_stop(&pdev->dc->disk); + kfree(pdev); + } + + return ret; +} + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) { if (code == SYS_DOWN || @@ -2518,6 +2572,7 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, + &ksysfs_pendings_cleanup.attr, NULL }; -- cgit v1.2.3 From 5461999848e0462c14f306a62923d22de820a59c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:43 +0800 Subject: bcache: fix mistaken sysfs entry for io_error counter In bch_cached_dev_files[] from driver/md/bcache/sysfs.c, sysfs_errors is incorrectly inserted in. The correct entry should be sysfs_io_errors. This patch fixes the problem and now I/O errors of cached device can be read from /sys/block/bcache/bcache/io_errors. Fixes: c7b7bd07404c5 ("bcache: add io_disable to struct cached_dev") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index eb678e43ac00..dddb8d4048ce 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -176,7 +176,7 @@ SHOW(__bch_cached_dev) var_print(writeback_percent); sysfs_hprint(writeback_rate, wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); - sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); + sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors)); sysfs_printf(io_error_limit, "%i", dc->error_limit); sysfs_printf(io_disable, "%i", dc->io_disable); var_print(writeback_rate_update_seconds); @@ -463,7 +463,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, - &sysfs_errors, + &sysfs_io_errors, &sysfs_io_error_limit, &sysfs_io_disable, &sysfs_dirty_data, -- cgit v1.2.3 From f54d801dda14942dbefa00541d10603015b7859c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:44 +0800 Subject: bcache: destroy dc->writeback_write_wq if failed to create dc->writeback_thread Commit 9baf30972b55 ("bcache: fix for gc and write-back race") added a new work queue dc->writeback_write_wq, but forgot to destroy it in the error condition when creating dc->writeback_thread failed. This patch destroys dc->writeback_write_wq if kthread_create() returns error pointer to dc->writeback_thread, then a memory leak is avoided. Fixes: 9baf30972b55 ("bcache: fix for gc and write-back race") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 262f7ef20992..21081febcb59 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -833,6 +833,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) { cached_dev_put(dc); + destroy_workqueue(dc->writeback_write_wq); return PTR_ERR(dc->writeback_thread); } dc->writeback_running = true; -- cgit v1.2.3 From 5c2a634cbfaf1971cb6453fe5f86d83585257790 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:45 +0800 Subject: bcache: stop writeback kthread and kworker when bch_cached_dev_run() failed In bch_cached_dev_attach() after bch_cached_dev_writeback_start() called, the wrireback kthread and writeback rate update kworker of the cached device are created, if the following bch_cached_dev_run() failed, bch_cached_dev_attach() will return with -ENOMEM without stopping the writeback related kthread and kworker. This patch stops writeback kthread and writeback rate update kworker before returning -ENOMEM if bch_cached_dev_run() returns error. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c4c4b2d99dc2..791cb930b353 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1189,6 +1189,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, ret = bch_cached_dev_run(dc); if (ret && (ret != -EBUSY)) { up_write(&dc->writeback_lock); + /* + * bch_register_lock is held, bcache_device_stop() is not + * able to be directly called. The kthread and kworker + * created previously in bch_cached_dev_writeback_start() + * have to be stopped manually here. + */ + kthread_stop(dc->writeback_thread); + cancel_writeback_rate_update_dwork(dc); pr_err("Couldn't run cached device %s", dc->backing_dev_name); return ret; -- cgit v1.2.3 From a59ff6ccc2bf2e2934b31bbf734f0bc04b5ec78a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:46 +0800 Subject: bcache: avoid a deadlock in bcache_reboot() It is quite frequently to observe deadlock in bcache_reboot() happens and hang the system reboot process. The reason is, in bcache_reboot() when calling bch_cache_set_stop() and bcache_device_stop() the mutex bch_register_lock is held. But in the process to stop cache set and bcache device, bch_register_lock will be acquired again. If this mutex is held here, deadlock will happen inside the stopping process. The aftermath of the deadlock is, whole system reboot gets hung. The fix is to avoid holding bch_register_lock for the following loops in bcache_reboot(), list_for_each_entry_safe(c, tc, &bch_cache_sets, list) bch_cache_set_stop(c); list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); A module range variable 'bcache_is_reboot' is added, it sets to true in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked to be true, reject the registration by returning -EBUSY immediately. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 40 +++++++++++++++++++++++++++++++++++++++- drivers/md/bcache/sysfs.c | 26 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 791cb930b353..a88238ad5da1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -40,6 +40,7 @@ static const char invalid_uuid[] = { static struct kobject *bcache_kobj; struct mutex bch_register_lock; +bool bcache_is_reboot; LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); @@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; struct workqueue_struct *bch_journal_wq; + #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) /* limitation of partitions number on single bcache device */ #define BCACHE_MINORS 128 @@ -2335,6 +2337,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; + /* For latest state of bcache_is_reboot */ + smp_mb(); + if (bcache_is_reboot) + return -EBUSY; + path = kstrndup(buffer, size, GFP_KERNEL); if (!path) goto err; @@ -2464,6 +2471,9 @@ static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) { + if (bcache_is_reboot) + return NOTIFY_DONE; + if (code == SYS_DOWN || code == SYS_HALT || code == SYS_POWER_OFF) { @@ -2476,19 +2486,45 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) mutex_lock(&bch_register_lock); + if (bcache_is_reboot) + goto out; + + /* New registration is rejected since now */ + bcache_is_reboot = true; + /* + * Make registering caller (if there is) on other CPU + * core know bcache_is_reboot set to true earlier + */ + smp_mb(); + if (list_empty(&bch_cache_sets) && list_empty(&uncached_devices)) goto out; + mutex_unlock(&bch_register_lock); + pr_info("Stopping all devices:"); + /* + * The reason bch_register_lock is not held to call + * bch_cache_set_stop() and bcache_device_stop() is to + * avoid potential deadlock during reboot, because cache + * set or bcache device stopping process will acqurie + * bch_register_lock too. + * + * We are safe here because bcache_is_reboot sets to + * true already, register_bcache() will reject new + * registration now. bcache_is_reboot also makes sure + * bcache_reboot() won't be re-entered on by other thread, + * so there is no race in following list iteration by + * list_for_each_entry_safe(). + */ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) bch_cache_set_stop(c); list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); - mutex_unlock(&bch_register_lock); /* * Give an early chance for other kthreads and @@ -2616,6 +2652,8 @@ static int __init bcache_init(void) bch_debug_init(); closure_debug_init(); + bcache_is_reboot = false; + return 0; err: bcache_exit(); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index dddb8d4048ce..d62e28643109 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,6 +16,8 @@ #include #include +extern bool bcache_is_reboot; + /* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", @@ -267,6 +269,10 @@ STORE(__cached_dev) struct cache_set *c; struct kobj_uevent_env *env; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + #define d_strtoul(var) sysfs_strtoul(var, dc->var) #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) @@ -407,6 +413,10 @@ STORE(bch_cached_dev) struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size); @@ -510,6 +520,10 @@ STORE(__bch_flash_dev) kobj); struct uuid_entry *u = &d->c->uuids[d->id]; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + sysfs_strtoul(data_csum, d->data_csum); if (attr == &sysfs_size) { @@ -745,6 +759,10 @@ STORE(__bch_cache_set) struct cache_set *c = container_of(kobj, struct cache_set, kobj); ssize_t v; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_unregister) bch_cache_set_unregister(c); @@ -864,6 +882,10 @@ STORE(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal); + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + return bch_cache_set_store(&c->kobj, attr, buf, size); } @@ -1049,6 +1071,10 @@ STORE(__bch_cache) struct cache *ca = container_of(kobj, struct cache, kobj); ssize_t v; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); -- cgit v1.2.3 From 97ba3b816e2cdea798398bc8486125f79f2453c1 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:47 +0800 Subject: bcache: acquire bch_register_lock later in cached_dev_detach_finish() Now there is variable bcache_is_reboot to prevent device register or unregister during reboot, it is unncessary to still hold mutex lock bch_register_lock before stopping writeback_rate_update kworker and writeback kthread. And if the stopping kworker or kthread holding bch_register_lock inside their routine (we used to have such problem in writeback thread, thanks to Junhui Wang fixed it), it is very easy to introduce deadlock during reboot/shutdown procedure. Therefore in this patch, the location to acquire bch_register_lock is moved to the location before calling calc_cached_dev_sectors(). Which is later then original location in cached_dev_detach_finish(). Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a88238ad5da1..40d857e690f9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1018,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w) BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count)); - mutex_lock(&bch_register_lock); if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1034,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bch_write_bdev_super(dc, &cl); closure_sync(&cl); + mutex_lock(&bch_register_lock); + calc_cached_dev_sectors(dc->disk.c); bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); -- cgit v1.2.3 From 80265d8dfd77792e133793cef44a21323aac2908 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:48 +0800 Subject: bcache: acquire bch_register_lock later in cached_dev_free() When enable lockdep engine, a lockdep warning can be observed when reboot or shutdown system, [ 3142.764557][ T1] bcache: bcache_reboot() Stopping all devices: [ 3142.776265][ T2649] [ 3142.777159][ T2649] ====================================================== [ 3142.780039][ T2649] WARNING: possible circular locking dependency detected [ 3142.782869][ T2649] 5.2.0-rc4-lp151.20-default+ #1 Tainted: G W [ 3142.785684][ T2649] ------------------------------------------------------ [ 3142.788479][ T2649] kworker/3:67/2649 is trying to acquire lock: [ 3142.790738][ T2649] 00000000aaf02291 ((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x4c0 [ 3142.794678][ T2649] [ 3142.794678][ T2649] but task is already holding lock: [ 3142.797402][ T2649] 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache] [ 3142.801462][ T2649] [ 3142.801462][ T2649] which lock already depends on the new lock. [ 3142.801462][ T2649] [ 3142.805277][ T2649] [ 3142.805277][ T2649] the existing dependency chain (in reverse order) is: [ 3142.808902][ T2649] [ 3142.808902][ T2649] -> #2 (&bch_register_lock){+.+.}: [ 3142.812396][ T2649] __mutex_lock+0x7a/0x9d0 [ 3142.814184][ T2649] cached_dev_free+0x17/0x120 [bcache] [ 3142.816415][ T2649] process_one_work+0x2a4/0x640 [ 3142.818413][ T2649] worker_thread+0x39/0x3f0 [ 3142.820276][ T2649] kthread+0x125/0x140 [ 3142.822061][ T2649] ret_from_fork+0x3a/0x50 [ 3142.823965][ T2649] [ 3142.823965][ T2649] -> #1 ((work_completion)(&cl->work)#2){+.+.}: [ 3142.827244][ T2649] process_one_work+0x277/0x640 [ 3142.829160][ T2649] worker_thread+0x39/0x3f0 [ 3142.830958][ T2649] kthread+0x125/0x140 [ 3142.832674][ T2649] ret_from_fork+0x3a/0x50 [ 3142.834915][ T2649] [ 3142.834915][ T2649] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}: [ 3142.838121][ T2649] lock_acquire+0xb4/0x1c0 [ 3142.840025][ T2649] flush_workqueue+0xae/0x4c0 [ 3142.842035][ T2649] drain_workqueue+0xa9/0x180 [ 3142.844042][ T2649] destroy_workqueue+0x17/0x250 [ 3142.846142][ T2649] cached_dev_free+0x52/0x120 [bcache] [ 3142.848530][ T2649] process_one_work+0x2a4/0x640 [ 3142.850663][ T2649] worker_thread+0x39/0x3f0 [ 3142.852464][ T2649] kthread+0x125/0x140 [ 3142.854106][ T2649] ret_from_fork+0x3a/0x50 [ 3142.855880][ T2649] [ 3142.855880][ T2649] other info that might help us debug this: [ 3142.855880][ T2649] [ 3142.859663][ T2649] Chain exists of: [ 3142.859663][ T2649] (wq_completion)bcache_writeback_wq --> (work_completion)(&cl->work)#2 --> &bch_register_lock [ 3142.859663][ T2649] [ 3142.865424][ T2649] Possible unsafe locking scenario: [ 3142.865424][ T2649] [ 3142.868022][ T2649] CPU0 CPU1 [ 3142.869885][ T2649] ---- ---- [ 3142.871751][ T2649] lock(&bch_register_lock); [ 3142.873379][ T2649] lock((work_completion)(&cl->work)#2); [ 3142.876399][ T2649] lock(&bch_register_lock); [ 3142.879727][ T2649] lock((wq_completion)bcache_writeback_wq); [ 3142.882064][ T2649] [ 3142.882064][ T2649] *** DEADLOCK *** [ 3142.882064][ T2649] [ 3142.885060][ T2649] 3 locks held by kworker/3:67/2649: [ 3142.887245][ T2649] #0: 00000000e774cdd0 ((wq_completion)events){+.+.}, at: process_one_work+0x21e/0x640 [ 3142.890815][ T2649] #1: 00000000f7df89da ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 [ 3142.894884][ T2649] #2: 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache] [ 3142.898797][ T2649] [ 3142.898797][ T2649] stack backtrace: [ 3142.900961][ T2649] CPU: 3 PID: 2649 Comm: kworker/3:67 Tainted: G W 5.2.0-rc4-lp151.20-default+ #1 [ 3142.904789][ T2649] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 [ 3142.909168][ T2649] Workqueue: events cached_dev_free [bcache] [ 3142.911422][ T2649] Call Trace: [ 3142.912656][ T2649] dump_stack+0x85/0xcb [ 3142.914181][ T2649] print_circular_bug+0x19a/0x1f0 [ 3142.916193][ T2649] __lock_acquire+0x16cd/0x1850 [ 3142.917936][ T2649] ? __lock_acquire+0x6a8/0x1850 [ 3142.919704][ T2649] ? lock_acquire+0xb4/0x1c0 [ 3142.921335][ T2649] ? find_held_lock+0x34/0xa0 [ 3142.923052][ T2649] lock_acquire+0xb4/0x1c0 [ 3142.924635][ T2649] ? flush_workqueue+0x87/0x4c0 [ 3142.926375][ T2649] flush_workqueue+0xae/0x4c0 [ 3142.928047][ T2649] ? flush_workqueue+0x87/0x4c0 [ 3142.929824][ T2649] ? drain_workqueue+0xa9/0x180 [ 3142.931686][ T2649] drain_workqueue+0xa9/0x180 [ 3142.933534][ T2649] destroy_workqueue+0x17/0x250 [ 3142.935787][ T2649] cached_dev_free+0x52/0x120 [bcache] [ 3142.937795][ T2649] process_one_work+0x2a4/0x640 [ 3142.939803][ T2649] worker_thread+0x39/0x3f0 [ 3142.941487][ T2649] ? process_one_work+0x640/0x640 [ 3142.943389][ T2649] kthread+0x125/0x140 [ 3142.944894][ T2649] ? kthread_create_worker_on_cpu+0x70/0x70 [ 3142.947744][ T2649] ret_from_fork+0x3a/0x50 [ 3142.970358][ T2649] bcache: bcache_device_free() bcache0 stopped Here is how the deadlock happens. 1) bcache_reboot() calls bcache_device_stop(), then inside bcache_device_stop() BCACHE_DEV_CLOSING bit is set on d->flags. Then closure_queue(&d->cl) is called to invoke cached_dev_flush(). 2) In cached_dev_flush(), cached_dev_free() is called by continu_at(). 3) In cached_dev_free(), when stopping the writeback kthread of the cached device by kthread_stop(), dc->writeback_thread will be waken up to quite the kthread while-loop, then cached_dev_put() is called in bch_writeback_thread(). 4) Calling cached_dev_put() in writeback kthread may drop dc->count to 0, then dc->detach kworker is scheduled, which is initialized as cached_dev_detach_finish(). 5) Inside cached_dev_detach_finish(), the last line of code is to call closure_put(&dc->disk.cl), which drops the last reference counter of closrure dc->disk.cl, then the callback cached_dev_flush() gets called. Now cached_dev_flush() is called for second time in the code path, the first time is in step 2). And again bch_register_lock will be acquired again, and a A-A lock (lockdep terminology) is happening. The root cause of the above A-A lock is in cached_dev_free(), mutex bch_register_lock is held before stopping writeback kthread and other kworkers. Fortunately now we have variable 'bcache_is_reboot', which may prevent device registration or unregistration during reboot/shutdown time, so it is unncessary to hold bch_register_lock such early now. This is how this patch fixes the reboot/shutdown time A-A lock issue: After moving mutex_lock(&bch_register_lock) to a later location where before atomic_read(&dc->running) in cached_dev_free(), such A-A lock problem can be solved without any reboot time registration race. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 40d857e690f9..8a12a8313367 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1231,8 +1231,6 @@ static void cached_dev_free(struct closure *cl) { struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - mutex_lock(&bch_register_lock); - if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1243,6 +1241,8 @@ static void cached_dev_free(struct closure *cl) if (!IS_ERR_OR_NULL(dc->status_update_thread)) kthread_stop(dc->status_update_thread); + mutex_lock(&bch_register_lock); + if (atomic_read(&dc->running)) bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); -- cgit v1.2.3 From 7e865eba00a3df2dc8c4746173a8ca1c1c7f042e Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:49 +0800 Subject: bcache: fix potential deadlock in cached_def_free() When enable lockdep and reboot system with a writeback mode bcache device, the following potential deadlock warning is reported by lockdep engine. [ 101.536569][ T401] kworker/2:2/401 is trying to acquire lock: [ 101.538575][ T401] 00000000bbf6e6c7 ((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x4c0 [ 101.542054][ T401] [ 101.542054][ T401] but task is already holding lock: [ 101.544587][ T401] 00000000f5f305b3 ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 [ 101.548386][ T401] [ 101.548386][ T401] which lock already depends on the new lock. [ 101.548386][ T401] [ 101.551874][ T401] [ 101.551874][ T401] the existing dependency chain (in reverse order) is: [ 101.555000][ T401] [ 101.555000][ T401] -> #1 ((work_completion)(&cl->work)#2){+.+.}: [ 101.557860][ T401] process_one_work+0x277/0x640 [ 101.559661][ T401] worker_thread+0x39/0x3f0 [ 101.561340][ T401] kthread+0x125/0x140 [ 101.562963][ T401] ret_from_fork+0x3a/0x50 [ 101.564718][ T401] [ 101.564718][ T401] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}: [ 101.567701][ T401] lock_acquire+0xb4/0x1c0 [ 101.569651][ T401] flush_workqueue+0xae/0x4c0 [ 101.571494][ T401] drain_workqueue+0xa9/0x180 [ 101.573234][ T401] destroy_workqueue+0x17/0x250 [ 101.575109][ T401] cached_dev_free+0x44/0x120 [bcache] [ 101.577304][ T401] process_one_work+0x2a4/0x640 [ 101.579357][ T401] worker_thread+0x39/0x3f0 [ 101.581055][ T401] kthread+0x125/0x140 [ 101.582709][ T401] ret_from_fork+0x3a/0x50 [ 101.584592][ T401] [ 101.584592][ T401] other info that might help us debug this: [ 101.584592][ T401] [ 101.588355][ T401] Possible unsafe locking scenario: [ 101.588355][ T401] [ 101.590974][ T401] CPU0 CPU1 [ 101.592889][ T401] ---- ---- [ 101.594743][ T401] lock((work_completion)(&cl->work)#2); [ 101.596785][ T401] lock((wq_completion)bcache_writeback_wq); [ 101.600072][ T401] lock((work_completion)(&cl->work)#2); [ 101.602971][ T401] lock((wq_completion)bcache_writeback_wq); [ 101.605255][ T401] [ 101.605255][ T401] *** DEADLOCK *** [ 101.605255][ T401] [ 101.608310][ T401] 2 locks held by kworker/2:2/401: [ 101.610208][ T401] #0: 00000000cf2c7d17 ((wq_completion)events){+.+.}, at: process_one_work+0x21e/0x640 [ 101.613709][ T401] #1: 00000000f5f305b3 ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640 [ 101.617480][ T401] [ 101.617480][ T401] stack backtrace: [ 101.619539][ T401] CPU: 2 PID: 401 Comm: kworker/2:2 Tainted: G W 5.2.0-rc4-lp151.20-default+ #1 [ 101.623225][ T401] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 [ 101.627210][ T401] Workqueue: events cached_dev_free [bcache] [ 101.629239][ T401] Call Trace: [ 101.630360][ T401] dump_stack+0x85/0xcb [ 101.631777][ T401] print_circular_bug+0x19a/0x1f0 [ 101.633485][ T401] __lock_acquire+0x16cd/0x1850 [ 101.635184][ T401] ? __lock_acquire+0x6a8/0x1850 [ 101.636863][ T401] ? lock_acquire+0xb4/0x1c0 [ 101.638421][ T401] ? find_held_lock+0x34/0xa0 [ 101.640015][ T401] lock_acquire+0xb4/0x1c0 [ 101.641513][ T401] ? flush_workqueue+0x87/0x4c0 [ 101.643248][ T401] flush_workqueue+0xae/0x4c0 [ 101.644832][ T401] ? flush_workqueue+0x87/0x4c0 [ 101.646476][ T401] ? drain_workqueue+0xa9/0x180 [ 101.648303][ T401] drain_workqueue+0xa9/0x180 [ 101.649867][ T401] destroy_workqueue+0x17/0x250 [ 101.651503][ T401] cached_dev_free+0x44/0x120 [bcache] [ 101.653328][ T401] process_one_work+0x2a4/0x640 [ 101.655029][ T401] worker_thread+0x39/0x3f0 [ 101.656693][ T401] ? process_one_work+0x640/0x640 [ 101.658501][ T401] kthread+0x125/0x140 [ 101.660012][ T401] ? kthread_create_worker_on_cpu+0x70/0x70 [ 101.661985][ T401] ret_from_fork+0x3a/0x50 [ 101.691318][ T401] bcache: bcache_device_free() bcache0 stopped Here is how the above potential deadlock may happen in reboot/shutdown code path, 1) bcache_reboot() is called firstly in the reboot/shutdown code path, then in bcache_reboot(), bcache_device_stop() is called. 2) bcache_device_stop() sets BCACHE_DEV_CLOSING on d->falgs, then call closure_queue(&d->cl) to invoke cached_dev_flush(). And in turn cached_dev_flush() calls cached_dev_free() via closure_at() 3) In cached_dev_free(), after stopped writebach kthread dc->writeback_thread, the kwork dc->writeback_write_wq is stopping by destroy_workqueue(). 4) Inside destroy_workqueue(), drain_workqueue() is called. Inside drain_workqueue(), flush_workqueue() is called. Then wq->lockdep_map is acquired by lock_map_acquire() in flush_workqueue(). After the lock acquired the rest part of flush_workqueue() just wait for the workqueue to complete. 5) Now we look back at writeback thread routine bch_writeback_thread(), in the main while-loop, write_dirty() is called via continue_at() in read_dirty_submit(), which is called via continue_at() in while-loop level called function read_dirty(). Inside write_dirty() it may be re-called on workqueeu dc->writeback_write_wq via continue_at(). It means when the writeback kthread is stopped in cached_dev_free() there might be still one kworker queued on dc->writeback_write_wq to execute write_dirty() again. 6) Now this kworker is scheduled on dc->writeback_write_wq to run by process_one_work() (which is called by worker_thread()). Before calling the kwork routine, wq->lockdep_map is acquired. 7) But wq->lockdep_map is acquired already in step 4), so a A-A lock (lockdep terminology) scenario happens. Indeed on multiple cores syatem, the above deadlock is very rare to happen, just as the code comments in process_one_work() says, 2263 * AFAICT there is no possible deadlock scenario between the 2264 * flush_work() and complete() primitives (except for single-threaded 2265 * workqueues), so hiding them isn't a problem. But it is still good to fix such lockdep warning, even no one running bcache on single core system. The fix is simple. This patch solves the above potential deadlock by, - Do not destroy workqueue dc->writeback_write_wq in cached_dev_free(). - Flush and destroy dc->writeback_write_wq in writebach kthread routine bch_writeback_thread(), where after quit the thread main while-loop and before cached_dev_put() is called. By this fix, dc->writeback_write_wq will be stopped and destroy before the writeback kthread stopped, so the chance for a A-A locking on wq->lockdep_map is disappeared, such A-A deadlock won't happen any more. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 -- drivers/md/bcache/writeback.c | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8a12a8313367..a8ea4e2086a9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1236,8 +1236,6 @@ static void cached_dev_free(struct closure *cl) if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); if (!IS_ERR_OR_NULL(dc->status_update_thread)) kthread_stop(dc->status_update_thread); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 21081febcb59..d60268fe49e1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -738,6 +738,10 @@ static int bch_writeback_thread(void *arg) } } + if (dc->writeback_write_wq) { + flush_workqueue(dc->writeback_write_wq); + destroy_workqueue(dc->writeback_write_wq); + } cached_dev_put(dc); wait_for_kthread_stop(); -- cgit v1.2.3 From 2464b693148e5d5ca42b6064bb40c1a275ea61cd Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:50 +0800 Subject: bcache: add code comments for journal_read_bucket() This patch adds more code comments in journal_read_bucket(), this is an effort to make the code to be more understandable. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 54f8886b6177..98ee467ec3f7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset; blocks = set_blocks(j, block_bytes(ca->set)); + /* + * Nodes in 'list' are in linear increasing order of + * i->j.seq, the node on head has the smallest (oldest) + * journal seq, the node on tail has the biggest + * (latest) journal seq. + */ + + /* + * Check from the oldest jset for last_seq. If + * i->j.seq < j->last_seq, it means the oldest jset + * in list is expired and useless, remove it from + * this list. Otherwise, j is a condidate jset for + * further following checks. + */ while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); @@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset; kfree(i); } + /* iterate list in reverse order (from latest jset) */ list_for_each_entry_reverse(i, list, list) { if (j->seq == i->j.seq) goto next_set; + /* + * if j->seq is less than any i->j.last_seq + * in list, j is an expired and useless jset. + */ if (j->seq < i->j.last_seq) goto next_set; + /* + * 'where' points to first jset in list which + * is elder then j. + */ if (j->seq > i->j.seq) { where = &i->list; goto add; @@ -129,6 +152,7 @@ add: if (!i) return -ENOMEM; memcpy(&i->j, j, bytes); + /* Add to the location after 'where' points to */ list_add(&i->list, where); ret = 1; -- cgit v1.2.3 From a231f07a5fe30a522b402011c5190cb936641a66 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:51 +0800 Subject: bcache: set largest seq to ja->seq[bucket_index] in journal_read_bucket() In journal_read_bucket() when setting ja->seq[bucket_index], there might be potential case that a later non-maximum overwrites a better sequence number to ja->seq[bucket_index]. This patch adds a check to make sure that ja->seq[bucket_index] will be only set a new value if it is bigger then current value. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 98ee467ec3f7..3d321bffddc9 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -156,7 +156,8 @@ add: list_add(&i->list, where); ret = 1; - ja->seq[bucket_index] = j->seq; + if (j->seq > ja->seq[bucket_index]) + ja->seq[bucket_index] = j->seq; next_set: offset += blocks * ca->sb.block_size; len -= blocks * ca->sb.block_size; -- cgit v1.2.3 From 1df3877ff6a4810054237c3259d900ded4468969 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:52 +0800 Subject: bcache: shrink btree node cache after bch_btree_check() When cache set starts, bch_btree_check() will check all bkeys on cache device by calculating the checksum. This operation will consume a huge number of system memory if there are a lot of data cached. Since bcache uses its own mca cache to maintain all its read-in btree nodes, and only releases the cache space when system memory manage code starts to shrink caches. Then before memory manager code to call the mca cache shrinker callback, bcache mca cache will compete memory resource with user space application, which may have nagive effect to performance of user space workloads (e.g. data base, or I/O service of distributed storage node). This patch tries to call bcache mca shrinker routine to proactively release mca cache memory, to decrease the memory pressure of system and avoid negative effort of the overall system I/O performance. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a8ea4e2086a9..26e374fbf57c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1880,6 +1880,23 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err; + /* + * bch_btree_check() may occupy too much system memory which + * has negative effects to user space application (e.g. data + * base) performance. Shrink the mca cache memory proactively + * here to avoid competing memory with user space workloads.. + */ + if (!c->shrinker_disabled) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = c->btree_cache_used * c->btree_pages; + /* first run to clear b->accessed tag */ + c->shrink.scan_objects(&c->shrink, &sc); + /* second run to reap non-accessed nodes */ + c->shrink.scan_objects(&c->shrink, &sc); + } + bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done"); -- cgit v1.2.3 From ba82c1ac1667d6efb91a268edb13fc9cdaecec9b Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:53 +0800 Subject: bcache: Revert "bcache: free heap cache_set->flush_btree in bch_journal_free" This reverts commit 6268dc2c4703aabfb0b35681be709acf4c2826c6. This patch depends on commit c4dc2497d50d ("bcache: fix high CPU occupancy during journal") which is reverted in previous patch. So revert this one too. Fixes: 6268dc2c4703 ("bcache: free heap cache_set->flush_btree in bch_journal_free") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Cc: Shenghui Wang Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 3d321bffddc9..11d8c93b88bb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -884,7 +884,6 @@ void bch_journal_free(struct cache_set *c) free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); free_fifo(&c->journal.pin); - free_heap(&c->flush_btree); } int bch_journal_alloc(struct cache_set *c) -- cgit v1.2.3 From 249a5f6da57c28a903c75d81505d58ec8c10030d Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:54 +0800 Subject: bcache: Revert "bcache: fix high CPU occupancy during journal" This reverts commit c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0. This patch enlarges a race between normal btree flush code path and flush_btree_write(), which causes deadlock when journal space is exhausted. Reverts this patch makes the race window from 128 btree nodes to only 1 btree nodes. Fixes: c4dc2497d50d ("bcache: fix high CPU occupancy during journal") Signed-off-by: Coly Li Cc: stable@vger.kernel.org Cc: Tang Junhui Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 -- drivers/md/bcache/journal.c | 47 +++++++++++++++------------------------------ drivers/md/bcache/util.h | 2 -- 3 files changed, 15 insertions(+), 36 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 73a97586a2ef..cb268d7c6cea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -726,8 +726,6 @@ struct cache_set { #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; - - DECLARE_HEAP(struct btree *, flush_btree); }; struct bbio { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 11d8c93b88bb..14a4e2c44de9 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -416,12 +416,6 @@ err: } /* Journalling */ -#define journal_max_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) -#define journal_min_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) static void btree_flush_write(struct cache_set *c) { @@ -429,35 +423,25 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b; - int i; + struct btree *b, *best; + unsigned int i; atomic_long_inc(&c->flush_write); - retry: - spin_lock(&c->journal.lock); - if (heap_empty(&c->flush_btree)) { - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!heap_full(&c->flush_btree)) - heap_add(&c->flush_btree, b, - journal_max_cmp); - else if (journal_max_cmp(b, - heap_peek(&c->flush_btree))) { - c->flush_btree.data[0] = b; - heap_sift(&c->flush_btree, 0, - journal_max_cmp); - } + best = NULL; + + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best)->journal, + btree_current_write(b)->journal)) { + best = b; } + } - for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) - heap_sift(&c->flush_btree, i, journal_min_cmp); - } - - b = NULL; - heap_pop(&c->flush_btree, b, journal_min_cmp); - spin_unlock(&c->journal.lock); - + b = best; if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { @@ -898,8 +882,7 @@ int bch_journal_alloc(struct cache_set *c) j->w[0].c = c; j->w[1].c = c; - if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || - !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) return -ENOMEM; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1fbced94e4cc..c029f7443190 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -113,8 +113,6 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) -#define heap_empty(h) ((h)->used == 0) - #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ -- cgit v1.2.3 From e5ec5f4765ada9c75fb3eee93a6e72f0e50599d5 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:55 +0800 Subject: bcache: only clear BTREE_NODE_dirty bit when it is set In bch_btree_cache_free() and btree_node_free(), BTREE_NODE_dirty is always set no matter btree node is dirty or not. The code looks like this, if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); Indeed if btree_node_dirty(b) returns false, it means BTREE_NODE_dirty bit is cleared, then it is unnecessary to clear the bit again. This patch only clears BTREE_NODE_dirty when btree_node_dirty(b) is true (the bit is set), to save a few CPU cycles. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index cf38a1b031fa..88e5aa3fbb07 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -782,10 +782,10 @@ void bch_btree_cache_free(struct cache_set *c) while (!list_empty(&c->btree_cache)) { b = list_first_entry(&c->btree_cache, struct btree, list); - if (btree_node_dirty(b)) + if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); - + clear_bit(BTREE_NODE_dirty, &b->flags); + } mca_data_free(b); } @@ -1073,9 +1073,10 @@ static void btree_node_free(struct btree *b) mutex_lock(&b->write_lock); - if (btree_node_dirty(b)) + if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); + clear_bit(BTREE_NODE_dirty, &b->flags); + } mutex_unlock(&b->write_lock); -- cgit v1.2.3 From 41508bb7d46b74dba631017e5a702a86caf1db8c Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:56 +0800 Subject: bcache: add comments for mutex_lock(&b->write_lock) When accessing or modifying BTREE_NODE_dirty bit, it is not always necessary to acquire b->write_lock. In bch_btree_cache_free() and mca_reap() acquiring b->write_lock is necessary, and this patch adds comments to explain why mutex_lock(&b->write_lock) is necessary for checking or clearing BTREE_NODE_dirty bit there. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 88e5aa3fbb07..846306c3a887 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -659,6 +659,11 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) up(&b->io_mutex); } + /* + * BTREE_NODE_dirty might be cleared in btree_flush_btree() by + * __bch_btree_node_write(). To avoid an extra flush, acquire + * b->write_lock before checking BTREE_NODE_dirty bit. + */ mutex_lock(&b->write_lock); if (btree_node_dirty(b)) __bch_btree_node_write(b, &cl); @@ -782,6 +787,11 @@ void bch_btree_cache_free(struct cache_set *c) while (!list_empty(&c->btree_cache)) { b = list_first_entry(&c->btree_cache, struct btree, list); + /* + * This function is called by cache_set_free(), no I/O + * request on cache now, it is unnecessary to acquire + * b->write_lock before clearing BTREE_NODE_dirty anymore. + */ if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); -- cgit v1.2.3 From d91ce7574daf48a4567ba62733d43284f5d2a3f4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:57 +0800 Subject: bcache: remove retry_flush_write from struct cache_set In struct cache_set, retry_flush_write is added for commit c4dc2497d50d ("bcache: fix high CPU occupancy during journal") which is reverted in previous patch. Now it is useless anymore, and this patch removes it from bcache code. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/journal.c | 1 - drivers/md/bcache/sysfs.c | 5 ----- 3 files changed, 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index cb268d7c6cea..35396248a7d5 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -706,7 +706,6 @@ struct cache_set { atomic_long_t reclaim; atomic_long_t flush_write; - atomic_long_t retry_flush_write; enum { ON_ERROR_UNREGISTER, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 14a4e2c44de9..1218e3cada3c 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -447,7 +447,6 @@ retry: if (!btree_current_write(b)->journal) { mutex_unlock(&b->write_lock); /* We raced */ - atomic_long_inc(&c->retry_flush_write); goto retry; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index d62e28643109..701a386a954c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -83,7 +83,6 @@ read_attribute(state); read_attribute(cache_read_races); read_attribute(reclaim); read_attribute(flush_write); -read_attribute(retry_flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); @@ -709,9 +708,6 @@ SHOW(__bch_cache_set) sysfs_print(flush_write, atomic_long_read(&c->flush_write)); - sysfs_print(retry_flush_write, - atomic_long_read(&c->retry_flush_write)); - sysfs_print(writeback_keys_done, atomic_long_read(&c->writeback_keys_done)); sysfs_print(writeback_keys_failed, @@ -936,7 +932,6 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_cache_read_races, &sysfs_reclaim, &sysfs_flush_write, - &sysfs_retry_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, -- cgit v1.2.3 From 50a260e859964002dab162513a10f91ae9d3bcd3 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:58 +0800 Subject: bcache: fix race in btree_flush_write() There is a race between mca_reap(), btree_node_free() and journal code btree_flush_write(), which results very rare and strange deadlock or panic and are very hard to reproduce. Let me explain how the race happens. In btree_flush_write() one btree node with oldest journal pin is selected, then it is flushed to cache device, the select-and-flush is a two steps operation. Between these two steps, there are something may happen inside the race window, - The selected btree node was reaped by mca_reap() and allocated to other requesters for other btree node. - The slected btree node was selected, flushed and released by mca shrink callback bch_mca_scan(). When btree_flush_write() tries to flush the selected btree node, firstly b->write_lock is held by mutex_lock(). If the race happens and the memory of selected btree node is allocated to other btree node, if that btree node's write_lock is held already, a deadlock very probably happens here. A worse case is the memory of the selected btree node is released, then all references to this btree node (e.g. b->write_lock) will trigger NULL pointer deference panic. This race was introduced in commit cafe56359144 ("bcache: A block layer cache"), and enlarged by commit c4dc2497d50d ("bcache: fix high CPU occupancy during journal"), which selected 128 btree nodes and flushed them one-by-one in a quite long time period. Such race is not easy to reproduce before. On a Lenovo SR650 server with 48 Xeon cores, and configure 1 NVMe SSD as cache device, a MD raid0 device assembled by 3 NVMe SSDs as backing device, this race can be observed around every 10,000 times btree_flush_write() gets called. Both deadlock and kernel panic all happened as aftermath of the race. The idea of the fix is to add a btree flag BTREE_NODE_journal_flush. It is set when selecting btree nodes, and cleared after btree nodes flushed. Then when mca_reap() selects a btree node with this bit set, this btree node will be skipped. Since mca_reap() only reaps btree node without BTREE_NODE_journal_flush flag, such race is avoided. Once corner case should be noticed, that is btree_node_free(). It might be called in some error handling code path. For example the following code piece from btree_split(), 2149 err_free2: 2150 bkey_put(b->c, &n2->key); 2151 btree_node_free(n2); 2152 rw_unlock(true, n2); 2153 err_free1: 2154 bkey_put(b->c, &n1->key); 2155 btree_node_free(n1); 2156 rw_unlock(true, n1); At line 2151 and 2155, the btree node n2 and n1 are released without mac_reap(), so BTREE_NODE_journal_flush also needs to be checked here. If btree_node_free() is called directly in such error handling path, and the selected btree node has BTREE_NODE_journal_flush bit set, just delay for 1 us and retry again. In this case this btree node won't be skipped, just retry until the BTREE_NODE_journal_flush bit cleared, and free the btree node memory. Fixes: cafe56359144 ("bcache: A block layer cache") Signed-off-by: Coly Li Reported-and-tested-by: kbuild test robot Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 28 +++++++++++++++++++++++++++- drivers/md/bcache/btree.h | 2 ++ drivers/md/bcache/journal.c | 7 +++++++ 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 846306c3a887..ba434d9ac720 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -35,7 +35,7 @@ #include #include #include - +#include #include /* @@ -659,12 +659,25 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) up(&b->io_mutex); } +retry: /* * BTREE_NODE_dirty might be cleared in btree_flush_btree() by * __bch_btree_node_write(). To avoid an extra flush, acquire * b->write_lock before checking BTREE_NODE_dirty bit. */ mutex_lock(&b->write_lock); + /* + * If this btree node is selected in btree_flush_write() by journal + * code, delay and retry until the node is flushed by journal code + * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). + */ + if (btree_node_journal_flush(b)) { + pr_debug("bnode %p is flushing by journal, retry", b); + mutex_unlock(&b->write_lock); + udelay(1); + goto retry; + } + if (btree_node_dirty(b)) __bch_btree_node_write(b, &cl); mutex_unlock(&b->write_lock); @@ -1081,7 +1094,20 @@ static void btree_node_free(struct btree *b) BUG_ON(b == b->c->root); +retry: mutex_lock(&b->write_lock); + /* + * If the btree node is selected and flushing in btree_flush_write(), + * delay and retry until the BTREE_NODE_journal_flush bit cleared, + * then it is safe to free the btree node here. Otherwise this btree + * node will be in race condition. + */ + if (btree_node_journal_flush(b)) { + mutex_unlock(&b->write_lock); + pr_debug("bnode %p journal_flush set, retry", b); + udelay(1); + goto retry; + } if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d1c72ef64edf..76cfd121a486 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -158,11 +158,13 @@ enum btree_flags { BTREE_NODE_io_error, BTREE_NODE_dirty, BTREE_NODE_write_idx, + BTREE_NODE_journal_flush, }; BTREE_FLAG(io_error); BTREE_FLAG(dirty); BTREE_FLAG(write_idx); +BTREE_FLAG(journal_flush); static inline struct btree_write *btree_current_write(struct btree *b) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 1218e3cada3c..a1e3e1fcea6e 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -430,6 +430,7 @@ static void btree_flush_write(struct cache_set *c) retry: best = NULL; + mutex_lock(&c->bucket_lock); for_each_cached_btree(b, c, i) if (btree_current_write(b)->journal) { if (!best) @@ -442,15 +443,21 @@ retry: } b = best; + if (b) + set_btree_node_journal_flush(b); + mutex_unlock(&c->bucket_lock); + if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); /* We raced */ goto retry; } __bch_btree_node_write(b, NULL); + clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); } } -- cgit v1.2.3 From 91be66e1318f67ed5888ca10e10cc8ffdc24f661 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 19:59:59 +0800 Subject: bcache: performance improvement for btree_flush_write() This patch improves performance for btree_flush_write() in following ways, - Use another spinlock journal.flush_write_lock to replace the very hot journal.lock. We don't have to use journal.lock here, selecting candidate btree nodes takes a lot of time, hold journal.lock here will block other jouranling threads and drop the overall I/O performance. - Only select flushing btree node from c->btree_cache list. When the machine has a large system memory, mca cache may have a huge number of cached btree nodes. Iterating all the cached nodes will take a lot of CPU time, and most of the nodes on c->btree_cache_freeable and c->btree_cache_freed lists are cleared and have need to flush. So only travel mca list c->btree_cache to select flushing btree node should be enough for most of the cases. - Don't iterate whole c->btree_cache list, only reversely select first BTREE_FLUSH_NR btree nodes to flush. Iterate all btree nodes from c->btree_cache and select the oldest journal pin btree nodes consumes huge number of CPU cycles if the list is huge (push and pop a node into/out of a heap is expensive). The last several dirty btree nodes on the tail of c->btree_cache list are earlest allocated and cached btree nodes, they are relative to the oldest journal pin btree nodes. Therefore only flushing BTREE_FLUSH_NR btree nodes from tail of c->btree_cache probably includes the oldest journal pin btree nodes. In my testing, the above change decreases 50%+ CPU consumption when journal space is full. Some times IOPS drops to 0 for 5-8 seconds, comparing blocking I/O for 120+ seconds in previous code, this is much better. Maybe there is room to improve in future, but at this momment the fix looks fine and performs well in my testing. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 85 +++++++++++++++++++++++++++++++++------------ drivers/md/bcache/journal.h | 4 +++ 2 files changed, 67 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a1e3e1fcea6e..8bcd8f1bf8cb 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -419,47 +419,87 @@ err: static void btree_flush_write(struct cache_set *c) { - /* - * Try to find the btree node with that references the oldest journal - * entry, best is our current candidate and is locked if non NULL: - */ - struct btree *b, *best; - unsigned int i; + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; + unsigned int i, n; + + if (c->journal.btree_flushing) + return; + + spin_lock(&c->journal.flush_write_lock); + if (c->journal.btree_flushing) { + spin_unlock(&c->journal.flush_write_lock); + return; + } + c->journal.btree_flushing = true; + spin_unlock(&c->journal.flush_write_lock); atomic_long_inc(&c->flush_write); -retry: - best = NULL; + memset(btree_nodes, 0, sizeof(btree_nodes)); + n = 0; mutex_lock(&c->bucket_lock); - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best)->journal, - btree_current_write(b)->journal)) { - best = b; - } + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (btree_node_journal_flush(b)) + pr_err("BUG: flush_write bit should not be set here!"); + + mutex_lock(&b->write_lock); + + if (!btree_node_dirty(b)) { + mutex_unlock(&b->write_lock); + continue; + } + + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + continue; } - b = best; - if (b) set_btree_node_journal_flush(b); + + mutex_unlock(&b->write_lock); + + btree_nodes[n++] = b; + if (n == BTREE_FLUSH_NR) + break; + } mutex_unlock(&c->bucket_lock); - if (b) { + for (i = 0; i < n; i++) { + b = btree_nodes[i]; + if (!b) { + pr_err("BUG: btree_nodes[%d] is NULL", i); + continue; + } + + /* safe to check without holding b->write_lock */ + if (!btree_node_journal_flush(b)) { + pr_err("BUG: bnode %p: journal_flush bit cleaned", b); + continue; + } + mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); - /* We raced */ - goto retry; + pr_debug("bnode %p: written by others", b); + continue; + } + + if (!btree_node_dirty(b)) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: dirty bit cleaned by others", b); + continue; } __bch_btree_node_write(b, NULL); clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); } + + spin_lock(&c->journal.flush_write_lock); + c->journal.btree_flushing = false; + spin_unlock(&c->journal.flush_write_lock); } #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) @@ -881,6 +921,7 @@ int bch_journal_alloc(struct cache_set *c) struct journal *j = &c->journal; spin_lock_init(&j->lock); + spin_lock_init(&j->flush_write_lock); INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 66f0facff84b..f2ea34d5f431 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -103,6 +103,8 @@ struct journal_write { /* Embedded in struct cache_set */ struct journal { spinlock_t lock; + spinlock_t flush_write_lock; + bool btree_flushing; /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; @@ -154,6 +156,8 @@ struct journal_device { struct bio_vec bv[8]; }; +#define BTREE_FLUSH_NR 8 + #define journal_pin_cmp(c, l, r) \ (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) -- cgit v1.2.3 From dff90d58a1c815b87b2603295382c97e78064349 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 28 Jun 2019 20:00:00 +0800 Subject: bcache: add reclaimed_journal_buckets to struct cache_set Now we have counters for how many times jouranl is reclaimed, how many times cached dirty btree nodes are flushed, but we don't know how many jouranl buckets are really reclaimed. This patch adds reclaimed_journal_buckets into struct cache_set, this is an increasing only counter, to tell how many journal buckets are reclaimed since cache set runs. From all these three counters (reclaim, reclaimed_journal_buckets, flush_write), we can have idea how well current journal space reclaim code works. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/journal.c | 1 + drivers/md/bcache/sysfs.c | 5 +++++ 3 files changed, 7 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 35396248a7d5..013e35a9e317 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -705,6 +705,7 @@ struct cache_set { atomic_long_t writeback_keys_failed; atomic_long_t reclaim; + atomic_long_t reclaimed_journal_buckets; atomic_long_t flush_write; enum { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8bcd8f1bf8cb..be2a2a201603 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -614,6 +614,7 @@ static void journal_reclaim(struct cache_set *c) k->ptr[n++] = MAKE_PTR(0, bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); } if (n) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 701a386a954c..9f0826712845 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -82,6 +82,7 @@ read_attribute(bset_tree_stats); read_attribute(state); read_attribute(cache_read_races); read_attribute(reclaim); +read_attribute(reclaimed_journal_buckets); read_attribute(flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); @@ -705,6 +706,9 @@ SHOW(__bch_cache_set) sysfs_print(reclaim, atomic_long_read(&c->reclaim)); + sysfs_print(reclaimed_journal_buckets, + atomic_long_read(&c->reclaimed_journal_buckets)); + sysfs_print(flush_write, atomic_long_read(&c->flush_write)); @@ -931,6 +935,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_bset_tree_stats, &sysfs_cache_read_races, &sysfs_reclaim, + &sysfs_reclaimed_journal_buckets, &sysfs_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, -- cgit v1.2.3