From 792732d9852c0e4505aceff4631ea2168fd02480 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 25 Apr 2019 00:48:29 +0800 Subject: bcache: use kmemdup_nul for CACHED_LABEL buffer This patch uses kmemdup_nul to create a NUL-terminated string from dc->sb.label. This is better than open coding it. With this, we can move env[2] initialization into env[] array to make code more elegant. Signed-off-by: Geliang Tang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a697a3a923cd..6e618cb6126c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -906,21 +906,18 @@ static int cached_dev_status_update(void *arg) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; - char buf[SB_LABEL_SIZE + 1]; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { "DRIVER=bcache", kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), - NULL, + kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""), NULL, }; - memcpy(buf, dc->sb.label, SB_LABEL_SIZE); - buf[SB_LABEL_SIZE] = '\0'; - env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); + kfree(buf); return; } @@ -944,6 +941,7 @@ void bch_cached_dev_run(struct cached_dev *dc) kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); kfree(env[2]); + kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) -- cgit v1.2.3 From a4b732a248d12cbdb46999daf0bf288c011335eb Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Thu, 25 Apr 2019 00:48:31 +0800 Subject: bcache: fix a race between cache register and cacheset unregister There is a race between cache device register and cache set unregister. For an already registered cache device, register_bcache will call bch_is_open to iterate through all cachesets and check every cache there. The race occurs if cache_set_free executes at the same time and clears the caches right before ca is dereferenced in bch_is_open_cache. To close the race, let's make sure the clean up work is protected by the bch_register_lock as well. This issue can be reproduced as follows, while true; do echo /dev/XXX> /sys/fs/bcache/register ; done& while true; do echo 1> /sys/block/XXX/bcache/set/unregister ; done & and results in the following oops, [ +0.000053] BUG: unable to handle kernel NULL pointer dereference at 0000000000000998 [ +0.000457] #PF error: [normal kernel read fault] [ +0.000464] PGD 800000003ca9d067 P4D 800000003ca9d067 PUD 3ca9c067 PMD 0 [ +0.000388] Oops: 0000 [#1] SMP PTI [ +0.000269] CPU: 1 PID: 3266 Comm: bash Not tainted 5.0.0+ #6 [ +0.000346] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.fc28 04/01/2014 [ +0.000472] RIP: 0010:register_bcache+0x1829/0x1990 [bcache] [ +0.000344] Code: b0 48 83 e8 50 48 81 fa e0 e1 10 c0 0f 84 a9 00 00 00 48 89 c6 48 89 ca 0f b7 ba 54 04 00 00 4c 8b 82 60 0c 00 00 85 ff 74 2f <49> 3b a8 98 09 00 00 74 4e 44 8d 47 ff 31 ff 49 c1 e0 03 eb 0d [ +0.000839] RSP: 0018:ffff92ee804cbd88 EFLAGS: 00010202 [ +0.000328] RAX: ffffffffc010e190 RBX: ffff918b5c6b5000 RCX: ffff918b7d8e0000 [ +0.000399] RDX: ffff918b7d8e0000 RSI: ffffffffc010e190 RDI: 0000000000000001 [ +0.000398] RBP: ffff918b7d318340 R08: 0000000000000000 R09: ffffffffb9bd2d7a [ +0.000385] R10: ffff918b7eb253c0 R11: ffffb95980f51200 R12: ffffffffc010e1a0 [ +0.000411] R13: fffffffffffffff2 R14: 000000000000000b R15: ffff918b7e232620 [ +0.000384] FS: 00007f955bec2740(0000) GS:ffff918b7eb00000(0000) knlGS:0000000000000000 [ +0.000420] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000801] CR2: 0000000000000998 CR3: 000000003cad6000 CR4: 00000000001406e0 [ +0.000837] Call Trace: [ +0.000682] ? _cond_resched+0x10/0x20 [ +0.000691] ? __kmalloc+0x131/0x1b0 [ +0.000710] kernfs_fop_write+0xfa/0x170 [ +0.000733] __vfs_write+0x2e/0x190 [ +0.000688] ? inode_security+0x10/0x30 [ +0.000698] ? selinux_file_permission+0xd2/0x120 [ +0.000752] ? security_file_permission+0x2b/0x100 [ +0.000753] vfs_write+0xa8/0x1a0 [ +0.000676] ksys_write+0x4d/0xb0 [ +0.000699] do_syscall_64+0x3a/0xf0 [ +0.000692] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Liang Chen Cc: stable@vger.kernel.org Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6e618cb6126c..53c5e3e0ac22 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1514,6 +1514,7 @@ static void cache_set_free(struct closure *cl) bch_btree_cache_free(c); bch_journal_free(c); + mutex_lock(&bch_register_lock); for_each_cache(ca, c, i) if (ca) { ca->set = NULL; @@ -1532,7 +1533,6 @@ static void cache_set_free(struct closure *cl) mempool_exit(&c->search); kfree(c->devices); - mutex_lock(&bch_register_lock); list_del(&c->list); mutex_unlock(&bch_register_lock); -- cgit v1.2.3 From ce3e4cfb59cb382f8e5ce359238aa580d4ae7778 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 25 Apr 2019 00:48:34 +0800 Subject: bcache: add failure check to run_cache_set() for journal replay Currently run_cache_set() has no return value, if there is failure in bch_journal_replay(), the caller of run_cache_set() has no idea about such failure and just continue to execute following code after run_cache_set(). The internal failure is triggered inside bch_journal_replay() and being handled in async way. This behavior is inefficient, while failure handling inside bch_journal_replay(), cache register code is still running to start the cache set. Registering and unregistering code running as same time may introduce some rare race condition, and make the code to be more hard to be understood. This patch adds return value to run_cache_set(), and returns -EIO if bch_journal_rreplay() fails. Then caller of run_cache_set() may detect such failure and stop registering code flow immedidately inside register_cache_set(). If journal replay fails, run_cache_set() can report error immediately to register_cache_set(). This patch makes the failure handling for bch_journal_replay() be in synchronized way, easier to understand and debug, and avoid poetential race condition for register-and-unregister in same time. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 53c5e3e0ac22..8c7fdada0acf 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1773,7 +1773,7 @@ err: return NULL; } -static void run_cache_set(struct cache_set *c) +static int run_cache_set(struct cache_set *c) { const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; @@ -1867,7 +1867,9 @@ static void run_cache_set(struct cache_set *c) if (j->version < BCACHE_JSET_VERSION_UUID) __uuid_write(c); - bch_journal_replay(c, &journal); + err = "bcache: replay journal failed"; + if (bch_journal_replay(c, &journal)) + goto err; } else { pr_notice("invalidating existing data"); @@ -1935,11 +1937,13 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); set_bit(CACHE_SET_RUNNING, &c->flags); - return; + return 0; err: closure_sync(&cl); /* XXX: test this, it's broken */ bch_cache_set_error(c, "%s", err); + + return -EIO; } static bool can_attach_cache(struct cache *ca, struct cache_set *c) @@ -2003,8 +2007,11 @@ found: ca->set->cache[ca->sb.nr_this_dev] = ca; c->cache_by_alloc[c->caches_loaded++] = ca; - if (c->caches_loaded == c->sb.nr_in_set) - run_cache_set(c); + if (c->caches_loaded == c->sb.nr_in_set) { + err = "failed to run cache set"; + if (run_cache_set(c) < 0) + goto err; + } return NULL; err: -- cgit v1.2.3 From 2d17456eb1cc78803b999fdd503c2dbd42a7d3da Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 25 Apr 2019 00:48:35 +0800 Subject: bcache: add comments for kobj release callback routine Bcache has several routines to release resources in implicit way, they are called when the associated kobj released. This patch adds code comments to notice when and which release callback will be called, - When dc->disk.kobj released: void bch_cached_dev_release(struct kobject *kobj) - When d->kobj released: void bch_flash_dev_release(struct kobject *kobj) - When c->kobj released: void bch_cache_set_release(struct kobject *kobj) - When ca->kobj released void bch_cache_release(struct kobject *kobj) Signed-off-by: Coly Li Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8c7fdada0acf..f8d80adcafec 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1172,6 +1172,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return 0; } +/* when dc->disk.kobj released */ void bch_cached_dev_release(struct kobject *kobj) { struct cached_dev *dc = container_of(kobj, struct cached_dev, @@ -1324,6 +1325,7 @@ err: /* Flash only volumes */ +/* When d->kobj released */ void bch_flash_dev_release(struct kobject *kobj) { struct bcache_device *d = container_of(kobj, struct bcache_device, @@ -1494,6 +1496,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) return true; } +/* When c->kobj released */ void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); @@ -2021,6 +2024,7 @@ err: /* Cache device */ +/* When ca->kobj released */ void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); -- cgit v1.2.3 From 88c12d42d2bb6e05deb3cfd24d12f6fe80544575 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 25 Apr 2019 00:48:37 +0800 Subject: bcache: add error check for calling register_bdev() This patch adds return value to register_bdev(). Then if failure happens inside register_bdev(), its caller register_bcache() may detect and handle the failure more properly. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f8d80adcafec..fde334939545 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1279,7 +1279,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ -static void register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { @@ -1317,10 +1317,11 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) bch_cached_dev_run(dc); - return; + return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); + return -EIO; } /* Flash only volumes */ @@ -2271,7 +2272,7 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = size; + ssize_t ret = -EINVAL; const char *err = "cannot allocate memory"; char *path = NULL; struct cache_sb *sb = NULL; @@ -2305,7 +2306,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto out; + goto quiet_out; } goto err; } @@ -2326,8 +2327,10 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto err_close; mutex_lock(&bch_register_lock); - register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); + if (ret < 0) + goto err; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -2337,6 +2340,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (register_cache(sb, sb_page, bdev, ca) != 0) goto err; } +quiet_out: + ret = size; out: if (sb_page) put_page(sb_page); @@ -2349,7 +2354,6 @@ err_close: blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: pr_info("error %s: %s", path, err); - ret = -EINVAL; goto out; } -- cgit v1.2.3 From bb6d355c2aff42d4075a8e7428dd72cb009d6143 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 25 Apr 2019 00:48:38 +0800 Subject: bcache: Add comments for blkdev_put() in registration code path Add comments to explain why in register_bcache() blkdev_put() won't be called in two location. Add comments to explain why blkdev_put() must be called in register_cache() when cache_alloc() failed. Signed-off-by: Coly Li Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fde334939545..fa856b2ca7af 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2189,6 +2189,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ret = cache_alloc(ca); if (ret != 0) { + /* + * If we failed here, it means ca->kobj is not initialized yet, + * kobject_put() won't be called and there is no chance to + * call blkdev_put() to bdev in bch_cache_release(). So we + * explicitly call blkdev_put() here. + */ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; @@ -2329,6 +2335,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, mutex_lock(&bch_register_lock); ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); + /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) goto err; } else { @@ -2337,6 +2344,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!ca) goto err_close; + /* blkdev_put() will be called in bch_cache_release() */ if (register_cache(sb, sb_page, bdev, ca) != 0) goto err; } -- cgit v1.2.3 From 63d63b51d70fb5155754dcf0baa2c1700bcafcb0 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 25 Apr 2019 00:48:39 +0800 Subject: bcache: add comments for closure_fn to be called in closure_queue() Add code comments to explain which call back function might be called for the closure_queue(). This is an effort to make code to be more understandable for readers. Signed-off-by: Coly Li Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fa856b2ca7af..0363ab534c8e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -662,6 +662,11 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) + /* + * closure_fn set to + * - cached device: cached_dev_flush() + * - flash dev: flash_dev_flush() + */ closure_queue(&d->cl); } @@ -1675,6 +1680,7 @@ static void __cache_set_unregister(struct closure *cl) void bch_cache_set_stop(struct cache_set *c) { if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) + /* closure_fn set to __cache_set_unregister() */ closure_queue(&c->caching); } -- cgit v1.2.3 From eb8cbb6df38f6e5124a3d5f1f8a3dbf519537c60 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 25 Apr 2019 00:48:40 +0800 Subject: bcache: improve bcache_reboot() This patch tries to release mutex bch_register_lock early, to give chance to stop cache set and bcache device early. This patch also expends time out of stopping all bcache device from 2 seconds to 10 seconds, because stopping writeback rate update worker may delay for 5 seconds, 2 seconds is not enough. After this patch applied, stopping bcache devices during system reboot or shutdown is very hard to be observed any more. Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0363ab534c8e..3f34b96ebbc3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2397,10 +2397,19 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); + mutex_unlock(&bch_register_lock); + + /* + * Give an early chance for other kthreads and + * kworkers to stop themselves + */ + schedule(); + /* What's a condition variable? */ while (1) { - long timeout = start + 2 * HZ - jiffies; + long timeout = start + 10 * HZ - jiffies; + mutex_lock(&bch_register_lock); stopped = list_empty(&bch_cache_sets) && list_empty(&uncached_devices); @@ -2412,7 +2421,6 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) mutex_unlock(&bch_register_lock); schedule_timeout(timeout); - mutex_lock(&bch_register_lock); } finish_wait(&unregister_wait, &wait); -- cgit v1.2.3 From 95f18c9d1310730d075499a75aaf13bcd60405a7 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Thu, 25 Apr 2019 00:48:43 +0800 Subject: bcache: avoid potential memleak of list of journal_replay(s) in the CACHE_SYNC branch of run_cache_set In the CACHE_SYNC branch of run_cache_set(), LIST_HEAD(journal) is used to collect journal_replay(s) and filled by bch_journal_read(). If all goes well, bch_journal_replay() will release the list of jounal_replay(s) at the end of the branch. If something goes wrong, code flow will jump to the label "err:" and leave the list unreleased. This patch will release the list of journal_replay(s) in the case of error detected. v1 -> v2: * Move the release code to the location after label 'err:' to simply the change. Signed-off-by: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3f34b96ebbc3..0ffe9acee9d8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1790,6 +1790,8 @@ static int run_cache_set(struct cache_set *c) struct cache *ca; struct closure cl; unsigned int i; + LIST_HEAD(journal); + struct journal_replay *l; closure_init_stack(&cl); @@ -1949,6 +1951,12 @@ static int run_cache_set(struct cache_set *c) set_bit(CACHE_SET_RUNNING, &c->flags); return 0; err: + while (!list_empty(&journal)) { + l = list_first_entry(&journal, struct journal_replay, list); + list_del(&l->list); + kfree(l); + } + closure_sync(&cl); /* XXX: test this, it's broken */ bch_cache_set_error(c, "%s", err); -- cgit v1.2.3 From cdca22bcbc64fc83dadb8d927df400a8d86ddabb Mon Sep 17 00:00:00 2001 From: Coly Li Date: Tue, 30 Apr 2019 22:02:25 +0800 Subject: bcache: remove redundant LIST_HEAD(journal) from run_cache_set() Commit 95f18c9d1310 ("bcache: avoid potential memleak of list of journal_replay(s) in the CACHE_SYNC branch of run_cache_set") forgets to remove the original define of LIST_HEAD(journal), which makes the change no take effect. This patch removes redundant variable LIST_HEAD(journal) from run_cache_set(), to make Shenghui's fix working. Fixes: 95f18c9d1310 ("bcache: avoid potential memleak of list of journal_replay(s) in the CACHE_SYNC branch of run_cache_set") Reported-by: Juha Aatrokoski Cc: Shenghui Wang Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md/bcache/super.c') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 0ffe9acee9d8..1b63ac876169 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1800,7 +1800,6 @@ static int run_cache_set(struct cache_set *c) set_gc_sectors(c); if (CACHE_SYNC(&c->sb)) { - LIST_HEAD(journal); struct bkey *k; struct jset *j; -- cgit v1.2.3