From 0b96da639a4874311e9b5156405f69ef9fc3bef8 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Feb 2020 22:12:05 +0800 Subject: bcache: ignore pending signals when creating gc and allocator thread When run a cache set, all the bcache btree node of this cache set will be checked by bch_btree_check(). If the bcache btree is very large, iterating all the btree nodes will occupy too much system memory and the bcache registering process might be selected and killed by system OOM killer. kthread_run() will fail if current process has pending signal, therefore the kthread creating in run_cache_set() for gc and allocator kernel threads are very probably failed for a very large bcache btree. Indeed such OOM is safe and the registering process will exit after the registration done. Therefore this patch flushes pending signals during the cache set start up, specificly in bch_cache_allocator_start() and bch_gc_thread_start(), to make sure run_cache_set() won't fail for large cahced data set. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 18 ++++++++++++++++-- drivers/md/bcache/btree.c | 13 +++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a1df0d95151c..8bc1faf71ff2 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #define MAX_OPEN_BUCKETS 128 @@ -733,8 +734,21 @@ int bch_open_buckets_alloc(struct cache_set *c) int bch_cache_allocator_start(struct cache *ca) { - struct task_struct *k = kthread_run(bch_allocator_thread, - ca, "bcache_allocator"); + struct task_struct *k; + + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + + k = kthread_run(bch_allocator_thread, ca, "bcache_allocator"); if (IS_ERR(k)) return PTR_ERR(k); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fa872df4e770..b12186c87f52 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1913,6 +1914,18 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); return PTR_ERR_OR_ZERO(c->gc_thread); } -- cgit v1.2.3 From 309cc719a2c869b71a7388209a0a80d4284d98fd Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Feb 2020 22:12:06 +0800 Subject: bcache: Revert "bcache: shrink btree node cache after bch_btree_check()" This reverts commit 1df3877ff6a4810054237c3259d900ded4468969. In my testing, sometimes even all the cached btree nodes are freed, creating gc and allocator kernel threads may still fail. Finally it turns out that kthread_run() may fail if there is pending signal for current task. And the pending signal is sent from OOM killer which is triggered by memory consuption in bch_btree_check(). Therefore explicitly shrinking bcache btree node here does not help, and after the shrinker callback is improved, as well as pending signals are ignored before creating kernel threads, now such operation is unncessary anymore. This patch reverts the commit 1df3877ff6a4 ("bcache: shrink btree node cache after bch_btree_check()") because we have better improvement now. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2749daf09724..0c3c5419c52b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1917,23 +1917,6 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err; - /* - * bch_btree_check() may occupy too much system memory which - * has negative effects to user space application (e.g. data - * base) performance. Shrink the mca cache memory proactively - * here to avoid competing memory with user space workloads.. - */ - if (!c->shrinker_disabled) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = c->btree_cache_used * c->btree_pages; - /* first run to clear b->accessed tag */ - c->shrink.scan_objects(&c->shrink, &sc); - /* second run to reap non-accessed nodes */ - c->shrink.scan_objects(&c->shrink, &sc); - } - bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done"); -- cgit v1.2.3 From 4ec31cb6241d95879aac337cc6b50c45dd10cfcb Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Feb 2020 22:12:07 +0800 Subject: bcache: remove macro nr_to_fifo_front() Macro nr_to_fifo_front() is only used once in btree_flush_write(), it is unncessary indeed. This patch removes this macro and does calculation directly in place. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 6730820780b0..0e3ff9745ac7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -417,8 +417,6 @@ err: /* Journalling */ -#define nr_to_fifo_front(p, front_p, mask) (((p) - (front_p)) & (mask)) - static void btree_flush_write(struct cache_set *c) { struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; @@ -510,9 +508,8 @@ static void btree_flush_write(struct cache_set *c) * journal entry can be reclaimed). These selected nodes * will be ignored and skipped in the folowing for-loop. */ - if (nr_to_fifo_front(btree_current_write(b)->journal, - fifo_front_p, - mask) != 0) { + if (((btree_current_write(b)->journal - fifo_front_p) & + mask) != 0) { mutex_unlock(&b->write_lock); continue; } -- cgit v1.2.3