From a5049a8ae34950249a7ae94c385d7c5c98914412 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 19 Jun 2014 17:42:57 -0400 Subject: blkcg: fix use-after-free in __blkg_release_rcu() by making blkcg_gq refcnt an atomic_t Hello, So, this patch should do. Joe, Vivek, can one of you guys please verify that the oops goes away with this patch? Jens, the original thread can be read at http://thread.gmane.org/gmane.linux.kernel/1720729 The fix converts blkg->refcnt from int to atomic_t. It does some overhead but it should be minute compared to everything else which is going on and the involved cacheline bouncing, so I think it's highly unlikely to cause any noticeable difference. Also, the refcnt in question should be converted to a perpcu_ref for blk-mq anyway, so the atomic_t is likely to go away pretty soon anyway. Thanks. ------- 8< ------- __blkg_release_rcu() may be invoked after the associated request_queue is released with a RCU grace period inbetween. As such, the function and callbacks invoked from it must not dereference the associated request_queue. This is clearly indicated in the comment above the function. Unfortunately, while trying to fix a different issue, 2a4fd070ee85 ("blkcg: move bulk of blkcg_gq release operations to the RCU callback") ignored this and added [un]locking of @blkg->q->queue_lock to __blkg_release_rcu(). This of course can cause oops as the request_queue may be long gone by the time this code gets executed. general protection fault: 0000 [#1] SMP CPU: 21 PID: 30 Comm: rcuos/21 Not tainted 3.15.0 #1 Hardware name: Stratus ftServer 6400/G7LAZ, BIOS BIOS Version 6.3:57 12/25/2013 task: ffff880854021de0 ti: ffff88085403c000 task.ti: ffff88085403c000 RIP: 0010:[] [] _raw_spin_lock_irq+0x15/0x60 RSP: 0018:ffff88085403fdf0 EFLAGS: 00010086 RAX: 0000000000020000 RBX: 0000000000000010 RCX: 0000000000000000 RDX: 000060ef80008248 RSI: 0000000000000286 RDI: 6b6b6b6b6b6b6b6b RBP: ffff88085403fdf0 R08: 0000000000000286 R09: 0000000000009f39 R10: 0000000000020001 R11: 0000000000020001 R12: ffff88103c17a130 R13: ffff88103c17a080 R14: 0000000000000000 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88107fca0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000006e5ab8 CR3: 000000000193d000 CR4: 00000000000407e0 Stack: ffff88085403fe18 ffffffff812cbfc2 ffff88103c17a130 0000000000000000 ffff88103c17a130 ffff88085403fec0 ffffffff810d1d28 ffff880854021de0 ffff880854021de0 ffff88107fcaec58 ffff88085403fe80 ffff88107fcaec30 Call Trace: [] __blkg_release_rcu+0x72/0x150 [] rcu_nocb_kthread+0x1e8/0x300 [] kthread+0xe1/0x100 [] ret_from_fork+0x7c/0xb0 Code: ff 47 04 48 8b 7d 08 be 00 02 00 00 e8 55 48 a4 ff 5d c3 0f 1f 00 66 66 66 66 90 55 48 89 e5 +fa 66 66 90 66 66 90 b8 00 00 02 00 0f c1 07 89 c2 c1 ea 10 66 39 c2 75 02 5d c3 83 e2 fe 0f +b7 RIP [] _raw_spin_lock_irq+0x15/0x60 RSP The request_queue locking was added because blkcg_gq->refcnt is an int protected with the queue lock and __blkg_release_rcu() needs to put the parent. Let's fix it by making blkcg_gq->refcnt an atomic_t and dropping queue locking in the function. Given the general heavy weight of the current request_queue and blkcg operations, this is unlikely to cause any noticeable overhead. Moreover, blkcg_gq->refcnt is likely to be converted to percpu_ref in the near future, so whatever (most likely negligible) overhead it may add is temporary. Signed-off-by: Tejun Heo Reported-by: Joe Lawrence Acked-by: Vivek Goyal Link: http://lkml.kernel.org/g/alpine.DEB.2.02.1406081816540.17948@jlaw-desktop.mno.stratus.com Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 7 ++----- block/blk-cgroup.h | 17 +++++++---------- 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 069bc202ffe3..1463ca6b7aae 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -80,7 +80,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; - blkg->refcnt = 1; + atomic_set(&blkg->refcnt, 1); /* root blkg uses @q->root_rl, init rl only for !root blkgs */ if (blkcg != &blkcg_root) { @@ -399,11 +399,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head) /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - if (blkg->parent) { - spin_lock_irq(blkg->q->queue_lock); + if (blkg->parent) blkg_put(blkg->parent); - spin_unlock_irq(blkg->q->queue_lock); - } blkg_free(blkg); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index cbb7f943f78a..1c401b057151 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -18,6 +18,7 @@ #include #include #include +#include /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -104,7 +105,7 @@ struct blkcg_gq { struct request_list rl; /* reference count */ - int refcnt; + atomic_t refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; @@ -257,13 +258,12 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) * blkg_get - get a blkg reference * @blkg: blkg to get * - * The caller should be holding queue_lock and an existing reference. + * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { - lockdep_assert_held(blkg->q->queue_lock); - WARN_ON_ONCE(!blkg->refcnt); - blkg->refcnt++; + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + atomic_inc(&blkg->refcnt); } void __blkg_release_rcu(struct rcu_head *rcu); @@ -271,14 +271,11 @@ void __blkg_release_rcu(struct rcu_head *rcu); /** * blkg_put - put a blkg reference * @blkg: blkg to put - * - * The caller should be holding queue_lock. */ static inline void blkg_put(struct blkcg_gq *blkg) { - lockdep_assert_held(blkg->q->queue_lock); - WARN_ON_ONCE(blkg->refcnt <= 0); - if (!--blkg->refcnt) + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); + if (atomic_dec_and_test(&blkg->refcnt)) call_rcu(&blkg->rcu_head, __blkg_release_rcu); } -- cgit v1.2.3 From d5bf02914ea3ecf28db4f830f136dc04146b2317 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 22 Jun 2014 16:31:56 -0600 Subject: Revert "block: add __init to blkcg_policy_register" This reverts commit a2d445d440003f2d70ee4cd4970ea82ace616fee. The original commit is buggy, we do use the registration functions at runtime for modular builds. --- block/blk-cgroup.c | 2 +- block/blk-cgroup.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1463ca6b7aae..b9f4cc494ece 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ -int __init blkcg_policy_register(struct blkcg_policy *pol) +int blkcg_policy_register(struct blkcg_policy *pol) { int i, ret; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 1c401b057151..d3fd7aa3d2a3 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -146,7 +146,7 @@ void blkcg_drain_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); /* Blkio controller policy registration */ -int __init blkcg_policy_register(struct blkcg_policy *pol); +int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol); @@ -577,7 +577,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { ret static inline int blkcg_init_queue(struct request_queue *q) { return 0; } static inline void blkcg_drain_queue(struct request_queue *q) { } static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int __init blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { return 0; } -- cgit v1.2.3 From e567bf7112518824830978d644dfb5a991e67d54 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 22 Jun 2014 16:32:48 -0600 Subject: Revert "block: add __init to elv_register" This reverts commit b5097e956a4d2919ee248d6481e4204c5568ed5c. The original commit is buggy, we do use the registration functions at runtime, for instance when loading IO schedulers through sysfs. Reported-by: Damien Wyart --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 34bded18910e..24c28b659bb3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -825,7 +825,7 @@ void elv_unregister_queue(struct request_queue *q) } EXPORT_SYMBOL(elv_unregister_queue); -int __init elv_register(struct elevator_type *e) +int elv_register(struct elevator_type *e) { char *def = ""; -- cgit v1.2.3 From 66cb45aa41315d1d9972cada354fbdf7870d7714 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Jun 2014 16:22:24 -0600 Subject: block: add support for limiting gaps in SG lists Another restriction inherited for NVMe - those devices don't support SG lists that have "gaps" in them. Gaps refers to cases where the previous SG entry doesn't end on a page boundary. For NVMe, all SG entries must start at offset 0 (except the first) and end on a page boundary (except the last). Signed-off-by: Jens Axboe --- block/bio.c | 8 ++++++++ block/blk-merge.c | 10 ++++++++++ 2 files changed, 18 insertions(+) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8c2e55e39a1b..0ec61c9e536c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -746,6 +746,14 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page goto done; } + + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, disallow it. + */ + if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && + bvec_gap_to_prev(prev, offset)) + return 0; } if (bio->bi_vcnt >= bio->bi_max_vecs) diff --git a/block/blk-merge.c b/block/blk-merge.c index b3bf0df0f4c2..54535831f1e1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -568,6 +568,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, bool blk_rq_merge_ok(struct request *rq, struct bio *bio) { + struct request_queue *q = rq->q; + if (!rq_mergeable(rq) || !bio_mergeable(bio)) return false; @@ -591,6 +593,14 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; + if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { + struct bio_vec *bprev; + + bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1]; + if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) + return false; + } + return true; } -- cgit v1.2.3 From 0ffbce80c263821161190f20e74a12f7aa8eab7b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Jun 2014 08:22:34 -0600 Subject: blk-mq: blk_mq_start_hw_queue() should use blk_mq_run_hw_queue() Currently it calls __blk_mq_run_hw_queue(), which depends on the CPU placement being correct. This means it's not possible to call blk_mq_start_hw_queues(q) from a context that is correct for all queues, leading to triggering the WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); in __blk_mq_run_hw_queue(). Reported-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 0ef2dc7f01bf..ad69ef657e85 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -878,7 +878,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) clear_bit(BLK_MQ_S_STOPPED, &hctx->state); preempt_disable(); - __blk_mq_run_hw_queue(hctx); + blk_mq_run_hw_queue(hctx, false); preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); -- cgit v1.2.3