diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-22 13:38:05 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-22 13:38:05 -0700 |
commit | 5bed49adfe899667887db0739830190309c9011b (patch) | |
tree | aec2108f17a63b6ff8faba95e492b2a7887c7bc4 /block | |
parent | fe6f0ed0dac7df01014ef17fdad45e3eaf21b949 (diff) | |
parent | 1e7da865b8c0428b9bcb18ba05ba0f6f47bcfdb4 (diff) | |
download | linux-5bed49adfe899667887db0739830190309c9011b.tar.gz linux-5bed49adfe899667887db0739830190309c9011b.tar.bz2 linux-5bed49adfe899667887db0739830190309c9011b.zip |
Merge tag 'for-4.19/post-20180822' of git://git.kernel.dk/linux-block
Pull more block updates from Jens Axboe:
- Set of bcache fixes and changes (Coly)
- The flush warn fix (me)
- Small series of BFQ fixes (Paolo)
- wbt hang fix (Ming)
- blktrace fix (Steven)
- blk-mq hardware queue count update fix (Jianchao)
- Various little fixes
* tag 'for-4.19/post-20180822' of git://git.kernel.dk/linux-block: (31 commits)
block/DAC960.c: make some arrays static const, shrinks object size
blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter
blk-mq: init hctx sched after update ctx and hctx mapping
block: remove duplicate initialization
tracing/blktrace: Fix to allow setting same value
pktcdvd: fix setting of 'ret' error return for a few cases
block: change return type to bool
block, bfq: return nbytes and not zero from struct cftype .write() method
block, bfq: improve code of bfq_bfqq_charge_time
block, bfq: reduce write overcharge
block, bfq: always update the budget of an entity when needed
block, bfq: readd missing reset of parent-entity service
blk-wbt: fix IO hang in wbt_wait()
block: don't warn for flush on read-only device
bcache: add the missing comments for smp_mb()/smp_wmb()
bcache: remove unnecessary space before ioctl function pointer arguments
bcache: add missing SPDX header
bcache: move open brace at end of function definitions to next line
bcache: add static const prefix to char * array declarations
bcache: fix code comments style
...
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-cgroup.c | 3 | ||||
-rw-r--r-- | block/bfq-iosched.c | 54 | ||||
-rw-r--r-- | block/bfq-wf2q.c | 22 | ||||
-rw-r--r-- | block/blk-core.c | 5 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 44 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 5 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 14 | ||||
-rw-r--r-- | block/blk-mq.c | 96 | ||||
-rw-r--r-- | block/blk-wbt.c | 6 | ||||
-rw-r--r-- | block/blk.h | 4 | ||||
-rw-r--r-- | block/elevator.c | 20 |
11 files changed, 173 insertions, 100 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index a9e8633388f4..58c6efa9f9a9 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -913,7 +913,8 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, if (ret) return ret; - return bfq_io_set_weight_legacy(of_css(of), NULL, weight); + ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); + return ret ?: nbytes; } #ifdef CONFIG_DEBUG_BLK_CGROUP diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 41d9036b1822..653100fb719e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -187,11 +187,25 @@ static const int bfq_stats_min_budgets = 194; static const int bfq_default_max_budget = 16 * 1024; /* - * Async to sync throughput distribution is controlled as follows: - * when an async request is served, the entity is charged the number - * of sectors of the request, multiplied by the factor below + * When a sync request is dispatched, the queue that contains that + * request, and all the ancestor entities of that queue, are charged + * with the number of sectors of the request. In constrast, if the + * request is async, then the queue and its ancestor entities are + * charged with the number of sectors of the request, multiplied by + * the factor below. This throttles the bandwidth for async I/O, + * w.r.t. to sync I/O, and it is done to counter the tendency of async + * writes to steal I/O throughput to reads. + * + * The current value of this parameter is the result of a tuning with + * several hardware and software configurations. We tried to find the + * lowest value for which writes do not cause noticeable problems to + * reads. In fact, the lower this parameter, the stabler I/O control, + * in the following respect. The lower this parameter is, the less + * the bandwidth enjoyed by a group decreases + * - when the group does writes, w.r.t. to when it does reads; + * - when other groups do reads, w.r.t. to when they do writes. */ -static const int bfq_async_charge_factor = 10; +static const int bfq_async_charge_factor = 3; /* Default timeout values, in jiffies, approximating CFQ defaults. */ const int bfq_timeout = HZ / 8; @@ -853,16 +867,7 @@ static unsigned long bfq_serv_to_charge(struct request *rq, if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) return blk_rq_sectors(rq); - /* - * If there are no weight-raised queues, then amplify service - * by just the async charge factor; otherwise amplify service - * by twice the async charge factor, to further reduce latency - * for weight-raised queues. - */ - if (bfqq->bfqd->wr_busy_queues == 0) - return blk_rq_sectors(rq) * bfq_async_charge_factor; - - return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; + return blk_rq_sectors(rq) * bfq_async_charge_factor; } /** @@ -3298,6 +3303,27 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, */ } else entity->service = 0; + + /* + * Reset the received-service counter for every parent entity. + * Differently from what happens with bfqq->entity.service, + * the resetting of this counter never needs to be postponed + * for parent entities. In fact, in case bfqq may have a + * chance to go on being served using the last, partially + * consumed budget, bfqq->entity.service needs to be kept, + * because if bfqq then actually goes on being served using + * the same budget, the last value of bfqq->entity.service is + * needed to properly decrement bfqq->entity.budget by the + * portion already consumed. In contrast, it is not necessary + * to keep entity->service for parent entities too, because + * the bubble up of the new value of bfqq->entity.budget will + * make sure that the budgets of parent entities are correct, + * even in case bfqq and thus parent entities go on receiving + * service with the same budget. + */ + entity = entity->parent; + for_each_entity(entity) + entity->service = 0; } /* diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index dbc07b456059..ae52bff43ce4 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -130,10 +130,14 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, if (!change_without_lookup) /* lookup needed */ next_in_service = bfq_lookup_next_entity(sd, expiration); - if (next_in_service) - parent_sched_may_change = !sd->next_in_service || + if (next_in_service) { + bool new_budget_triggers_change = bfq_update_parent_budget(next_in_service); + parent_sched_may_change = !sd->next_in_service || + new_budget_triggers_change; + } + sd->next_in_service = next_in_service; if (!next_in_service) @@ -877,15 +881,11 @@ void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, unsigned long time_ms) { struct bfq_entity *entity = &bfqq->entity; - int tot_serv_to_charge = entity->service; - unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); - - if (time_ms > 0 && time_ms < timeout_ms) - tot_serv_to_charge = - (bfqd->bfq_max_budget * time_ms) / timeout_ms; - - if (tot_serv_to_charge < entity->service) - tot_serv_to_charge = entity->service; + unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); + unsigned long bounded_time_ms = min(time_ms, timeout_ms); + int serv_to_charge_for_time = + (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; + int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service); /* Increase budget to avoid inconsistencies */ if (tot_serv_to_charge > entity->budget) diff --git a/block/blk-core.c b/block/blk-core.c index 12550340418d..dee56c282efb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1036,7 +1036,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, NULL); - INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP @@ -2162,7 +2161,9 @@ static inline bool should_fail_request(struct hd_struct *part, static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { - if (part->policy && op_is_write(bio_op(bio))) { + const int op = bio_op(bio); + + if (part->policy && (op_is_write(op) && !op_is_flush(op))) { char b[BDEVNAME_SIZE]; WARN_ONCE(1, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index cf9c66c6d35a..29bfe8017a2d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -462,50 +462,6 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) blk_mq_sched_free_tags(set, hctx, i); } -int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - struct elevator_queue *e = q->elevator; - int ret; - - if (!e) - return 0; - - ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); - if (ret) - return ret; - - if (e->type->ops.mq.init_hctx) { - ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); - if (ret) { - blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); - return ret; - } - } - - blk_mq_debugfs_register_sched_hctx(q, hctx); - - return 0; -} - -void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - struct elevator_queue *e = q->elevator; - - if (!e) - return; - - blk_mq_debugfs_unregister_sched_hctx(hctx); - - if (e->type->ops.mq.exit_hctx && hctx->sched_data) { - e->type->ops.mq.exit_hctx(hctx, hctx_idx); - hctx->sched_data = NULL; - } - - blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); -} - int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0cb8f938dff9..4e028ee42430 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -28,11 +28,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); -int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx); -void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx); - static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 816923bf874d..94e1ed667b6e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -320,6 +320,18 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, struct blk_mq_hw_ctx *hctx; int i; + /* + * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and + * queue_hw_ctx after freeze the queue. So we could use q_usage_counter + * to avoid race with it. __blk_mq_update_nr_hw_queues will users + * synchronize_rcu to ensure all of the users go out of the critical + * section below and see zeroed q_usage_counter. + */ + rcu_read_lock(); + if (percpu_ref_is_zero(&q->q_usage_counter)) { + rcu_read_unlock(); + return; + } queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; @@ -335,7 +347,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } - + rcu_read_unlock(); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, diff --git a/block/blk-mq.c b/block/blk-mq.c index 72a0033ccee9..85a1c1a59c72 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2145,8 +2145,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); - blk_mq_sched_exit_hctx(q, hctx, hctx_idx); - if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -2214,12 +2212,9 @@ static int blk_mq_init_hctx(struct request_queue *q, set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; - if (blk_mq_sched_init_hctx(q, hctx, hctx_idx)) - goto exit_hctx; - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); if (!hctx->fq) - goto sched_exit_hctx; + goto exit_hctx; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) goto free_fq; @@ -2233,8 +2228,6 @@ static int blk_mq_init_hctx(struct request_queue *q, free_fq: kfree(hctx->fq); - sched_exit_hctx: - blk_mq_sched_exit_hctx(q, hctx, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -2896,10 +2889,81 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +/* + * request_queue and elevator_type pair. + * It is just used by __blk_mq_update_nr_hw_queues to cache + * the elevator_type associated with a request_queue. + */ +struct blk_mq_qe_pair { + struct list_head node; + struct request_queue *q; + struct elevator_type *type; +}; + +/* + * Cache the elevator_type in qe pair list and switch the + * io scheduler to 'none' + */ +static bool blk_mq_elv_switch_none(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + + if (!q->elevator) + return true; + + qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); + if (!qe) + return false; + + INIT_LIST_HEAD(&qe->node); + qe->q = q; + qe->type = q->elevator->type; + list_add(&qe->node, head); + + mutex_lock(&q->sysfs_lock); + /* + * After elevator_switch_mq, the previous elevator_queue will be + * released by elevator_release. The reference of the io scheduler + * module get by elevator_get will also be put. So we need to get + * a reference of the io scheduler module here to prevent it to be + * removed. + */ + __module_get(qe->type->elevator_owner); + elevator_switch_mq(q, NULL); + mutex_unlock(&q->sysfs_lock); + + return true; +} + +static void blk_mq_elv_switch_back(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + struct elevator_type *t = NULL; + + list_for_each_entry(qe, head, node) + if (qe->q == q) { + t = qe->type; + break; + } + + if (!t) + return; + + list_del(&qe->node); + kfree(qe); + + mutex_lock(&q->sysfs_lock); + elevator_switch_mq(q, t); + mutex_unlock(&q->sysfs_lock); +} + static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; + LIST_HEAD(head); lockdep_assert_held(&set->tag_list_lock); @@ -2910,6 +2974,18 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); + /* + * Sync with blk_mq_queue_tag_busy_iter. + */ + synchronize_rcu(); + /* + * Switch IO scheduler to 'none', cleaning up the data associated + * with the previous scheduler. We will switch back once we are done + * updating the new sw to hw queue mappings. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + if (!blk_mq_elv_switch_none(&head, q)) + goto switch_back; set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); @@ -2918,6 +2994,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_queue_reinit(q); } +switch_back: + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_elv_switch_back(&head, q); + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); } diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 1d94a20374fc..bb93c7c2b182 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -576,12 +576,8 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags; - if (!rwb_enabled(rwb)) - return; - flags = bio_to_wbt_flags(rwb, bio); - - if (!wbt_should_throttle(rwb, bio)) { + if (!(flags & WBT_TRACKED)) { if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); return; diff --git a/block/blk.h b/block/blk.h index d4d67e948920..9db4e389582c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -234,6 +234,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq int elevator_init(struct request_queue *); int elevator_init_mq(struct request_queue *q); +int elevator_switch_mq(struct request_queue *q, + struct elevator_type *new_e); void elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q); @@ -297,7 +299,7 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int); * b) the queue had IO stats enabled when this request was started, and * c) it's a file system request */ -static inline int blk_do_io_stat(struct request *rq) +static inline bool blk_do_io_stat(struct request *rq) { return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT) && diff --git a/block/elevator.c b/block/elevator.c index fa828b5bfd4b..5ea6e7d600e4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -933,16 +933,13 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static int elevator_switch_mq(struct request_queue *q, +int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e) { int ret; lockdep_assert_held(&q->sysfs_lock); - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - if (q->elevator) { if (q->elevator->registered) elv_unregister_queue(q); @@ -968,8 +965,6 @@ static int elevator_switch_mq(struct request_queue *q, blk_add_trace_msg(q, "elv switch: none"); out: - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); return ret; } @@ -1021,8 +1016,17 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) lockdep_assert_held(&q->sysfs_lock); - if (q->mq_ops) - return elevator_switch_mq(q, new_e); + if (q->mq_ops) { + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + err = elevator_switch_mq(q, new_e); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return err; + } /* * Turn on BYPASS and drain all requests w/ elevator private data. |