summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-08-22 13:38:05 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-22 13:38:05 -0700
commit5bed49adfe899667887db0739830190309c9011b (patch)
treeaec2108f17a63b6ff8faba95e492b2a7887c7bc4 /block
parentfe6f0ed0dac7df01014ef17fdad45e3eaf21b949 (diff)
parent1e7da865b8c0428b9bcb18ba05ba0f6f47bcfdb4 (diff)
downloadlinux-5bed49adfe899667887db0739830190309c9011b.tar.gz
linux-5bed49adfe899667887db0739830190309c9011b.tar.bz2
linux-5bed49adfe899667887db0739830190309c9011b.zip
Merge tag 'for-4.19/post-20180822' of git://git.kernel.dk/linux-block
Pull more block updates from Jens Axboe: - Set of bcache fixes and changes (Coly) - The flush warn fix (me) - Small series of BFQ fixes (Paolo) - wbt hang fix (Ming) - blktrace fix (Steven) - blk-mq hardware queue count update fix (Jianchao) - Various little fixes * tag 'for-4.19/post-20180822' of git://git.kernel.dk/linux-block: (31 commits) block/DAC960.c: make some arrays static const, shrinks object size blk-mq: sync the update nr_hw_queues with blk_mq_queue_tag_busy_iter blk-mq: init hctx sched after update ctx and hctx mapping block: remove duplicate initialization tracing/blktrace: Fix to allow setting same value pktcdvd: fix setting of 'ret' error return for a few cases block: change return type to bool block, bfq: return nbytes and not zero from struct cftype .write() method block, bfq: improve code of bfq_bfqq_charge_time block, bfq: reduce write overcharge block, bfq: always update the budget of an entity when needed block, bfq: readd missing reset of parent-entity service blk-wbt: fix IO hang in wbt_wait() block: don't warn for flush on read-only device bcache: add the missing comments for smp_mb()/smp_wmb() bcache: remove unnecessary space before ioctl function pointer arguments bcache: add missing SPDX header bcache: move open brace at end of function definitions to next line bcache: add static const prefix to char * array declarations bcache: fix code comments style ...
Diffstat (limited to 'block')
-rw-r--r--block/bfq-cgroup.c3
-rw-r--r--block/bfq-iosched.c54
-rw-r--r--block/bfq-wf2q.c22
-rw-r--r--block/blk-core.c5
-rw-r--r--block/blk-mq-sched.c44
-rw-r--r--block/blk-mq-sched.h5
-rw-r--r--block/blk-mq-tag.c14
-rw-r--r--block/blk-mq.c96
-rw-r--r--block/blk-wbt.c6
-rw-r--r--block/blk.h4
-rw-r--r--block/elevator.c20
11 files changed, 173 insertions, 100 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index a9e8633388f4..58c6efa9f9a9 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -913,7 +913,8 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
if (ret)
return ret;
- return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+ ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+ return ret ?: nbytes;
}
#ifdef CONFIG_DEBUG_BLK_CGROUP
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 41d9036b1822..653100fb719e 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -187,11 +187,25 @@ static const int bfq_stats_min_budgets = 194;
static const int bfq_default_max_budget = 16 * 1024;
/*
- * Async to sync throughput distribution is controlled as follows:
- * when an async request is served, the entity is charged the number
- * of sectors of the request, multiplied by the factor below
+ * When a sync request is dispatched, the queue that contains that
+ * request, and all the ancestor entities of that queue, are charged
+ * with the number of sectors of the request. In constrast, if the
+ * request is async, then the queue and its ancestor entities are
+ * charged with the number of sectors of the request, multiplied by
+ * the factor below. This throttles the bandwidth for async I/O,
+ * w.r.t. to sync I/O, and it is done to counter the tendency of async
+ * writes to steal I/O throughput to reads.
+ *
+ * The current value of this parameter is the result of a tuning with
+ * several hardware and software configurations. We tried to find the
+ * lowest value for which writes do not cause noticeable problems to
+ * reads. In fact, the lower this parameter, the stabler I/O control,
+ * in the following respect. The lower this parameter is, the less
+ * the bandwidth enjoyed by a group decreases
+ * - when the group does writes, w.r.t. to when it does reads;
+ * - when other groups do reads, w.r.t. to when they do writes.
*/
-static const int bfq_async_charge_factor = 10;
+static const int bfq_async_charge_factor = 3;
/* Default timeout values, in jiffies, approximating CFQ defaults. */
const int bfq_timeout = HZ / 8;
@@ -853,16 +867,7 @@ static unsigned long bfq_serv_to_charge(struct request *rq,
if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
return blk_rq_sectors(rq);
- /*
- * If there are no weight-raised queues, then amplify service
- * by just the async charge factor; otherwise amplify service
- * by twice the async charge factor, to further reduce latency
- * for weight-raised queues.
- */
- if (bfqq->bfqd->wr_busy_queues == 0)
- return blk_rq_sectors(rq) * bfq_async_charge_factor;
-
- return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
+ return blk_rq_sectors(rq) * bfq_async_charge_factor;
}
/**
@@ -3298,6 +3303,27 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
*/
} else
entity->service = 0;
+
+ /*
+ * Reset the received-service counter for every parent entity.
+ * Differently from what happens with bfqq->entity.service,
+ * the resetting of this counter never needs to be postponed
+ * for parent entities. In fact, in case bfqq may have a
+ * chance to go on being served using the last, partially
+ * consumed budget, bfqq->entity.service needs to be kept,
+ * because if bfqq then actually goes on being served using
+ * the same budget, the last value of bfqq->entity.service is
+ * needed to properly decrement bfqq->entity.budget by the
+ * portion already consumed. In contrast, it is not necessary
+ * to keep entity->service for parent entities too, because
+ * the bubble up of the new value of bfqq->entity.budget will
+ * make sure that the budgets of parent entities are correct,
+ * even in case bfqq and thus parent entities go on receiving
+ * service with the same budget.
+ */
+ entity = entity->parent;
+ for_each_entity(entity)
+ entity->service = 0;
}
/*
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index dbc07b456059..ae52bff43ce4 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -130,10 +130,14 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
if (!change_without_lookup) /* lookup needed */
next_in_service = bfq_lookup_next_entity(sd, expiration);
- if (next_in_service)
- parent_sched_may_change = !sd->next_in_service ||
+ if (next_in_service) {
+ bool new_budget_triggers_change =
bfq_update_parent_budget(next_in_service);
+ parent_sched_may_change = !sd->next_in_service ||
+ new_budget_triggers_change;
+ }
+
sd->next_in_service = next_in_service;
if (!next_in_service)
@@ -877,15 +881,11 @@ void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
unsigned long time_ms)
{
struct bfq_entity *entity = &bfqq->entity;
- int tot_serv_to_charge = entity->service;
- unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
-
- if (time_ms > 0 && time_ms < timeout_ms)
- tot_serv_to_charge =
- (bfqd->bfq_max_budget * time_ms) / timeout_ms;
-
- if (tot_serv_to_charge < entity->service)
- tot_serv_to_charge = entity->service;
+ unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout);
+ unsigned long bounded_time_ms = min(time_ms, timeout_ms);
+ int serv_to_charge_for_time =
+ (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms;
+ int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service);
/* Increase budget to avoid inconsistencies */
if (tot_serv_to_charge > entity->budget)
diff --git a/block/blk-core.c b/block/blk-core.c
index 12550340418d..dee56c282efb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1036,7 +1036,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
laptop_mode_timer_fn, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, NULL);
- INIT_LIST_HEAD(&q->queue_head);
INIT_LIST_HEAD(&q->timeout_list);
INIT_LIST_HEAD(&q->icq_list);
#ifdef CONFIG_BLK_CGROUP
@@ -2162,7 +2161,9 @@ static inline bool should_fail_request(struct hd_struct *part,
static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
{
- if (part->policy && op_is_write(bio_op(bio))) {
+ const int op = bio_op(bio);
+
+ if (part->policy && (op_is_write(op) && !op_is_flush(op))) {
char b[BDEVNAME_SIZE];
WARN_ONCE(1,
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index cf9c66c6d35a..29bfe8017a2d 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -462,50 +462,6 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
blk_mq_sched_free_tags(set, hctx, i);
}
-int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
-{
- struct elevator_queue *e = q->elevator;
- int ret;
-
- if (!e)
- return 0;
-
- ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
- if (ret)
- return ret;
-
- if (e->type->ops.mq.init_hctx) {
- ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
- if (ret) {
- blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
- return ret;
- }
- }
-
- blk_mq_debugfs_register_sched_hctx(q, hctx);
-
- return 0;
-}
-
-void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
-{
- struct elevator_queue *e = q->elevator;
-
- if (!e)
- return;
-
- blk_mq_debugfs_unregister_sched_hctx(hctx);
-
- if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
- e->type->ops.mq.exit_hctx(hctx, hctx_idx);
- hctx->sched_data = NULL;
- }
-
- blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
-}
-
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
struct blk_mq_hw_ctx *hctx;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 0cb8f938dff9..4e028ee42430 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -28,11 +28,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
-int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx);
-void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx);
-
static inline bool
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
{
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 816923bf874d..94e1ed667b6e 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -320,6 +320,18 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
struct blk_mq_hw_ctx *hctx;
int i;
+ /*
+ * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
+ * queue_hw_ctx after freeze the queue. So we could use q_usage_counter
+ * to avoid race with it. __blk_mq_update_nr_hw_queues will users
+ * synchronize_rcu to ensure all of the users go out of the critical
+ * section below and see zeroed q_usage_counter.
+ */
+ rcu_read_lock();
+ if (percpu_ref_is_zero(&q->q_usage_counter)) {
+ rcu_read_unlock();
+ return;
+ }
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
@@ -335,7 +347,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
}
-
+ rcu_read_unlock();
}
static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 72a0033ccee9..85a1c1a59c72 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2145,8 +2145,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (set->ops->exit_request)
set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
- blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
-
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -2214,12 +2212,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto free_bitmap;
- if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
- goto exit_hctx;
-
hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
if (!hctx->fq)
- goto sched_exit_hctx;
+ goto exit_hctx;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
goto free_fq;
@@ -2233,8 +2228,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
free_fq:
kfree(hctx->fq);
- sched_exit_hctx:
- blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -2896,10 +2889,81 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
return ret;
}
+/*
+ * request_queue and elevator_type pair.
+ * It is just used by __blk_mq_update_nr_hw_queues to cache
+ * the elevator_type associated with a request_queue.
+ */
+struct blk_mq_qe_pair {
+ struct list_head node;
+ struct request_queue *q;
+ struct elevator_type *type;
+};
+
+/*
+ * Cache the elevator_type in qe pair list and switch the
+ * io scheduler to 'none'
+ */
+static bool blk_mq_elv_switch_none(struct list_head *head,
+ struct request_queue *q)
+{
+ struct blk_mq_qe_pair *qe;
+
+ if (!q->elevator)
+ return true;
+
+ qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
+ if (!qe)
+ return false;
+
+ INIT_LIST_HEAD(&qe->node);
+ qe->q = q;
+ qe->type = q->elevator->type;
+ list_add(&qe->node, head);
+
+ mutex_lock(&q->sysfs_lock);
+ /*
+ * After elevator_switch_mq, the previous elevator_queue will be
+ * released by elevator_release. The reference of the io scheduler
+ * module get by elevator_get will also be put. So we need to get
+ * a reference of the io scheduler module here to prevent it to be
+ * removed.
+ */
+ __module_get(qe->type->elevator_owner);
+ elevator_switch_mq(q, NULL);
+ mutex_unlock(&q->sysfs_lock);
+
+ return true;
+}
+
+static void blk_mq_elv_switch_back(struct list_head *head,
+ struct request_queue *q)
+{
+ struct blk_mq_qe_pair *qe;
+ struct elevator_type *t = NULL;
+
+ list_for_each_entry(qe, head, node)
+ if (qe->q == q) {
+ t = qe->type;
+ break;
+ }
+
+ if (!t)
+ return;
+
+ list_del(&qe->node);
+ kfree(qe);
+
+ mutex_lock(&q->sysfs_lock);
+ elevator_switch_mq(q, t);
+ mutex_unlock(&q->sysfs_lock);
+}
+
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
+ LIST_HEAD(head);
lockdep_assert_held(&set->tag_list_lock);
@@ -2910,6 +2974,18 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
+ /*
+ * Sync with blk_mq_queue_tag_busy_iter.
+ */
+ synchronize_rcu();
+ /*
+ * Switch IO scheduler to 'none', cleaning up the data associated
+ * with the previous scheduler. We will switch back once we are done
+ * updating the new sw to hw queue mappings.
+ */
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ if (!blk_mq_elv_switch_none(&head, q))
+ goto switch_back;
set->nr_hw_queues = nr_hw_queues;
blk_mq_update_queue_map(set);
@@ -2918,6 +2994,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
blk_mq_queue_reinit(q);
}
+switch_back:
+ list_for_each_entry(q, &set->tag_list, tag_set_list)
+ blk_mq_elv_switch_back(&head, q);
+
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue(q);
}
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 1d94a20374fc..bb93c7c2b182 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -576,12 +576,8 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
struct rq_wb *rwb = RQWB(rqos);
enum wbt_flags flags;
- if (!rwb_enabled(rwb))
- return;
-
flags = bio_to_wbt_flags(rwb, bio);
-
- if (!wbt_should_throttle(rwb, bio)) {
+ if (!(flags & WBT_TRACKED)) {
if (flags & WBT_READ)
wb_timestamp(rwb, &rwb->last_issue);
return;
diff --git a/block/blk.h b/block/blk.h
index d4d67e948920..9db4e389582c 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -234,6 +234,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq
int elevator_init(struct request_queue *);
int elevator_init_mq(struct request_queue *q);
+int elevator_switch_mq(struct request_queue *q,
+ struct elevator_type *new_e);
void elevator_exit(struct request_queue *, struct elevator_queue *);
int elv_register_queue(struct request_queue *q);
void elv_unregister_queue(struct request_queue *q);
@@ -297,7 +299,7 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int);
* b) the queue had IO stats enabled when this request was started, and
* c) it's a file system request
*/
-static inline int blk_do_io_stat(struct request *rq)
+static inline bool blk_do_io_stat(struct request *rq)
{
return rq->rq_disk &&
(rq->rq_flags & RQF_IO_STAT) &&
diff --git a/block/elevator.c b/block/elevator.c
index fa828b5bfd4b..5ea6e7d600e4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -933,16 +933,13 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
-static int elevator_switch_mq(struct request_queue *q,
+int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e)
{
int ret;
lockdep_assert_held(&q->sysfs_lock);
- blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
-
if (q->elevator) {
if (q->elevator->registered)
elv_unregister_queue(q);
@@ -968,8 +965,6 @@ static int elevator_switch_mq(struct request_queue *q,
blk_add_trace_msg(q, "elv switch: none");
out:
- blk_mq_unquiesce_queue(q);
- blk_mq_unfreeze_queue(q);
return ret;
}
@@ -1021,8 +1016,17 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
lockdep_assert_held(&q->sysfs_lock);
- if (q->mq_ops)
- return elevator_switch_mq(q, new_e);
+ if (q->mq_ops) {
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
+ err = elevator_switch_mq(q, new_e);
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
+ return err;
+ }
/*
* Turn on BYPASS and drain all requests w/ elevator private data.