diff options
Diffstat (limited to 'block')
34 files changed, 1704 insertions, 744 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index da1525ec4c87..d819dc77fe65 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -775,10 +775,11 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) unsigned long flags; int i; + spin_lock_irqsave(&bfqd->lock, flags); + if (!entity) /* root group */ - return; + goto put_async_queues; - spin_lock_irqsave(&bfqd->lock, flags); /* * Empty all service_trees belonging to this group before * deactivating the group itself. @@ -809,6 +810,8 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) } __bfq_deactivate_entity(entity, false); + +put_async_queues: bfq_put_async_queues(bfqd, bfqg); spin_unlock_irqrestore(&bfqd->lock, flags); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index bcb6d21baf12..aeca22d91101 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -166,6 +166,20 @@ static const int bfq_async_charge_factor = 10; /* Default timeout values, in jiffies, approximating CFQ defaults. */ const int bfq_timeout = HZ / 8; +/* + * Time limit for merging (see comments in bfq_setup_cooperator). Set + * to the slowest value that, in our tests, proved to be effective in + * removing false positives, while not causing true positives to miss + * queue merging. + * + * As can be deduced from the low time limit below, queue merging, if + * successful, happens at the very beggining of the I/O of the involved + * cooperating processes, as a consequence of the arrival of the very + * first requests from each cooperator. After that, there is very + * little chance to find cooperators. + */ +static const unsigned long bfq_merge_time_limit = HZ/10; + static struct kmem_cache *bfq_pool; /* Below this threshold (in ns), we consider thinktime immediate. */ @@ -178,7 +192,7 @@ static struct kmem_cache *bfq_pool; #define BFQQ_SEEK_THR (sector_t)(8 * 100) #define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) /* Min number of samples required to perform peak-rate update */ #define BFQ_RATE_MIN_SAMPLES 32 @@ -195,15 +209,17 @@ static struct kmem_cache *bfq_pool; * interactive applications automatically, using the following formula: * duration = (R / r) * T, where r is the peak rate of the device, and * R and T are two reference parameters. - * In particular, R is the peak rate of the reference device (see below), - * and T is a reference time: given the systems that are likely to be - * installed on the reference device according to its speed class, T is - * about the maximum time needed, under BFQ and while reading two files in - * parallel, to load typical large applications on these systems. - * In practice, the slower/faster the device at hand is, the more/less it - * takes to load applications with respect to the reference device. - * Accordingly, the longer/shorter BFQ grants weight raising to interactive - * applications. + * In particular, R is the peak rate of the reference device (see + * below), and T is a reference time: given the systems that are + * likely to be installed on the reference device according to its + * speed class, T is about the maximum time needed, under BFQ and + * while reading two files in parallel, to load typical large + * applications on these systems (see the comments on + * max_service_from_wr below, for more details on how T is obtained). + * In practice, the slower/faster the device at hand is, the more/less + * it takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to + * interactive applications. * * BFQ uses four different reference pairs (R, T), depending on: * . whether the device is rotational or non-rotational; @@ -240,6 +256,60 @@ static int T_slow[2]; static int T_fast[2]; static int device_speed_thresh[2]; +/* + * BFQ uses the above-detailed, time-based weight-raising mechanism to + * privilege interactive tasks. This mechanism is vulnerable to the + * following false positives: I/O-bound applications that will go on + * doing I/O for much longer than the duration of weight + * raising. These applications have basically no benefit from being + * weight-raised at the beginning of their I/O. On the opposite end, + * while being weight-raised, these applications + * a) unjustly steal throughput to applications that may actually need + * low latency; + * b) make BFQ uselessly perform device idling; device idling results + * in loss of device throughput with most flash-based storage, and may + * increase latencies when used purposelessly. + * + * BFQ tries to reduce these problems, by adopting the following + * countermeasure. To introduce this countermeasure, we need first to + * finish explaining how the duration of weight-raising for + * interactive tasks is computed. + * + * For a bfq_queue deemed as interactive, the duration of weight + * raising is dynamically adjusted, as a function of the estimated + * peak rate of the device, so as to be equal to the time needed to + * execute the 'largest' interactive task we benchmarked so far. By + * largest task, we mean the task for which each involved process has + * to do more I/O than for any of the other tasks we benchmarked. This + * reference interactive task is the start-up of LibreOffice Writer, + * and in this task each process/bfq_queue needs to have at most ~110K + * sectors transferred. + * + * This last piece of information enables BFQ to reduce the actual + * duration of weight-raising for at least one class of I/O-bound + * applications: those doing sequential or quasi-sequential I/O. An + * example is file copy. In fact, once started, the main I/O-bound + * processes of these applications usually consume the above 110K + * sectors in much less time than the processes of an application that + * is starting, because these I/O-bound processes will greedily devote + * almost all their CPU cycles only to their target, + * throughput-friendly I/O operations. This is even more true if BFQ + * happens to be underestimating the device peak rate, and thus + * overestimating the duration of weight raising. But, according to + * our measurements, once transferred 110K sectors, these processes + * have no right to be weight-raised any longer. + * + * Basing on the last consideration, BFQ ends weight-raising for a + * bfq_queue if the latter happens to have received an amount of + * service at least equal to the following constant. The constant is + * set to slightly more than 110K, to have a minimum safety margin. + * + * This early ending of weight-raising reduces the amount of time + * during which interactive false positives cause the two problems + * described at the beginning of these comments. + */ +static const unsigned long max_service_from_wr = 120000; + #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) @@ -403,6 +473,82 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. + */ +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +{ + bfqd->sb_shift = bt->sb.shift; + + /* + * In-word depths if no bfq_queue is being weight-raised: + * leaving 25% of tags only for sync reads. + * + * In next formulas, right-shift the value + * (1U<<bfqd->sb_shift), instead of computing directly + * (1U<<(bfqd->sb_shift - something)), to be robust against + * any possible value of bfqd->sb_shift, without having to + * limit 'something'. + */ + /* no more than 50% of tags for async I/O */ + bfqd->word_depths[0][0] = max((1U<<bfqd->sb_shift)>>1, 1U); + /* + * no more than 75% of tags for sync writes (25% extra tags + * w.r.t. async I/O, to prevent async I/O from starving sync + * writes) + */ + bfqd->word_depths[0][1] = max(((1U<<bfqd->sb_shift) * 3)>>2, 1U); + + /* + * In-word depths in case some bfq_queue is being weight- + * raised: leaving ~63% of tags for sync reads. This is the + * highest percentage for which, in our tests, application + * start-up times didn't suffer from any regression due to tag + * shortage. + */ + /* no more than ~18% of tags for async I/O */ + bfqd->word_depths[1][0] = max(((1U<<bfqd->sb_shift) * 3)>>4, 1U); + /* no more than ~37% of tags for sync writes (~20% extra tags) */ + bfqd->word_depths[1][1] = max(((1U<<bfqd->sb_shift) * 6)>>4, 1U); +} + +/* + * Async I/O can easily starve sync I/O (both sync reads and sync + * writes), by consuming all tags. Similarly, storms of sync writes, + * such as those that sync(2) may trigger, can starve sync reads. + * Limit depths of async I/O and sync writes so as to counter both + * problems. + */ +static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct sbitmap_queue *bt; + + if (op_is_sync(op) && !op_is_write(op)) + return; + + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return; + } + bt = &tags->breserved_tags; + } else + bt = &tags->bitmap_tags; + + if (unlikely(bfqd->sb_shift != bt->sb.shift)) + bfq_update_depths(bfqd, bt); + + data->shallow_depth = + bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", + __func__, bfqd->wr_busy_queues, op_is_sync(op), + data->shallow_depth); +} + static struct bfq_queue * bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, sector_t sector, struct rb_node **ret_parent, @@ -444,6 +590,13 @@ bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, return bfqq; } +static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) +{ + return bfqq->service_from_backlogged > 0 && + time_is_before_jiffies(bfqq->first_IO_time + + bfq_merge_time_limit); +} + void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct rb_node **p, *parent; @@ -454,6 +607,14 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->pos_root = NULL; } + /* + * bfqq cannot be merged any longer (see comments in + * bfq_setup_cooperator): no point in adding bfqq into the + * position tree. + */ + if (bfq_too_late_for_merging(bfqq)) + return; + if (bfq_class_idle(bfqq)) return; if (!bfqq->next_rq) @@ -1247,6 +1408,7 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, if (old_wr_coeff == 1 && wr_or_deserves_wr) { /* start a weight-raising period */ if (interactive) { + bfqq->service_from_wr = 0; bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { @@ -1627,6 +1789,8 @@ static void bfq_remove_request(struct request_queue *q, rb_erase(&bfqq->pos_node, bfqq->pos_root); bfqq->pos_root = NULL; } + } else { + bfq_pos_tree_add_move(bfqd, bfqq); } if (rq->cmd_flags & REQ_META) @@ -1933,6 +2097,9 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) { + if (bfq_too_late_for_merging(new_bfqq)) + return false; + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || (bfqq->ioprio_class != new_bfqq->ioprio_class)) return false; @@ -1957,20 +2124,6 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, } /* - * If this function returns true, then bfqq cannot be merged. The idea - * is that true cooperation happens very early after processes start - * to do I/O. Usually, late cooperations are just accidental false - * positives. In case bfqq is weight-raised, such false positives - * would evidently degrade latency guarantees for bfqq. - */ -static bool wr_from_too_long(struct bfq_queue *bfqq) -{ - return bfqq->wr_coeff > 1 && - time_is_before_jiffies(bfqq->last_wr_start_finish + - msecs_to_jiffies(100)); -} - -/* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return * NULL if no merge was scheduled, a pointer to the shared bfq_queue @@ -1983,11 +2136,6 @@ static bool wr_from_too_long(struct bfq_queue *bfqq) * to maintain. Besides, in such a critical condition as an out of memory, * the benefits of queue merging may be little relevant, or even negligible. * - * Weight-raised queues can be merged only if their weight-raising - * period has just started. In fact cooperating processes are usually - * started together. Thus, with this filter we avoid false positives - * that would jeopardize low-latency guarantees. - * * WARNING: queue merging may impair fairness among non-weight raised * queues, for at least two reasons: 1) the original weight of a * merged queue may change during the merged state, 2) even being the @@ -2001,12 +2149,24 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_queue *in_service_bfqq, *new_bfqq; + /* + * Prevent bfqq from being merged if it has been created too + * long ago. The idea is that true cooperating processes, and + * thus their associated bfq_queues, are supposed to be + * created shortly after each other. This is the case, e.g., + * for KVM/QEMU and dump I/O threads. Basing on this + * assumption, the following filtering greatly reduces the + * probability that two non-cooperating processes, which just + * happen to do close I/O for some short time interval, have + * their queues merged by mistake. + */ + if (bfq_too_late_for_merging(bfqq)) + return NULL; + if (bfqq->new_bfqq) return bfqq->new_bfqq; - if (!io_struct || - wr_from_too_long(bfqq) || - unlikely(bfqq == &bfqd->oom_bfqq)) + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; /* If there is only one backlogged queue, don't search. */ @@ -2015,12 +2175,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, in_service_bfqq = bfqd->in_service_queue; - if (!in_service_bfqq || in_service_bfqq == bfqq - || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - - if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + if (in_service_bfqq && in_service_bfqq != bfqq && + likely(in_service_bfqq != &bfqd->oom_bfqq) && + bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && bfqq->entity.parent == in_service_bfqq->entity.parent && bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); @@ -2032,12 +2189,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * queues. The only thing we need is that the bio/request is not * NULL, as we need it to establish whether a cooperator exists. */ -check_scheduled: new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, bfq_io_struct_pos(io_struct, request)); - if (new_bfqq && !wr_from_too_long(new_bfqq) && - likely(new_bfqq != &bfqd->oom_bfqq) && + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && bfq_may_be_close_cooperator(bfqq, new_bfqq)) return bfq_setup_merge(bfqq, new_bfqq); @@ -2062,7 +2217,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); if (unlikely(bfq_bfqq_just_created(bfqq) && - !bfq_bfqq_in_large_burst(bfqq))) { + !bfq_bfqq_in_large_burst(bfqq) && + bfqq->bfqd->low_latency)) { /* * bfqq being merged right after being created: bfqq * would have deserved interactive weight raising, but @@ -2917,45 +3073,87 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, * whereas soft_rt_next_start is set to infinity for applications that do * not. * - * Unfortunately, even a greedy application may happen to behave in an - * isochronous way if the CPU load is high. In fact, the application may - * stop issuing requests while the CPUs are busy serving other processes, - * then restart, then stop again for a while, and so on. In addition, if - * the disk achieves a low enough throughput with the request pattern - * issued by the application (e.g., because the request pattern is random - * and/or the device is slow), then the application may meet the above - * bandwidth requirement too. To prevent such a greedy application to be - * deemed as soft real-time, a further rule is used in the computation of - * soft_rt_next_start: soft_rt_next_start must be higher than the current - * time plus the maximum time for which the arrival of a request is waited - * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. - * This filters out greedy applications, as the latter issue instead their - * next request as soon as possible after the last one has been completed - * (in contrast, when a batch of requests is completed, a soft real-time - * application spends some time processing data). + * Unfortunately, even a greedy (i.e., I/O-bound) application may + * happen to meet, occasionally or systematically, both the above + * bandwidth and isochrony requirements. This may happen at least in + * the following circumstances. First, if the CPU load is high. The + * application may stop issuing requests while the CPUs are busy + * serving other processes, then restart, then stop again for a while, + * and so on. The other circumstances are related to the storage + * device: the storage device is highly loaded or reaches a low-enough + * throughput with the I/O of the application (e.g., because the I/O + * is random and/or the device is slow). In all these cases, the + * I/O of the application may be simply slowed down enough to meet + * the bandwidth and isochrony requirements. To reduce the probability + * that greedy applications are deemed as soft real-time in these + * corner cases, a further rule is used in the computation of + * soft_rt_next_start: the return value of this function is forced to + * be higher than the maximum between the following two quantities. * - * Unfortunately, the last filter may easily generate false positives if - * only bfqd->bfq_slice_idle is used as a reference time interval and one - * or both the following cases occur: - * 1) HZ is so low that the duration of a jiffy is comparable to or higher - * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with - * HZ=100. + * (a) Current time plus: (1) the maximum time for which the arrival + * of a request is waited for when a sync queue becomes idle, + * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We + * postpone for a moment the reason for adding a few extra + * jiffies; we get back to it after next item (b). Lower-bounding + * the return value of this function with the current time plus + * bfqd->bfq_slice_idle tends to filter out greedy applications, + * because the latter issue their next request as soon as possible + * after the last one has been completed. In contrast, a soft + * real-time application spends some time processing data, after a + * batch of its requests has been completed. + * + * (b) Current value of bfqq->soft_rt_next_start. As pointed out + * above, greedy applications may happen to meet both the + * bandwidth and isochrony requirements under heavy CPU or + * storage-device load. In more detail, in these scenarios, these + * applications happen, only for limited time periods, to do I/O + * slowly enough to meet all the requirements described so far, + * including the filtering in above item (a). These slow-speed + * time intervals are usually interspersed between other time + * intervals during which these applications do I/O at a very high + * speed. Fortunately, exactly because of the high speed of the + * I/O in the high-speed intervals, the values returned by this + * function happen to be so high, near the end of any such + * high-speed interval, to be likely to fall *after* the end of + * the low-speed time interval that follows. These high values are + * stored in bfqq->soft_rt_next_start after each invocation of + * this function. As a consequence, if the last value of + * bfqq->soft_rt_next_start is constantly used to lower-bound the + * next value that this function may return, then, from the very + * beginning of a low-speed interval, bfqq->soft_rt_next_start is + * likely to be constantly kept so high that any I/O request + * issued during the low-speed interval is considered as arriving + * to soon for the application to be deemed as soft + * real-time. Then, in the high-speed interval that follows, the + * application will not be deemed as soft real-time, just because + * it will do I/O at a high speed. And so on. + * + * Getting back to the filtering in item (a), in the following two + * cases this filtering might be easily passed by a greedy + * application, if the reference quantity was just + * bfqd->bfq_slice_idle: + * 1) HZ is so low that the duration of a jiffy is comparable to or + * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow + * devices with HZ=100. The time granularity may be so coarse + * that the approximation, in jiffies, of bfqd->bfq_slice_idle + * is rather lower than the exact value. * 2) jiffies, instead of increasing at a constant rate, may stop increasing * for a while, then suddenly 'jump' by several units to recover the lost * increments. This seems to happen, e.g., inside virtual machines. - * To address this issue, we do not use as a reference time interval just - * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In - * particular we add the minimum number of jiffies for which the filter - * seems to be quite precise also in embedded systems and KVM/QEMU virtual - * machines. + * To address this issue, in the filtering in (a) we do not use as a + * reference time interval just bfqd->bfq_slice_idle, but + * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the + * minimum number of jiffies for which the filter seems to be quite + * precise also in embedded systems and KVM/QEMU virtual machines. */ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, - jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); + return max3(bfqq->soft_rt_next_start, + bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } /** @@ -3000,17 +3198,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); /* - * Increase service_from_backlogged before next statement, - * because the possible next invocation of - * bfq_bfqq_charge_time would likely inflate - * entity->service. In contrast, service_from_backlogged must - * contain real service, to enable the soft real-time - * heuristic to correctly compute the bandwidth consumed by - * bfqq. - */ - bfqq->service_from_backlogged += entity->service; - - /* * As above explained, charge slow (typically seeky) and * timed-out queues with the time and not the service * received, to favor sequential workloads. @@ -3535,6 +3722,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->entity.prio_changed = 1; } } + if (bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && + bfqq->service_from_wr > max_service_from_wr) { + /* see comments on max_service_from_wr */ + bfq_bfqq_end_wr(bfqq); + } } /* * To improve latency (for this or other queues), immediately @@ -3630,20 +3823,22 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) } /* - * We exploit the put_rq_private hook to decrement - * rq_in_driver, but put_rq_private will not be - * invoked on this request. So, to avoid unbalance, - * just start this request, without incrementing - * rq_in_driver. As a negative consequence, - * rq_in_driver is deceptively lower than it should be - * while this request is in service. This may cause - * bfq_schedule_dispatch to be invoked uselessly. + * We exploit the bfq_finish_requeue_request hook to + * decrement rq_in_driver, but + * bfq_finish_requeue_request will not be invoked on + * this request. So, to avoid unbalance, just start + * this request, without incrementing rq_in_driver. As + * a negative consequence, rq_in_driver is deceptively + * lower than it should be while this request is in + * service. This may cause bfq_schedule_dispatch to be + * invoked uselessly. * * As for implementing an exact solution, the - * put_request hook, if defined, is probably invoked - * also on this request. So, by exploiting this hook, - * we could 1) increment rq_in_driver here, and 2) - * decrement it in put_request. Such a solution would + * bfq_finish_requeue_request hook, if defined, is + * probably invoked also on this request. So, by + * exploiting this hook, we could 1) increment + * rq_in_driver here, and 2) decrement it in + * bfq_finish_requeue_request. Such a solution would * let the value of the counter be always accurate, * but it would entail using an extra interface * function. This cost seems higher than the benefit, @@ -3689,35 +3884,16 @@ exit: return rq; } -static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct request *rq; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - struct bfq_queue *in_serv_queue, *bfqq; - bool waiting_rq, idle_timer_disabled; -#endif - - spin_lock_irq(&bfqd->lock); - #if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - - rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); - -#else - rq = __bfq_dispatch_request(hctx); -#endif - spin_unlock_irq(&bfqd->lock); +static void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) +{ + struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - bfqq = rq ? RQ_BFQQ(rq) : NULL; if (!idle_timer_disabled && !bfqq) - return rq; + return; /* * rq and bfqq are guaranteed to exist until this function @@ -3732,7 +3908,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(hctx->queue->queue_lock); + spin_lock_irq(q->queue_lock); if (idle_timer_disabled) /* * Since the idle timer has been disabled, @@ -3751,9 +3927,37 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); } - spin_unlock_irq(hctx->queue->queue_lock); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) {} #endif +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled; + + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + + rq = __bfq_dispatch_request(hctx); + + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + + spin_unlock_irq(&bfqd->lock); + + bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, + idle_timer_disabled); + return rq; } @@ -4002,10 +4206,15 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->split_time = bfq_smallest_from_now(); /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. + * To not forget the possibly high bandwidth consumed by a + * process/queue in the recent past, + * bfq_bfqq_softrt_next_start() returns a value at least equal + * to the current value of bfqq->soft_rt_next_start (see + * comments on bfq_bfqq_softrt_next_start). Set + * soft_rt_next_start to now, to mean that bfqq has consumed + * no bandwidth so far. */ - bfqq->soft_rt_next_start = bfq_greatest_from_now(); + bfqq->soft_rt_next_start = jiffies; /* first request is almost certainly seeky */ bfqq->seek_history = 1; @@ -4276,16 +4485,48 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) return idle_timer_disabled; } +#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) +static void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) +{ + if (!bfqq) + return; + + /* + * bfqq still exists, because it can disappear only after + * either it is merged with another queue, or the process it + * is associated with exits. But both actions must be taken by + * the same process currently executing this flow of + * instructions. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(q->queue_lock); + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); + if (idle_timer_disabled) + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + spin_unlock_irq(q->queue_lock); +} +#else +static inline void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) {} +#endif + +static void bfq_prepare_request(struct request *rq, struct bio *bio); + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *bfqq = RQ_BFQQ(rq); bool idle_timer_disabled = false; unsigned int cmd_flags; -#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4304,7 +4545,18 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) + if (WARN_ON_ONCE(!bfqq)) { + /* + * This should never happen. Most likely rq is + * a requeued regular request, being + * re-inserted without being first + * re-prepared. Do a prepare, to avoid + * failure. + */ + bfq_prepare_request(rq, rq->bio); + bfqq = RQ_BFQQ(rq); + } + idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4312,9 +4564,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * redirected into a new queue. */ bfqq = RQ_BFQQ(rq); -#else - __bfq_insert_request(bfqd, rq); -#endif if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4323,35 +4572,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* * Cache cmd_flags before releasing scheduler lock, because rq * may disappear afterwards (for example, because of a request * merge). */ cmd_flags = rq->cmd_flags; -#endif + spin_unlock_irq(&bfqd->lock); -#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - if (!bfqq) - return; - /* - * bfqq still exists, because it can disappear only after - * either it is merged with another queue, or the process it - * is associated with exits. But both actions must be taken by - * the same process currently executing this flow of - * instruction. - * - * In addition, the following queue lock guarantees that - * bfqq_group(bfqq) exists as well. - */ - spin_lock_irq(q->queue_lock); - bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); - if (idle_timer_disabled) - bfqg_stats_update_idle_time(bfqq_group(bfqq)); - spin_unlock_irq(q->queue_lock); -#endif + bfq_update_insert_stats(q, bfqq, idle_timer_disabled, + cmd_flags); } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, @@ -4482,22 +4713,44 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_schedule_dispatch(bfqd); } -static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) +static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) { bfqq->allocated--; bfq_put_queue(bfqq); } -static void bfq_finish_request(struct request *rq) +/* + * Handle either a requeue or a finish for rq. The things to do are + * the same in both cases: all references to rq are to be dropped. In + * particular, rq is considered completed from the point of view of + * the scheduler. + */ +static void bfq_finish_requeue_request(struct request *rq) { - struct bfq_queue *bfqq; + struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; - if (!rq->elv.icq) + /* + * Requeue and finish hooks are invoked in blk-mq without + * checking whether the involved request is actually still + * referenced in the scheduler. To handle this fact, the + * following two checks make this function exit in case of + * spurious invocations, for which there is nothing to do. + * + * First, check whether rq has nothing to do with an elevator. + */ + if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) + return; + + /* + * rq either is not associated with any icq, or is an already + * requeued request that has not (yet) been re-inserted into + * a bfq_queue. + */ + if (!rq->elv.icq || !bfqq) return; - bfqq = RQ_BFQQ(rq); bfqd = bfqq->bfqd; if (rq->rq_flags & RQF_STARTED) @@ -4512,13 +4765,14 @@ static void bfq_finish_request(struct request *rq) spin_lock_irqsave(&bfqd->lock, flags); bfq_completed_request(bfqq, bfqd); - bfq_put_rq_priv_body(bfqq); + bfq_finish_requeue_request_body(bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); } else { /* * Request rq may be still/already in the scheduler, - * in which case we need to remove it. And we cannot + * in which case we need to remove it (this should + * never happen in case of requeue). And we cannot * defer such a check and removal, to avoid * inconsistencies in the time interval from the end * of this function to the start of the deferred work. @@ -4533,9 +4787,26 @@ static void bfq_finish_request(struct request *rq) bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } - bfq_put_rq_priv_body(bfqq); + bfq_finish_requeue_request_body(bfqq); } + /* + * Reset private fields. In case of a requeue, this allows + * this function to correctly do nothing if it is spuriously + * invoked again on this same request (see the check at the + * beginning of the function). Probably, a better general + * design would be to prevent blk-mq from invoking the requeue + * or finish hooks of an elevator, for a request that is not + * referred by that elevator. + * + * Resetting the following fields would break the + * request-insertion logic if rq is re-inserted into a bfq + * internal queue, without a re-preparation. Here we assume + * that re-insertions of requeued requests, without + * re-preparation, can happen only for pass_through or at_head + * requests (which are not re-inserted into bfq internal + * queues). + */ rq->elv.priv[0] = NULL; rq->elv.priv[1] = NULL; } @@ -4818,6 +5089,9 @@ static void bfq_exit_queue(struct elevator_queue *e) hrtimer_cancel(&bfqd->idle_slice_timer); #ifdef CONFIG_BFQ_GROUP_IOSCHED + /* release oom-queue reference to root group */ + bfqg_and_blkg_put(bfqd->root_group); + blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); #else spin_lock_irq(&bfqd->lock); @@ -5206,8 +5480,10 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { + .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, - .finish_request = bfq_finish_request, + .requeue_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_requeue_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 91c4390903a1..350c39ae2896 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -337,6 +337,11 @@ struct bfq_queue { * last transition from idle to backlogged. */ unsigned long service_from_backlogged; + /* + * Cumulative service received from the @bfq_queue since its + * last transition to weight-raised state. + */ + unsigned long service_from_wr; /* * Value of wr start time when switching to soft rt @@ -344,6 +349,8 @@ struct bfq_queue { unsigned long wr_start_at_switch_to_srt; unsigned long split_time; /* time of last split */ + + unsigned long first_IO_time; /* time of first I/O for this queue */ }; /** @@ -627,6 +634,18 @@ struct bfq_data { struct bfq_io_cq *bio_bic; /* bfqq associated with the task issuing current bio for merging */ struct bfq_queue *bio_bfqq; + + /* + * Cached sbitmap shift, used to compute depth limits in + * bfq_update_depths. + */ + unsigned int sb_shift; + + /* + * Depth limits used in bfq_limit_depth (see comments on the + * function) + */ + unsigned int word_depths[2][2]; }; enum bfqq_state_flags { diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index e495d3f9b4b0..4498c43245e2 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -835,6 +835,13 @@ void bfq_bfqq_served(struct bfq_queue *bfqq, int served) struct bfq_entity *entity = &bfqq->entity; struct bfq_service_tree *st; + if (!bfqq->service_from_backlogged) + bfqq->first_IO_time = jiffies; + + if (bfqq->wr_coeff > 1) + bfqq->service_from_wr += served; + + bfqq->service_from_backlogged += served; for_each_entity(entity) { st = bfq_entity_service_tree(entity); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 23b42e8aa03e..9cfdd6c83b5b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -374,7 +374,6 @@ static void bio_integrity_verify_fn(struct work_struct *work) /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio - * @error: Pointer to errno * * Description: Completion for integrity I/O * diff --git a/block/bio.c b/block/bio.c index 8bfdea58159b..e1708db48258 100644 --- a/block/bio.c +++ b/block/bio.c @@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_disk = bio_src->bi_disk; bio->bi_partno = bio_src->bi_partno; bio_set_flag(bio, BIO_CLONED); + if (bio_flagged(bio_src, BIO_THROTTLED)) + bio_set_flag(bio, BIO_THROTTLED); bio->bi_opf = bio_src->bi_opf; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; @@ -969,34 +971,6 @@ void bio_advance(struct bio *bio, unsigned bytes) EXPORT_SYMBOL(bio_advance); /** - * bio_alloc_pages - allocates a single page for each bvec in a bio - * @bio: bio to allocate pages for - * @gfp_mask: flags for allocation - * - * Allocates pages up to @bio->bi_vcnt. - * - * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are - * freed. - */ -int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, i) { - bv->bv_page = alloc_page(gfp_mask); - if (!bv->bv_page) { - while (--bv >= bio->bi_io_vec) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} -EXPORT_SYMBOL(bio_alloc_pages); - -/** * bio_copy_data - copy contents of data buffers from one chain of bios to * another * @src: source bio list @@ -1836,7 +1810,7 @@ struct bio *bio_split(struct bio *bio, int sectors, bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) - bio_set_flag(bio, BIO_TRACE_COMPLETION); + bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } diff --git a/block/blk-core.c b/block/blk-core.c index b8881750a3ac..2d1a7bbe0634 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> #include <linux/debugfs.h> +#include <linux/bpf.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> @@ -126,6 +127,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); } EXPORT_SYMBOL(blk_rq_init); @@ -143,6 +146,7 @@ static const struct { [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, /* device mapper special case, should not leak out: */ @@ -562,6 +566,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) } } +void blk_drain_queue(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + __blk_drain_queue(q, true); + spin_unlock_irq(q->queue_lock); +} + /** * blk_queue_bypass_start - enter queue bypass mode * @q: queue of interest @@ -689,11 +700,18 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); spin_lock_irq(lock); - if (!q->mq_ops) - __blk_drain_queue(q, true); queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); + /* + * make sure all in-progress dispatch are completed because + * blk_freeze_queue() can only complete all requests, and + * dispatch may still be in-progress since we dispatch requests + * from more than one contexts + */ + if (q->mq_ops) + blk_mq_quiesce_queue(q); + /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); @@ -1641,6 +1659,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) lockdep_assert_held(q->queue_lock); + blk_req_zone_write_unlock(req); blk_pm_put_request(req); elv_completed_request(q, req); @@ -2050,6 +2069,29 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +{ + if (part->policy && op_is_write(bio_op(bio))) { + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR + "generic_make_request: Trying to write " + "to read-only block-device %s (partno %d)\n", + bio_devname(bio, b), part->partno); + return true; + } + + return false; +} + +static noinline int should_fail_bio(struct bio *bio) +{ + if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + return -EIO; + return 0; +} +ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); + /* * Remap block n of partition p to block n+start(p) of the disk. */ @@ -2058,27 +2100,28 @@ static inline int blk_partition_remap(struct bio *bio) struct hd_struct *p; int ret = 0; + rcu_read_lock(); + p = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) || + bio_check_ro(bio, p))) { + ret = -EIO; + goto out; + } + /* * Zone reset does not include bi_size so bio_sectors() is always 0. * Include a test for the reset op code and perform the remap if needed. */ - if (!bio->bi_partno || - (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)) - return 0; + if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET) + goto out; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) { - bio->bi_iter.bi_sector += p->start_sect; - bio->bi_partno = 0; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); - } else { - printk("%s: fail for partition %d\n", __func__, bio->bi_partno); - ret = -EIO; - } - rcu_read_unlock(); + bio->bi_iter.bi_sector += p->start_sect; + bio->bi_partno = 0; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); +out: + rcu_read_unlock(); return ret; } @@ -2137,15 +2180,19 @@ generic_make_request_checks(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) goto not_supported; - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_bio(bio)) goto end_io; - if (blk_partition_remap(bio)) - goto end_io; + if (!bio->bi_partno) { + if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + goto end_io; + } else { + if (blk_partition_remap(bio)) + goto end_io; + } if (bio_check_eod(bio, nr_sectors)) goto end_io; @@ -2488,8 +2535,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_request_bypass_insert(rq, true); - return BLK_STS_OK; + return blk_mq_request_issue_directly(rq); } spin_lock_irqsave(q->queue_lock, flags); @@ -2841,7 +2887,7 @@ void blk_start_request(struct request *req) wbt_issue(req->q->rq_wb, &req->issue_stat); } - BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); + BUG_ON(blk_rq_is_complete(req)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); @@ -3246,6 +3292,8 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, { if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); + else if (bio_op(bio) == REQ_OP_DISCARD) + rq->nr_phys_segments = 1; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; @@ -3410,20 +3458,6 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); -int kblockd_schedule_delayed_work(struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work(kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work); - -int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); -} -EXPORT_SYMBOL(kblockd_schedule_delayed_work_on); - /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized diff --git a/block/blk-exec.c b/block/blk-exec.c index 5c0f3dc446dc..f7b292f12449 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -61,7 +61,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be reused after dying flag is set */ if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false, false); + blk_mq_sched_insert_request(rq, at_head, true, false); return; } diff --git a/block/blk-lib.c b/block/blk-lib.c index 2bc544ce3d2e..a676084d4740 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -37,6 +37,9 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secure_erase(q)) return -EOPNOTSUPP; @@ -156,6 +159,9 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; @@ -233,6 +239,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); @@ -287,6 +296,9 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, if (!q) return -ENXIO; + if (bdev_read_only(bdev)) + return -EPERM; + while (nr_sects != 0) { bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), gfp_mask); diff --git a/block/blk-map.c b/block/blk-map.c index b21f8e86f120..db9373bd31ac 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -12,22 +12,29 @@ #include "blk.h" /* - * Append a bio to a passthrough request. Only works can be merged into - * the request based on the driver constraints. + * Append a bio to a passthrough request. Only works if the bio can be merged + * into the request based on the driver constraints. */ -int blk_rq_append_bio(struct request *rq, struct bio *bio) +int blk_rq_append_bio(struct request *rq, struct bio **bio) { - blk_queue_bounce(rq->q, &bio); + struct bio *orig_bio = *bio; + + blk_queue_bounce(rq->q, bio); if (!rq->bio) { - blk_rq_bio_prep(rq->q, rq, bio); + blk_rq_bio_prep(rq->q, rq, *bio); } else { - if (!ll_back_merge_fn(rq->q, rq, bio)) + if (!ll_back_merge_fn(rq->q, rq, *bio)) { + if (orig_bio != *bio) { + bio_put(*bio); + *bio = orig_bio; + } return -EINVAL; + } - rq->biotail->bi_next = bio; - rq->biotail = bio; - rq->__data_len += bio->bi_iter.bi_size; + rq->biotail->bi_next = *bio; + rq->biotail = *bio; + rq->__data_len += (*bio)->bi_iter.bi_size; } return 0; @@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq, * We link the bounce buffer in and could have to traverse it * later so we have to get a ref to prevent it from being freed */ - ret = blk_rq_append_bio(rq, bio); - bio_get(bio); + ret = blk_rq_append_bio(rq, &bio); if (ret) { - bio_endio(bio); __blk_rq_unmap_user(orig_bio); - bio_put(bio); return ret; } + bio_get(bio); return 0; } @@ -114,7 +119,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; - int ret; + int ret = -EINVAL; if (!iter_is_iovec(iter)) goto fail; @@ -143,7 +148,7 @@ unmap_rq: __blk_rq_unmap_user(bio); fail: rq->bio = NULL; - return -EINVAL; + return ret; } EXPORT_SYMBOL(blk_rq_map_user_iov); @@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; int do_copy = 0; - struct bio *bio; + struct bio *bio, *orig_bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) @@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (do_copy) rq->rq_flags |= RQF_COPY_USER; - ret = blk_rq_append_bio(rq, bio); + orig_bio = bio; + ret = blk_rq_append_bio(rq, &bio); if (unlikely(ret)) { /* request is too big */ - bio_put(bio); + bio_put(orig_bio); return ret; } diff --git a/block/blk-merge.c b/block/blk-merge.c index f5dedd57dff6..782940c65d8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -128,9 +128,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, nsegs++; sectors = max_sectors; } - if (sectors) - goto split; - /* Make this single bvec as the 1st segment */ + goto split; } if (bvprvp && blk_queue_cluster(q)) { @@ -146,22 +144,21 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, bvprvp = &bvprv; sectors += bv.bv_len >> 9; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; continue; } new_segment: if (nsegs == queue_max_segments(q)) goto split; + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + nsegs++; bvprv = bv; bvprvp = &bvprv; seg_size = bv.bv_len; sectors += bv.bv_len >> 9; - if (nsegs == 1 && seg_size > front_seg_size) - front_seg_size = seg_size; } do_split = false; @@ -174,6 +171,8 @@ split: bio = new; } + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; bio->bi_seg_front_size = front_seg_size; if (seg_size > bio->bi_seg_back_size) bio->bi_seg_back_size = seg_size; @@ -551,6 +550,24 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } +static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, + struct request *next) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(next->bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next); + return true; +no_merge: + req_set_nomerge(q, req); + return false; +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -684,9 +701,13 @@ static struct request *attempt_merge(struct request_queue *q, * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn * will have updated segment counts, update sector - * counts here. + * counts here. Handle DISCARDs separately, as they + * have separate settings. */ - if (!ll_merge_requests_fn(q, req, next)) + if (req_op(req) == REQ_OP_DISCARD) { + if (!req_attempt_discard_merge(q, req, next)) + return NULL; + } else if (!ll_merge_requests_fn(q, req, next)) return NULL; /* @@ -716,7 +737,8 @@ static struct request *attempt_merge(struct request_queue *q, req->__data_len += blk_rq_bytes(next); - elv_merge_requests(q, req, next); + if (req_op(req) != REQ_OP_DISCARD) + elv_merge_requests(q, req, next); /* * 'next' is going away, so update stats accordingly diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index b56a4f35720d..21cbc1f071c6 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -289,17 +289,12 @@ static const char *const rqf_name[] = { RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), + RQF_NAME(ZONE_WRITE_LOCKED), + RQF_NAME(MQ_TIMEOUT_EXPIRED), + RQF_NAME(MQ_POLL_SLEPT), }; #undef RQF_NAME -#define RQAF_NAME(name) [REQ_ATOM_##name] = #name -static const char *const rqaf_name[] = { - RQAF_NAME(COMPLETE), - RQAF_NAME(STARTED), - RQAF_NAME(POLL_SLEPT), -}; -#undef RQAF_NAME - int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; @@ -316,8 +311,7 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); - seq_puts(m, ", .atomic_flags="); - blk_flags_show(m, rq->atomic_flags, rqaf_name, ARRAY_SIZE(rqaf_name)); + seq_printf(m, ", complete=%d", blk_rq_is_complete(rq)); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) @@ -409,7 +403,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) const struct show_busy_params *params = data; if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && - test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + blk_mq_rq_state(rq) != MQ_RQ_IDLE) __blk_mq_debugfs_rq_show(params->m, list_entry_rq(&rq->queuelist)); } @@ -703,7 +697,11 @@ static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, const struct blk_mq_debugfs_attr *attr = m->private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private; - if (!attr->write) + /* + * Attributes that only implement .seq_ops are read-only and 'attr' is + * the same with 'data' in this case. + */ + if (attr == data || !attr->write) return -EPERM; return attr->write(data, buf, count, ppos); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index c117bd8fd1f6..25c14c58385c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -172,7 +172,6 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) WRITE_ONCE(hctx->dispatch_from, ctx); } -/* return true if hw queue need to be run again */ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -260,6 +259,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, if (!*merged_request) elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); return true; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio); default: return false; } @@ -428,7 +429,7 @@ done: } void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block) + bool run_queue, bool async) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index ba1d1418a96d..1e9c9018ace1 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -18,7 +18,7 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async, bool can_block); + bool run_queue, bool async); void blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 79969c3c234f..a54b4b070f1c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -248,7 +248,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; @@ -265,13 +265,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) q->mq_sysfs_init_done = false; } -void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) -{ - mutex_lock(&q->sysfs_lock); - __blk_mq_unregister_dev(dev, q); - mutex_unlock(&q->sysfs_lock); -} - void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index c81b40ecd3f1..336dde07b230 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -134,12 +134,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ws = bt_wait_ptr(bt, data->hctx); drop_ctx = data->ctx == NULL; do { - prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - - tag = __blk_mq_get_tag(data, bt); - if (tag != -1) - break; - /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for @@ -155,6 +149,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; + prepare_to_wait_exclusive(&ws->wait, &wait, + TASK_UNINTERRUPTIBLE); + + tag = __blk_mq_get_tag(data, bt); + if (tag != -1) + break; + if (data->ctx) blk_mq_put_ctx(data->ctx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 11097477eeab..df93102e2149 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,8 +95,7 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) && - !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) { /* * index[0] counts the specific partition that was asked * for. index[1] counts the ones that are active on the @@ -161,6 +160,8 @@ void blk_freeze_queue(struct request_queue *q) * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); + if (!q->mq_ops) + blk_drain_queue(q); blk_mq_freeze_queue_wait(q); } @@ -220,7 +221,7 @@ void blk_mq_quiesce_queue(struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) { if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->queue_rq_srcu); + synchronize_srcu(hctx->srcu); else rcu = true; } @@ -270,15 +271,14 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - - rq->rq_flags = 0; + req_flags_t rq_flags = 0; if (data->flags & BLK_MQ_REQ_INTERNAL) { rq->tag = -1; rq->internal_tag = tag; } else { if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; + rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } rq->tag = tag; @@ -286,27 +286,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, data->hctx->tags->rqs[rq->tag] = rq; } - INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; + rq->rq_flags = rq_flags; + rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; if (blk_queue_io_stat(data->q)) rq->rq_flags |= RQF_IO_STAT; - /* do not touch atomic flags, it needs atomic ops against the timer */ - rq->cpu = -1; + INIT_LIST_HEAD(&rq->queuelist); INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; rq->start_time = jiffies; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; - set_start_time_ns(rq); - rq->io_start_time_ns = 0; -#endif rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; @@ -314,6 +309,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->special = NULL; /* tag was already set */ rq->extra_len = 0; + rq->__deadline = 0; INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; @@ -322,6 +318,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->end_io_data = NULL; rq->next_rq = NULL; +#ifdef CONFIG_BLK_CGROUP + rq->rl = NULL; + set_start_time_ns(rq); + rq->io_start_time_ns = 0; +#endif + data->ctx->rq_dispatched[op_is_sync(op)]++; return rq; } @@ -441,7 +443,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_queue_exit(q); return ERR_PTR(-EXDEV); } - cpu = cpumask_first(alloc_data.hctx->cpumask); + cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); rq = blk_mq_get_request(q, NULL, op, &alloc_data); @@ -483,8 +485,7 @@ void blk_mq_free_request(struct request *rq) if (blk_rq_rl(rq)) blk_put_rl(blk_rq_rl(rq)); - clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -530,6 +531,9 @@ static void __blk_mq_complete_request(struct request *rq) bool shared = false; int cpu; + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); + blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE); + if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); if (rq->rq_flags & RQF_STATS) { @@ -557,6 +561,56 @@ static void __blk_mq_complete_request(struct request *rq) put_cpu(); } +static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) + __releases(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + rcu_read_unlock(); + else + srcu_read_unlock(hctx->srcu, srcu_idx); +} + +static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) + __acquires(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + /* shut up gcc false positive */ + *srcu_idx = 0; + rcu_read_lock(); + } else + *srcu_idx = srcu_read_lock(hctx->srcu); +} + +static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) +{ + unsigned long flags; + + /* + * blk_mq_rq_aborted_gstate() is used from the completion path and + * can thus be called from irq context. u64_stats_fetch in the + * middle of update on the same CPU leads to lockup. Disable irq + * while updating. + */ + local_irq_save(flags); + u64_stats_update_begin(&rq->aborted_gstate_sync); + rq->aborted_gstate = gstate; + u64_stats_update_end(&rq->aborted_gstate_sync); + local_irq_restore(flags); +} + +static u64 blk_mq_rq_aborted_gstate(struct request *rq) +{ + unsigned int start; + u64 aborted_gstate; + + do { + start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); + aborted_gstate = rq->aborted_gstate; + } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); + + return aborted_gstate; +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -568,17 +622,33 @@ static void __blk_mq_complete_request(struct request *rq) void blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); + int srcu_idx; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + + /* + * If @rq->aborted_gstate equals the current instance, timeout is + * claiming @rq and we lost. This is synchronized through + * hctx_lock(). See blk_mq_timeout_work() for details. + * + * Completion path never blocks and we can directly use RCU here + * instead of hctx_lock() which can be either RCU or SRCU. + * However, that would complicate paths which want to synchronize + * against us. Let stay in sync with the issue path so that + * hctx_lock() covers both issue and completion paths. + */ + hctx_lock(hctx, &srcu_idx); + if (blk_mq_rq_aborted_gstate(rq) != rq->gstate) __blk_mq_complete_request(rq); + hctx_unlock(hctx, srcu_idx); } EXPORT_SYMBOL(blk_mq_complete_request); int blk_mq_request_started(struct request *rq) { - return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } EXPORT_SYMBOL_GPL(blk_mq_request_started); @@ -596,34 +666,27 @@ void blk_mq_start_request(struct request *rq) wbt_issue(q->rq_wb, &rq->issue_stat); } - blk_add_timer(rq); - - WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); /* - * Mark us as started and clear complete. Complete might have been - * set if requeue raced with timeout, which then marked it as - * complete. So be sure to clear complete again when we start - * the request, otherwise we'll ignore the completion event. + * Mark @rq in-flight which also advances the generation number, + * and register for timeout. Protect with a seqcount to allow the + * timeout path to read both @rq->gstate and @rq->deadline + * coherently. * - * Ensure that ->deadline is visible before we set STARTED, such that - * blk_mq_check_expired() is guaranteed to observe our ->deadline when - * it observes STARTED. + * This is the only place where a request is marked in-flight. If + * the timeout path reads an in-flight @rq->gstate, the + * @rq->deadline it reads together under @rq->gstate_seq is + * guaranteed to be the matching one. */ - smp_wmb(); - set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { - /* - * Coherence order guarantees these consecutive stores to a - * single variable propagate in the specified order. Thus the - * clear_bit() is ordered _after_ the set bit. See - * blk_mq_check_expired(). - * - * (the bits must be part of the same byte for this to be - * true). - */ - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); - } + preempt_disable(); + write_seqcount_begin(&rq->gstate_seq); + + blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); + blk_add_timer(rq); + + write_seqcount_end(&rq->gstate_seq); + preempt_enable(); if (q->dma_drain_size && blk_rq_bytes(rq)) { /* @@ -637,13 +700,9 @@ void blk_mq_start_request(struct request *rq) EXPORT_SYMBOL(blk_mq_start_request); /* - * When we reach here because queue is busy, REQ_ATOM_COMPLETE - * flag isn't set yet, so there may be race with timeout handler, - * but given rq->deadline is just set in .queue_rq() under - * this situation, the race won't be possible in reality because - * rq->timeout should be set as big enough to cover the window - * between blk_mq_start_request() called from .queue_rq() and - * clearing REQ_ATOM_STARTED here. + * When we reach here because queue is busy, it's safe to change the state + * to IDLE without checking @rq->aborted_gstate because we should still be + * holding the RCU read lock and thus protected against timeout. */ static void __blk_mq_requeue_request(struct request *rq) { @@ -655,7 +714,8 @@ static void __blk_mq_requeue_request(struct request *rq) wbt_requeue(q->rq_wb, &rq->issue_stat); blk_mq_sched_requeue_request(rq); - if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) { + blk_mq_rq_update_state(rq, MQ_RQ_IDLE); if (q->dma_drain_size && blk_rq_bytes(rq)) rq->nr_phys_segments--; } @@ -687,13 +747,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false, true); + blk_mq_sched_insert_request(rq, true, false, false); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false, true); + blk_mq_sched_insert_request(rq, false, false, false); } blk_mq_run_hw_queues(q, false); @@ -727,7 +787,7 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_delayed_work(&q->requeue_work, 0); + kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); @@ -753,24 +813,15 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); struct blk_mq_timeout_data { unsigned long next; unsigned int next_set; + unsigned int nr_expired; }; -void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct request *req, bool reserved) { const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - /* - * We know that complete is set at this point. If STARTED isn't set - * anymore, then the request isn't active and the "timeout" should - * just be ignored. This can happen due to the bitflag ordering. - * Timeout first checks if STARTED is set, and if it is, assumes - * the request is active. But if we race with completion, then - * both flags will get cleared. So check here again, and ignore - * a timeout event with a request that isn't active. - */ - if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) - return; + req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED; if (ops->timeout) ret = ops->timeout(req, reserved); @@ -780,8 +831,13 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) __blk_mq_complete_request(req); break; case BLK_EH_RESET_TIMER: + /* + * As nothing prevents from completion happening while + * ->aborted_gstate is set, this may lead to ignored + * completions and further spurious timeouts. + */ + blk_mq_rq_update_aborted_gstate(req, 0); blk_add_timer(req); - blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: break; @@ -795,50 +851,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - unsigned long deadline; + unsigned long gstate, deadline; + int start; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) - return; + might_sleep(); - /* - * Ensures that if we see STARTED we must also see our - * up-to-date deadline, see blk_mq_start_request(). - */ - smp_rmb(); + if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) + return; - deadline = READ_ONCE(rq->deadline); + /* read coherent snapshots of @rq->state_gen and @rq->deadline */ + while (true) { + start = read_seqcount_begin(&rq->gstate_seq); + gstate = READ_ONCE(rq->gstate); + deadline = blk_rq_deadline(rq); + if (!read_seqcount_retry(&rq->gstate_seq, start)) + break; + cond_resched(); + } - /* - * The rq being checked may have been freed and reallocated - * out already here, we avoid this race by checking rq->deadline - * and REQ_ATOM_COMPLETE flag together: - * - * - if rq->deadline is observed as new value because of - * reusing, the rq won't be timed out because of timing. - * - if rq->deadline is observed as previous value, - * REQ_ATOM_COMPLETE flag won't be cleared in reuse path - * because we put a barrier between setting rq->deadline - * and clearing the flag in blk_mq_start_request(), so - * this rq won't be timed out too. - */ - if (time_after_eq(jiffies, deadline)) { - if (!blk_mark_rq_complete(rq)) { - /* - * Again coherence order ensures that consecutive reads - * from the same variable must be in that order. This - * ensures that if we see COMPLETE clear, we must then - * see STARTED set and we'll ignore this timeout. - * - * (There's also the MB implied by the test_and_clear()) - */ - blk_mq_rq_timed_out(rq, reserved); - } + /* if in-flight && overdue, mark for abortion */ + if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && + time_after_eq(jiffies, deadline)) { + blk_mq_rq_update_aborted_gstate(rq, gstate); + data->nr_expired++; + hctx->nr_expired++; } else if (!data->next_set || time_after(data->next, deadline)) { data->next = deadline; data->next_set = 1; } } +static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + /* + * We marked @rq->aborted_gstate and waited for RCU. If there were + * completions that we lost to, they would have finished and + * updated @rq->gstate by now; otherwise, the completion path is + * now guaranteed to see @rq->aborted_gstate and yield. If + * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. + */ + if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && + READ_ONCE(rq->gstate) == rq->aborted_gstate) + blk_mq_rq_timed_out(rq, reserved); +} + static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = @@ -846,7 +903,9 @@ static void blk_mq_timeout_work(struct work_struct *work) struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, + .nr_expired = 0, }; + struct blk_mq_hw_ctx *hctx; int i; /* A deadlock might occur if a request is stuck requiring a @@ -865,14 +924,46 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; + /* scan for the expired ones and set their ->aborted_gstate */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); + if (data.nr_expired) { + bool has_rcu = false; + + /* + * Wait till everyone sees ->aborted_gstate. The + * sequential waits for SRCUs aren't ideal. If this ever + * becomes a problem, we can add per-hw_ctx rcu_head and + * wait in parallel. + */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->nr_expired) + continue; + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + has_rcu = true; + else + synchronize_srcu(hctx->srcu); + + hctx->nr_expired = 0; + } + if (has_rcu) + synchronize_rcu(); + + /* terminate the ones we won */ + blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); + } + if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { - struct blk_mq_hw_ctx *hctx; - + /* + * Request timeouts are handled as a forward rolling timer. If + * we end up here it means that no requests are pending and + * also that no request has been pending for a while. Mark + * each hctx as idle. + */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -1008,68 +1099,71 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, /* * Mark us waiting for a tag. For shared tags, this involves hooking us into - * the tag wakeups. For non-shared tags, we can simply mark us nedeing a - * restart. For both caes, take care to check the condition again after + * the tag wakeups. For non-shared tags, we can simply mark us needing a + * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, struct request *rq) { struct blk_mq_hw_ctx *this_hctx = *hctx; - bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0; struct sbq_wait_state *ws; wait_queue_entry_t *wait; bool ret; - if (!shared_tags) { + if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); - } else { - wait = &this_hctx->dispatch_wait; - if (!list_empty_careful(&wait->entry)) - return false; - spin_lock(&this_hctx->lock); - if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); - return false; - } + /* + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. + * + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return blk_mq_get_driver_tag(rq, hctx, false); + } + + wait = &this_hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); - add_wait_queue(&ws->wait, wait); + spin_lock(&this_hctx->lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&this_hctx->lock); + return false; } + ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + add_wait_queue(&ws->wait, wait); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq, hctx, false); - - if (!shared_tags) { - /* - * Don't clear RESTART here, someone else could have set it. - * At most this will cost an extra queue run. - */ - return ret; - } else { - if (!ret) { - spin_unlock(&this_hctx->lock); - return false; - } - - /* - * We got a tag, remove ourselves from the wait queue to ensure - * someone else gets the wakeup. - */ - spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); + if (!ret) { spin_unlock(&this_hctx->lock); - return true; + return false; } + + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); + spin_unlock(&this_hctx->lock); + + return true; } +#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ + bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bool got_budget) { @@ -1077,6 +1171,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, struct request *rq, *nxt; bool no_tag = false; int errors, queued; + blk_status_t ret = BLK_STS_OK; if (list_empty(list)) return false; @@ -1089,7 +1184,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, errors = queued = 0; do { struct blk_mq_queue_data bd; - blk_status_t ret; rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { @@ -1134,7 +1228,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, } ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE) { + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { /* * If an I/O scheduler has been configured and we got a * driver tag for the next request already, free it @@ -1165,6 +1259,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * that is where we will continue on next queue run. */ if (!list_empty(list)) { + bool needs_restart; + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1188,10 +1284,17 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. + * + * If driver returns BLK_STS_RESOURCE and SCHED_RESTART + * bit is set, run queue after a delay to avoid IO stalls + * that could otherwise occur if the queue is idle. */ - if (!blk_mq_sched_needs_restart(hctx) || + needs_restart = blk_mq_sched_needs_restart(hctx); + if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); + else if (needs_restart && (ret == BLK_STS_RESOURCE)) + blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); } return (queued + errors) != 0; @@ -1204,9 +1307,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) /* * We should be running this queue from one of the CPUs that * are mapped to it. + * + * There are at least two related races now between setting + * hctx->next_cpu from blk_mq_hctx_next_cpu() and running + * __blk_mq_run_hw_queue(): + * + * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), + * but later it becomes online, then this warning is harmless + * at all + * + * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), + * but later it becomes offline, then the warning can't be + * triggered, and we depend on blk-mq timeout handler to + * handle dispatched requests to this hctx */ - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); + if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)) { + printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", + raw_smp_processor_id(), + cpumask_empty(hctx->cpumask) ? "inactive": "active"); + dump_stack(); + } /* * We can't run the queue inline with ints disabled. Ensure that @@ -1214,17 +1335,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ WARN_ON_ONCE(in_interrupt()); - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - blk_mq_sched_dispatch_requests(hctx); - rcu_read_unlock(); - } else { - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - blk_mq_sched_dispatch_requests(hctx); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + blk_mq_sched_dispatch_requests(hctx); + hctx_unlock(hctx, srcu_idx); } /* @@ -1235,20 +1350,47 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { + bool tried = false; + if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { int next_cpu; - - next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); +select_cpu: + next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask, + cpu_online_mask); if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(hctx->cpumask); + next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask); - hctx->next_cpu = next_cpu; + /* + * No online CPU is found, so have to make sure hctx->next_cpu + * is set correctly for not breaking workqueue. + */ + if (next_cpu >= nr_cpu_ids) + hctx->next_cpu = cpumask_first(hctx->cpumask); + else + hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + /* + * Do unbound schedule if we can't find a online CPU for this hctx, + * and it should only happen in the path of handling CPU DEAD. + */ + if (!cpu_online(hctx->next_cpu)) { + if (!tried) { + tried = true; + goto select_cpu; + } + + /* + * Make sure to re-select CPU next time once after CPUs + * in hctx->cpumask become online again. + */ + hctx->next_cpu_batch = 1; + return WORK_CPU_UNBOUND; + } return hctx->next_cpu; } @@ -1272,9 +1414,8 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, - msecs_to_jiffies(msecs)); + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, + msecs_to_jiffies(msecs)); } void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) @@ -1285,7 +1426,23 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (blk_mq_hctx_has_pending(hctx)) { + int srcu_idx; + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even __blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + hctx_lock(hctx, &srcu_idx); + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx); + hctx_unlock(hctx, srcu_idx); + + if (need_run) { __blk_mq_delay_run_hw_queue(hctx, async, 0); return true; } @@ -1593,9 +1750,9 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } -static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool may_sleep) +static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { @@ -1604,15 +1761,53 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, }; blk_qc_t new_cookie; blk_status_t ret; + + new_cookie = request_to_qc_t(hctx, rq); + + /* + * For OK queue, we are done. For error, caller may kill it. + * Any other error (busy), just add it to our list as we + * previously would have done. + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + switch (ret) { + case BLK_STS_OK: + *cookie = new_cookie; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + __blk_mq_requeue_request(rq); + break; + default: + *cookie = BLK_QC_T_NONE; + break; + } + + return ret; +} + +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, + bool bypass_insert) +{ + struct request_queue *q = rq->q; bool run_queue = true; - /* RCU or SRCU read lock is needed before checking quiesced flag */ + /* + * RCU or SRCU read lock is needed before checking quiesced flag. + * + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. + */ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; + bypass_insert = false; goto insert; } - if (q->elevator) + if (q->elevator && !bypass_insert) goto insert; if (!blk_mq_get_driver_tag(rq, NULL, false)) @@ -1623,47 +1818,47 @@ static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - new_cookie = request_to_qc_t(hctx, rq); - - /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done - */ - ret = q->mq_ops->queue_rq(hctx, &bd); - switch (ret) { - case BLK_STS_OK: - *cookie = new_cookie; - return; - case BLK_STS_RESOURCE: - __blk_mq_requeue_request(rq); - goto insert; - default: - *cookie = BLK_QC_T_NONE; - blk_mq_end_request(rq, ret); - return; - } - + return __blk_mq_issue_directly(hctx, rq, cookie); insert: - blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep); + if (bypass_insert) + return BLK_STS_RESOURCE; + + blk_mq_sched_insert_request(rq, false, run_queue, false); + return BLK_STS_OK; } static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie) { - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - rcu_read_lock(); - __blk_mq_try_issue_directly(hctx, rq, cookie, false); - rcu_read_unlock(); - } else { - unsigned int srcu_idx; + blk_status_t ret; + int srcu_idx; - might_sleep(); + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); - __blk_mq_try_issue_directly(hctx, rq, cookie, true); - srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx); - } + hctx_lock(hctx, &srcu_idx); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) + blk_mq_sched_insert_request(rq, false, true, false); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_issue_directly(struct request *rq) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + hctx_unlock(hctx, srcu_idx); + + return ret; } static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) @@ -1774,7 +1969,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) } else if (q->elevator) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_sched_insert_request(rq, false, true, true, true); + blk_mq_sched_insert_request(rq, false, true, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); @@ -1867,6 +2062,22 @@ static size_t order_to_size(unsigned int order) return (size_t)PAGE_SIZE << order; } +static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, int node) +{ + int ret; + + if (set->ops->init_request) { + ret = set->ops->init_request(set, rq, hctx_idx, node); + if (ret) + return ret; + } + + seqcount_init(&rq->gstate_seq); + u64_stats_init(&rq->aborted_gstate_sync); + return 0; +} + int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { @@ -1928,12 +2139,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct request *rq = p; tags->static_rqs[i] = rq; - if (set->ops->init_request) { - if (set->ops->init_request(set, rq, hctx_idx, - node)) { - tags->static_rqs[i] = NULL; - goto fail; - } + if (blk_mq_init_request(set, rq, hctx_idx, node)) { + tags->static_rqs[i] = NULL; + goto fail; } p += rq_size; @@ -1992,7 +2200,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, { blk_mq_debugfs_unregister_hctx(hctx); - blk_mq_tag_idle(hctx); + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); @@ -2003,7 +2212,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->queue_rq_srcu); + cleanup_srcu_struct(hctx->srcu); blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); @@ -2072,13 +2281,11 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->fq) goto sched_exit_hctx; - if (set->ops->init_request && - set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, - node)) + if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) goto free_fq; if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->queue_rq_srcu); + init_srcu_struct(hctx->srcu); blk_mq_debugfs_register_hctx(q, hctx); @@ -2114,16 +2321,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; - /* If the cpu isn't present, the cpu is mapped to first hctx */ - if (!cpu_present(i)) - continue; - - hctx = blk_mq_map_queue(q, i); - /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ + hctx = blk_mq_map_queue(q, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = local_memory_node(cpu_to_node(i)); } @@ -2180,7 +2382,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * * If the cpu isn't present, the cpu is mapped to first hctx. */ - for_each_present_cpu(i) { + for_each_possible_cpu(i) { hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && @@ -2234,7 +2436,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) /* * Initialize batch roundrobin counts */ - hctx->next_cpu = cpumask_first(hctx->cpumask); + hctx->next_cpu = cpumask_first_and(hctx->cpumask, + cpu_online_mask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } @@ -2367,7 +2570,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) { int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu), + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), __alignof__(struct blk_mq_hw_ctx)) != sizeof(struct blk_mq_hw_ctx)); @@ -2384,6 +2587,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; blk_mq_sysfs_unregister(q); + + /* protect against switching io scheduler */ + mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int node; @@ -2428,6 +2634,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, } } q->nr_hw_queues = i; + mutex_unlock(&q->sysfs_lock); blk_mq_sysfs_register(q); } @@ -2599,9 +2806,27 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) + if (set->ops->map_queues) { + int cpu; + /* + * transport .map_queues is usually done in the following + * way: + * + * for (queue = 0; queue < set->nr_hw_queues; queue++) { + * mask = get_cpu_mask(queue) + * for_each_cpu(cpu, mask) + * set->mq_map[cpu] = queue; + * } + * + * When we need to remap, the table has to be cleared for + * killing stale mapping since one CPU may not be mapped + * to any hw queue. + */ + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; + return set->ops->map_queues(set); - else + } else return blk_mq_map_queues(set); } @@ -2710,6 +2935,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return -EINVAL; blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { @@ -2733,6 +2959,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; @@ -2848,7 +3075,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, unsigned int nsecs; ktime_t kt; - if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + if (rq->rq_flags & RQF_MQ_POLL_SLEPT) return false; /* @@ -2868,7 +3095,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, if (!nsecs) return false; - set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + rq->rq_flags |= RQF_MQ_POLL_SLEPT; /* * This will be replaced with the stats tracking code, using @@ -2882,7 +3109,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, hrtimer_init_sleeper(&hs, current); do { - if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) break; set_current_state(TASK_UNINTERRUPTIBLE); hrtimer_start_expires(&hs.timer, mode); @@ -2968,12 +3195,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) static int __init blk_mq_init(void) { - /* - * See comment in block/blk.h rq_atomic_flags enum - */ - BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != - (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); - cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); return 0; diff --git a/block/blk-mq.h b/block/blk-mq.h index 6c7c3ff5bf62..88c558f71819 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -27,6 +27,20 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; +/* + * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value + * and the upper bits the generation number. + */ +enum mq_rq_state { + MQ_RQ_IDLE = 0, + MQ_RQ_IN_FLIGHT = 1, + MQ_RQ_COMPLETE = 2, + + MQ_RQ_STATE_BITS = 2, + MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1, + MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS, +}; + void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); @@ -60,6 +74,9 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_issue_directly(struct request *rq); + /* * CPU -> queue mappings */ @@ -81,10 +98,41 @@ extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); -extern void blk_mq_rq_timed_out(struct request *req, bool reserved); - void blk_mq_release(struct request_queue *q); +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline int blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK; +} + +/** + * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request + * @rq: target request. + * @state: new state to set. + * + * Set @rq's state to @state. The caller is responsible for ensuring that + * there are no other updaters. A request can transition into IN_FLIGHT + * only from IDLE and doing so increments the generation number. + */ +static inline void blk_mq_rq_update_state(struct request *rq, + enum mq_rq_state state) +{ + u64 old_val = READ_ONCE(rq->gstate); + u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state; + + if (state == MQ_RQ_IN_FLIGHT) { + WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE); + new_val += MQ_RQ_GEN_INC; + } + + /* avoid exposing interim values */ + WRITE_ONCE(rq->gstate, new_val); +} + static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 870484eaed1f..cbea895a5547 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -853,6 +853,10 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +/** + * blk_register_queue - register a block layer queue with sysfs + * @disk: Disk of which the request queue should be registered with sysfs. + */ int blk_register_queue(struct gendisk *disk) { int ret; @@ -909,11 +913,12 @@ int blk_register_queue(struct gendisk *disk) if (q->request_fn || (q->mq_ops && q->elevator)) { ret = elv_register_queue(q); if (ret) { + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(dev); kobject_put(&dev->kobj); - goto unlock; + return ret; } } ret = 0; @@ -921,7 +926,15 @@ unlock: mutex_unlock(&q->sysfs_lock); return ret; } +EXPORT_SYMBOL_GPL(blk_register_queue); +/** + * blk_unregister_queue - counterpart of blk_register_queue() + * @disk: Disk of which the request queue should be unregistered from sysfs. + * + * Note: the caller is responsible for guaranteeing that this function is called + * after blk_register_queue() has finished. + */ void blk_unregister_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -929,21 +942,39 @@ void blk_unregister_queue(struct gendisk *disk) if (WARN_ON(!q)) return; - mutex_lock(&q->sysfs_lock); - queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q); - mutex_unlock(&q->sysfs_lock); + /* Return early if disk->queue was never registered. */ + if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) + return; - wbt_exit(q); + /* + * Since sysfs_remove_dir() prevents adding new directory entries + * before removal of existing entries starts, protect against + * concurrent elv_iosched_store() calls. + */ + mutex_lock(&q->sysfs_lock); + spin_lock_irq(q->queue_lock); + queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + spin_unlock_irq(q->queue_lock); + /* + * Remove the sysfs attributes before unregistering the queue data + * structures that can be modified through sysfs. + */ if (q->mq_ops) blk_mq_unregister_dev(disk_to_dev(disk), q); - - if (q->request_fn || (q->mq_ops && q->elevator)) - elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); blk_trace_remove_sysfs(disk_to_dev(disk)); + + wbt_exit(q); + + mutex_lock(&q->sysfs_lock); + if (q->request_fn || (q->mq_ops && q->elevator)) + elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); + kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 825bc29767e6..c5a131673733 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -216,9 +216,9 @@ struct throtl_data unsigned int scale; - struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; - struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; - struct latency_bucket __percpu *latency_buckets; + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets[2]; unsigned long last_calculate_time; unsigned long filtered_latency; @@ -1511,10 +1511,20 @@ static struct cftype throtl_legacy_files[] = { .seq_show = blkg_print_stat_bytes, }, { + .name = "throttle.io_service_bytes_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes_recursive, + }, + { .name = "throttle.io_serviced", .private = (unsigned long)&blkcg_policy_throtl, .seq_show = blkg_print_stat_ios, }, + { + .name = "throttle.io_serviced_recursive", + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios_recursive, + }, { } /* terminate */ }; @@ -2040,10 +2050,10 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { - struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; - int i, cpu; - unsigned long last_latency = 0; - unsigned long latency; + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; + int i, cpu, rw; + unsigned long last_latency[2] = { 0 }; + unsigned long latency[2]; if (!blk_queue_nonrot(td->queue)) return; @@ -2052,56 +2062,67 @@ static void throtl_update_latency_buckets(struct throtl_data *td) td->last_calculate_time = jiffies; memset(avg_latency, 0, sizeof(avg_latency)); - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - struct latency_bucket *tmp = &td->tmp_buckets[i]; - - for_each_possible_cpu(cpu) { - struct latency_bucket *bucket; - - /* this isn't race free, but ok in practice */ - bucket = per_cpu_ptr(td->latency_buckets, cpu); - tmp->total_latency += bucket[i].total_latency; - tmp->samples += bucket[i].samples; - bucket[i].total_latency = 0; - bucket[i].samples = 0; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets[rw], + cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } - if (tmp->samples >= 32) { - int samples = tmp->samples; + if (tmp->samples >= 32) { + int samples = tmp->samples; - latency = tmp->total_latency; + latency[rw] = tmp->total_latency; - tmp->total_latency = 0; - tmp->samples = 0; - latency /= samples; - if (latency == 0) - continue; - avg_latency[i].latency = latency; + tmp->total_latency = 0; + tmp->samples = 0; + latency[rw] /= samples; + if (latency[rw] == 0) + continue; + avg_latency[rw][i].latency = latency[rw]; + } } } - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - if (!avg_latency[i].latency) { - if (td->avg_buckets[i].latency < last_latency) - td->avg_buckets[i].latency = last_latency; - continue; - } + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[rw][i].latency) { + if (td->avg_buckets[rw][i].latency < last_latency[rw]) + td->avg_buckets[rw][i].latency = + last_latency[rw]; + continue; + } - if (!td->avg_buckets[i].valid) - latency = avg_latency[i].latency; - else - latency = (td->avg_buckets[i].latency * 7 + - avg_latency[i].latency) >> 3; + if (!td->avg_buckets[rw][i].valid) + latency[rw] = avg_latency[rw][i].latency; + else + latency[rw] = (td->avg_buckets[rw][i].latency * 7 + + avg_latency[rw][i].latency) >> 3; - td->avg_buckets[i].latency = max(latency, last_latency); - td->avg_buckets[i].valid = true; - last_latency = td->avg_buckets[i].latency; + td->avg_buckets[rw][i].latency = max(latency[rw], + last_latency[rw]); + td->avg_buckets[rw][i].valid = true; + last_latency[rw] = td->avg_buckets[rw][i].latency; + } } for (i = 0; i < LATENCY_BUCKET_SIZE; i++) throtl_log(&td->service_queue, - "Latency bucket %d: latency=%ld, valid=%d", i, - td->avg_buckets[i].latency, td->avg_buckets[i].valid); + "Latency bucket %d: read latency=%ld, read valid=%d, " + "write latency=%ld, write valid=%d", i, + td->avg_buckets[READ][i].latency, + td->avg_buckets[READ][i].valid, + td->avg_buckets[WRITE][i].latency, + td->avg_buckets[WRITE][i].valid); } #else static inline void throtl_update_latency_buckets(struct throtl_data *td) @@ -2226,13 +2247,7 @@ again: out_unlock: spin_unlock_irq(q->queue_lock); out: - /* - * As multiple blk-throtls may stack in the same issue path, we - * don't want bios to leave with the flag set. Clear the flag if - * being issued. - */ - if (!throttled) - bio_clear_flag(bio, BIO_THROTTLED); + bio_set_flag(bio, BIO_THROTTLED); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) @@ -2248,16 +2263,17 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size, struct latency_bucket *latency; int index; - if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || + if (!td || td->limit_index != LIMIT_LOW || + !(op == REQ_OP_READ || op == REQ_OP_WRITE) || !blk_queue_nonrot(td->queue)) return; index = request_bucket_index(size); - latency = get_cpu_ptr(td->latency_buckets); + latency = get_cpu_ptr(td->latency_buckets[op]); latency[index].total_latency += time; latency[index].samples++; - put_cpu_ptr(td->latency_buckets); + put_cpu_ptr(td->latency_buckets[op]); } void blk_throtl_stat_add(struct request *rq, u64 time_ns) @@ -2276,6 +2292,7 @@ void blk_throtl_bio_endio(struct bio *bio) unsigned long finish_time; unsigned long start_time; unsigned long lat; + int rw = bio_data_dir(bio); tg = bio->bi_cg_private; if (!tg) @@ -2304,7 +2321,7 @@ void blk_throtl_bio_endio(struct bio *bio) bucket = request_bucket_index( blk_stat_size(&bio->bi_issue_stat)); - threshold = tg->td->avg_buckets[bucket].latency + + threshold = tg->td->avg_buckets[rw][bucket].latency + tg->latency_target; if (lat > threshold) tg->bad_bio_cnt++; @@ -2397,9 +2414,16 @@ int blk_throtl_init(struct request_queue *q) td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; - td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * + td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets[READ]) { + kfree(td); + return -ENOMEM; + } + td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets) { + if (!td->latency_buckets[WRITE]) { + free_percpu(td->latency_buckets[READ]); kfree(td); return -ENOMEM; } @@ -2418,7 +2442,8 @@ int blk_throtl_init(struct request_queue *q) /* activate policy */ ret = blkcg_activate_policy(q, &blkcg_policy_throtl); if (ret) { - free_percpu(td->latency_buckets); + free_percpu(td->latency_buckets[READ]); + free_percpu(td->latency_buckets[WRITE]); kfree(td); } return ret; @@ -2429,7 +2454,8 @@ void blk_throtl_exit(struct request_queue *q) BUG_ON(!q->td); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); - free_percpu(q->td->latency_buckets); + free_percpu(q->td->latency_buckets[READ]); + free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } @@ -2447,15 +2473,17 @@ void blk_throtl_register_queue(struct request_queue *q) } else { td->throtl_slice = DFL_THROTL_SLICE_HD; td->filtered_latency = LATENCY_FILTERED_HD; - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) - td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY; + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; + td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; + } } #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ td->throtl_slice = DFL_THROTL_SLICE_HD; #endif - td->track_bio_latency = !q->mq_ops && !q->request_fn; + td->track_bio_latency = !queue_is_rq_based(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 764ecf9aeb30..a05e3676d24a 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -112,7 +112,9 @@ static void blk_rq_timed_out(struct request *req) static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, unsigned int *next_set) { - if (time_after_eq(jiffies, rq->deadline)) { + const unsigned long deadline = blk_rq_deadline(rq); + + if (time_after_eq(jiffies, deadline)) { list_del_init(&rq->timeout_list); /* @@ -120,8 +122,8 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout */ if (!blk_mark_rq_complete(rq)) blk_rq_timed_out(rq); - } else if (!*next_set || time_after(*next_timeout, rq->deadline)) { - *next_timeout = rq->deadline; + } else if (!*next_set || time_after(*next_timeout, deadline)) { + *next_timeout = deadline; *next_set = 1; } } @@ -156,12 +158,17 @@ void blk_timeout_work(struct work_struct *work) */ void blk_abort_request(struct request *req) { - if (blk_mark_rq_complete(req)) - return; - if (req->q->mq_ops) { - blk_mq_rq_timed_out(req, false); + /* + * All we need to ensure is that timeout scan takes place + * immediately and that scan sees the new timeout value. + * No need for fancy synchronizations. + */ + blk_rq_set_deadline(req, jiffies); + mod_timer(&req->q->timeout, 0); } else { + if (blk_mark_rq_complete(req)) + return; blk_delete_timer(req); blk_rq_timed_out(req); } @@ -208,7 +215,8 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; - WRITE_ONCE(req->deadline, jiffies + req->timeout); + blk_rq_set_deadline(req, jiffies + req->timeout); + req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED; /* * Only the non-mq case needs to add the request to a protected list. @@ -222,7 +230,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(req->deadline)); + expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk-wbt.c b/block/blk-wbt.c index ae8de9780085..f92fc84b5e2c 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -697,7 +697,15 @@ u64 wbt_default_latency_nsec(struct request_queue *q) static int wbt_data_dir(const struct request *rq) { - return rq_data_dir(rq); + const int op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH) + return WRITE; + + /* don't account */ + return -1; } int wbt_init(struct request_queue *q) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ff57fb51b338..acb7252c7e81 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -22,6 +22,48 @@ static inline sector_t blk_zone_start(struct request_queue *q, } /* + * Return true if a request is a write requests that needs zone write locking. + */ +bool blk_req_needs_zone_write_lock(struct request *rq) +{ + if (!rq->q->seq_zones_wlock) + return false; + + if (blk_rq_is_passthrough(rq)) + return false; + + switch (req_op(rq)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + return blk_rq_zone_is_seq(rq); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); + +void __blk_req_zone_write_lock(struct request *rq) +{ + if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock))) + return; + + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); + +void __blk_req_zone_write_unlock(struct request *rq) +{ + rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; + if (rq->q->seq_zones_wlock) + WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock)); +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); + +/* * Check that a zone report belongs to the partition. * If yes, fix its start sector and write pointer, copy it in the * zone information array and return true. Return false otherwise. diff --git a/block/blk.h b/block/blk.h index 3f1446937aec..46db5dc83dcb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -120,33 +120,23 @@ void blk_account_io_completion(struct request *req, unsigned int bytes); void blk_account_io_done(struct request *req); /* - * Internal atomic flags for request handling - */ -enum rq_atomic_flags { - /* - * Keep these two bits first - not because we depend on the - * value of them, but we do depend on them being in the same - * byte of storage to ensure ordering on writes. Keeping them - * first will achieve that nicely. - */ - REQ_ATOM_COMPLETE = 0, - REQ_ATOM_STARTED, - - REQ_ATOM_POLL_SLEPT, -}; - -/* * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them succeeds + * sure that only one of them succeeds. Steal the bottom bit of the + * __deadline field for this. */ static inline int blk_mark_rq_complete(struct request *rq) { - return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + return test_and_set_bit(0, &rq->__deadline); } static inline void blk_clear_rq_complete(struct request *rq) { - clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); + clear_bit(0, &rq->__deadline); +} + +static inline bool blk_rq_is_complete(struct request *rq) +{ + return test_bit(0, &rq->__deadline); } /* @@ -172,6 +162,9 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq e->type->ops.sq.elevator_deactivate_req_fn(q, rq); } +int elv_register_queue(struct request_queue *q); +void elv_unregister_queue(struct request_queue *q); + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); #ifdef CONFIG_FAIL_IO_TIMEOUT @@ -246,6 +239,21 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) } /* + * Steal a bit from this field for legacy IO path atomic IO marking. Note that + * setting the deadline clears the bottom bit, potentially clearing the + * completed bit. The user has to be OK with this (current ones are fine). + */ +static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) +{ + rq->__deadline = time & ~0x1UL; +} + +static inline unsigned long blk_rq_deadline(struct request *rq) +{ + return rq->__deadline & ~0x1UL; +} + +/* * Internal io_context interface */ void get_io_context(struct io_context *ioc); @@ -330,4 +338,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) } #endif /* CONFIG_BOUNCE */ +extern void blk_drain_queue(struct request_queue *q); + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index fceb1a96480b..6a3e68292273 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -113,45 +113,50 @@ int init_emergency_isa_pool(void) static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { unsigned char *vfrom; - struct bio_vec tovec, *fromvec = from->bi_io_vec; + struct bio_vec tovec, fromvec; struct bvec_iter iter; + /* + * The bio of @from is created by bounce, so we can iterate + * its bvec from start to end, but the @from->bi_iter can't be + * trusted because it might be changed by splitting. + */ + struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; bio_for_each_segment(tovec, to, iter) { - if (tovec.bv_page != fromvec->bv_page) { + fromvec = bio_iter_iovec(from, from_iter); + if (tovec.bv_page != fromvec.bv_page) { /* * fromvec->bv_offset and fromvec->bv_len might have * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec->bv_page) + + vfrom = page_address(fromvec.bv_page) + tovec.bv_offset; bounce_copy_vec(&tovec, vfrom); flush_dcache_page(tovec.bv_page); } - - fromvec++; + bio_advance_iter(from, &from_iter, tovec.bv_len); } } static void bounce_end_io(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; + struct bio_vec *bvec, orig_vec; int i; - int start = bio_orig->bi_iter.bi_idx; + struct bvec_iter orig_iter = bio_orig->bi_iter; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i + start; - - if (bvec->bv_page == org_vec->bv_page) - continue; - - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); + orig_vec = bio_iter_iovec(bio_orig, orig_iter); + if (bvec->bv_page != orig_vec.bv_page) { + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } bio_orig->bi_status = bio->bi_status; @@ -200,6 +205,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, unsigned i = 0; bool bounce = false; int sectors = 0; + bool passthrough = bio_is_passthrough(*bio_orig); bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_PAGES) @@ -210,13 +216,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (!bounce) return; - if (sectors < bio_sectors(*bio_orig)) { + if (!passthrough && sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split); bio_chain(bio, *bio_orig); generic_make_request(*bio_orig); *bio_orig = bio; } - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set); + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL : + bounce_bio_set); bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 15d25ccd51a5..1474153f73e3 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -30,7 +30,7 @@ /** * bsg_teardown_job - routine to teardown a bsg job - * @job: bsg_job that is to be torn down + * @kref: kref inside bsg_job that is to be torn down */ static void bsg_teardown_job(struct kref *kref) { @@ -251,6 +251,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) * @name: device to give bsg device * @job_fn: bsg job handler * @dd_job_size: size of LLD data needed for each job + * @release: @dev release function */ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, bsg_job_fn *job_fn, int dd_job_size, diff --git a/block/bsg.c b/block/bsg.c index 452f94f1c5d4..06dc96e1f670 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -32,6 +32,9 @@ #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" #define BSG_VERSION "0.4" +#define bsg_dbg(bd, fmt, ...) \ + pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) + struct bsg_device { struct request_queue *queue; spinlock_t lock; @@ -55,14 +58,6 @@ enum { #define BSG_DEFAULT_CMDS 64 #define BSG_MAX_DEVS 32768 -#undef BSG_DEBUG - -#ifdef BSG_DEBUG -#define dprintk(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ##args) -#else -#define dprintk(fmt, args...) -#endif - static DEFINE_MUTEX(bsg_mutex); static DEFINE_IDR(bsg_minor_idr); @@ -123,7 +118,7 @@ static struct bsg_command *bsg_alloc_command(struct bsg_device *bd) bc->bd = bd; INIT_LIST_HEAD(&bc->list); - dprintk("%s: returning free cmd %p\n", bd->name, bc); + bsg_dbg(bd, "returning free cmd %p\n", bc); return bc; out: spin_unlock_irq(&bd->lock); @@ -222,7 +217,8 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t mode) if (!bcd->class_dev) return ERR_PTR(-ENXIO); - dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, + bsg_dbg(bd, "map hdr %llx/%u %llx/%u\n", + (unsigned long long) hdr->dout_xferp, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, hdr->din_xfer_len); @@ -299,8 +295,8 @@ static void bsg_rq_end_io(struct request *rq, blk_status_t status) struct bsg_device *bd = bc->bd; unsigned long flags; - dprintk("%s: finished rq %p bc %p, bio %p\n", - bd->name, rq, bc, bc->bio); + bsg_dbg(bd, "finished rq %p bc %p, bio %p\n", + rq, bc, bc->bio); bc->hdr.duration = jiffies_to_msecs(jiffies - bc->hdr.duration); @@ -333,7 +329,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, list_add_tail(&bc->list, &bd->busy_list); spin_unlock_irq(&bd->lock); - dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); + bsg_dbg(bd, "queueing rq %p, bc %p\n", rq, bc); rq->end_io_data = bc; blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); @@ -379,7 +375,7 @@ static struct bsg_command *bsg_get_done_cmd(struct bsg_device *bd) } } while (1); - dprintk("%s: returning done %p\n", bd->name, bc); + bsg_dbg(bd, "returning done %p\n", bc); return bc; } @@ -390,7 +386,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, struct scsi_request *req = scsi_req(rq); int ret = 0; - dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result); + pr_debug("rq %p bio %p 0x%x\n", rq, bio, req->result); /* * fill in all the output members */ @@ -469,7 +465,7 @@ static int bsg_complete_all_commands(struct bsg_device *bd) struct bsg_command *bc; int ret, tret; - dprintk("%s: entered\n", bd->name); + bsg_dbg(bd, "entered\n"); /* * wait for all commands to complete @@ -572,7 +568,7 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) int ret; ssize_t bytes_read; - dprintk("%s: read %zd bytes\n", bd->name, count); + bsg_dbg(bd, "read %zd bytes\n", count); bsg_set_block(bd, file); @@ -646,7 +642,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) ssize_t bytes_written; int ret; - dprintk("%s: write %zd bytes\n", bd->name, count); + bsg_dbg(bd, "write %zd bytes\n", count); if (unlikely(uaccess_kernel())) return -EINVAL; @@ -664,7 +660,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) if (!bytes_written || err_block_err(ret)) bytes_written = ret; - dprintk("%s: returning %zd\n", bd->name, bytes_written); + bsg_dbg(bd, "returning %zd\n", bytes_written); return bytes_written; } @@ -717,7 +713,7 @@ static int bsg_put_device(struct bsg_device *bd) hlist_del(&bd->dev_list); mutex_unlock(&bsg_mutex); - dprintk("%s: tearing down\n", bd->name); + bsg_dbg(bd, "tearing down\n"); /* * close can always block @@ -744,9 +740,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, struct file *file) { struct bsg_device *bd; -#ifdef BSG_DEBUG unsigned char buf[32]; -#endif if (!blk_queue_scsi_passthrough(rq)) { WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); @@ -771,7 +765,7 @@ static struct bsg_device *bsg_add_device(struct inode *inode, hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); - dprintk("bound to <%s>, max queue %d\n", + bsg_dbg(bd, "bound to <%s>, max queue %d\n", format_dev_t(buf, inode->i_rdev), bd->max_queue); mutex_unlock(&bsg_mutex); @@ -845,19 +839,19 @@ static int bsg_release(struct inode *inode, struct file *file) return bsg_put_device(bd); } -static unsigned int bsg_poll(struct file *file, poll_table *wait) +static __poll_t bsg_poll(struct file *file, poll_table *wait) { struct bsg_device *bd = file->private_data; - unsigned int mask = 0; + __poll_t mask = 0; poll_wait(file, &bd->wq_done, wait); poll_wait(file, &bd->wq_free, wait); spin_lock_irq(&bd->lock); if (!list_empty(&bd->done_list)) - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; if (bd->queued_cmds < bd->max_queue) - mask |= POLLOUT; + mask |= EPOLLOUT; spin_unlock_irq(&bd->lock); return mask; diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index b83f77460d28..9de9f156e203 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -50,8 +50,6 @@ struct deadline_data { int front_merges; }; -static void deadline_move_request(struct deadline_data *, struct request *); - static inline struct rb_root * deadline_rb_root(struct deadline_data *dd, struct request *rq) { @@ -100,6 +98,12 @@ deadline_add_request(struct request_queue *q, struct request *rq) struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + deadline_add_rq_rb(dd, rq); /* @@ -190,6 +194,12 @@ deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) { struct request_queue *q = rq->q; + /* + * For a zoned block device, write requests must write lock their + * target zone. + */ + blk_req_zone_write_lock(rq); + deadline_remove_request(q, rq); elv_dispatch_add_tail(q, rq); } @@ -231,6 +241,69 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) } /* + * For the specified data direction, return the next request to dispatch using + * arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + if (list_empty(&dd->fifo_list[data_dir])) + return NULL; + + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + return rq; + } + + return NULL; +} + +/* + * For the specified data direction, return the next request to dispatch using + * sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + rq = dd->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + return rq; + rq = deadline_latter_request(rq); + } + + return NULL; +} + +/* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ @@ -239,16 +312,15 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) struct deadline_data *dd = q->elevator->elevator_data; const int reads = !list_empty(&dd->fifo_list[READ]); const int writes = !list_empty(&dd->fifo_list[WRITE]); - struct request *rq; + struct request *rq, *next_rq; int data_dir; /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; - else - rq = dd->next_rq[READ]; + rq = deadline_next_request(dd, WRITE); + if (!rq) + rq = deadline_next_request(dd, READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -262,7 +334,8 @@ static int deadline_dispatch_requests(struct request_queue *q, int force) if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (deadline_fifo_request(dd, WRITE) && + (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -291,21 +364,29 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + next_rq = deadline_next_request(dd, data_dir); + if (deadline_check_fifo(dd, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = deadline_fifo_request(dd, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = next_rq; } + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return 0; + dd->batching = 0; dispatch_request: @@ -318,6 +399,16 @@ dispatch_request: return 1; } +/* + * For zoned block devices, write unlock the target zone of completed + * write requests. + */ +static void +deadline_completed_request(struct request_queue *q, struct request *rq) +{ + blk_req_zone_write_unlock(rq); +} + static void deadline_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -439,6 +530,7 @@ static struct elevator_type iosched_deadline = { .elevator_merged_fn = deadline_merged_request, .elevator_merge_req_fn = deadline_merged_requests, .elevator_dispatch_fn = deadline_dispatch_requests, + .elevator_completed_req_fn = deadline_completed_request, .elevator_add_req_fn = deadline_add_request, .elevator_former_req_fn = elv_rb_former_request, .elevator_latter_req_fn = elv_rb_latter_request, diff --git a/block/elevator.c b/block/elevator.c index 7bda083d5968..e87e9b43aba0 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -869,6 +869,8 @@ int elv_register_queue(struct request_queue *q) struct elevator_queue *e = q->elevator; int error; + lockdep_assert_held(&q->sysfs_lock); + error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; @@ -886,10 +888,11 @@ int elv_register_queue(struct request_queue *q) } return error; } -EXPORT_SYMBOL(elv_register_queue); void elv_unregister_queue(struct request_queue *q) { + lockdep_assert_held(&q->sysfs_lock); + if (q) { struct elevator_queue *e = q->elevator; @@ -900,7 +903,6 @@ void elv_unregister_queue(struct request_queue *q) wbt_enable_default(q); } } -EXPORT_SYMBOL(elv_unregister_queue); int elv_register(struct elevator_type *e) { @@ -967,7 +969,10 @@ static int elevator_switch_mq(struct request_queue *q, { int ret; + lockdep_assert_held(&q->sysfs_lock); + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); if (q->elevator) { if (q->elevator->registered) @@ -994,6 +999,7 @@ static int elevator_switch_mq(struct request_queue *q, blk_add_trace_msg(q, "elv switch: none"); out: + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; } @@ -1010,6 +1016,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) bool old_registered = false; int err; + lockdep_assert_held(&q->sysfs_lock); + if (q->mq_ops) return elevator_switch_mq(q, new_e); diff --git a/block/genhd.c b/block/genhd.c index 96a66f671720..88a53c188cb7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -629,16 +629,18 @@ exit: } /** - * device_add_disk - add partitioning information to kernel list + * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information + * @register_queue: register the queue if set to true * * This function registers the partitioning information in @disk * with the kernel. * * FIXME: error handling */ -void device_add_disk(struct device *parent, struct gendisk *disk) +static void __device_add_disk(struct device *parent, struct gendisk *disk, + bool register_queue) { dev_t devt; int retval; @@ -682,7 +684,8 @@ void device_add_disk(struct device *parent, struct gendisk *disk) exact_match, exact_lock, disk); } register_disk(parent, disk); - blk_register_queue(disk); + if (register_queue) + blk_register_queue(disk); /* * Take an extra ref on queue which will be put on disk_release() @@ -693,8 +696,19 @@ void device_add_disk(struct device *parent, struct gendisk *disk) disk_add_events(disk); blk_integrity_add(disk); } + +void device_add_disk(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, true); +} EXPORT_SYMBOL(device_add_disk); +void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, false); +} +EXPORT_SYMBOL(device_add_disk_no_queue_reg); + void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -725,7 +739,8 @@ void del_gendisk(struct gendisk *disk) * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - bdi_unregister(disk->queue->backing_dev_info); + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->queue->backing_dev_info); blk_unregister_queue(disk); } else { WARN_ON(1); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index b4df317c2916..f95c60774ce8 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -100,9 +100,13 @@ struct kyber_hctx_data { unsigned int cur_domain; unsigned int batching; wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; + struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; atomic_t wait_index[KYBER_NUM_DOMAINS]; }; +static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, + void *key); + static int rq_sched_domain(const struct request *rq) { unsigned int op = rq->cmd_flags; @@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) for (i = 0; i < KYBER_NUM_DOMAINS; i++) { INIT_LIST_HEAD(&khd->rqs[i]); + init_waitqueue_func_entry(&khd->domain_wait[i], + kyber_domain_wake); + khd->domain_wait[i].private = hctx; INIT_LIST_HEAD(&khd->domain_wait[i].entry); atomic_set(&khd->wait_index[i], 0); } @@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, int nr; nr = __sbitmap_queue_get(domain_tokens); - if (nr >= 0) - return nr; /* * If we failed to get a domain token, make sure the hardware queue is * run when one becomes available. Note that this is serialized on * khd->lock, but we still need to be careful about the waker. */ - if (list_empty_careful(&wait->entry)) { - init_waitqueue_func_entry(wait, kyber_domain_wake); - wait->private = hctx; + if (nr < 0 && list_empty_careful(&wait->entry)) { ws = sbq_wait_ptr(domain_tokens, &khd->wait_index[sched_domain]); + khd->domain_ws[sched_domain] = ws; add_wait_queue(&ws->wait, wait); /* * Try again in case a token was freed before we got on the wait - * queue. The waker may have already removed the entry from the - * wait queue, but list_del_init() is okay with that. + * queue. */ nr = __sbitmap_queue_get(domain_tokens); - if (nr >= 0) { - unsigned long flags; + } - spin_lock_irqsave(&ws->wait.lock, flags); - list_del_init(&wait->entry); - spin_unlock_irqrestore(&ws->wait.lock, flags); - } + /* + * If we got a token while we were on the wait queue, remove ourselves + * from the wait queue to ensure that all wake ups make forward + * progress. It's possible that the waker already deleted the entry + * between the !list_empty_careful() check and us grabbing the lock, but + * list_del_init() is okay with that. + */ + if (nr >= 0 && !list_empty_careful(&wait->entry)) { + ws = khd->domain_ws[sched_domain]; + spin_lock_irq(&ws->wait.lock); + list_del_init(&wait->entry); + spin_unlock_irq(&ws->wait.lock); } + return nr; } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 0179e484ec98..c56f211c8440 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -59,6 +59,7 @@ struct deadline_data { int front_merges; spinlock_t lock; + spinlock_t zone_lock; struct list_head dispatch; }; @@ -192,13 +193,83 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) } /* + * For the specified data direction, return the next request to + * dispatch using arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + unsigned long flags; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + if (list_empty(&dd->fifo_list[data_dir])) + return NULL; + + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + goto out; + } + rq = NULL; +out: + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * For the specified data direction, return the next request to + * dispatch using sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + unsigned long flags; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + rq = dd->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + break; + rq = deadline_latter_request(rq); + } + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ -static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +static struct request *__dd_dispatch_request(struct deadline_data *dd) { - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - struct request *rq; + struct request *rq, *next_rq; bool reads, writes; int data_dir; @@ -214,10 +285,9 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) /* * batches are currently reads XOR writes */ - if (dd->next_rq[WRITE]) - rq = dd->next_rq[WRITE]; - else - rq = dd->next_rq[READ]; + rq = deadline_next_request(dd, WRITE); + if (!rq) + rq = deadline_next_request(dd, READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -231,7 +301,8 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - if (writes && (dd->starved++ >= dd->writes_starved)) + if (deadline_fifo_request(dd, WRITE) && + (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; @@ -260,21 +331,29 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { + next_rq = deadline_next_request(dd, data_dir); + if (deadline_check_fifo(dd, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = deadline_fifo_request(dd, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ - rq = dd->next_rq[data_dir]; + rq = next_rq; } + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return NULL; + dd->batching = 0; dispatch_request: @@ -284,17 +363,27 @@ dispatch_request: dd->batching++; deadline_move_request(dd, rq); done: + /* + * If the request needs its target zone locked, do it. + */ + blk_req_zone_write_lock(rq); rq->rq_flags |= RQF_STARTED; return rq; } +/* + * One confusing aspect here is that we get called for a specific + * hardware queue, but we return a request that may not be for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. + */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; spin_lock(&dd->lock); - rq = __dd_dispatch_request(hctx); + rq = __dd_dispatch_request(dd); spin_unlock(&dd->lock); return rq; @@ -339,6 +428,7 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); + spin_lock_init(&dd->zone_lock); INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; @@ -395,6 +485,12 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + if (blk_mq_sched_try_insert_merge(q, rq)) return; @@ -439,6 +535,26 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } +/* + * For zoned block devices, write unlock the target zone of + * completed write requests. Do this while holding the zone lock + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * while deadline_next_request() are executing. + */ +static void dd_completed_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (blk_queue_is_zoned(q)) { + struct deadline_data *dd = q->elevator->elevator_data; + unsigned long flags; + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } +} + static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; @@ -640,6 +756,7 @@ static struct elevator_type mq_deadline = { .ops.mq = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, + .completed_request = dd_completed_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 0af3a3db6fb0..82c44f7df911 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -301,7 +301,9 @@ static void parse_bsd(struct parsed_partitions *state, continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); - if (memcmp(flavour, "bsd\0", 4) == 0) + /* FreeBSD has relative offset if C partition offset is zero */ + if (memcmp(flavour, "bsd\0", 4) == 0 && + le32_to_cpu(l->d_partitions[2].p_offset) == 0) bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index edcfff974527..60b471f8621b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -384,9 +384,10 @@ out_put_request: /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl - * @file: file this ioctl operates on (optional) * @q: request queue to send scsi commands down * @disk: gendisk to operate on (option) + * @mode: mode used to open the file through which the ioctl has been + * submitted * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below @@ -415,10 +416,10 @@ out_put_request: * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ -#define OMAX_SB_LEN 16 /* For backward compatibility */ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, struct scsi_ioctl_command __user *sic) { + enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ struct request *rq; struct scsi_request *req; int err; @@ -692,38 +693,9 @@ int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) if (bd && bd == bd->bd_contains) return 0; - /* Actually none of these is particularly useful on a partition, - * but they are safe. - */ - switch (cmd) { - case SCSI_IOCTL_GET_IDLUN: - case SCSI_IOCTL_GET_BUS_NUMBER: - case SCSI_IOCTL_GET_PCI: - case SCSI_IOCTL_PROBE_HOST: - case SG_GET_VERSION_NUM: - case SG_SET_TIMEOUT: - case SG_GET_TIMEOUT: - case SG_GET_RESERVED_SIZE: - case SG_SET_RESERVED_SIZE: - case SG_EMULATED_HOST: - return 0; - case CDROM_GET_CAPABILITY: - /* Keep this until we remove the printk below. udev sends it - * and we do not want to spam dmesg about it. CD-ROMs do - * not have partitions, so we get here only for disks. - */ - return -ENOIOCTLCMD; - default: - break; - } - if (capable(CAP_SYS_RAWIO)) return 0; - /* In particular, rule out all resets and host-specific ioctls. */ - printk_ratelimited(KERN_WARNING - "%s: sending ioctl %x to a partition!\n", current->comm, cmd); - return -ENOIOCTLCMD; } EXPORT_SYMBOL(scsi_verify_blk_ioctl); |