From 797476b88bde2a6001f9552f383f147e58c1a330 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 18 Oct 2016 15:40:29 +0900 Subject: block: Add 'zoned' queue limit Add the zoned queue limit to indicate the zoning model of a block device. Defined values are 0 (BLK_ZONED_NONE) for regular block devices, 1 (BLK_ZONED_HA) for host-aware zone block devices and 2 (BLK_ZONED_HM) for host-managed zone block devices. The standards defined drive managed model is not defined here since these block devices do not provide any command for accessing zone information. Drive managed model devices will be reported as BLK_ZONED_NONE. The helper functions blk_queue_zoned_model and bdev_zoned_model return the zoned limit and the functions blk_queue_is_zoned and bdev_is_zoned return a boolean for callers to test if a block device is zoned. The zoned attribute is also exported as a string to applications via sysfs. BLK_ZONED_NONE shows as "none", BLK_ZONED_HA as "host-aware" and BLK_ZONED_HM as "host-managed". Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/blk-settings.c | 1 + block/blk-sysfs.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index f679ae122843..b1d5b7fa4d07 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -107,6 +107,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->io_opt = 0; lim->misaligned = 0; lim->cluster = 1; + lim->zoned = BLK_ZONED_NONE; } EXPORT_SYMBOL(blk_set_default_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9cc8d7c5439a..ff9cd9cd35a3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -257,6 +257,18 @@ QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); #undef QUEUE_SYSFS_BIT_FNS +static ssize_t queue_zoned_show(struct request_queue *q, char *page) +{ + switch (blk_queue_zoned_model(q)) { + case BLK_ZONED_HA: + return sprintf(page, "host-aware\n"); + case BLK_ZONED_HM: + return sprintf(page, "host-managed\n"); + default: + return sprintf(page, "none\n"); + } +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show((blk_queue_nomerges(q) << 1) | @@ -485,6 +497,11 @@ static struct queue_sysfs_entry queue_nonrot_entry = { .store = queue_store_nonrot, }; +static struct queue_sysfs_entry queue_zoned_entry = { + .attr = {.name = "zoned", .mode = S_IRUGO }, + .show = queue_zoned_show, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR }, .show = queue_nomerges_show, @@ -546,6 +563,7 @@ static struct attribute *default_attrs[] = { &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nonrot_entry.attr, + &queue_zoned_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, -- cgit v1.2.3 From 87caf97cf54b5082e56af241b88a2b8a30d17ef3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 18 Oct 2016 15:40:30 +0900 Subject: blk-sysfs: Add 'chunk_sectors' to sysfs attributes The queue limits already have a 'chunk_sectors' setting, so we should be presenting it via sysfs. Signed-off-by: Hannes Reinecke [Damien: Updated Documentation/ABI/testing/sysfs-block] Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ff9cd9cd35a3..488c2e28feb8 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -130,6 +130,11 @@ static ssize_t queue_physical_block_size_show(struct request_queue *q, char *pag return queue_var_show(queue_physical_block_size(q), page); } +static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.chunk_sectors, page); +} + static ssize_t queue_io_min_show(struct request_queue *q, char *page) { return queue_var_show(queue_io_min(q), page); @@ -455,6 +460,11 @@ static struct queue_sysfs_entry queue_physical_block_size_entry = { .show = queue_physical_block_size_show, }; +static struct queue_sysfs_entry queue_chunk_sectors_entry = { + .attr = {.name = "chunk_sectors", .mode = S_IRUGO }, + .show = queue_chunk_sectors_show, +}; + static struct queue_sysfs_entry queue_io_min_entry = { .attr = {.name = "minimum_io_size", .mode = S_IRUGO }, .show = queue_io_min_show, @@ -555,6 +565,7 @@ static struct attribute *default_attrs[] = { &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, + &queue_chunk_sectors_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, -- cgit v1.2.3 From 987b3b26eb7b19960160505faf9b2f50ae77e14d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 18 Oct 2016 15:40:31 +0900 Subject: block: update chunk_sectors in blk_stack_limits() Signed-off-by: Hannes Reinecke Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/blk-settings.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index b1d5b7fa4d07..55369a65dea2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -631,6 +631,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } + if (b->chunk_sectors) + t->chunk_sectors = min_not_zero(t->chunk_sectors, + b->chunk_sectors); + return ret; } EXPORT_SYMBOL(blk_stack_limits); -- cgit v1.2.3 From 2d253440b5afb128d22ccdae812dde9ba77a2cca Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 18 Oct 2016 15:40:32 +0900 Subject: block: Define zoned block device operations Define REQ_OP_ZONE_REPORT and REQ_OP_ZONE_RESET for handling zones of host-managed and host-aware zoned block devices. With with these two new operations, the total number of operations defined reaches 8 and still fits with the 3 bits definition of REQ_OP_BITS. Signed-off-by: Shaun Tancheff Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 14d7c0740dc0..e4eda5d2aa56 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1941,6 +1941,10 @@ generic_make_request_checks(struct bio *bio) case REQ_OP_WRITE_SAME: if (!bdev_write_same(bio->bi_bdev)) goto not_supported; + case REQ_OP_ZONE_REPORT: + case REQ_OP_ZONE_RESET: + if (!bdev_is_zoned(bio->bi_bdev)) + goto not_supported; break; default: break; -- cgit v1.2.3 From 6a0cb1bc106fc07ce0443303bcdb7f7da5131e5c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 18 Oct 2016 15:40:33 +0900 Subject: block: Implement support for zoned block devices Implement zoned block device zone information reporting and reset. Zone information are reported as struct blk_zone. This implementation does not differentiate between host-aware and host-managed device models and is valid for both. Two functions are provided: blkdev_report_zones for discovering the zone configuration of a zoned block device, and blkdev_reset_zones for resetting the write pointer of sequential zones. The helper function blk_queue_zone_size and bdev_zone_size are also provided for, as the name suggest, obtaining the zone size (in 512B sectors) of the zones of the device. Signed-off-by: Hannes Reinecke [Damien: * Removed the zone cache * Implement report zones operation based on earlier proposal by Shaun Tancheff ] Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/Kconfig | 8 ++ block/Makefile | 1 + block/blk-zoned.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 266 insertions(+) create mode 100644 block/blk-zoned.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 1d4d624492fc..6b0ad08f0677 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -89,6 +89,14 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_ZONED + bool "Zoned block device support" + ---help--- + Block layer zoned block device support. This option enables + support for ZAC/ZBC host-managed and host-aware zoned block devices. + + Say yes here if you have a ZAC or ZBC storage device. + config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y diff --git a/block/Makefile b/block/Makefile index 36acdd7545be..934dac73fb37 100644 --- a/block/Makefile +++ b/block/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o +obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o diff --git a/block/blk-zoned.c b/block/blk-zoned.c new file mode 100644 index 000000000000..1603573f9605 --- /dev/null +++ b/block/blk-zoned.c @@ -0,0 +1,257 @@ +/* + * Zoned block device handling + * + * Copyright (c) 2015, Hannes Reinecke + * Copyright (c) 2015, SUSE Linux GmbH + * + * Copyright (c) 2016, Damien Le Moal + * Copyright (c) 2016, Western Digital + */ + +#include +#include +#include +#include + +static inline sector_t blk_zone_start(struct request_queue *q, + sector_t sector) +{ + sector_t zone_mask = blk_queue_zone_size(q) - 1; + + return sector & ~zone_mask; +} + +/* + * Check that a zone report belongs to the partition. + * If yes, fix its start sector and write pointer, copy it in the + * zone information array and return true. Return false otherwise. + */ +static bool blkdev_report_zone(struct block_device *bdev, + struct blk_zone *rep, + struct blk_zone *zone) +{ + sector_t offset = get_start_sect(bdev); + + if (rep->start < offset) + return false; + + rep->start -= offset; + if (rep->start + rep->len > bdev->bd_part->nr_sects) + return false; + + if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL) + rep->wp = rep->start + rep->len; + else + rep->wp -= offset; + memcpy(zone, rep, sizeof(struct blk_zone)); + + return true; +} + +/** + * blkdev_report_zones - Get zones information + * @bdev: Target block device + * @sector: Sector from which to report zones + * @zones: Array of zone structures where to return the zones information + * @nr_zones: Number of zone structures in the zone array + * @gfp_mask: Memory allocation flags (for bio_alloc) + * + * Description: + * Get zone information starting from the zone containing @sector. + * The number of zone information reported may be less than the number + * requested by @nr_zones. The number of zones actually reported is + * returned in @nr_zones. + */ +int blkdev_report_zones(struct block_device *bdev, + sector_t sector, + struct blk_zone *zones, + unsigned int *nr_zones, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct blk_zone_report_hdr *hdr; + unsigned int nrz = *nr_zones; + struct page *page; + unsigned int nr_rep; + size_t rep_bytes; + unsigned int nr_pages; + struct bio *bio; + struct bio_vec *bv; + unsigned int i, n, nz; + unsigned int ofst; + void *addr; + int ret = 0; + + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -EOPNOTSUPP; + + if (!nrz) + return 0; + + if (sector > bdev->bd_part->nr_sects) { + *nr_zones = 0; + return 0; + } + + /* + * The zone report has a header. So make room for it in the + * payload. Also make sure that the report fits in a single BIO + * that will not be split down the stack. + */ + rep_bytes = sizeof(struct blk_zone_report_hdr) + + sizeof(struct blk_zone) * nrz; + rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK; + if (rep_bytes > (queue_max_sectors(q) << 9)) + rep_bytes = queue_max_sectors(q) << 9; + + nr_pages = min_t(unsigned int, BIO_MAX_PAGES, + rep_bytes >> PAGE_SHIFT); + nr_pages = min_t(unsigned int, nr_pages, + queue_max_segments(q)); + + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return -ENOMEM; + + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = blk_zone_start(q, sector); + bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0); + + for (i = 0; i < nr_pages; i++) { + page = alloc_page(gfp_mask); + if (!page) { + ret = -ENOMEM; + goto out; + } + if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { + __free_page(page); + break; + } + } + + if (i == 0) + ret = -ENOMEM; + else + ret = submit_bio_wait(bio); + if (ret) + goto out; + + /* + * Process the report result: skip the header and go through the + * reported zones to fixup and fixup the zone information for + * partitions. At the same time, return the zone information into + * the zone array. + */ + n = 0; + nz = 0; + nr_rep = 0; + bio_for_each_segment_all(bv, bio, i) { + + if (!bv->bv_page) + break; + + addr = kmap_atomic(bv->bv_page); + + /* Get header in the first page */ + ofst = 0; + if (!nr_rep) { + hdr = (struct blk_zone_report_hdr *) addr; + nr_rep = hdr->nr_zones; + ofst = sizeof(struct blk_zone_report_hdr); + } + + /* Fixup and report zones */ + while (ofst < bv->bv_len && + n < nr_rep && nz < nrz) { + if (blkdev_report_zone(bdev, addr + ofst, &zones[nz])) + nz++; + ofst += sizeof(struct blk_zone); + n++; + } + + kunmap_atomic(addr); + + if (n >= nr_rep || nz >= nrz) + break; + + } + +out: + bio_for_each_segment_all(bv, bio, i) + __free_page(bv->bv_page); + bio_put(bio); + + if (ret == 0) + *nr_zones = nz; + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_report_zones); + +/** + * blkdev_reset_zones - Reset zones write pointer + * @bdev: Target block device + * @sector: Start sector of the first zone to reset + * @nr_sectors: Number of sectors, at least the length of one zone + * @gfp_mask: Memory allocation flags (for bio_alloc) + * + * Description: + * Reset the write pointer of the zones contained in the range + * @sector..@sector+@nr_sectors. Specifying the entire disk sector range + * is valid, but the specified range should not contain conventional zones. + */ +int blkdev_reset_zones(struct block_device *bdev, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + sector_t zone_sectors; + sector_t end_sector = sector + nr_sectors; + struct bio *bio; + int ret; + + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -EOPNOTSUPP; + + if (end_sector > bdev->bd_part->nr_sects) + /* Out of range */ + return -EINVAL; + + /* Check alignment (handle eventual smaller last zone) */ + zone_sectors = blk_queue_zone_size(q); + if (sector & (zone_sectors - 1)) + return -EINVAL; + + if ((nr_sectors & (zone_sectors - 1)) && + end_sector != bdev->bd_part->nr_sects) + return -EINVAL; + + while (sector < end_sector) { + + bio = bio_alloc(gfp_mask, 0); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); + + ret = submit_bio_wait(bio); + bio_put(bio); + + if (ret) + return ret; + + sector += zone_sectors; + + /* This may take a while, so be nice to others */ + cond_resched(); + + } + + return 0; +} +EXPORT_SYMBOL_GPL(blkdev_reset_zones); -- cgit v1.2.3 From 3ed05a987e0f63b21e634101e0b460d32f3581c3 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 18 Oct 2016 15:40:35 +0900 Subject: blk-zoned: implement ioctls Adds the new BLKREPORTZONE and BLKRESETZONE ioctls for respectively obtaining the zone configuration of a zoned block device and resetting the write pointer of sequential zones of a zoned block device. The BLKREPORTZONE ioctl maps directly to a single call of the function blkdev_report_zones. The zone information result is passed as an array of struct blk_zone identical to the structure used internally for processing the REQ_OP_ZONE_REPORT operation. The BLKRESETZONE ioctl maps to a single call of the blkdev_reset_zones function. Signed-off-by: Shaun Tancheff Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-zoned.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ block/ioctl.c | 4 +++ 2 files changed, 97 insertions(+) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1603573f9605..667f95d86695 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -255,3 +255,96 @@ int blkdev_reset_zones(struct block_device *bdev, return 0; } EXPORT_SYMBOL_GPL(blkdev_reset_zones); + +/** + * BLKREPORTZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_report rep; + struct blk_zone *zones; + int ret; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) + return -EFAULT; + + if (!rep.nr_zones) + return -EINVAL; + + zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) + return -ENOMEM; + + ret = blkdev_report_zones(bdev, rep.sector, + zones, &rep.nr_zones, + GFP_KERNEL); + if (ret) + goto out; + + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { + ret = -EFAULT; + goto out; + } + + if (rep.nr_zones) { + if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, + sizeof(struct blk_zone) * rep.nr_zones)) + ret = -EFAULT; + } + + out: + kfree(zones); + + return ret; +} + +/** + * BLKRESETZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_range zrange; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) + return -EFAULT; + + return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); +} diff --git a/block/ioctl.c b/block/ioctl.c index 755119c3c1b9..f856963204f4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -519,6 +519,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKREPORTZONE: + return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + case BLKRESETZONE: + return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKRAGET: -- cgit v1.2.3 From 3c4da75814c4b8871116940eb32d3a5243026918 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 21 Oct 2016 17:42:33 +0200 Subject: block: zoned: fix harmless maybe-uninitialized warning The blkdev_report_zones produces a harmless warning when -Wmaybe-uninitialized is set, after gcc gets a little confused about the multiple 'goto' here: block/blk-zoned.c: In function 'blkdev_report_zones': block/blk-zoned.c:188:13: error: 'nz' may be used uninitialized in this function [-Werror=maybe-uninitialized] Moving the assignment to nr_zones makes this a little simpler while also avoiding the warning reliably. I'm removing the extraneous initialization of 'int ret' in the same patch, as that is semi-related and could cause an uninitialized use of that variable to not produce a warning. Fixes: 6a0cb1bc106f ("block: Implement support for zoned block devices") Signed-off-by: Arnd Bergmann Reviewed-by: Shaun Tancheff Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 667f95d86695..472211fa183a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -80,7 +80,7 @@ int blkdev_report_zones(struct block_device *bdev, unsigned int i, n, nz; unsigned int ofst; void *addr; - int ret = 0; + int ret; if (!q) return -ENXIO; @@ -179,14 +179,12 @@ int blkdev_report_zones(struct block_device *bdev, } + *nr_zones = nz; out: bio_for_each_segment_all(bv, bio, i) __free_page(bv->bv_page); bio_put(bio); - if (ret == 0) - *nr_zones = nz; - return ret; } EXPORT_SYMBOL_GPL(blkdev_report_zones); -- cgit v1.2.3 From 7dd2fb6877b601a0b9c7284534ca4916cd7ac4bd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Oct 2016 09:49:19 -0600 Subject: blk-mq: update hardware and software queues for sleeping alloc If we end up sleeping due to running out of requests, we should update the hardware and software queues in the map ctx structure. Otherwise we could end up having rq->mq_ctx point to the pre-sleep context, and risk corrupting ctx->rq_list since we'll be grabbing the wrong lock when inserting the request. Reported-by: Dave Jones Reported-by: Chris Mason Tested-by: Chris Mason Fixes: 63581af3f31e ("blk-mq: remove non-blocking pass in blk_mq_map_request") Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ddc2eed64771..f3d27a6dee09 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1217,9 +1217,9 @@ static struct request *blk_mq_map_request(struct request_queue *q, blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - hctx->queued++; - data->hctx = hctx; - data->ctx = ctx; + data->hctx = alloc_data.hctx; + data->ctx = alloc_data.ctx; + data->hctx->queued++; return rq; } -- cgit v1.2.3 From 2552e3f878c2b43b41d7728a328821d8220c28da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Oct 2016 19:03:55 -0600 Subject: blk-mq: get rid of confusing blk_map_ctx structure We can just use struct blk_mq_alloc_data - it has a few more members, but we allocate it further down the stack anyway. So this cleans up the code, and reduces the stack overhead a bit. Signed-off-by: Jens Axboe --- block/blk-mq.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index f3d27a6dee09..d74a74a9f9ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1190,21 +1190,15 @@ insert_rq: } } -struct blk_map_ctx { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; -}; - static struct request *blk_mq_map_request(struct request_queue *q, struct bio *bio, - struct blk_map_ctx *data) + struct blk_mq_alloc_data *data) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct request *rq; int op = bio_data_dir(bio); int op_flags = 0; - struct blk_mq_alloc_data alloc_data; blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); @@ -1214,11 +1208,9 @@ static struct request *blk_mq_map_request(struct request_queue *q, op_flags |= REQ_SYNC; trace_block_getrq(q, bio, op); - blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); + blk_mq_set_alloc_data(data, q, 0, ctx, hctx); + rq = __blk_mq_alloc_request(data, op, op_flags); - data->hctx = alloc_data.hctx; - data->ctx = alloc_data.ctx; data->hctx->queued++; return rq; } @@ -1267,7 +1259,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); - struct blk_map_ctx data; + struct blk_mq_alloc_data data; struct request *rq; unsigned int request_count = 0; struct blk_plug *plug; @@ -1363,7 +1355,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_plug *plug; unsigned int request_count = 0; - struct blk_map_ctx data; + struct blk_mq_alloc_data data; struct request *rq; blk_qc_t cookie; -- cgit v1.2.3 From c4aebd0332da831a3403faf2035af45059ab6b7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:09 +0200 Subject: block: remove bio_is_rw With the addition of the zoned operations the tests in this function became incorrect. But I think it's much better to just open code the allow operations in the only caller anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 63f72f00c72e..5384713d48bc 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -172,7 +172,7 @@ bool bio_integrity_enabled(struct bio *bio) { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - if (!bio_is_rw(bio)) + if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return false; /* Already protected? */ -- cgit v1.2.3 From 8d2bbd4c8236e9e38e6b36ac9e2c54fdcfe5b335 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:12 +0200 Subject: block: replace REQ_THROTTLED with a bio flag It's the last bio-only REQ_* flag, and we have space for it in the bio bi_flags field. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/blk-throttle.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a3ea8260c94c..a6bb4fe326c3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -818,13 +818,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) tg->io_disp[rw]++; /* - * REQ_THROTTLED is used to prevent the same bio to be throttled + * BIO_THROTTLED is used to prevent the same bio to be throttled * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. */ - if (!(bio->bi_opf & REQ_THROTTLED)) - bio->bi_opf |= REQ_THROTTLED; + if (!bio_flagged(bio, BIO_THROTTLED)) + bio_set_flag(bio, BIO_THROTTLED); } /** @@ -1401,7 +1401,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, WARN_ON_ONCE(!rcu_read_lock_held()); /* see throtl_charge_bio() */ - if ((bio->bi_opf & REQ_THROTTLED) || !tg->has_rules[rw]) + if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) goto out; spin_lock_irq(q->queue_lock); @@ -1480,7 +1480,7 @@ out: * being issued. */ if (!throttled) - bio->bi_opf &= ~REQ_THROTTLED; + bio_clear_flag(bio, BIO_THROTTLED); return throttled; } -- cgit v1.2.3 From e806402130c9c494e22c73ae9ead4e79d2a5811c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:13 +0200 Subject: block: split out request-only flags into a new namespace A lot of the REQ_* flags are only used on struct requests, and only of use to the block layer and a few drivers that dig into struct request internals. This patch adds a new req_flags_t rq_flags field to struct request for them, and thus dramatically shrinks the number of common requests. It also removes the unfortunate situation where we have to fit the fields from the same enum into 32 bits for struct bio and 64 bits for struct request. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/blk-core.c | 71 +++++++++++++++++++++++++++++-------------------------- block/blk-exec.c | 2 +- block/blk-flush.c | 9 +++---- block/blk-map.c | 4 ++-- block/blk-merge.c | 8 +++---- block/blk-mq.c | 19 +++++++-------- block/blk-tag.c | 6 ++--- block/blk.h | 4 ++-- block/elevator.c | 32 ++++++++++++------------- 9 files changed, 78 insertions(+), 77 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e4eda5d2aa56..fd416651a676 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -145,13 +145,13 @@ static void req_bio_endio(struct request *rq, struct bio *bio, if (error) bio->bi_error = error; - if (unlikely(rq->cmd_flags & REQ_QUIET)) + if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } @@ -899,7 +899,7 @@ EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_list *rl, struct request *rq) { - if (rq->cmd_flags & REQ_ELVPRIV) { + if (rq->rq_flags & RQF_ELVPRIV) { elv_put_request(rl->q, rq); if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); @@ -961,14 +961,14 @@ static void __freed_request(struct request_list *rl, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_list *rl, int op, unsigned int flags) +static void freed_request(struct request_list *rl, bool sync, + req_flags_t rq_flags) { struct request_queue *q = rl->q; - int sync = rw_is_sync(op, flags); q->nr_rqs[sync]--; rl->count[sync]--; - if (flags & REQ_ELVPRIV) + if (rq_flags & RQF_ELVPRIV) q->nr_rqs_elvpriv--; __freed_request(rl, sync); @@ -1079,6 +1079,7 @@ static struct request *__get_request(struct request_list *rl, int op, struct io_cq *icq = NULL; const bool is_sync = rw_is_sync(op, op_flags) != 0; int may_queue; + req_flags_t rq_flags = RQF_ALLOCED; if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); @@ -1127,7 +1128,7 @@ static struct request *__get_request(struct request_list *rl, int op, /* * Decide whether the new request will be managed by elevator. If - * so, mark @op_flags and increment elvpriv. Non-zero elvpriv will + * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will * prevent the current elevator from being destroyed until the new * request is freed. This guarantees icq's won't be destroyed and * makes creating new ones safe. @@ -1136,14 +1137,14 @@ static struct request *__get_request(struct request_list *rl, int op, * it will be created after releasing queue_lock. */ if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { - op_flags |= REQ_ELVPRIV; + rq_flags |= RQF_ELVPRIV; q->nr_rqs_elvpriv++; if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } if (blk_queue_io_stat(q)) - op_flags |= REQ_IO_STAT; + rq_flags |= RQF_IO_STAT; spin_unlock_irq(q->queue_lock); /* allocate and init request */ @@ -1153,10 +1154,11 @@ static struct request *__get_request(struct request_list *rl, int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - req_set_op_attrs(rq, op, op_flags | REQ_ALLOCED); + req_set_op_attrs(rq, op, op_flags); + rq->rq_flags = rq_flags; /* init elvpriv */ - if (op_flags & REQ_ELVPRIV) { + if (rq_flags & RQF_ELVPRIV) { if (unlikely(et->icq_cache && !icq)) { if (ioc) icq = ioc_create_icq(ioc, q, gfp_mask); @@ -1195,7 +1197,7 @@ fail_elvpriv: printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", __func__, dev_name(q->backing_dev_info.dev)); - rq->cmd_flags &= ~REQ_ELVPRIV; + rq->rq_flags &= ~RQF_ELVPRIV; rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); @@ -1212,7 +1214,7 @@ fail_alloc: * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(rl, op, op_flags); + freed_request(rl, is_sync, rq_flags); /* * in the very unlikely event that allocation failed and no @@ -1347,7 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - if (rq->cmd_flags & REQ_QUEUED) + if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); @@ -1409,7 +1411,7 @@ EXPORT_SYMBOL_GPL(part_round_stats); #ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) { - if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) + if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending) pm_runtime_mark_last_busy(rq->q->dev); } #else @@ -1421,6 +1423,8 @@ static inline void blk_pm_put_request(struct request *rq) {} */ void __blk_put_request(struct request_queue *q, struct request *req) { + req_flags_t rq_flags = req->rq_flags; + if (unlikely(!q)) return; @@ -1440,16 +1444,15 @@ void __blk_put_request(struct request_queue *q, struct request *req) * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools */ - if (req->cmd_flags & REQ_ALLOCED) { - unsigned int flags = req->cmd_flags; - int op = req_op(req); + if (rq_flags & RQF_ALLOCED) { struct request_list *rl = blk_rq_rl(req); + bool sync = rw_is_sync(req_op(req), req->cmd_flags); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); blk_free_request(rl, req); - freed_request(rl, op, flags); + freed_request(rl, sync, rq_flags); blk_put_rl(rl); } } @@ -2214,7 +2217,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) unsigned int bytes = 0; struct bio *bio; - if (!(rq->cmd_flags & REQ_MIXED_MERGE)) + if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* @@ -2257,7 +2260,7 @@ void blk_account_io_done(struct request *req) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { + if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; @@ -2285,7 +2288,7 @@ static struct request *blk_pm_peek_request(struct request_queue *q, struct request *rq) { if (q->dev && (q->rpm_status == RPM_SUSPENDED || - (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM)))) + (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) return NULL; else return rq; @@ -2361,13 +2364,13 @@ struct request *blk_peek_request(struct request_queue *q) if (!rq) break; - if (!(rq->cmd_flags & REQ_STARTED)) { + if (!(rq->rq_flags & RQF_STARTED)) { /* * This is the first time the device driver * sees this request (possibly after * requeueing). Notify IO scheduler. */ - if (rq->cmd_flags & REQ_SORTED) + if (rq->rq_flags & RQF_SORTED) elv_activate_rq(q, rq); /* @@ -2375,7 +2378,7 @@ struct request *blk_peek_request(struct request_queue *q) * it, a request that has been delayed should * not be passed by new incoming requests */ - rq->cmd_flags |= REQ_STARTED; + rq->rq_flags |= RQF_STARTED; trace_block_rq_issue(q, rq); } @@ -2384,7 +2387,7 @@ struct request *blk_peek_request(struct request_queue *q) q->boundary_rq = NULL; } - if (rq->cmd_flags & REQ_DONTPREP) + if (rq->rq_flags & RQF_DONTPREP) break; if (q->dma_drain_size && blk_rq_bytes(rq)) { @@ -2407,11 +2410,11 @@ struct request *blk_peek_request(struct request_queue *q) /* * the request may have been (partially) prepped. * we need to keep this request in the front to - * avoid resource deadlock. REQ_STARTED will + * avoid resource deadlock. RQF_STARTED will * prevent other fs requests from passing this one. */ if (q->dma_drain_size && blk_rq_bytes(rq) && - !(rq->cmd_flags & REQ_DONTPREP)) { + !(rq->rq_flags & RQF_DONTPREP)) { /* * remove the space for the drain we added * so that we don't add it again @@ -2424,7 +2427,7 @@ struct request *blk_peek_request(struct request_queue *q) } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. @@ -2561,7 +2564,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) req->errors = 0; if (error && req->cmd_type == REQ_TYPE_FS && - !(req->cmd_flags & REQ_QUIET)) { + !(req->rq_flags & RQF_QUIET)) { char *error_type; switch (error) { @@ -2634,7 +2637,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ - if (req->cmd_flags & REQ_MIXED_MERGE) { + if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } @@ -2687,7 +2690,7 @@ void blk_unprep_request(struct request *req) { struct request_queue *q = req->q; - req->cmd_flags &= ~REQ_DONTPREP; + req->rq_flags &= ~RQF_DONTPREP; if (q->unprep_rq_fn) q->unprep_rq_fn(q, req); } @@ -2698,7 +2701,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); */ void blk_finish_request(struct request *req, int error) { - if (req->cmd_flags & REQ_QUEUED) + if (req->rq_flags & RQF_QUEUED) blk_queue_end_tag(req->q, req); BUG_ON(blk_queued_rq(req)); @@ -2708,7 +2711,7 @@ void blk_finish_request(struct request *req, int error) blk_delete_timer(req); - if (req->cmd_flags & REQ_DONTPREP) + if (req->rq_flags & RQF_DONTPREP) blk_unprep_request(req); blk_account_io_done(req); diff --git a/block/blk-exec.c b/block/blk-exec.c index 7ea04325d02f..3ecb00a6cf45 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -72,7 +72,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; rq->errors = -ENXIO; __blk_end_request_all(rq, rq->errors); spin_unlock_irq(q->queue_lock); diff --git a/block/blk-flush.c b/block/blk-flush.c index 6a14b68b9135..3990b9cfbda5 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -56,7 +56,7 @@ * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole - * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in + * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * * The above peculiarity requires that each FLUSH/FUA request has only one @@ -127,7 +127,7 @@ static void blk_flush_restore_request(struct request *rq) rq->bio = rq->biotail; /* make @rq a normal request */ - rq->cmd_flags &= ~REQ_FLUSH_SEQ; + rq->rq_flags &= ~RQF_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; } @@ -330,7 +330,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ); + req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH); + flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; @@ -433,7 +434,7 @@ void blk_insert_flush(struct request *rq) */ memset(&rq->flush, 0, sizeof(rq->flush)); INIT_LIST_HEAD(&rq->flush.list); - rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ if (q->mq_ops) { rq->end_io = mq_flush_data_end_io; diff --git a/block/blk-map.c b/block/blk-map.c index b8657fa8dc9a..2c5ae5fef473 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -135,7 +135,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->cmd_flags |= REQ_COPY_USER; + rq->rq_flags |= RQF_COPY_USER; return 0; unmap_rq: @@ -232,7 +232,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); if (do_copy) - rq->cmd_flags |= REQ_COPY_USER; + rq->rq_flags |= RQF_COPY_USER; ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 2642e5fc8b69..fda6a12fc776 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -456,7 +456,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); - if (unlikely(rq->cmd_flags & REQ_COPY_USER) && + if (unlikely(rq->rq_flags & RQF_COPY_USER) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; @@ -634,7 +634,7 @@ void blk_rq_set_mixed_merge(struct request *rq) unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; struct bio *bio; - if (rq->cmd_flags & REQ_MIXED_MERGE) + if (rq->rq_flags & RQF_MIXED_MERGE) return; /* @@ -647,7 +647,7 @@ void blk_rq_set_mixed_merge(struct request *rq) (bio->bi_opf & REQ_FAILFAST_MASK) != ff); bio->bi_opf |= ff; } - rq->cmd_flags |= REQ_MIXED_MERGE; + rq->rq_flags |= RQF_MIXED_MERGE; } static void blk_account_io_merge(struct request *req) @@ -709,7 +709,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, * makes sure that all involved bios have mixable attributes * set properly. */ - if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || + if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) || (req->cmd_flags & REQ_FAILFAST_MASK) != (next->cmd_flags & REQ_FAILFAST_MASK)) { blk_rq_set_mixed_merge(req); diff --git a/block/blk-mq.c b/block/blk-mq.c index d74a74a9f9ef..b49c6658eb05 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -142,14 +142,13 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, struct request *rq, int op, unsigned int op_flags) { - if (blk_queue_io_stat(q)) - op_flags |= REQ_IO_STAT; - INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = q; rq->mq_ctx = ctx; req_set_op_attrs(rq, op, op_flags); + if (blk_queue_io_stat(q)) + rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ rq->cpu = -1; INIT_HLIST_NODE(&rq->hash); @@ -198,7 +197,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) rq = data->hctx->tags->rqs[tag]; if (blk_mq_tag_busy(data->hctx)) { - rq->cmd_flags = REQ_MQ_INFLIGHT; + rq->rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } @@ -298,9 +297,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - if (rq->cmd_flags & REQ_MQ_INFLIGHT) + if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); - rq->cmd_flags = 0; + rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx, ctx, tag); @@ -489,10 +488,10 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_unlock_irqrestore(&q->requeue_lock, flags); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + if (!(rq->rq_flags & RQF_SOFTBARRIER)) continue; - rq->cmd_flags &= ~REQ_SOFTBARRIER; + rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); blk_mq_insert_request(rq, true, false, false); } @@ -519,11 +518,11 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) * We abuse this flag that is otherwise used by the I/O scheduler to * request head insertation from the workqueue. */ - BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); spin_lock_irqsave(&q->requeue_lock, flags); if (at_head) { - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->requeue_list); } else { list_add_tail(&rq->queuelist, &q->requeue_list); diff --git a/block/blk-tag.c b/block/blk-tag.c index f0344e6939d5..bae1decb6ec3 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -270,7 +270,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); - rq->cmd_flags &= ~REQ_QUEUED; + rq->rq_flags &= ~RQF_QUEUED; rq->tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) @@ -316,7 +316,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) unsigned max_depth; int tag; - if (unlikely((rq->cmd_flags & REQ_QUEUED))) { + if (unlikely((rq->rq_flags & RQF_QUEUED))) { printk(KERN_ERR "%s: request %p for device [%s] already tagged %d", __func__, rq, @@ -371,7 +371,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) */ bqt->next_tag = (tag + 1) % bqt->max_depth; - rq->cmd_flags |= REQ_QUEUED; + rq->rq_flags |= RQF_QUEUED; rq->tag = tag; bqt->tag_index[tag] = rq; blk_start_request(rq); diff --git a/block/blk.h b/block/blk.h index 74444c49078f..aa132dea598c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -130,7 +130,7 @@ static inline void blk_clear_rq_complete(struct request *rq) /* * Internal elevator interface */ -#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED) +#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) void blk_insert_flush(struct request *rq); @@ -247,7 +247,7 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int); static inline int blk_do_io_stat(struct request *rq) { return rq->rq_disk && - (rq->cmd_flags & REQ_IO_STAT) && + (rq->rq_flags & RQF_IO_STAT) && (rq->cmd_type == REQ_TYPE_FS); } diff --git a/block/elevator.c b/block/elevator.c index f7d973a56fd7..ac80f89a0842 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -245,7 +245,7 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { hash_del(&rq->hash); - rq->cmd_flags &= ~REQ_HASHED; + rq->rq_flags &= ~RQF_HASHED; } static void elv_rqhash_del(struct request_queue *q, struct request *rq) @@ -260,7 +260,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) BUG_ON(ELV_ON_HASH(rq)); hash_add(e->hash, &rq->hash, rq_hash_key(rq)); - rq->cmd_flags |= REQ_HASHED; + rq->rq_flags |= RQF_HASHED; } static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) @@ -352,7 +352,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) { sector_t boundary; struct list_head *entry; - int stop_flags; if (q->last_merge == rq) q->last_merge = NULL; @@ -362,7 +361,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) q->nr_sorted--; boundary = q->end_sector; - stop_flags = REQ_SOFTBARRIER | REQ_STARTED; list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); @@ -370,7 +368,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) break; if (rq_data_dir(rq) != rq_data_dir(pos)) break; - if (pos->cmd_flags & stop_flags) + if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER)) break; if (blk_rq_pos(rq) >= boundary) { if (blk_rq_pos(pos) < boundary) @@ -510,7 +508,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; - const int next_sorted = next->cmd_flags & REQ_SORTED; + const int next_sorted = next->rq_flags & RQF_SORTED; if (next_sorted && e->type->ops.elevator_merge_req_fn) e->type->ops.elevator_merge_req_fn(q, rq, next); @@ -537,13 +535,13 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, #ifdef CONFIG_PM static void blk_pm_requeue_request(struct request *rq) { - if (rq->q->dev && !(rq->cmd_flags & REQ_PM)) + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) rq->q->nr_pending--; } static void blk_pm_add_request(struct request_queue *q, struct request *rq) { - if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 && + if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 && (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING)) pm_request_resume(q->dev); } @@ -563,11 +561,11 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (rq->cmd_flags & REQ_SORTED) + if (rq->rq_flags & RQF_SORTED) elv_deactivate_rq(q, rq); } - rq->cmd_flags &= ~REQ_STARTED; + rq->rq_flags &= ~RQF_STARTED; blk_pm_requeue_request(rq); @@ -597,13 +595,13 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) rq->q = q; - if (rq->cmd_flags & REQ_SOFTBARRIER) { + if (rq->rq_flags & RQF_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS) { q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; } - } else if (!(rq->cmd_flags & REQ_ELVPRIV) && + } else if (!(rq->rq_flags & RQF_ELVPRIV) && (where == ELEVATOR_INSERT_SORT || where == ELEVATOR_INSERT_SORT_MERGE)) where = ELEVATOR_INSERT_BACK; @@ -611,12 +609,12 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) switch (where) { case ELEVATOR_INSERT_REQUEUE: case ELEVATOR_INSERT_FRONT: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_BACK: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; elv_drain_elevator(q); list_add_tail(&rq->queuelist, &q->queue_head); /* @@ -642,7 +640,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) break; case ELEVATOR_INSERT_SORT: BUG_ON(rq->cmd_type != REQ_TYPE_FS); - rq->cmd_flags |= REQ_SORTED; + rq->rq_flags |= RQF_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -659,7 +657,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) break; case ELEVATOR_INSERT_FLUSH: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; blk_insert_flush(rq); break; default: @@ -735,7 +733,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if ((rq->cmd_flags & REQ_SORTED) && + if ((rq->rq_flags & RQF_SORTED) && e->type->ops.elevator_completed_req_fn) e->type->ops.elevator_completed_req_fn(q, rq); } -- cgit v1.2.3 From ef295ecf090d3e86e5b742fc6ab34f1122a43773 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2016 08:48:16 -0600 Subject: block: better op and flags encoding Now that we don't need the common flags to overflow outside the range of a 32-bit type we can encode them the same way for both the bio and request fields. This in addition allows us to place the operation first (and make some room for more ops while we're at it) and to stop having to shift around the operation values. In addition this allows passing around only one value in the block layer instead of two (and eventuall also in the file systems, but we can do that later) and thus clean up a lot of code. Last but not least this allows decreasing the size of the cmd_flags field in struct request to 32-bits. Various functions passing this value could also be updated, but I'd like to avoid the churn for now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 60 ++++++++++++++++-------------------------------- block/blk-flush.c | 2 +- block/blk-lib.c | 2 +- block/blk-map.c | 2 ++ block/blk-mq.c | 28 +++++++++-------------- block/cfq-iosched.c | 66 ++++++++++++++++++++++++++--------------------------- block/elevator.c | 4 ++-- 7 files changed, 69 insertions(+), 95 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index fd416651a676..0bfaa54d3e9f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1056,8 +1056,7 @@ static struct io_context *rq_ioc(struct bio *bio) /** * __get_request - get a free request * @rl: request list to allocate from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1068,23 +1067,22 @@ static struct io_context *rq_ioc(struct bio *bio) * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *__get_request(struct request_list *rl, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *__get_request(struct request_list *rl, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { struct request_queue *q = rl->q; struct request *rq; struct elevator_type *et = q->elevator->type; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); int may_queue; req_flags_t rq_flags = RQF_ALLOCED; if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); - may_queue = elv_may_queue(q, op, op_flags); + may_queue = elv_may_queue(q, op); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; @@ -1154,7 +1152,7 @@ static struct request *__get_request(struct request_list *rl, int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - req_set_op_attrs(rq, op, op_flags); + rq->cmd_flags = op; rq->rq_flags = rq_flags; /* init elvpriv */ @@ -1232,8 +1230,7 @@ rq_starved: /** * get_request - get a free request * @q: request_queue to allocate request from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1244,18 +1241,17 @@ rq_starved: * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *get_request(struct request_queue *q, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *get_request(struct request_queue *q, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); struct request_list *rl; struct request *rq; rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, op_flags, bio, gfp_mask); + rq = __get_request(rl, op, bio, gfp_mask); if (!IS_ERR(rq)) return rq; @@ -1297,7 +1293,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); - rq = get_request(q, rw, 0, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); return rq; @@ -1446,7 +1442,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) */ if (rq_flags & RQF_ALLOCED) { struct request_list *rl = blk_rq_rl(req); - bool sync = rw_is_sync(req_op(req), req->cmd_flags); + bool sync = op_is_sync(req->cmd_flags); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); @@ -1652,8 +1648,6 @@ out: void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; - - req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK; if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; @@ -1665,9 +1659,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { - const bool sync = !!(bio->bi_opf & REQ_SYNC); struct blk_plug *plug; - int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT; + int el_ret, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; @@ -1722,24 +1715,11 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - /* - * This sync check and mask will be re-done in init_request_from_bio(), - * but we need to set it earlier to expose the sync flag to the - * rq allocator and io schedulers. - */ - if (sync) - rw_flags |= REQ_SYNC; - - /* - * Add in META/PRIO flags, if set, before we get to the IO scheduler - */ - rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO)); - /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO); + req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { bio->bi_error = PTR_ERR(req); bio_endio(bio); @@ -2946,8 +2926,6 @@ EXPORT_SYMBOL_GPL(__blk_end_request_err); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { - req_set_op(rq, bio_op(bio)); - if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); @@ -3031,8 +3009,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - req_set_op_attrs(dst, req_op(src), - (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE); + dst->cmd_flags = src->cmd_flags | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -3537,8 +3514,11 @@ EXPORT_SYMBOL(blk_set_runtime_active); int __init blk_dev_init(void) { - BUILD_BUG_ON(__REQ_NR_BITS > 8 * + BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct request, cmd_flags)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * + FIELD_SIZEOF(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", diff --git a/block/blk-flush.c b/block/blk-flush.c index 3990b9cfbda5..95f1d4d357df 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH); + flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH; flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; diff --git a/block/blk-lib.c b/block/blk-lib.c index 46fe9248410d..18abda862915 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int granularity; - enum req_op op; + unsigned int op; int alignment; sector_t bs_mask; diff --git a/block/blk-map.c b/block/blk-map.c index 2c5ae5fef473..0173a72a8aa9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -16,6 +16,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) { if (!rq->bio) { + rq->cmd_flags &= REQ_OP_MASK; + rq->cmd_flags |= (bio->bi_opf & REQ_OP_MASK); blk_rq_bio_prep(rq->q, rq, bio); } else { if (!ll_back_merge_fn(rq->q, rq, bio)) diff --git a/block/blk-mq.c b/block/blk-mq.c index b49c6658eb05..2da1a0ee3318 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -139,14 +139,13 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) EXPORT_SYMBOL(blk_mq_can_queue); static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, int op, - unsigned int op_flags) + struct request *rq, unsigned int op) { INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = q; rq->mq_ctx = ctx; - req_set_op_attrs(rq, op, op_flags); + rq->cmd_flags = op; if (blk_queue_io_stat(q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ @@ -183,11 +182,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->end_io_data = NULL; rq->next_rq = NULL; - ctx->rq_dispatched[rw_is_sync(op, op_flags)]++; + ctx->rq_dispatched[op_is_sync(op)]++; } static struct request * -__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) +__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) { struct request *rq; unsigned int tag; @@ -202,7 +201,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) } rq->tag = tag; - blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags); + blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } @@ -225,7 +224,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + rq = __blk_mq_alloc_request(&alloc_data, rw); blk_mq_put_ctx(ctx); if (!rq) { @@ -277,7 +276,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + rq = __blk_mq_alloc_request(&alloc_data, rw); if (!rq) { ret = -EWOULDBLOCK; goto out_queue_exit; @@ -1196,19 +1195,14 @@ static struct request *blk_mq_map_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct request *rq; - int op = bio_data_dir(bio); - int op_flags = 0; blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, ctx->cpu); - if (rw_is_sync(bio_op(bio), bio->bi_opf)) - op_flags |= REQ_SYNC; - - trace_block_getrq(q, bio, op); + trace_block_getrq(q, bio, bio->bi_opf); blk_mq_set_alloc_data(data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(data, op, op_flags); + rq = __blk_mq_alloc_request(data, bio->bi_opf); data->hctx->queued++; return rq; @@ -1256,7 +1250,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) */ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_mq_alloc_data data; struct request *rq; @@ -1350,7 +1344,7 @@ done: */ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_plug *plug; unsigned int request_count = 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5e24d880306c..c96186adaa66 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -667,10 +667,10 @@ static inline void cfqg_put(struct cfq_group *cfqg) } while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, - int op_flags) + struct cfq_group *curr_cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.queued, op, 1); cfqg_stats_end_empty_time(&cfqg->stats); cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); } @@ -684,30 +684,29 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, #endif } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1); + blkg_rwstat_add(&cfqg->stats.queued, op, -1); } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.merged, op, 1); } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { struct cfqg_stats *stats = &cfqg->stats; unsigned long long now = sched_clock(); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, op_flags, - now - io_start_time); + blkg_rwstat_add(&stats->service_time, op, now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, op_flags, + blkg_rwstat_add(&stats->wait_time, op, io_start_time - start_time); } @@ -786,16 +785,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { } #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, int op_flags) { } + struct cfq_group *curr_cfqg, unsigned int op) { } static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, uint64_t time, unsigned long unaccounted_time) { } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) { } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) { } +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { } +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) { } + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -2474,10 +2473,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); cfq_add_rq_rb(rq); cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, - req_op(rq), rq->cmd_flags); + rq->cmd_flags); } static struct request * @@ -2530,7 +2529,7 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); if (rq->cmd_flags & REQ_PRIO) { WARN_ON(!cfqq->prio_pending); cfqq->prio_pending--; @@ -2565,7 +2564,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf); + cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); } static void @@ -2588,7 +2587,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags); + cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); cfqq = RQ_CFQQ(next); /* @@ -4142,7 +4141,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq), + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, rq->cmd_flags); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -4240,8 +4239,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), - rq_io_start_time_ns(rq), req_op(rq), - rq->cmd_flags); + rq_io_start_time_ns(rq), rq->cmd_flags); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -4319,14 +4317,14 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags) +static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) { /* * If REQ_PRIO is set, boost class and prio level, if it's below * BE/NORM. If prio is not set, restore the potentially boosted * class/prio level. */ - if (!(op_flags & REQ_PRIO)) { + if (!(op & REQ_PRIO)) { cfqq->ioprio_class = cfqq->org_ioprio_class; cfqq->ioprio = cfqq->org_ioprio; } else { @@ -4347,7 +4345,7 @@ static inline int __cfq_may_queue(struct cfq_queue *cfqq) return ELV_MQUEUE_MAY; } -static int cfq_may_queue(struct request_queue *q, int op, int op_flags) +static int cfq_may_queue(struct request_queue *q, unsigned int op) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -4364,10 +4362,10 @@ static int cfq_may_queue(struct request_queue *q, int op, int op_flags) if (!cic) return ELV_MQUEUE_MAY; - cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags)); + cfqq = cic_to_cfqq(cic, op_is_sync(op)); if (cfqq) { cfq_init_prio_data(cfqq, cic); - cfqq_boost_on_prio(cfqq, op_flags); + cfqq_boost_on_prio(cfqq, op); return __cfq_may_queue(cfqq); } diff --git a/block/elevator.c b/block/elevator.c index ac80f89a0842..a18a5db274e4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -714,12 +714,12 @@ void elv_put_request(struct request_queue *q, struct request *rq) e->type->ops.elevator_put_req_fn(rq); } -int elv_may_queue(struct request_queue *q, int op, int op_flags) +int elv_may_queue(struct request_queue *q, unsigned int op) { struct elevator_queue *e = q->elevator; if (e->type->ops.elevator_may_queue_fn) - return e->type->ops.elevator_may_queue_fn(q, op, op_flags); + return e->type->ops.elevator_may_queue_fn(q, op); return ELV_MQUEUE_MAY; } -- cgit v1.2.3 From aa39ebd404423e62f74cfd3e27e9ffe7e38b2a25 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:02 -0600 Subject: cfq-iosched: use op_is_sync instead of opencoding it Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c96186adaa66..f28db97c3fe0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -911,15 +911,6 @@ static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) return cic->icq.q->elevator->elevator_data; } -/* - * We regard a request as SYNC, if it's either a read or has the SYNC bit - * set (in which case it could also be direct WRITE). - */ -static inline bool cfq_bio_sync(struct bio *bio) -{ - return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC); -} - /* * scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing @@ -2490,7 +2481,7 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) if (!cic) return NULL; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf)); if (cfqq) return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); @@ -2604,13 +2595,14 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, struct bio *bio) { struct cfq_data *cfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); struct cfq_io_cq *cic; struct cfq_queue *cfqq; /* * Disallow merge of a sync bio into an async request. */ - if (cfq_bio_sync(bio) && !rq_is_sync(rq)) + if (is_sync && !rq_is_sync(rq)) return false; /* @@ -2621,7 +2613,7 @@ static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, if (!cic) return false; - cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); + cfqq = cic_to_cfqq(cic, is_sync); return cfqq == RQ_CFQQ(rq); } -- cgit v1.2.3 From a2b809672ee6fcb4d5756ea815725b3dbaea654e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:09 -0600 Subject: block: replace REQ_NOIDLE with REQ_IDLE Noidle should be the default for writes as seen by all the compounds definitions in fs.h using it. In fact only direct I/O really should be using NODILE, so turn the whole flag around to get the defaults right, which will make our life much easier especially onces the WRITE_* defines go away. This assumes all the existing "raw" users of REQ_SYNC for writes want noidle behavior, which seems to be spot on from a quick audit. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f28db97c3fe0..dcbed8c9c82c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3914,6 +3914,12 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); } +static inline bool req_noidle(struct request *req) +{ + return req_op(req) == REQ_OP_WRITE && + (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC; +} + /* * Disable idle window if the process thinks too long or seeks so much that * it doesn't matter @@ -3935,7 +3941,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq->queued[0] + cfqq->queued[1] >= 4) cfq_mark_cfqq_deep(cfqq); - if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) + if (cfqq->next_rq && req_noidle(cfqq->next_rq)) enable_idle = 0; else if (!atomic_read(&cic->icq.ioc->active_ref) || !cfqd->cfq_slice_idle || @@ -4220,8 +4226,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) const int sync = rq_is_sync(rq); u64 now = ktime_get_ns(); - cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", - !!(rq->cmd_flags & REQ_NOIDLE)); + cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq)); cfq_update_hw_tag(cfqd); -- cgit v1.2.3 From 70fd76140a6cb63262bd47b68d57b42e889c10ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:10 -0600 Subject: block,fs: use REQ_* flags directly Remove the WRITE_* and READ_SYNC wrappers, and just use the flags directly. Where applicable this also drops usage of the bio_set_op_attrs wrapper. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 95f1d4d357df..d35beca18481 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH; + flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; @@ -486,7 +486,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio = bio_alloc(gfp_mask, 0); bio->bi_bdev = bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); -- cgit v1.2.3 From 2cefe4dbaadf83b236caab46705b4b5a4958e3b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Oct 2016 11:59:24 -0600 Subject: block: add bio_iov_iter_get_pages() This is a helper that pins down a range from an iov_iter and adds it to a bio without requiring a separate memory allocation for the page array. It will be used for upcoming direct I/O implementations for block devices and iomap based file systems. Signed-off-by: Kent Overstreet [hch: ported to the iov_iter interface, renamed and added comments. All blame should be directed to me and all fame should go to Kent after this!] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index db85c5753a76..2cf6ebabc68c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -847,6 +847,55 @@ done: } EXPORT_SYMBOL(bio_add_page); +/** + * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * @bio: bio to add pages to + * @iter: iov iterator describing the region to be mapped + * + * Pins as many pages from *iter and appends them to @bio's bvec array. The + * pages will have to be released using put_page() when done. + */ +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + size_t offset, diff; + ssize_t size; + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE; + + /* + * Deep magic below: We need to walk the pinned pages backwards + * because we are abusing the space allocated for the bio_vecs + * for the page array. Because the bio_vecs are larger than the + * page pointers by definition this will always work. But it also + * means we can't use bio_add_page, so any changes to it's semantics + * need to be reflected here as well. + */ + bio->bi_iter.bi_size += size; + bio->bi_vcnt += nr_pages; + + diff = (nr_pages * PAGE_SIZE - offset) - size; + while (nr_pages--) { + bv[nr_pages].bv_page = pages[nr_pages]; + bv[nr_pages].bv_len = PAGE_SIZE; + bv[nr_pages].bv_offset = 0; + } + + bv[0].bv_offset += offset; + bv[0].bv_len -= offset; + if (diff) + bv[bio->bi_vcnt - 1].bv_len -= diff; + + iov_iter_advance(iter, size); + return 0; +} +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); + struct submit_bio_ret { struct completion event; int error; -- cgit v1.2.3 From bc27c01b5c46d3bfec42c96537c7a3fae0bb2cc4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:18:48 -0700 Subject: blk-mq: Do not invoke .queue_rq() for a stopped queue The meaning of the BLK_MQ_S_STOPPED flag is "do not call .queue_rq()". Hence modify blk_mq_make_request() such that requests are queued instead of issued if a queue has been stopped. Reported-by: Ming Lei Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Cc: Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 2da1a0ee3318..92edd28b7e76 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1317,9 +1317,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); if (!old_rq) goto done; - if (!blk_mq_direct_issue_request(old_rq, &cookie)) - goto done; - blk_mq_insert_request(old_rq, false, true, true); + if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) || + blk_mq_direct_issue_request(old_rq, &cookie) != 0) + blk_mq_insert_request(old_rq, false, true, true); goto done; } -- cgit v1.2.3 From 5d1b25c1ecabb37f8eb58c8e9dd74f77f703e5d9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:19:15 -0700 Subject: blk-mq: Introduce blk_mq_hctx_stopped() Multiple functions test the BLK_MQ_S_STOPPED bit so introduce a helper function that performs this test. Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++------ block/blk-mq.h | 5 +++++ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 92edd28b7e76..2864e191cc86 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -785,7 +785,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) struct list_head *dptr; int queued; - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (unlikely(blk_mq_hctx_stopped(hctx))) return; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && @@ -910,8 +910,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || - !blk_mq_hw_queue_mapped(hctx))) + if (unlikely(blk_mq_hctx_stopped(hctx) || + !blk_mq_hw_queue_mapped(hctx))) return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { @@ -936,7 +936,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) queue_for_each_hw_ctx(q, hctx, i) { if ((!blk_mq_hctx_has_pending(hctx) && list_empty_careful(&hctx->dispatch)) || - test_bit(BLK_MQ_S_STOPPED, &hctx->state)) + blk_mq_hctx_stopped(hctx)) continue; blk_mq_run_hw_queue(hctx, async); @@ -986,7 +986,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) + if (!blk_mq_hctx_stopped(hctx)) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); @@ -1317,7 +1317,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); if (!old_rq) goto done; - if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) || + if (blk_mq_hctx_stopped(data.hctx) || blk_mq_direct_issue_request(old_rq, &cookie) != 0) blk_mq_insert_request(old_rq, false, true, true); goto done; diff --git a/block/blk-mq.h b/block/blk-mq.h index e5d25249028c..ac772dac7ce8 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -100,6 +100,11 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) +{ + return test_bit(BLK_MQ_S_STOPPED, &hctx->state); +} + static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; -- cgit v1.2.3 From fd00144301d64f1742541a3c5e64cd1c51f39c55 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:19:37 -0700 Subject: blk-mq: Introduce blk_mq_queue_stopped() The function blk_queue_stopped() allows to test whether or not a traditional request queue has been stopped. Introduce a helper function that allows block drivers to query easily whether or not one or more hardware contexts of a blk-mq queue have been stopped. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 2864e191cc86..28bf667bfe09 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -944,6 +944,26 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) } EXPORT_SYMBOL(blk_mq_run_hw_queues); +/** + * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped + * @q: request queue. + * + * The caller is responsible for serializing this function against + * blk_mq_{start,stop}_hw_queue(). + */ +bool blk_mq_queue_stopped(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + if (blk_mq_hctx_stopped(hctx)) + return true; + + return false; +} +EXPORT_SYMBOL(blk_mq_queue_stopped); + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { cancel_work(&hctx->run_work); -- cgit v1.2.3 From 2253efc850c4cf690516bbc07854eeb1077202ba Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:20:02 -0700 Subject: blk-mq: Move more code into blk_mq_direct_issue_request() Move the "hctx stopped" test and the insert request calls into blk_mq_direct_issue_request(). Rename that function into blk_mq_try_issue_directly() to reflect its new semantics. Pass the hctx pointer to that function instead of looking it up a second time. These changes avoid that code has to be duplicated in the next patch. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 28bf667bfe09..447c37f39e32 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1228,11 +1228,11 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } -static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) +static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_qc_t *cookie) { int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, @@ -1240,6 +1240,9 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) }; blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + if (blk_mq_hctx_stopped(hctx)) + goto insert; + /* * For OK queue, we are done. For error, kill it. Any other * error (busy), just add it to our list as we previously @@ -1248,7 +1251,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_MQ_RQ_QUEUE_OK) { *cookie = new_cookie; - return 0; + return; } __blk_mq_requeue_request(rq); @@ -1257,10 +1260,11 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) *cookie = BLK_QC_T_NONE; rq->errors = -EIO; blk_mq_end_request(rq, rq->errors); - return 0; + return; } - return -1; +insert: + blk_mq_insert_request(rq, false, true, true); } /* @@ -1337,9 +1341,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); if (!old_rq) goto done; - if (blk_mq_hctx_stopped(data.hctx) || - blk_mq_direct_issue_request(old_rq, &cookie) != 0) - blk_mq_insert_request(old_rq, false, true, true); + blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); goto done; } -- cgit v1.2.3 From 52d7f1b5c2f33b5d34dc2b6af5175fb6a44999f6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:20:32 -0700 Subject: blk-mq: Avoid that requeueing starts stopped queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since blk_mq_requeue_work() starts stopped queues and since execution of this function can be scheduled after a queue has been stopped it is not possible to stop queues without using an additional state variable to track whether or not the queue has been stopped. Hence modify blk_mq_requeue_work() such that it does not start stopped queues. My conclusion after a review of the blk_mq_stop_hw_queues() and blk_mq_{delay_,}kick_requeue_list() callers is as follows: * In the dm driver starting and stopping queues should only happen if __dm_suspend() or __dm_resume() is called and not if the requeue list is processed. * In the SCSI core queue stopping and starting should only be performed by the scsi_internal_device_block() and scsi_internal_device_unblock() functions but not by any other function. Although the blk_mq_stop_hw_queue() call in scsi_queue_rq() may help to reduce CPU load if a LLD queue is full, figuring out whether or not a queue should be restarted when requeueing a command would require to introduce additional locking in scsi_mq_requeue_cmd() to avoid a race with scsi_internal_device_block(). Avoid this complexity by removing the blk_mq_stop_hw_queue() call from scsi_queue_rq(). * In the NVMe core only the functions that call blk_mq_start_stopped_hw_queues() explicitly should start stopped queues. * A blk_mq_start_stopped_hwqueues() call must be added in the xen-blkfront driver in its blkif_recover() function. Signed-off-by: Bart Van Assche Cc: Konrad Rzeszutek Wilk Cc: Roger Pau Monné Cc: Mike Snitzer Cc: James Bottomley Cc: Martin K. Petersen Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 447c37f39e32..d95034ae64f6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -501,11 +501,7 @@ static void blk_mq_requeue_work(struct work_struct *work) blk_mq_insert_request(rq, false, false, false); } - /* - * Use the start variant of queue running here, so that running - * the requeue work will kick stopped queues. - */ - blk_mq_start_hw_queues(q); + blk_mq_run_hw_queues(q, false); } void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) -- cgit v1.2.3 From 9b7dd572cc439fa92e120290eb74d0295567c5a0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:20:49 -0700 Subject: blk-mq: Remove blk_mq_cancel_requeue_work() Since blk_mq_requeue_work() no longer restarts stopped queues canceling requeue work is no longer needed to prevent that a stopped queue would be restarted. Hence remove this function. Signed-off-by: Bart Van Assche Cc: Mike Snitzer Cc: Keith Busch Cc: Hannes Reinecke Cc: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d95034ae64f6..a461823644fb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -526,12 +526,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) } EXPORT_SYMBOL(blk_mq_add_to_requeue_list); -void blk_mq_cancel_requeue_work(struct request_queue *q) -{ - cancel_delayed_work_sync(&q->requeue_work); -} -EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); - void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_schedule_delayed_work(&q->requeue_work, 0); -- cgit v1.2.3 From 6a83e74d214a47a1371cd2e6a783264fcba7d428 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 2 Nov 2016 10:09:51 -0600 Subject: blk-mq: Introduce blk_mq_quiesce_queue() blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations have finished. This function does *not* wait until all outstanding requests have finished (this means invocation of request.end_io()). The algorithm used by blk_mq_quiesce_queue() is as follows: * Hold either an RCU read lock or an SRCU read lock around .queue_rq() calls. The former is used if .queue_rq() does not block and the latter if .queue_rq() may block. * blk_mq_quiesce_queue() first calls blk_mq_stop_hw_queues() followed by synchronize_srcu() or synchronize_rcu(). The latter call waits for .queue_rq() invocations that started before blk_mq_quiesce_queue() was called. * The blk_mq_hctx_stopped() calls that control whether or not .queue_rq() will be called are called with the (S)RCU read lock held. This is necessary to avoid race conditions against blk_mq_quiesce_queue(). Signed-off-by: Bart Van Assche Cc: Hannes Reinecke Cc: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Kconfig | 1 + block/blk-mq.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 65 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 6b0ad08f0677..3a024440a669 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,6 +5,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y select SBITMAP + select SRCU help Provide block layer support for the kernel. diff --git a/block/blk-mq.c b/block/blk-mq.c index a461823644fb..3dc323543293 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -115,6 +115,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +/** + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished + * @q: request queue. + * + * Note: this function does not prevent that the struct request end_io() + * callback function is invoked. Additionally, it is not prevented that + * new queue_rq() calls occur unless the queue has been stopped first. + */ +void blk_mq_quiesce_queue(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + bool rcu = false; + + blk_mq_stop_hw_queues(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(&hctx->queue_rq_srcu); + else + rcu = true; + } + if (rcu) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -766,7 +793,7 @@ static inline unsigned int queued_to_index(unsigned int queued) * of IO. In particular, we'd like FIFO behaviour on handling existing * items on the hctx->dispatch list. Ignore that for now. */ -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct request *rq; @@ -778,9 +805,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx))) return; - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); - hctx->run++; /* @@ -871,6 +895,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } } +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + int srcu_idx; + + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)); + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + blk_mq_process_rq_list(hctx); + rcu_read_unlock(); + } else { + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + blk_mq_process_rq_list(hctx); + srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + } +} + /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. @@ -1268,7 +1310,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_mq_alloc_data data; struct request *rq; - unsigned int request_count = 0; + unsigned int request_count = 0, srcu_idx; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; @@ -1311,7 +1353,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); /* - * We do limited pluging. If the bio can be merged, do that. + * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be * issued. So the plug list will have one request at most */ @@ -1331,7 +1373,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); if (!old_rq) goto done; - blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + + if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + rcu_read_unlock(); + } else { + srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); + blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); + } goto done; } @@ -1610,6 +1661,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + if (hctx->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(&hctx->queue_rq_srcu); + blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); @@ -1690,6 +1744,9 @@ static int blk_mq_init_hctx(struct request_queue *q, flush_start_tag + hctx_idx, node)) goto free_fq; + if (hctx->flags & BLK_MQ_F_BLOCKING) + init_srcu_struct(&hctx->queue_rq_srcu); + return 0; free_fq: -- cgit v1.2.3 From 2b053aca76b48e681be57b34ca3a8c2c10b275c5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:21:41 -0700 Subject: blk-mq: Add a kick_requeue_list argument to blk_mq_requeue_request() Most blk_mq_requeue_request() and blk_mq_add_to_requeue_list() calls are followed by kicking the requeue list. Hence add an argument to these two functions that allows to kick the requeue list. This was proposed by Christoph Hellwig. Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Cc: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-flush.c | 5 +---- block/blk-mq.c | 10 +++++++--- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index d35beca18481..c486b7aa62ee 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -134,10 +134,7 @@ static void blk_flush_restore_request(struct request *rq) static bool blk_flush_queue_rq(struct request *rq, bool add_front) { if (rq->q->mq_ops) { - struct request_queue *q = rq->q; - - blk_mq_add_to_requeue_list(rq, add_front); - blk_mq_kick_requeue_list(q); + blk_mq_add_to_requeue_list(rq, add_front, true); return false; } else { if (add_front) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3dc323543293..8d3de5bd4d6f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -492,12 +492,12 @@ static void __blk_mq_requeue_request(struct request *rq) } } -void blk_mq_requeue_request(struct request *rq) +void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { __blk_mq_requeue_request(rq); BUG_ON(blk_queued_rq(rq)); - blk_mq_add_to_requeue_list(rq, true); + blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -531,7 +531,8 @@ static void blk_mq_requeue_work(struct work_struct *work) blk_mq_run_hw_queues(q, false); } -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list) { struct request_queue *q = rq->q; unsigned long flags; @@ -550,6 +551,9 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) list_add_tail(&rq->queuelist, &q->requeue_list); } spin_unlock_irqrestore(&q->requeue_lock, flags); + + if (kick_requeue_list) + blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_add_to_requeue_list); -- cgit v1.2.3 From 46f3cc176210b2db5db89d9c0e62796cad7e3cdc Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 3 Nov 2016 03:38:54 -0600 Subject: block: drop q argument from bsg_validate_sgv4_hdr bsg_validate_sgv4_hdr() doesn't care about the request_queue, so drop it from it's arguments. Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/bsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index d214e929ce18..8a05a404ae70 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -176,7 +176,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, * Check if sg_io_v4 from user is allowed and valid */ static int -bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw) +bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *rw) { int ret = 0; @@ -226,7 +226,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, hdr->din_xfer_len); - ret = bsg_validate_sgv4_hdr(q, hdr, &rw); + ret = bsg_validate_sgv4_hdr(hdr, &rw); if (ret) return ERR_PTR(ret); -- cgit v1.2.3 From 50d24c34403c62ad29e8b6db559d491bae20b4b7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Nov 2016 17:03:53 -0700 Subject: block: immediately dispatch big size request Currently block plug holds up to 16 non-mergeable requests. This makes sense if the request size is small, eg, reduce lock contention. But if request size is big enough, we don't need to worry about lock contention. Holding such request makes no sense and it lows the disk utilization. In practice, this improves 10% throughput for my raid5 sequential write workload. The size (128k) is arbitrary right now, but it makes sure lock contention is small. This probably could be more intelligent, eg, check average request size holded. Since this is mainly for sequential IO, probably not worthy. V2: check the last request instead of the first request, so as long as there is one big size request we flush the plug. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 0bfaa54d3e9f..2deca48a4a05 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1746,7 +1746,9 @@ get_rq: if (!request_count) trace_block_plug(q); else { - if (request_count >= BLK_MAX_REQUEST_COUNT) { + struct request *last = list_entry_rq(plug->list.prev); + if (request_count >= BLK_MAX_REQUEST_COUNT || + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) { blk_flush_plug_list(plug, false); trace_block_plug(q); } -- cgit v1.2.3 From 600271d9000027c013c01be87cbb90a5a18c5c3f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Nov 2016 17:03:54 -0700 Subject: blk-mq: immediately dispatch big size request This is corresponding part for blk-mq. Disk with multiple hardware queues doesn't need this as we only hold 1 request at most. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8d3de5bd4d6f..077c2416a955 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1453,13 +1453,18 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) */ plug = current->plug; if (plug) { + struct request *last = NULL; + blk_mq_bio_to_request(rq, bio); if (!request_count) trace_block_plug(q); + else + last = list_entry_rq(plug->mq_list.prev); blk_mq_put_ctx(data.ctx); - if (request_count >= BLK_MAX_REQUEST_COUNT) { + if (request_count >= BLK_MAX_REQUEST_COUNT || (last && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); trace_block_plug(q); } -- cgit v1.2.3 From d278d4a8892f13b6a9eb6102b356402f0e062324 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:21:08 -0600 Subject: block: add code to track actual device queue depth For blk-mq, ->nr_requests does track queue depth, at least at init time. But for the older queue paths, it's simply a soft setting. On top of that, it's generally larger than the hardware setting on purpose, to allow backup of requests for merging. Fill a hole in struct request with a 'queue_depth' member, that drivers can call to more closely inform the block layer of the real queue depth. Signed-off-by: Jens Axboe Reviewed-by: Jan Kara --- block/blk-settings.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 55369a65dea2..9cf053759363 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -836,6 +836,18 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable) } EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); +/** + * blk_set_queue_depth - tell the block layer about the device queue depth + * @q: the request queue for the device + * @depth: queue depth + * + */ +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) +{ + q->queue_depth = depth; +} +EXPORT_SYMBOL(blk_set_queue_depth); + /** * blk_queue_write_cache - configure queue's write cache * @q: the request queue for the device -- cgit v1.2.3 From c02ebfdddbafa9a6a0f52fbd715e6bfa229af9d3 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 28 Sep 2016 00:24:24 -0300 Subject: blk-mq: Always schedule hctx->next_cpu Commit 0e87e58bf60e ("blk-mq: improve warning for running a queue on the wrong CPU") attempts to avoid triggering the WARN_ON in __blk_mq_run_hw_queue when the expected CPU is dead. Problem is, in the last batch execution before round robin, blk_mq_hctx_next_cpu can schedule a dead CPU and also update next_cpu to the next alive CPU in the mask, which will trigger the WARN_ON despite the previous workaround. The following patch fixes this scenario by always scheduling the value in hctx->next_cpu. This changes the moment when we round-robin the CPU running the hctx, but it really doesn't matter, since it still executes BLK_MQ_CPU_WORK_BATCH times in a row before switching to another CPU. Fixes: 0e87e58bf60e ("blk-mq: improve warning for running a queue on the wrong CPU") Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 077c2416a955..6f5cb3f3dcac 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -929,7 +929,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int cpu = hctx->next_cpu, next_cpu; + int next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -937,8 +937,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; - - return cpu; } return hctx->next_cpu; -- cgit v1.2.3 From ae5b2ec8ad5e017126cd4552220f25ce8a6b92e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Nov 2016 19:39:28 -0700 Subject: block: set REQ_SYNC if we clear REQ_FUA|REQ_PREFLUSH If we insert a flush request, we clear REQ_PREFLUSH and/or REQ_FUA, depending on flush settings. Since op_is_sync() factors those flags in for deciding whether this request is sync or not, we should set REQ_SYNC to avoid screwing up this accounting. This should be less fragile. Reported-by: Logan Gunthorpe Fixes: b685d3d65ac ("block: treat REQ_FUA and REQ_PREFLUSH as synchronous") Signed-off-by: Jens Axboe --- block/blk-flush.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index c486b7aa62ee..1bdbb3d3e5f5 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -395,6 +395,13 @@ void blk_insert_flush(struct request *rq) if (!(fflags & (1UL << QUEUE_FLAG_FUA))) rq->cmd_flags &= ~REQ_FUA; + /* + * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any + * of those flags, we have to set REQ_SYNC to avoid skewing + * the request accounting. + */ + rq->cmd_flags |= REQ_SYNC; + /* * An empty flush handed down from a stacking driver may * translate into nothing if the underlying device does not -- cgit v1.2.3 From ebc4ff661fbe76781c6b16dfb7b754a5d5073f8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Nov 2016 11:16:37 -0500 Subject: block: cfq_cpd_alloc() should use @gfp cfq_cpd_alloc() which is the cpd_alloc_fn implementation for cfq was incorrectly hard coding GFP_KERNEL instead of using the mask specified through the @gfp parameter. This currently doesn't cause any actual issues because all current callers specify GFP_KERNEL. Fix it. Signed-off-by: Tejun Heo Reported-by: Dan Carpenter Fixes: e4a9bde9589f ("blkcg: replace blkcg_policy->cpd_size with ->cpd_alloc/free_fn() methods") Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index dcbed8c9c82c..61010511c5a0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1586,7 +1586,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) { struct cfq_group_data *cgd; - cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + cgd = kzalloc(sizeof(*cgd), gfp); if (!cgd) return NULL; return &cgd->cpd; -- cgit v1.2.3 From cf43e6be865a582ba66ee4747ae27a0513f6bba1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Nov 2016 21:32:37 -0700 Subject: block: add scalable completion tracking of requests For legacy block, we simply track them in the request queue. For blk-mq, we track them on a per-sw queue basis, which we can then sum up through the hardware queues and finally to a per device state. The stats are tracked in, roughly, 0.1s interval windows. Add sysfs files to display the stats. The feature is off by default, to avoid any extra overhead. In-kernel users of it can turn it on by setting QUEUE_FLAG_STATS in the queue flags. We currently don't turn it on if someone just reads any of the stats files, that is something we could add as well. Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-core.c | 14 ++- block/blk-mq-sysfs.c | 47 ++++++++++ block/blk-mq.c | 25 ++++++ block/blk-mq.h | 3 + block/blk-stat.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-stat.h | 42 +++++++++ block/blk-sysfs.c | 26 ++++++ 8 files changed, 404 insertions(+), 3 deletions(-) create mode 100644 block/blk-stat.c create mode 100644 block/blk-stat.h (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 934dac73fb37..2528c596f7ec 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index 2deca48a4a05..216372b01624 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2464,6 +2464,11 @@ void blk_start_request(struct request *req) { blk_dequeue_request(req); + if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { + blk_stat_set_issue_time(&req->issue_stat); + req->rq_flags |= RQF_STATS; + } + /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. @@ -2683,8 +2688,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); */ void blk_finish_request(struct request *req, int error) { + struct request_queue *q = req->q; + + if (req->rq_flags & RQF_STATS) + blk_stat_add(&q->rq_stats[rq_data_dir(req)], req); + if (req->rq_flags & RQF_QUEUED) - blk_queue_end_tag(req->q, req); + blk_queue_end_tag(q, req); BUG_ON(blk_queued_rq(req)); @@ -2704,7 +2714,7 @@ void blk_finish_request(struct request *req, int error) if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); - __blk_put_request(req->q, req); + __blk_put_request(q, req); } } EXPORT_SYMBOL(blk_finish_request); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 01fb455d3377..eacd3af72099 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + unsigned int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } +} + +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t count) +{ + blk_mq_stat_clear(hctx); + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_stat_init(&stat[BLK_STAT_READ]); + blk_stat_init(&stat[BLK_STAT_WRITE]); + + blk_hctx_stat_get(hctx, stat); + + ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); + ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); + return ret; +} + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { .show = blk_mq_hw_sysfs_poll_show, .store = blk_mq_hw_sysfs_poll_store, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, + .show = blk_mq_hw_sysfs_stat_show, + .store = blk_mq_hw_sysfs_stat_store, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, + &blk_mq_hw_sysfs_stat.attr, NULL, }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 6f5cb3f3dcac..19795886d46e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -403,10 +404,27 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + if (rq->rq_flags & RQF_STATS) { + /* + * We could rq->mq_ctx here, but there's less of a risk + * of races if we have the completion event add the stats + * to the local software queue. + */ + struct blk_mq_ctx *ctx; + + ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id()); + blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq); + } +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -450,6 +468,11 @@ void blk_mq_start_request(struct request *rq) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + blk_stat_set_issue_time(&rq->issue_stat); + rq->rq_flags |= RQF_STATS; + } + blk_add_timer(rq); /* @@ -1784,6 +1807,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[BLK_STAT_READ]); + blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) diff --git a/block/blk-mq.h b/block/blk-mq.h index ac772dac7ce8..b444370ae05b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -18,6 +20,7 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 000000000000..688c958367ee --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,248 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include + +#include "blk-stat.h" +#include "blk-mq.h" + +static void blk_stat_flush_batch(struct blk_rq_stat *stat) +{ + const s32 nr_batch = READ_ONCE(stat->nr_batch); + const s32 nr_samples = READ_ONCE(stat->nr_batch); + + if (!nr_batch) + return; + if (!nr_samples) + stat->mean = div64_s64(stat->batch, nr_batch); + else { + stat->mean = div64_s64((stat->mean * nr_samples) + + stat->batch, + nr_batch + nr_samples); + } + + stat->nr_samples += nr_batch; + stat->nr_batch = stat->batch = 0; +} + +static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + blk_stat_flush_batch(src); + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + uint64_t latest = 0; + int i, j, nr; + + blk_stat_init(&dst[BLK_STAT_READ]); + blk_stat_init(&dst[BLK_STAT_WRITE]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (!ctx->stat[BLK_STAT_READ].nr_samples && + !ctx->stat[BLK_STAT_WRITE].nr_samples) + continue; + if (ctx->stat[BLK_STAT_READ].time > newest) + newest = ctx->stat[BLK_STAT_READ].time; + if (ctx->stat[BLK_STAT_WRITE].time > newest) + newest = ctx->stat[BLK_STAT_WRITE].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + if (newest > latest) + latest = newest; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[BLK_STAT_READ].time == newest) { + blk_stat_sum(&dst[BLK_STAT_READ], + &ctx->stat[BLK_STAT_READ]); + nr++; + } + if (ctx->stat[BLK_STAT_WRITE].time == newest) { + blk_stat_sum(&dst[BLK_STAT_WRITE], + &ctx->stat[BLK_STAT_WRITE]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); + + dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest; +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], + sizeof(struct blk_rq_stat)); + memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], + sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + if (!ctx->stat[BLK_STAT_READ].nr_samples && + !ctx->stat[BLK_STAT_WRITE].nr_samples) + continue; + + if (ctx->stat[BLK_STAT_READ].time > newest) + newest = ctx->stat[BLK_STAT_READ].time; + if (ctx->stat[BLK_STAT_WRITE].time > newest) + newest = ctx->stat[BLK_STAT_WRITE].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[BLK_STAT_READ].time == newest) { + blk_stat_sum(&dst[BLK_STAT_READ], + &ctx->stat[BLK_STAT_READ]); + nr++; + } + if (ctx->stat[BLK_STAT_WRITE].time == newest) { + blk_stat_sum(&dst[BLK_STAT_WRITE], + &ctx->stat[BLK_STAT_WRITE]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->batch = stat->nr_batch = 0; + stat->time = time_now & BLK_STAT_NSEC_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) +{ + return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); +} + +bool blk_stat_is_current(struct blk_rq_stat *stat) +{ + return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 now, value; + + now = __blk_stat_time(ktime_to_ns(ktime_get())); + if (now < blk_stat_time(&rq->issue_stat)) + return; + + if (!__blk_stat_is_current(stat, now)) + __blk_stat_init(stat, now); + + value = now - blk_stat_time(&rq->issue_stat); + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + if (stat->batch + value < stat->batch || + stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) + blk_stat_flush_batch(stat); + + stat->batch += value; + stat->nr_batch++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } + } + } else { + blk_stat_init(&q->rq_stats[BLK_STAT_READ]); + blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]); + } +} + +void blk_stat_set_issue_time(struct blk_issue_stat *stat) +{ + stat->time = (stat->time & BLK_STAT_MASK) | + (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); +} + +/* + * Enable stat tracking, return whether it was enabled + */ +bool blk_stat_enable(struct request_queue *q) +{ + if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + return false; + } + + return true; +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 000000000000..a2050a0a5314 --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,42 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) + +/* + * Upper 3 bits can be used elsewhere + */ +#define BLK_STAT_RES_BITS 3 +#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) +#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) +#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK + +enum { + BLK_STAT_READ = 0, + BLK_STAT_WRITE, +}; + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *); +void blk_stat_init(struct blk_rq_stat *); +bool blk_stat_is_current(struct blk_rq_stat *); +void blk_stat_set_issue_time(struct blk_issue_stat *); +bool blk_stat_enable(struct request_queue *); + +static inline u64 __blk_stat_time(u64 time) +{ + return time & BLK_STAT_TIME_MASK; +} + +static inline u64 blk_stat_time(struct blk_issue_stat *stat) +{ + return __blk_stat_time(stat->time); +} + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 488c2e28feb8..9cdb7247727a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -401,6 +401,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); + ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -553,6 +573,11 @@ static struct queue_sysfs_entry queue_dax_entry = { .show = queue_dax_show, }; +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -582,6 +607,7 @@ static struct attribute *default_attrs[] = { &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_dax_entry.attr, + &queue_stats_entry.attr, NULL, }; -- cgit v1.2.3 From e34cbd307477ae07c5d8a8d0bd15e65a9ddaba5c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Nov 2016 12:36:15 -0700 Subject: blk-wbt: add general throttling mechanism We can hook this up to the block layer, to help throttle buffered writes. wbt registers a few trace points that can be used to track what is happening in the system: wbt_lat: 259:0: latency 2446318 wbt_stat: 259:0: rmean=2446318, rmin=2446318, rmax=2446318, rsamples=1, wmean=518866, wmin=15522, wmax=5330353, wsamples=57 wbt_step: 259:0: step down: step=1, window=72727272, background=8, normal=16, max=32 This shows a sync issue event (wbt_lat) that exceeded it's time. wbt_stat dumps the current read/write stats for that window, and wbt_step shows a step down event where we now scale back writes. Each trace includes the device, 259:0 in this case. Signed-off-by: Jens Axboe --- block/Makefile | 1 + block/blk-wbt.c | 735 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-wbt.h | 165 +++++++++++++ 3 files changed, 901 insertions(+) create mode 100644 block/blk-wbt.c create mode 100644 block/blk-wbt.h (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 2528c596f7ec..a827f988c4e6 100644 --- a/block/Makefile +++ b/block/Makefile @@ -24,3 +24,4 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o +obj-$(CONFIG_BLK_WBT) += blk-wbt.o diff --git a/block/blk-wbt.c b/block/blk-wbt.c new file mode 100644 index 000000000000..889c17ff8503 --- /dev/null +++ b/block/blk-wbt.c @@ -0,0 +1,735 @@ +/* + * buffered writeback throttling. loosely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * - If we're only doing writes, allow the scaling step to go negative. This + * will temporarily boost write performance, snapping back to a stable + * scaling step of 0 if reads show up or the heavy writers finish. Unlike + * positive scaling steps where we shrink the monitoring window, a negative + * scaling step retains the default step==0 window size. + * + * Copyright (C) 2016 Jens Axboe + * + */ +#include +#include +#include +#include +#include + +#include "blk-wbt.h" + +#define CREATE_TRACE_POINTS +#include + +enum { + /* + * Default setting, we'll scale up (to 75% of QD max) or down (min 1) + * from here depending on device stats + */ + RWB_DEF_DEPTH = 16, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet this minimum + */ + RWB_MIN_WRITE_SAMPLES = 3, + + /* + * If we have this number of consecutive windows with not enough + * information to scale up or down, scale up. + */ + RWB_UNKNOWN_BUMP = 5, +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->wb_normal != 0; +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +/* + * If a task was rate throttled in balance_dirty_pages() within the last + * second or so, use that to indicate a higher cleaning rate. + */ +static bool wb_recent_wait(struct rq_wb *rwb) +{ + struct bdi_writeback *wb = &rwb->bdi->wb; + + return time_before(jiffies, wb->dirty_sleep + HZ); +} + +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) +{ + return &rwb->rq_wait[is_kswapd]; +} + +static void rwb_wake_all(struct rq_wb *rwb) +{ + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + struct rq_wait *rqw = &rwb->rq_wait[i]; + + if (waitqueue_active(&rqw->wait)) + wake_up_all(&rqw->wait); + } +} + +void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) +{ + struct rq_wait *rqw; + int inflight, limit; + + if (!(wb_acct & WBT_TRACKED)) + return; + + rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); + inflight = atomic_dec_return(&rqw->inflight); + + /* + * wbt got disabled with IO in flight. Wake up any potential + * waiters, we don't have to do more than that. + */ + if (unlikely(!rwb_enabled(rwb))) { + rwb_wake_all(rwb); + return; + } + + /* + * If the device does write back caching, drop further down + * before we wake people up. + */ + if (rwb->wc && !wb_recent_wait(rwb)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. + */ + if (inflight && inflight >= limit) + return; + + if (waitqueue_active(&rqw->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up_all(&rqw->wait); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb) + return; + + if (!wbt_is_tracked(stat)) { + if (rwb->sync_cookie == stat) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + if (wbt_is_read(stat)) + wb_timestamp(rwb, &rwb->last_comp); + wbt_clear_state(stat); + } else { + WARN_ON_ONCE(stat == rwb->sync_cookie); + __wbt_done(rwb, wbt_stat_to_mask(stat)); + wbt_clear_state(stat); + } +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +static bool calc_wb_limits(struct rq_wb *rwb) +{ + unsigned int depth; + bool ret = false; + + if (!rwb->min_lat_nsec) { + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; + return false; + } + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rwb->queue_depth == 1) { + if (rwb->scale_step > 0) + rwb->wb_max = rwb->wb_normal = 1; + else { + rwb->wb_max = rwb->wb_normal = 2; + ret = true; + } + rwb->wb_background = 1; + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); + if (rwb->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); + else if (rwb->scale_step < 0) { + unsigned int maxd = 3 * rwb->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rwb->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + /* + * Set our max/normal/bg queue depths based on how far + * we have scaled down (->scale_step). + */ + rwb->wb_max = depth; + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_background = (rwb->wb_max + 3) / 4; + } + + return ret; +} + +static bool inline stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return stat[0].nr_samples >= 1 && + stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK = 1, + LAT_UNKNOWN, + LAT_UNKNOWN_WRITES, + LAT_EXCEEDED, +}; + +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + u64 thislat; + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. wbt works off completion latencies, + * but for a flooded device, a single sync IO can take a long time + * to complete after being issued. If this time exceeds our + * monitoring window AND we didn't see any other completions in that + * window, then count that sync IO as a violation of the latency. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { + trace_wbt_lat(rwb->bdi, thislat); + return LAT_EXCEEDED; + } + + /* + * No read/write mix, if stat isn't valid + */ + if (!stat_sample_valid(stat)) { + /* + * If we had writes in this stat window and the window is + * current, we're only doing writes. If a task recently + * waited or still has writes in flights, consider us doing + * just writes as well. + */ + if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) || + wb_recent_wait(rwb) || wbt_inflight(rwb)) + return LAT_UNKNOWN_WRITES; + return LAT_UNKNOWN; + } + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[0].min > rwb->min_lat_nsec) { + trace_wbt_lat(rwb->bdi, stat[0].min); + trace_wbt_stat(rwb->bdi, stat); + return LAT_EXCEEDED; + } + + if (rwb->scale_step) + trace_wbt_stat(rwb->bdi, stat); + + return LAT_OK; +} + +static int latency_exceeded(struct rq_wb *rwb) +{ + struct blk_rq_stat stat[2]; + + rwb->stat_ops->get(rwb->ops_data, stat); + return __latency_exceeded(rwb, stat); +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rwb->wb_max); +} + +static void scale_up(struct rq_wb *rwb) +{ + /* + * Hit max in previous round, stop here + */ + if (rwb->scaled_max) + return; + + rwb->scale_step--; + rwb->unknown_cnt = 0; + rwb->stat_ops->clear(rwb->ops_data); + + rwb->scaled_max = calc_wb_limits(rwb); + + rwb_wake_all(rwb); + + rwb_trace_step(rwb, "step up"); +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +static void scale_down(struct rq_wb *rwb, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rwb->wb_max == 1) + return; + + if (rwb->scale_step < 0 && hard_throttle) + rwb->scale_step = 0; + else + rwb->scale_step++; + + rwb->scaled_max = false; + rwb->unknown_cnt = 0; + rwb->stat_ops->clear(rwb->ops_data); + calc_wb_limits(rwb); + rwb_trace_step(rwb, "step down"); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + unsigned long expires; + + if (rwb->scale_step > 0) { + /* + * We should speed this up, using some variant of a fast + * integer inverse square root calculation. Since we only do + * this for every window expiration, it's not a huge deal, + * though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rwb->scale_step + 1) << 8)); + } else { + /* + * For step < 0, we don't want to increase/decrease the + * window size. + */ + rwb->cur_win_nsec = rwb->win_nsec; + } + + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); + mod_timer(&rwb->window_timer, expires); +} + +static void wb_timer_fn(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + unsigned int inflight = wbt_inflight(rwb); + int status; + + status = latency_exceeded(rwb); + + trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight); + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb, true); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN_WRITES: + /* + * We started a the center step, but don't have a valid + * read/write sample, but we do have writes going on. + * Allow step to go negative, to increase write perf. + */ + scale_up(rwb); + break; + case LAT_UNKNOWN: + if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) + break; + /* + * We get here when previously scaled reduced depth, and we + * currently don't have a valid read/write sample. For that + * case, slowly return to center state (step == 0). + */ + if (rwb->scale_step > 0) + scale_up(rwb); + else if (rwb->scale_step < 0) + scale_down(rwb, false); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rwb->scale_step || inflight) + rwb_arm_timer(rwb); +} + +void wbt_update_limits(struct rq_wb *rwb) +{ + rwb->scale_step = 0; + rwb->scaled_max = false; + calc_wb_limits(rwb); + + rwb_wake_all(rwb); +} + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If this is + * kswapd trying to free memory, or REQ_SYNC is set, set, then + * it's WB_SYNC_ALL writeback, and we'll use the max limit for + * that. If the write is marked as a background write, then use + * the idle limit, or go to normal if we haven't had competing + * IO for a bit. + */ + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + limit = rwb->wb_max; + else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, + wait_queue_t *wait, unsigned long rw) +{ + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __wbt_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rqw->inflight); + return true; + } + + /* + * If the waitqueue is already active and we are not the next + * in line to be woken up, wait for our turn. + */ + if (waitqueue_active(&rqw->wait) && + rqw->wait.task_list.next != &wait->task_list) + return false; + + return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) +{ + struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); + DEFINE_WAIT(wait); + + if (may_queue(rwb, rqw, &wait, rw)) + return; + + do { + prepare_to_wait_exclusive(&rqw->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (may_queue(rwb, rqw, &wait, rw)) + break; + + if (lock) + spin_unlock_irq(lock); + + io_schedule(); + + if (lock) + spin_lock_irq(lock); + } while (1); + + finish_wait(&rqw->wait, &wait); +} + +static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +{ + const int op = bio_op(bio); + + /* + * If not a WRITE (or a discard), do nothing + */ + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) + return false; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) + return false; + + return true; +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +unsigned int wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +{ + unsigned int ret = 0; + + if (!rwb_enabled(rwb)) + return 0; + + if (bio_op(bio) == REQ_OP_READ) + ret = WBT_READ; + + if (!wbt_should_throttle(rwb, bio)) { + if (ret & WBT_READ) + wb_timestamp(rwb, &rwb->last_issue); + return ret; + } + + __wbt_wait(rwb, bio->bi_opf, lock); + + if (!timer_pending(&rwb->window_timer)) + rwb_arm_timer(rwb); + + if (current_is_kswapd()) + ret |= WBT_KSWAPD; + + return ret | WBT_TRACKED; +} + +void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + + /* + * Track sync issue, in case it takes a long time to complete. Allows + * us to react quicker, if a sync IO takes a long time to complete. + * Note that this is just a hint. 'stat' can go away when the + * request completes, so it's important we never dereference it. We + * only use the address to compare with, which is why we store the + * sync_issue time locally. + */ + if (wbt_is_read(stat) && !rwb->sync_issue) { + rwb->sync_cookie = stat; + rwb->sync_issue = blk_stat_time(stat); + } +} + +void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + if (stat == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +{ + if (rwb) { + rwb->queue_depth = depth; + wbt_update_limits(rwb); + } +} + +void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) +{ + if (rwb) + rwb->wc = write_cache_on; +} + +void wbt_disable(struct rq_wb *rwb) +{ + if (rwb) { + del_timer_sync(&rwb->window_timer); + rwb->win_nsec = rwb->min_lat_nsec = 0; + wbt_update_limits(rwb); + } +} +EXPORT_SYMBOL_GPL(wbt_disable); + +int wbt_init(struct request_queue *q, struct wb_stat_ops *ops) +{ + struct rq_wb *rwb; + int i; + + /* + * For now, we depend on the stats window being larger than + * our monitoring window. Ensure that this isn't inadvertently + * violated. + */ + BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); + BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); + + if (!ops->get || !ops->is_current || !ops->clear) + return -EINVAL; + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return -ENOMEM; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + atomic_set(&rwb->rq_wait[i].inflight, 0); + init_waitqueue_head(&rwb->rq_wait[i].wait); + } + + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); + rwb->wc = 1; + rwb->queue_depth = RWB_DEF_DEPTH; + rwb->last_comp = rwb->last_issue = jiffies; + rwb->bdi = &q->backing_dev_info; + rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->stat_ops = ops; + rwb->ops_data = q; + wbt_update_limits(rwb); + + /* + * Assign rwb, and turn on stats tracking for this queue + */ + q->rq_wb = rwb; + blk_stat_enable(q); + + if (blk_queue_nonrot(q)) + rwb->min_lat_nsec = 2000000ULL; + else + rwb->min_lat_nsec = 75000000ULL; + + wbt_set_queue_depth(rwb, blk_queue_depth(q)); + wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + + return 0; +} + +void wbt_exit(struct request_queue *q) +{ + struct rq_wb *rwb = q->rq_wb; + + if (rwb) { + del_timer_sync(&rwb->window_timer); + q->rq_wb = NULL; + kfree(rwb); + } +} diff --git a/block/blk-wbt.h b/block/blk-wbt.h new file mode 100644 index 000000000000..e57700337c26 --- /dev/null +++ b/block/blk-wbt.h @@ -0,0 +1,165 @@ +#ifndef WB_THROTTLE_H +#define WB_THROTTLE_H + +#include +#include +#include +#include +#include + +#include "blk-stat.h" + +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_KSWAPD = 4, /* write, from kswapd */ + + WBT_NR_BITS = 3, /* number of bits */ +}; + +enum { + WBT_NUM_RWQ = 2, +}; + +static inline void wbt_clear_state(struct blk_issue_stat *stat) +{ + stat->time &= BLK_STAT_TIME_MASK; +} + +static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat) +{ + return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT; +} + +static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct) +{ + stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT; +} + +static inline bool wbt_is_tracked(struct blk_issue_stat *stat) +{ + return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED; +} + +static inline bool wbt_is_read(struct blk_issue_stat *stat) +{ + return (stat->time >> BLK_STAT_SHIFT) & WBT_READ; +} + +struct wb_stat_ops { + void (*get)(void *, struct blk_rq_stat *); + bool (*is_current)(struct blk_rq_stat *); + void (*clear)(void *); +}; + +struct rq_wait { + wait_queue_head_t wait; + atomic_t inflight; +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + unsigned int wb_max; /* max throughput writeback */ + int scale_step; + bool scaled_max; + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct timer_list window_timer; + + s64 sync_issue; + void *sync_cookie; + + unsigned int wc; + unsigned int queue_depth; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct backing_dev_info *bdi; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + + struct wb_stat_ops *stat_ops; + void *ops_data; +}; + +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; +} + +struct backing_dev_info; + +#ifdef CONFIG_BLK_WBT + +void __wbt_done(struct rq_wb *, enum wbt_flags); +void wbt_done(struct rq_wb *, struct blk_issue_stat *); +enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); +int wbt_init(struct request_queue *, struct wb_stat_ops *); +void wbt_exit(struct request_queue *); +void wbt_update_limits(struct rq_wb *); +void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); +void wbt_issue(struct rq_wb *, struct blk_issue_stat *); +void wbt_disable(struct rq_wb *); + +void wbt_set_queue_depth(struct rq_wb *, unsigned int); +void wbt_set_write_cache(struct rq_wb *, bool); + +#else + +static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) +{ +} +static inline void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ +} +static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, + spinlock_t *lock) +{ + return 0; +} +static inline int wbt_init(struct request_queue *q, struct wb_stat_ops *ops) +{ + return -EINVAL; +} +static inline void wbt_exit(struct request_queue *q) +{ +} +static inline void wbt_update_limits(struct rq_wb *rwb) +{ +} +static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ +} +static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ +} +static inline void wbt_disable(struct rq_wb *rwb) +{ +} +static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +{ +} +static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) +{ +} + +#endif /* CONFIG_BLK_WBT */ + +#endif -- cgit v1.2.3 From 87760e5eef359788047d6fd54fc12eec74ce0d27 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Nov 2016 12:38:14 -0700 Subject: block: hook up writeback throttling Enable throttling of buffered writeback to make it a lot more smooth, and has way less impact on other system activity. Background writeback should be, by definition, background activity. The fact that we flush huge bundles of it at the time means that it potentially has heavy impacts on foreground workloads, which isn't ideal. We can't easily limit the sizes of writes that we do, since that would impact file system layout in the presence of delayed allocation. So just throttle back buffered writeback, unless someone is waiting for it. The algorithm for when to throttle takes its inspiration in the CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors the minimum latencies of requests over a window of time. In that window of time, if the minimum latency of any request exceeds a given target, then a scale count is incremented and the queue depth is shrunk. The next monitoring window is shrunk accordingly. Unlike CoDel, if we hit a window that exhibits good behavior, then we simply increment the scale count and re-calculate the limits for that scale value. This prevents us from oscillating between a close-to-ideal value and max all the time, instead remaining in the windows where we get good behavior. Unlike CoDel, blk-wb allows the scale count to to negative. This happens if we primarily have writes going on. Unlike positive scale counts, this doesn't change the size of the monitoring window. When the heavy writers finish, blk-bw quickly snaps back to it's stable state of a zero scale count. The patch registers a sysfs entry, 'wb_lat_usec'. This sets the latency target to me met. It defaults to 2 msec for non-rotational storage, and 75 msec for rotational storage. Setting this value to '0' disables blk-wb. Generally, a user would not have to touch this setting. We don't enable WBT on devices that are managed with CFQ, and have a non-root block cgroup attached. If we have a proportional share setup on this particular disk, then the wbt throttling will interfere with that. We don't have a strong need for wbt for that case, since we will rely on CFQ doing that for us. Signed-off-by: Jens Axboe --- block/Kconfig | 26 ++++++++++++++++ block/blk-core.c | 17 ++++++++-- block/blk-mq.c | 26 ++++++++++++++-- block/blk-settings.c | 4 +++ block/blk-sysfs.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++ block/cfq-iosched.c | 14 +++++++++ 6 files changed, 171 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index 3a024440a669..8bf114a3858a 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -121,6 +121,32 @@ config BLK_CMDLINE_PARSER See Documentation/block/cmdline-partition.txt for more information. +config BLK_WBT + bool "Enable support for block device writeback throttling" + default n + ---help--- + Enabling this option enables the block layer to throttle buffered + background writeback from the VM, making it more smooth and having + less impact on foreground operations. The throttling is done + dynamically on an algorithm loosely based on CoDel, factoring in + the realtime performance of the disk. + +config BLK_WBT_SQ + bool "Single queue writeback throttling" + default n + depends on BLK_WBT + ---help--- + Enable writeback throttling by default on legacy single queue devices + +config BLK_WBT_MQ + bool "Multiqueue writeback throttling" + default y + depends on BLK_WBT + ---help--- + Enable writeback throttling by default on multiqueue devices. + Multiqueue currently doesn't have support for IO scheduling, + enabling this option is recommended. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/blk-core.c b/block/blk-core.c index 216372b01624..59f8129a4295 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wbt.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -882,6 +883,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, fail: blk_free_flush_queue(q->fq); + wbt_exit(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1344,6 +1346,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->issue_stat); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); @@ -1436,6 +1439,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + wbt_done(q->rq_wb, &req->issue_stat); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1663,6 +1668,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int el_ret, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; + unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1715,17 +1721,22 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: + wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); + /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { + __wbt_done(q->rq_wb, wb_acct); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + wbt_track(&req->issue_stat, wb_acct); + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). @@ -2467,6 +2478,7 @@ void blk_start_request(struct request *req) if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { blk_stat_set_issue_time(&req->issue_stat); req->rq_flags |= RQF_STATS; + wbt_issue(req->q->rq_wb, &req->issue_stat); } /* @@ -2708,9 +2720,10 @@ void blk_finish_request(struct request *req, int error) blk_account_io_done(req); - if (req->end_io) + if (req->end_io) { + wbt_done(req->q->rq_wb, &req->issue_stat); req->end_io(req, error); - else { + } else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); diff --git a/block/blk-mq.c b/block/blk-mq.c index 19795886d46e..d180c989a0e5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -31,6 +31,7 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-stat.h" +#include "blk-wbt.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -326,6 +327,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + wbt_done(q->rq_wb, &rq->issue_stat); rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -354,6 +357,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) blk_account_io_done(rq); if (rq->end_io) { + wbt_done(rq->q->rq_wb, &rq->issue_stat); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -471,6 +475,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { blk_stat_set_issue_time(&rq->issue_stat); rq->rq_flags |= RQF_STATS; + wbt_issue(q->rq_wb, &rq->issue_stat); } blk_add_timer(rq); @@ -508,6 +513,7 @@ static void __blk_mq_requeue_request(struct request *rq) struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->issue_stat); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -1339,6 +1345,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1353,9 +1360,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; + } + + wbt_track(&rq->issue_stat, wb_acct); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1439,6 +1452,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct blk_mq_alloc_data data; struct request *rq; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1455,9 +1469,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + wb_acct = wbt_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; + } + + wbt_track(&rq->issue_stat, wb_acct); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -2139,6 +2159,8 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + wbt_exit(q); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); diff --git a/block/blk-settings.c b/block/blk-settings.c index 9cf053759363..c7ccabc0ec3e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -13,6 +13,7 @@ #include #include "blk.h" +#include "blk-wbt.h" unsigned long blk_max_low_pfn; EXPORT_SYMBOL(blk_max_low_pfn); @@ -845,6 +846,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; + wbt_set_queue_depth(q->rq_wb, depth); } EXPORT_SYMBOL(blk_set_queue_depth); @@ -868,6 +870,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) else queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); + + wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9cdb7247727a..9262d2d60a09 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wbt.h" struct queue_sysfs_entry { struct attribute attr; @@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } +static ssize_t queue_var_store64(u64 *var, const char *page) +{ + int err; + u64 v; + + err = kstrtou64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, (page)); @@ -364,6 +378,32 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->min_lat_nsec = val * 1000ULL; + wbt_update_limits(q->rq_wb); + return count; +} + static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -578,6 +618,12 @@ static struct queue_sysfs_entry queue_stats_entry = { .show = queue_stats_show, }; +static struct queue_sysfs_entry queue_wb_lat_entry = { + .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_lat_show, + .store = queue_wb_lat_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -608,6 +654,7 @@ static struct attribute *default_attrs[] = { &queue_wc_entry.attr, &queue_dax_entry.attr, &queue_stats_entry.attr, + &queue_wb_lat_entry.attr, NULL, }; @@ -682,6 +729,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); + wbt_exit(q); bdi_exit(&q->backing_dev_info); blkcg_exit_queue(q); @@ -722,6 +770,44 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat) +{ + blk_queue_stat_get(data, stat); +} + +static void blk_wb_stat_clear(void *data) +{ + blk_stat_clear(data); +} + +static bool blk_wb_stat_is_current(struct blk_rq_stat *stat) +{ + return blk_stat_is_current(stat); +} + +static struct wb_stat_ops wb_stat_ops = { + .get = blk_wb_stat_get, + .is_current = blk_wb_stat_is_current, + .clear = blk_wb_stat_clear, +}; + +static void blk_wb_init(struct request_queue *q) +{ +#ifndef CONFIG_BLK_WBT_MQ + if (q->mq_ops) + return; +#endif +#ifndef CONFIG_BLK_WBT_SQ + if (q->request_fn) + return; +#endif + + /* + * If this fails, we don't get throttling + */ + wbt_init(q, &wb_stat_ops); +} + int blk_register_queue(struct gendisk *disk) { int ret; @@ -761,6 +847,8 @@ int blk_register_queue(struct gendisk *disk) if (q->mq_ops) blk_mq_register_dev(dev, q); + blk_wb_init(q); + if (!q->request_fn) return 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 61010511c5a0..e280d08ef6d7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -16,6 +16,7 @@ #include #include #include "blk.h" +#include "blk-wbt.h" /* * tunables @@ -3762,9 +3763,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; uint64_t serial_nr; + bool nonroot_cg; rcu_read_lock(); serial_nr = bio_blkcg(bio)->css.serial_nr; + nonroot_cg = bio_blkcg(bio) != &blkcg_root; rcu_read_unlock(); /* @@ -3774,6 +3777,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; + /* + * If we have a non-root cgroup, we can depend on that to + * do proper throttling of writes. Turn off wbt for that + * case. + */ + if (nonroot_cg) { + struct request_queue *q = cfqd->queue; + + wbt_disable(q->rq_wb); + } + /* * Drop reference to queues. New queues will be assigned in new * group upon arrival of fresh requests. -- cgit v1.2.3 From 066a4a73cee9a44a906b98825e70c47de5bd8b5c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Nov 2016 12:24:46 -0700 Subject: blk-mq: blk_mq_try_issue_directly() should lookup hardware queue A previous commit changed this to pass in the hardware queue, but it was using the wrong hardware queue. Hence a request that was allocated on one hardware queue ended up being issued on another one, and that caused IO timeouts and oopses on some drivers. Since the request holds hardware queue private resources, like a tag, we can't just issue it on a different hardware queue. Fixes: 2253efc850c4 ("blk-mq: Move more code into blk_mq_direct_issue_request()") Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d180c989a0e5..77110aed24ea 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1291,11 +1291,11 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } -static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) +static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) { int ret; struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, @@ -1414,11 +1414,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + blk_mq_try_issue_directly(old_rq, &cookie); rcu_read_unlock(); } else { srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); - blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + blk_mq_try_issue_directly(old_rq, &cookie); srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); } goto done; -- cgit v1.2.3 From bbd7bb7017d5c2b1e75f3818b4ce88fa58bb0eab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Nov 2016 09:34:34 -0600 Subject: block: move poll code to blk-mq The poll code is blk-mq specific, let's move it to blk-mq.c. This is a prep patch for improving the polling code. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- block/blk-core.c | 46 ---------------------------------------------- block/blk-mq.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 46 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 59f8129a4295..eea246567884 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3312,52 +3312,6 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); -bool blk_poll(struct request_queue *q, blk_qc_t cookie) -{ - struct blk_plug *plug; - long state; - unsigned int queue_num; - struct blk_mq_hw_ctx *hctx; - - if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return false; - - queue_num = blk_qc_t_to_queue_num(cookie); - hctx = q->queue_hw_ctx[queue_num]; - hctx->poll_considered++; - - plug = current->plug; - if (plug) - blk_flush_plug_list(plug, false); - - state = current->state; - while (!need_resched()) { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); - if (ret > 0) { - hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; - } - - if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); - - if (current->state == TASK_RUNNING) - return true; - if (ret < 0) - break; - cpu_relax(); - } - - return false; -} -EXPORT_SYMBOL_GPL(blk_poll); - #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine diff --git a/block/blk-mq.c b/block/blk-mq.c index 77110aed24ea..ae8df5ec20d3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2461,6 +2461,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct request_queue *q = hctx->queue; + long state; + + hctx->poll_considered++; + + state = current->state; + while (!need_resched()) { + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, rq->tag); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + +bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_plug *plug; + struct request *rq; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + + return __blk_mq_poll(hctx, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_poll); + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); -- cgit v1.2.3 From d8a0cbfd73cb7281120d1b49f90afeef26ad48a2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Nov 2016 21:52:53 -0700 Subject: blk-wbt: store queue instead of bdi The bdi was a leftover from when the code was block layer agnostic. Now that we just support a block layer user, store the queue directly. Signed-off-by: Jens Axboe --- block/blk-wbt.c | 20 ++++++++++++-------- block/blk-wbt.h | 4 +--- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 889c17ff8503..4ab9cebc8003 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -96,7 +96,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->bdi->wb; + struct bdi_writeback *wb = &rwb->queue->backing_dev_info.wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -279,6 +279,7 @@ enum { static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { + struct backing_dev_info *bdi = &rwb->queue->backing_dev_info; u64 thislat; /* @@ -293,7 +294,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec || (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { - trace_wbt_lat(rwb->bdi, thislat); + trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED; } @@ -317,13 +318,13 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) * If the 'min' latency exceeds our target, step down. */ if (stat[0].min > rwb->min_lat_nsec) { - trace_wbt_lat(rwb->bdi, stat[0].min); - trace_wbt_stat(rwb->bdi, stat); + trace_wbt_lat(bdi, stat[0].min); + trace_wbt_stat(bdi, stat); return LAT_EXCEEDED; } if (rwb->scale_step) - trace_wbt_stat(rwb->bdi, stat); + trace_wbt_stat(bdi, stat); return LAT_OK; } @@ -338,7 +339,9 @@ static int latency_exceeded(struct rq_wb *rwb) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec, + struct backing_dev_info *bdi = &rwb->queue->backing_dev_info; + + trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, rwb->wb_background, rwb->wb_normal, rwb->wb_max); } @@ -420,7 +423,8 @@ static void wb_timer_fn(unsigned long data) status = latency_exceeded(rwb); - trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight); + trace_wbt_timer(&rwb->queue->backing_dev_info, status, rwb->scale_step, + inflight); /* * If we exceeded the latency target, step down. If we did not, @@ -700,7 +704,7 @@ int wbt_init(struct request_queue *q, struct wb_stat_ops *ops) rwb->wc = 1; rwb->queue_depth = RWB_DEF_DEPTH; rwb->last_comp = rwb->last_issue = jiffies; - rwb->bdi = &q->backing_dev_info; + rwb->queue = q; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->stat_ops = ops; rwb->ops_data = q; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index e57700337c26..09c61a3f8295 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -87,7 +87,7 @@ struct rq_wb { unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; - struct backing_dev_info *bdi; + struct request_queue *queue; struct rq_wait rq_wait[WBT_NUM_RWQ]; struct wb_stat_ops *stat_ops; @@ -104,8 +104,6 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) return ret; } -struct backing_dev_info; - #ifdef CONFIG_BLK_WBT void __wbt_done(struct rq_wb *, enum wbt_flags); -- cgit v1.2.3 From 8054b89f8fca75d514965ee627a15b47020d2053 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Nov 2016 21:50:51 -0700 Subject: blk-wbt: remove stat ops Again a leftover from when the throttling code was generic. Now that we just have the block user, get rid of the stat ops and indirections. Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 23 +---------------------- block/blk-wbt.c | 15 +++++---------- block/blk-wbt.h | 13 ++----------- 3 files changed, 8 insertions(+), 43 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9262d2d60a09..415e764807d0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -770,27 +770,6 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; -static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat) -{ - blk_queue_stat_get(data, stat); -} - -static void blk_wb_stat_clear(void *data) -{ - blk_stat_clear(data); -} - -static bool blk_wb_stat_is_current(struct blk_rq_stat *stat) -{ - return blk_stat_is_current(stat); -} - -static struct wb_stat_ops wb_stat_ops = { - .get = blk_wb_stat_get, - .is_current = blk_wb_stat_is_current, - .clear = blk_wb_stat_clear, -}; - static void blk_wb_init(struct request_queue *q) { #ifndef CONFIG_BLK_WBT_MQ @@ -805,7 +784,7 @@ static void blk_wb_init(struct request_queue *q) /* * If this fails, we don't get throttling */ - wbt_init(q, &wb_stat_ops); + wbt_init(q); } int blk_register_queue(struct gendisk *disk) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4ab9cebc8003..f6ec7e587fa6 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -308,7 +308,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) * waited or still has writes in flights, consider us doing * just writes as well. */ - if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) || + if ((stat[1].nr_samples && blk_stat_is_current(stat)) || wb_recent_wait(rwb) || wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; @@ -333,7 +333,7 @@ static int latency_exceeded(struct rq_wb *rwb) { struct blk_rq_stat stat[2]; - rwb->stat_ops->get(rwb->ops_data, stat); + blk_queue_stat_get(rwb->queue, stat); return __latency_exceeded(rwb, stat); } @@ -355,7 +355,7 @@ static void scale_up(struct rq_wb *rwb) rwb->scale_step--; rwb->unknown_cnt = 0; - rwb->stat_ops->clear(rwb->ops_data); + blk_stat_clear(rwb->queue); rwb->scaled_max = calc_wb_limits(rwb); @@ -385,7 +385,7 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) rwb->scaled_max = false; rwb->unknown_cnt = 0; - rwb->stat_ops->clear(rwb->ops_data); + blk_stat_clear(rwb->queue); calc_wb_limits(rwb); rwb_trace_step(rwb, "step down"); } @@ -675,7 +675,7 @@ void wbt_disable(struct rq_wb *rwb) } EXPORT_SYMBOL_GPL(wbt_disable); -int wbt_init(struct request_queue *q, struct wb_stat_ops *ops) +int wbt_init(struct request_queue *q) { struct rq_wb *rwb; int i; @@ -688,9 +688,6 @@ int wbt_init(struct request_queue *q, struct wb_stat_ops *ops) BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); - if (!ops->get || !ops->is_current || !ops->clear) - return -EINVAL; - rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) return -ENOMEM; @@ -706,8 +703,6 @@ int wbt_init(struct request_queue *q, struct wb_stat_ops *ops) rwb->last_comp = rwb->last_issue = jiffies; rwb->queue = q; rwb->win_nsec = RWB_WINDOW_NSEC; - rwb->stat_ops = ops; - rwb->ops_data = q; wbt_update_limits(rwb); /* diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 09c61a3f8295..44dc2173dc1f 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -46,12 +46,6 @@ static inline bool wbt_is_read(struct blk_issue_stat *stat) return (stat->time >> BLK_STAT_SHIFT) & WBT_READ; } -struct wb_stat_ops { - void (*get)(void *, struct blk_rq_stat *); - bool (*is_current)(struct blk_rq_stat *); - void (*clear)(void *); -}; - struct rq_wait { wait_queue_head_t wait; atomic_t inflight; @@ -89,9 +83,6 @@ struct rq_wb { unsigned long min_lat_nsec; struct request_queue *queue; struct rq_wait rq_wait[WBT_NUM_RWQ]; - - struct wb_stat_ops *stat_ops; - void *ops_data; }; static inline unsigned int wbt_inflight(struct rq_wb *rwb) @@ -109,7 +100,7 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb) void __wbt_done(struct rq_wb *, enum wbt_flags); void wbt_done(struct rq_wb *, struct blk_issue_stat *); enum wbt_flags wbt_wait(struct rq_wb *, struct bio *, spinlock_t *); -int wbt_init(struct request_queue *, struct wb_stat_ops *); +int wbt_init(struct request_queue *); void wbt_exit(struct request_queue *); void wbt_update_limits(struct rq_wb *); void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); @@ -132,7 +123,7 @@ static inline enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, { return 0; } -static inline int wbt_init(struct request_queue *q, struct wb_stat_ops *ops) +static inline int wbt_init(struct request_queue *q) { return -EINVAL; } -- cgit v1.2.3 From 382cf633edcb0371a6dd506653014897c4ac2a4d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Nov 2016 08:23:53 -0700 Subject: blk-wbt: use BLK_STAT_{READ,WRITE} instead of 0/1 Since we have proper enums for the stats directions, use them. Signed-off-by: Jens Axboe --- block/blk-wbt.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index f6ec7e587fa6..20712f0db6ea 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -255,8 +255,8 @@ static bool inline stat_sample_valid(struct blk_rq_stat *stat) * that it's writes impacting us, and not just some sole read on * a device that is in a lower power state. */ - return stat[0].nr_samples >= 1 && - stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES; + return stat[BLK_STAT_READ].nr_samples >= 1 && + stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; } static u64 rwb_sync_issue_lat(struct rq_wb *rwb) @@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) */ thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec || - (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) { + (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED; } @@ -308,7 +308,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) * waited or still has writes in flights, consider us doing * just writes as well. */ - if ((stat[1].nr_samples && blk_stat_is_current(stat)) || + if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || wb_recent_wait(rwb) || wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN; @@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) /* * If the 'min' latency exceeds our target, step down. */ - if (stat[0].min > rwb->min_lat_nsec) { - trace_wbt_lat(bdi, stat[0].min); + if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { + trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); trace_wbt_stat(bdi, stat); return LAT_EXCEEDED; } -- cgit v1.2.3 From dbb3ab03563cbf0880b1801a238b3e3964e364a1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 25 Sep 2016 19:54:38 -0700 Subject: bsg: Add sparse annotations to bsg_request_fn() Avoid that sparse complains about unbalanced lock actions. Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bsg-lib.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 650f427d915b..b2a61e3ecb14 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -161,6 +161,8 @@ failjob_rls_job: * Drivers/subsys should pass this to the queue init function. */ void bsg_request_fn(struct request_queue *q) + __releases(q->queue_lock) + __acquires(q->queue_lock) { struct device *dev = q->queuedata; struct request *req; -- cgit v1.2.3 From 0a6219a95f0b0690fb7094acb26002e7a4791197 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Nov 2016 18:07:05 +0800 Subject: block: deal with stale req count of plug list In both legacy and mq path, req count of plug list is computed before allocating request, so the number can be stale when falling back to slept allocation, also the new introduced wbt can sleep too. This patch deals with the case by checking if plug list becomes empty, and fixes the KASAN report of 'BUG: KASAN: stack-out-of-bounds' which is introduced by Shaohua's patches of dispatching big request. Fixes: 600271d900002(blk-mq: immediately dispatch big size request) Fixes: 50d24c34403c6(block: immediately dispatch big size request) Cc: Shaohua Li Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++++- block/blk-mq.c | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index eea246567884..473dd698effd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1753,8 +1753,11 @@ get_rq: /* * If this is the first request added after a plug, fire * of a plug trace. + * + * @request_count may become stale because of schedule + * out, so check plug list again. */ - if (!request_count) + if (!request_count || list_empty(&plug->list)) trace_block_plug(q); else { struct request *last = list_entry_rq(plug->list.prev); diff --git a/block/blk-mq.c b/block/blk-mq.c index ae8df5ec20d3..f39e69c732cc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1497,6 +1497,13 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct request *last = NULL; blk_mq_bio_to_request(rq, bio); + + /* + * @request_count may become stale because of schedule + * out, so check the list again. + */ + if (list_empty(&plug->mq_list)) + request_count = 0; if (!request_count) trace_block_plug(q); else -- cgit v1.2.3 From 4121d385f1457d9beb2067d4b5b4659ef3e6c316 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 16 Nov 2016 16:29:57 +0100 Subject: blk-wbt: fix old-style function declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The newly added driver causes a harmless warning in some configurations: block/blk-wbt.c:250:1: error: ‘inline’ is not at beginning of declaration [-Werror=old-style-declaration] static bool inline stat_sample_valid(struct blk_rq_stat *stat) This makes it use the expected format for the declaration. Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- block/blk-wbt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 20712f0db6ea..9f97594e68ce 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -247,7 +247,7 @@ static bool calc_wb_limits(struct rq_wb *rwb) return ret; } -static bool inline stat_sample_valid(struct blk_rq_stat *stat) +static inline bool stat_sample_valid(struct blk_rq_stat *stat) { /* * We need at least one read sample, and a minimum of -- cgit v1.2.3 From 06426adf072bca62ac31ea396ff2159a34f276c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:01:59 -0700 Subject: blk-mq: implement hybrid poll mode for sync O_DIRECT This patch enables a hybrid polling mode. Instead of polling after IO submission, we can induce an artificial delay, and then poll after that. For example, if the IO is presumed to complete in 8 usecs from now, we can sleep for 4 usecs, wake up, and then do our polling. This still puts a sleep/wakeup cycle in the IO path, but instead of the wakeup happening after the IO has completed, it'll happen before. With this hybrid scheme, we can achieve big latency reductions while still using the same (or less) amount of CPU. Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- block/blk-mq.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-sysfs.c | 29 +++++++++++++++++++++++++++++ block/blk.h | 1 + 3 files changed, 80 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index f39e69c732cc..8cb248fb6a68 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -332,6 +332,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); blk_mq_put_tag(hctx, ctx, tag); blk_queue_exit(q); } @@ -2468,11 +2469,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, + struct request *rq) +{ + struct hrtimer_sleeper hs; + enum hrtimer_mode mode; + ktime_t kt; + + if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + return false; + + set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + + /* + * This will be replaced with the stats tracking code, using + * 'avg_completion_time / 2' as the pre-sleep target. + */ + kt = ktime_set(0, q->poll_nsec); + + mode = HRTIMER_MODE_REL; + hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&hs.timer, kt); + + hrtimer_init_sleeper(&hs, current); + do { + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + hrtimer_start_expires(&hs.timer, mode); + if (hs.task) + io_schedule(); + hrtimer_cancel(&hs.timer); + mode = HRTIMER_MODE_ABS; + } while (hs.task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&hs.timer); + return true; +} + static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct request_queue *q = hctx->queue; long state; + /* + * If we sleep, have the caller restart the poll loop to reset + * the state. Like for the other success return cases, the + * caller is responsible for checking if the IO completed. If + * the IO isn't complete, we'll get called again and will go + * straight to the busy poll loop. + */ + if (blk_mq_poll_hybrid_sleep(q, rq)) + return true; + hctx->poll_considered++; state = current->state; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 415e764807d0..dcdfcaa12653 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -350,6 +350,28 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->poll_nsec / 1000, page); +} + +static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_usec; + ssize_t ret; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + ret = queue_var_store(&poll_usec, page, count); + if (ret < 0) + return ret; + + q->poll_nsec = poll_usec * 1000; + return ret; +} + static ssize_t queue_poll_show(struct request_queue *q, char *page) { return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); @@ -602,6 +624,12 @@ static struct queue_sysfs_entry queue_poll_entry = { .store = queue_poll_store, }; +static struct queue_sysfs_entry queue_poll_delay_entry = { + .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_delay_show, + .store = queue_poll_delay_store, +}; + static struct queue_sysfs_entry queue_wc_entry = { .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, .show = queue_wc_show, @@ -655,6 +683,7 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_stats_entry.attr, &queue_wb_lat_entry.attr, + &queue_poll_delay_entry.attr, NULL, }; diff --git a/block/blk.h b/block/blk.h index aa132dea598c..041185e5f129 100644 --- a/block/blk.h +++ b/block/blk.h @@ -111,6 +111,7 @@ void blk_account_io_done(struct request *req); enum rq_atomic_flags { REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, + REQ_ATOM_POLL_SLEPT, }; /* -- cgit v1.2.3 From 64f1c21e86f7fe63337b5c23c129de3ec506431d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:03:03 -0700 Subject: blk-mq: make the polling code adaptive The previous commit introduced the hybrid sleep/poll mode. Take that one step further, and use the completion latencies to automatically sleep for half the mean completion time. This is a good approximation. This changes the 'io_poll_delay' sysfs file a bit to expose the various options. Depending on the value, the polling code will behave differently: -1 Never enter hybrid sleep mode 0 Use half of the completion mean for the sleep delay >0 Use this specific value as the sleep delay Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- block/blk-mq.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- block/blk-sysfs.c | 26 ++++++++++++++------- 2 files changed, 82 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8cb248fb6a68..9d4a1d630d0b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2132,6 +2132,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->nr_requests = set->queue_depth; + /* + * Default to classic polling + */ + q->poll_nsec = -1; + if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); @@ -2469,14 +2474,70 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +static unsigned long blk_mq_poll_nsecs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct blk_rq_stat stat[2]; + unsigned long ret = 0; + + /* + * If stats collection isn't on, don't sleep but turn it on for + * future users + */ + if (!blk_stat_enable(q)) + return 0; + + /* + * We don't have to do this once per IO, should optimize this + * to just use the current window of stats until it changes + */ + memset(&stat, 0, sizeof(stat)); + blk_hctx_stat_get(hctx, stat); + + /* + * As an optimistic guess, use half of the mean service time + * for this type of request. We can (and should) make this smarter. + * For instance, if the completion latencies are tight, we can + * get closer than just half the mean. This is especially + * important on devices where the completion latencies are longer + * than ~10 usec. + */ + if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples) + ret = (stat[BLK_STAT_READ].mean + 1) / 2; + else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples) + ret = (stat[BLK_STAT_WRITE].mean + 1) / 2; + + return ret; +} + static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, struct request *rq) { struct hrtimer_sleeper hs; enum hrtimer_mode mode; + unsigned int nsecs; ktime_t kt; - if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + return false; + + /* + * poll_nsec can be: + * + * -1: don't ever hybrid sleep + * 0: use half of prev avg + * >0: use this specific value + */ + if (q->poll_nsec == -1) + return false; + else if (q->poll_nsec > 0) + nsecs = q->poll_nsec; + else + nsecs = blk_mq_poll_nsecs(q, hctx, rq); + + if (!nsecs) return false; set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); @@ -2485,7 +2546,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, * This will be replaced with the stats tracking code, using * 'avg_completion_time / 2' as the pre-sleep target. */ - kt = ktime_set(0, q->poll_nsec); + kt = ktime_set(0, nsecs); mode = HRTIMER_MODE_REL; hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); @@ -2520,7 +2581,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) * the IO isn't complete, we'll get called again and will go * straight to the busy poll loop. */ - if (blk_mq_poll_hybrid_sleep(q, rq)) + if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) return true; hctx->poll_considered++; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index dcdfcaa12653..1855c6770045 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -352,24 +352,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { - return queue_var_show(q->poll_nsec / 1000, page); + int val; + + if (q->poll_nsec == -1) + val = -1; + else + val = q->poll_nsec / 1000; + + return sprintf(page, "%d\n", val); } static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, size_t count) { - unsigned long poll_usec; - ssize_t ret; + int err, val; if (!q->mq_ops || !q->mq_ops->poll) return -EINVAL; - ret = queue_var_store(&poll_usec, page, count); - if (ret < 0) - return ret; + err = kstrtoint(page, 10, &val); + if (err < 0) + return err; - q->poll_nsec = poll_usec * 1000; - return ret; + if (val == -1) + q->poll_nsec = -1; + else + q->poll_nsec = val * 1000; + + return count; } static ssize_t queue_poll_show(struct request_queue *q, char *page) -- cgit v1.2.3 From 778889d8412e36e666b1e4ce108373613c84b428 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Mon, 21 Nov 2016 15:52:23 -0600 Subject: block: apply blk_partition_remap to REQ_OP_ZONE_RESET If a ZBC device is partitioned and operations are performed on the partition the zone information is rebased to the partition, however the zone reset is not mapped from the partition to device as are other operations. This causes the API (report zones / reset zone) to be unbalanced in this regard. Checking for the zone reset op code explicitly will balance the API. Signed-off-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 473dd698effd..6c4a425690fc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1787,7 +1787,12 @@ static inline void blk_partition_remap(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; - if (bio_sectors(bio) && bdev != bdev->bd_contains) { + /* + * Zone reset does not include bi_size so bio_sectors() is always 0. + * Include a test for the reset op code and perform the remap if needed. + */ + if (bdev != bdev->bd_contains && + (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)) { struct hd_struct *p = bdev->bd_part; bio->bi_iter.bi_sector += p->start_sect; -- cgit v1.2.3 From 3a83f4677539bce8eaa2bca9ee9c20e172d7ab04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 22 Nov 2016 08:57:21 -0700 Subject: block: bio: pass bvec table to bio_init() Some drivers often use external bvec table, so introduce this helper for this case. It is always safe to access the bio->bi_io_vec in this way for this case. After converting to this usage, it will becomes a bit easier to evaluate the remaining direct access to bio->bi_io_vec, so it can help to prepare for the following multipage bvec support. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Fixed up the new O_DIRECT cases. Signed-off-by: Jens Axboe --- block/bio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 2cf6ebabc68c..de257ced69b1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -270,11 +270,15 @@ static void bio_free(struct bio *bio) } } -void bio_init(struct bio *bio) +void bio_init(struct bio *bio, struct bio_vec *table, + unsigned short max_vecs) { memset(bio, 0, sizeof(*bio)); atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + + bio->bi_io_vec = table; + bio->bi_max_vecs = max_vecs; } EXPORT_SYMBOL(bio_init); @@ -480,7 +484,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) return NULL; bio = p + front_pad; - bio_init(bio); + bio_init(bio, NULL, 0); if (nr_iovecs > inline_vecs) { unsigned long idx = 0; -- cgit v1.2.3 From e00f4f4d0ff7e13b9115428a245b49108d625f09 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2016 18:03:32 -0500 Subject: block,blkcg: use __GFP_NOWARN for best-effort allocations in blkcg blkcg allocates some per-cgroup data structures with GFP_NOWAIT and when that fails falls back to operations which aren't specific to the cgroup. Occassional failures are expected under pressure and falling back to non-cgroup operation is the right thing to do. Unfortunately, I forgot to add __GFP_NOWARN to these allocations and these expected failures end up creating a lot of noise. Add __GFP_NOWARN. Signed-off-by: Tejun Heo Reported-by: Marc MERLIN Reported-by: Vlastimil Babka Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++++---- block/cfq-iosched.c | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b08ccbb9393a..8ba0af780e88 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -185,7 +185,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, } wb_congested = wb_congested_get_create(&q->backing_dev_info, - blkcg->css.id, GFP_NOWAIT); + blkcg->css.id, + GFP_NOWAIT | __GFP_NOWARN); if (!wb_congested) { ret = -ENOMEM; goto err_put_css; @@ -193,7 +194,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_congested; @@ -1022,7 +1023,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) } spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -1240,7 +1241,7 @@ pd_prealloc: if (blkg->pd[pol->plid]) continue; - pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node); if (!pd) swap(pd, pd_prealloc); if (!pd) { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e280d08ef6d7..9894dc985e09 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3859,7 +3859,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } - cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqq = kmem_cache_alloc_node(cfq_pool, + GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, cfqd->queue->node); if (!cfqq) { cfqq = &cfqd->oom_cfqq; -- cgit v1.2.3 From 80e091d10e8bf7b801d634ea8870b9e907314424 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Nov 2016 09:22:47 -0700 Subject: blk-wbt: allow reset of default latency through sysfs Allow a write of '-1' to reset the default latency target for a given device. This removes knowledge of the different default settings for rotational vs non-rotational from user space. Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 22 +++++++++++++++------- block/blk-wbt.c | 17 +++++++++++++---- block/blk-wbt.h | 6 ++++++ 3 files changed, 34 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1855c6770045..f0ca569e276b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -42,12 +42,12 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_var_store64(u64 *var, const char *page) +static ssize_t queue_var_store64(s64 *var, const char *page) { int err; - u64 v; + s64 v; - err = kstrtou64(page, 10, &v); + err = kstrtos64(page, 10, &v); if (err < 0) return err; @@ -421,18 +421,26 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, size_t count) { + struct rq_wb *rwb; ssize_t ret; - u64 val; + s64 val; - if (!q->rq_wb) + rwb = q->rq_wb; + if (!rwb) return -EINVAL; ret = queue_var_store64(&val, page); if (ret < 0) return ret; - q->rq_wb->min_lat_nsec = val * 1000ULL; - wbt_update_limits(q->rq_wb); + if (val == -1) + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + else if (val >= 0) + rwb->min_lat_nsec = val * 1000ULL; + else + return -EINVAL; + + wbt_update_limits(rwb); return count; } diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 9f97594e68ce..92df2f7c5af1 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -675,6 +675,18 @@ void wbt_disable(struct rq_wb *rwb) } EXPORT_SYMBOL_GPL(wbt_disable); +u64 wbt_default_latency_nsec(struct request_queue *q) +{ + /* + * We default to 2msec for non-rotational storage, and 75msec + * for rotational storage. + */ + if (blk_queue_nonrot(q)) + return 2000000ULL; + else + return 75000000ULL; +} + int wbt_init(struct request_queue *q) { struct rq_wb *rwb; @@ -711,10 +723,7 @@ int wbt_init(struct request_queue *q) q->rq_wb = rwb; blk_stat_enable(q); - if (blk_queue_nonrot(q)) - rwb->min_lat_nsec = 2000000ULL; - else - rwb->min_lat_nsec = 75000000ULL; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); wbt_set_queue_depth(rwb, blk_queue_depth(q)); wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 44dc2173dc1f..9dfc88ad7f30 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -110,6 +110,8 @@ void wbt_disable(struct rq_wb *); void wbt_set_queue_depth(struct rq_wb *, unsigned int); void wbt_set_write_cache(struct rq_wb *, bool); +u64 wbt_default_latency_nsec(struct request_queue *); + #else static inline void __wbt_done(struct rq_wb *rwb, enum wbt_flags flags) @@ -148,6 +150,10 @@ static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) static inline void wbt_set_write_cache(struct rq_wb *rwb, bool wc) { } +static inline u64 wbt_default_latency_nsec(struct request_queue *q) +{ + return 0; +} #endif /* CONFIG_BLK_WBT */ -- cgit v1.2.3 From fa224eed2b5e0f2f9a57281e9dc733c843d590ad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Nov 2016 09:25:50 -0700 Subject: blk-wbt: cleanup disable-by-default for CFQ Make it clear that we are disabling wbt for the specified queued, if it was enabled by default. This is in preparation for allowing users to re-enable wbt, and not have it disabled automatically again. Signed-off-by: Jens Axboe --- block/blk-wbt.c | 10 ++++++++-- block/blk-wbt.h | 4 ++-- block/cfq-iosched.c | 9 +++------ 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 92df2f7c5af1..7c0e618d6e7d 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -665,15 +665,21 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) rwb->wc = write_cache_on; } -void wbt_disable(struct rq_wb *rwb) + /* + * Disable wbt, if enabled by default. Only called from CFQ, if we have + * cgroups enabled + */ +void wbt_disable_default(struct request_queue *q) { + struct rq_wb *rwb = q->rq_wb; + if (rwb) { del_timer_sync(&rwb->window_timer); rwb->win_nsec = rwb->min_lat_nsec = 0; wbt_update_limits(rwb); } } -EXPORT_SYMBOL_GPL(wbt_disable); +EXPORT_SYMBOL_GPL(wbt_disable_default); u64 wbt_default_latency_nsec(struct request_queue *q) { diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 9dfc88ad7f30..8f485f8e1baf 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -105,7 +105,7 @@ void wbt_exit(struct request_queue *); void wbt_update_limits(struct rq_wb *); void wbt_requeue(struct rq_wb *, struct blk_issue_stat *); void wbt_issue(struct rq_wb *, struct blk_issue_stat *); -void wbt_disable(struct rq_wb *); +void wbt_disable_default(struct request_queue *); void wbt_set_queue_depth(struct rq_wb *, unsigned int); void wbt_set_write_cache(struct rq_wb *, bool); @@ -141,7 +141,7 @@ static inline void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) { } -static inline void wbt_disable(struct rq_wb *rwb) +static inline void wbt_disable_default(struct request_queue *q) { } static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9894dc985e09..c73a6fcaeb9d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3780,13 +3780,10 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) /* * If we have a non-root cgroup, we can depend on that to * do proper throttling of writes. Turn off wbt for that - * case. + * case, if it was enabled by default. */ - if (nonroot_cg) { - struct request_queue *q = cfqd->queue; - - wbt_disable(q->rq_wb); - } + if (nonroot_cg) + wbt_disable_default(cfqd->queue); /* * Drop reference to queues. New queues will be assigned in new -- cgit v1.2.3 From d62118b6dd99b8f64350206a6ea6996083b28c9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Nov 2016 09:40:34 -0700 Subject: blk-wbt: allow wbt to be enabled always through sysfs Currently there's no way to enable wbt if it's not enabled in the kernel config by default for a device. Allow a write to the 'wbt_lat_usec' queue sysfs file to enable wbt. This is useful for both the kernel config case, but also if the device is CFQ managed and it was turned off by default. Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 22 ++++++++++++++++------ block/blk-wbt.c | 3 ++- block/blk-wbt.h | 11 +++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f0ca569e276b..a97841491769 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -425,20 +425,30 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, ssize_t ret; s64 val; - rwb = q->rq_wb; - if (!rwb) - return -EINVAL; - ret = queue_var_store64(&val, page); if (ret < 0) return ret; + if (val < -1) + return -EINVAL; + + rwb = q->rq_wb; + if (!rwb) { + ret = wbt_init(q); + if (ret) + return ret; + + rwb = q->rq_wb; + if (!rwb) + return -EINVAL; + } if (val == -1) rwb->min_lat_nsec = wbt_default_latency_nsec(q); else if (val >= 0) rwb->min_lat_nsec = val * 1000ULL; - else - return -EINVAL; + + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) + rwb->enable_state = WBT_STATE_ON_MANUAL; wbt_update_limits(rwb); return count; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 7c0e618d6e7d..b8647343141f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -673,7 +673,7 @@ void wbt_disable_default(struct request_queue *q) { struct rq_wb *rwb = q->rq_wb; - if (rwb) { + if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { del_timer_sync(&rwb->window_timer); rwb->win_nsec = rwb->min_lat_nsec = 0; wbt_update_limits(rwb); @@ -721,6 +721,7 @@ int wbt_init(struct request_queue *q) rwb->last_comp = rwb->last_issue = jiffies; rwb->queue = q; rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->enable_state = WBT_STATE_ON_DEFAULT; wbt_update_limits(rwb); /* diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 8f485f8e1baf..65f1de519f67 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -21,6 +21,15 @@ enum { WBT_NUM_RWQ = 2, }; +/* + * Enable states. Either off, or on by default (done at init time), + * or on through manual setup in sysfs. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, + WBT_STATE_ON_MANUAL = 2, +}; + static inline void wbt_clear_state(struct blk_issue_stat *stat) { stat->time &= BLK_STAT_TIME_MASK; @@ -61,6 +70,8 @@ struct rq_wb { int scale_step; bool scaled_max; + short enable_state; /* WBT_STATE_* */ + /* * Number of consecutive periods where we don't have enough * information to make a firm scale up/down decision. -- cgit v1.2.3 From 415d3dab964c1a9bc6ceb8941bd4dbe4fbe36a09 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 28 Nov 2016 15:01:48 -0200 Subject: blk-mq: Drop explicit timeout sync in hotplug After commit 287922eb0b18 ("block: defer timeouts to a workqueue"), deleting the timeout work after freezing the queue shouldn't be necessary, since the synchronization is already enforced by the acquisition of a q_usage_counter reference in blk_mq_timeout_work. Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9d4a1d630d0b..bac12caece06 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2220,16 +2220,9 @@ static void blk_mq_queue_reinit_work(void) */ list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_freeze_queue_start(q); - list_for_each_entry(q, &all_q_list, all_q_node) { + list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_freeze_queue_wait(q); - /* - * timeout handler can't touch hw queue during the - * reinitialization - */ - del_timer_sync(&q->timeout); - } - list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_queue_reinit(q, &cpuhp_online_new); -- cgit v1.2.3 From b02d8aaea12463f573c5ef485c638ab12628dc5e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 1 Dec 2016 16:00:25 +0900 Subject: block: Check partition alignment on zoned block devices Both blkdev_report_zones and blkdev_reset_zones can operate on a partition of a zoned block device. However, the first and last zones reported for a partition make sense only if the partition start sector and size are aligned on the device zone size. The same applies for zone reset. Resetting the first or the last zone of a partition straddling zones may impact neighboring partitions. Finally, if a partition start sector is not at the beginning of a sequential zone, it will be impossible to write to the first sectors of the partition on a host-managed device. Avoid all these problems and incoherencies by ignoring partitions that are not zone aligned. Note: Even with CONFIG_BLK_DEV_ZONED disabled, bdev_is_zoned() will report the correct disk zoning type (host-aware, host-managed or none) but bdev_zone_size() will always return 0 for zoned block devices (i.e. the zone size is unknown). So test this as a way to ensure that a zoned block device is being handled as such. As a result, for a host-aware devices, unaligned zone partitions will be accepted with CONFIG_BLK_DEV_ZONED disabled. That is, the disk will be treated as a regular block device (as it should). If zoned block device support is enabled, only aligned partitions will be accepted. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/partition-generic.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'block') diff --git a/block/partition-generic.c b/block/partition-generic.c index 71d9ed9df8da..d7beb6bbbf66 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -430,6 +430,56 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) return 0; } +static bool part_zone_aligned(struct gendisk *disk, + struct block_device *bdev, + sector_t from, sector_t size) +{ + unsigned int zone_size = bdev_zone_size(bdev); + + /* + * If this function is called, then the disk is a zoned block device + * (host-aware or host-managed). This can be detected even if the + * zoned block device support is disabled (CONFIG_BLK_DEV_ZONED not + * set). In this case, however, only host-aware devices will be seen + * as a block device is not created for host-managed devices. Without + * zoned block device support, host-aware drives can still be used as + * regular block devices (no zone operation) and their zone size will + * be reported as 0. Allow this case. + */ + if (!zone_size) + return true; + + /* + * Check partition start and size alignement. If the drive has a + * smaller last runt zone, ignore it and allow the partition to + * use it. Check the zone size too: it should be a power of 2 number + * of sectors. + */ + if (WARN_ON_ONCE(!is_power_of_2(zone_size))) { + u32 rem; + + div_u64_rem(from, zone_size, &rem); + if (rem) + return false; + if ((from + size) < get_capacity(disk)) { + div_u64_rem(size, zone_size, &rem); + if (rem) + return false; + } + + } else { + + if (from & (zone_size - 1)) + return false; + if ((from + size) < get_capacity(disk) && + (size & (zone_size - 1))) + return false; + + } + + return true; +} + int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { struct parsed_partitions *state = NULL; @@ -529,6 +579,21 @@ rescan: } } + /* + * On a zoned block device, partitions should be aligned on the + * device zone size (i.e. zone boundary crossing not allowed). + * Otherwise, resetting the write pointer of the last zone of + * one partition may impact the following partition. + */ + if (bdev_is_zoned(bdev) && + !part_zone_aligned(disk, bdev, from, size)) { + printk(KERN_WARNING + "%s: p%d start %llu+%llu is not zone aligned\n", + disk->disk_name, p, (unsigned long long) from, + (unsigned long long) size); + continue; + } + part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); -- cgit v1.2.3 From e73c23ff736e1ea371dfa419d7bf8e77ee53044a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:58 -0800 Subject: block: add async variant of blkdev_issue_zeroout Similar to __blkdev_issue_discard this variant allows submitting the final bio asynchronously and chaining multiple ranges into a single completion. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-lib.c | 115 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 18abda862915..bfb28b03765e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -137,24 +137,24 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, EXPORT_SYMBOL(blkdev_issue_discard); /** - * blkdev_issue_write_same - queue a write same operation + * __blkdev_issue_write_same - generate number of bios with same page * @bdev: target blockdev * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @page: page containing data to write + * @biop: pointer to anchor bio * * Description: - * Issue a write same request for the sectors in question. + * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. */ -int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, - struct page *page) +static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page, + struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); unsigned int max_write_same_sectors; - struct bio *bio = NULL; - int ret = 0; + struct bio *bio = *biop; sector_t bs_mask; if (!q) @@ -164,6 +164,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if ((sector | nr_sects) & bs_mask) return -EINVAL; + if (!bdev_write_same(bdev)) + return -EOPNOTSUPP; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ max_write_same_sectors = UINT_MAX >> 9; @@ -185,32 +188,63 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, bio->bi_iter.bi_size = nr_sects << 9; nr_sects = 0; } + cond_resched(); } - if (bio) { + *biop = bio; + return 0; +} + +/** + * blkdev_issue_write_same - queue a write same operation + * @bdev: target blockdev + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @page: page containing data + * + * Description: + * Issue a write same request for the sectors in question. + */ +int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, + struct page *page) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, + &bio); + if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); /** - * blkdev_issue_zeroout - generate number of zero filed write bios + * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio + * @discard: discard flag * * Description: * Generate and issue number of bios with zerofiled pages. */ - -static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + bool discard) { int ret; - struct bio *bio = NULL; + int bi_size = 0; + struct bio *bio = *biop; unsigned int sz; sector_t bs_mask; @@ -218,6 +252,19 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if ((sector | nr_sects) & bs_mask) return -EINVAL; + if (discard) { + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, + BLKDEV_DISCARD_ZERO, biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + } + + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, + ZERO_PAGE(0), biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + + ret = 0; while (nr_sects != 0) { bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES), gfp_mask); @@ -227,21 +274,20 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); - ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); - nr_sects -= ret >> 9; - sector += ret >> 9; - if (ret < (sz << 9)) + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); + nr_sects -= bi_size >> 9; + sector += bi_size >> 9; + if (bi_size < (sz << 9)) break; } + cond_resched(); } - if (bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - return ret; - } - return 0; + *biop = bio; +out: + return ret; } +EXPORT_SYMBOL(__blkdev_issue_zeroout); /** * blkdev_issue_zeroout - zero-fill a block range @@ -263,21 +309,22 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, * clearing the block range. Otherwise the zeroing will be performed * using regular WRITE calls. */ - int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard) { - if (discard) { - if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, - BLKDEV_DISCARD_ZERO)) - return 0; - } + int ret; + struct bio *bio = NULL; + struct blk_plug plug; - if (bdev_write_same(bdev) && - blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, - ZERO_PAGE(0)) == 0) - return 0; + blk_start_plug(&plug); + ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, + &bio, discard); + if (ret == 0 && bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); - return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); + return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); -- cgit v1.2.3 From a6f0788ec2881ac14e97ff7fa6a78a807f87b5ba Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:59 -0800 Subject: block: add support for REQ_OP_WRITE_ZEROES This adds a new block layer operation to zero out a range of LBAs. This allows to implement zeroing for devices that don't use either discard with a predictable zero pattern or WRITE SAME of zeroes. The prominent example of that is NVMe with the Write Zeroes command, but in the future, this should also help with improving the way zeroing discards work. For this operation, suitable entry is exported in sysfs which indicate the number of maximum bytes allowed in one write zeroes operation by the device. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 1 + block/blk-core.c | 4 ++++ block/blk-lib.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++-- block/blk-merge.c | 17 +++++++++++---- block/blk-settings.c | 17 +++++++++++++++ block/blk-sysfs.c | 11 ++++++++++ block/blk-wbt.c | 5 +++-- 7 files changed, 105 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index de257ced69b1..83db1f37fd0b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -674,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: break; case REQ_OP_WRITE_SAME: bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; diff --git a/block/blk-core.c b/block/blk-core.c index 6c4a425690fc..3f2eb8d80189 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1950,6 +1950,10 @@ generic_make_request_checks(struct bio *bio) if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; + case REQ_OP_WRITE_ZEROES: + if (!bdev_write_zeroes_sectors(bio->bi_bdev)) + goto not_supported; + break; default: break; } diff --git a/block/blk-lib.c b/block/blk-lib.c index bfb28b03765e..510a6fb15318 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -226,6 +226,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_write_same); +/** + * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio + * + * Description: + * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages. + */ +static int __blkdev_issue_write_zeroes(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct bio *bio = *biop; + unsigned int max_write_zeroes_sectors; + struct request_queue *q = bdev_get_queue(bdev); + + if (!q) + return -ENXIO; + + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ + max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + + if (max_write_zeroes_sectors == 0) + return -EOPNOTSUPP; + + while (nr_sects) { + bio = next_bio(bio, 0, gfp_mask); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0); + + if (nr_sects > max_write_zeroes_sectors) { + bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; + nr_sects -= max_write_zeroes_sectors; + sector += max_write_zeroes_sectors; + } else { + bio->bi_iter.bi_size = nr_sects << 9; + nr_sects = 0; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, goto out; } + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, + biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, ZERO_PAGE(0), biop); if (ret == 0 || (ret && ret != -EOPNOTSUPP)) @@ -304,8 +358,8 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); * the discard request fail, if the discard flag is not set, or if * discard_zeroes_data is not supported, this function will resort to * zeroing the blocks manually, thus provisioning (allocating, - * anchoring) them. If the block device supports the WRITE SAME command - * blkdev_issue_zeroout() will use it to optimize the process of + * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME + * command(s), blkdev_issue_zeroout() will use it to optimize the process of * clearing the block range. Otherwise the zeroing will be performed * using regular WRITE calls. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index fda6a12fc776..cf2848cb91d8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, case REQ_OP_SECURE_ERASE: split = blk_bio_discard_split(q, *bio, bs, &nsegs); break; + case REQ_OP_WRITE_ZEROES: + split = NULL; + nsegs = (*bio)->bi_phys_segments; + break; case REQ_OP_WRITE_SAME: split = blk_bio_write_same_split(q, *bio, bs, &nsegs); break; @@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, * This should probably be returning 0, but blk_add_request_payload() * (Christoph!!!!) */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: return 1; + default: + break; + } fbio = bio; cluster = blk_queue_cluster(q); @@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: /* * This is a hack - drivers should be neither modifying the * biovec, nor relying on bi_vcnt - but because of diff --git a/block/blk-settings.c b/block/blk-settings.c index c7ccabc0ec3e..8a2bc124a684 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; + lim->max_write_zeroes_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; @@ -132,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; + lim->max_write_zeroes_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -299,6 +301,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_same_sectors); +/** + * blk_queue_max_write_zeroes_sectors - set max sectors for a single + * write zeroes + * @q: the request queue for the device + * @max_write_zeroes_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_zeroes_sectors) +{ + q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); + /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device @@ -527,6 +542,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); + t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, + b->max_write_zeroes_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a97841491769..706b27bd73a1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -211,6 +211,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_same_sectors << 9); } +static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_zeroes_sectors << 9); +} static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) @@ -611,6 +616,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = { .show = queue_write_same_max_show, }; +static struct queue_sysfs_entry queue_write_zeroes_max_entry = { + .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO }, + .show = queue_write_zeroes_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_show_nonrot, @@ -700,6 +710,7 @@ static struct attribute *default_attrs[] = { &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, + &queue_write_zeroes_max_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nomerges_entry.attr, diff --git a/block/blk-wbt.c b/block/blk-wbt.c index b8647343141f..d500e43da5d9 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) const int op = bio_op(bio); /* - * If not a WRITE (or a discard), do nothing + * If not a WRITE (or a discard or write zeroes), do nothing */ - if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_OP_WRITE_ZEROES)) return false; /* -- cgit v1.2.3 From e0c723000966ae285295caaa8cda914dfa177fa4 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Thu, 1 Dec 2016 08:36:16 -0700 Subject: block: factor out req_set_nomerge Factor out common code for setting REQ_NOMERGE flag which is being used out at certain places and make it a helper instead, req_set_nomerge(). Signed-off-by: Ritesh Harjani Get rid of the inline. Signed-off-by: Jens Axboe --- block/blk-merge.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index cf2848cb91d8..1002afdfee99 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -501,6 +501,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_sg); +static void req_set_nomerge(struct request_queue *q, struct request *req) +{ + req->cmd_flags |= REQ_NOMERGE; + if (req == q->last_merge) + q->last_merge = NULL; +} + static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) @@ -521,9 +528,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, return 1; no_merge: - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } @@ -537,9 +542,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, blk_rq_pos(req))) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } if (!bio_flagged(req->biotail, BIO_SEG_VALID)) @@ -561,9 +564,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { - req->cmd_flags |= REQ_NOMERGE; - if (req == q->last_merge) - q->last_merge = NULL; + req_set_nomerge(q, req); return 0; } if (!bio_flagged(bio, BIO_SEG_VALID)) -- cgit v1.2.3 From 209200efa3db3d09380ee7c796efa73d900d5f3a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 2 Dec 2016 17:13:09 -0800 Subject: blk-stat: fix a typo Signed-off-by: Shaohua Li Fixes: cf43e6be865a ("block: add scalable completion tracking of requests") Signed-off-by: Jens Axboe --- block/blk-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-stat.c b/block/blk-stat.c index 688c958367ee..4d0118568727 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -12,7 +12,7 @@ static void blk_stat_flush_batch(struct blk_rq_stat *stat) { const s32 nr_batch = READ_ONCE(stat->nr_batch); - const s32 nr_samples = READ_ONCE(stat->nr_batch); + const s32 nr_samples = READ_ONCE(stat->nr_samples); if (!nr_batch) return; -- cgit v1.2.3 From 58886785db318588f95c8036abb2a47016c1f14c Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Sun, 4 Dec 2016 14:56:39 +0100 Subject: block: fix unintended fallthrough in generic_make_request_checks() Since commit e73c23ff736e ("block: add async variant of blkdev_issue_zeroout") messages like the following show up: EXT4-fs (dm-1): Delayed block allocation failed for inode 2368848 at logical offset 0 with max blocks 1 with error 95 EXT4-fs (dm-1): This should not happen!! Data will be lost Due to the following fallthrough introduced with commit 2d253440b5af ("block: Define zoned block device operations"), generic_make_request_checks() would accept a REQ_OP_WRITE_SAME bio only if the block device supports "write same" *and* is a zoned one: switch (bio_op(bio)) { [...] case REQ_OP_WRITE_SAME: if (!bdev_write_same(bio->bi_bdev)) goto not_supported; case REQ_OP_ZONE_REPORT: case REQ_OP_ZONE_RESET: if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; [...] } Thus, although the bio setup as done by __blkdev_issue_write_same() from commit e73c23ff736e ("block: add async variant of blkdev_issue_zeroout") would succeed, its actual submission would not, resulting in the EOPNOTSUPP == 95. Fix this by removing the fallthrough which, due to the lack of an explicit comment, seems to be unintended anyway. Fixes: e73c23ff736e ("block: add async variant of blkdev_issue_zeroout") Fixes: 2d253440b5af ("block: Define zoned block device operations") Signed-off-by: Nicolai Stange Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 3f2eb8d80189..4b7ec5958055 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1945,6 +1945,7 @@ generic_make_request_checks(struct bio *bio) case REQ_OP_WRITE_SAME: if (!bdev_write_same(bio->bi_bdev)) goto not_supported; + break; case REQ_OP_ZONE_REPORT: case REQ_OP_ZONE_RESET: if (!bdev_is_zoned(bio->bi_bdev)) -- cgit v1.2.3 From 6e85eaf3078bcc8552ca32a0938dbf7d2b495af0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 2 Dec 2016 20:00:14 -0700 Subject: blk-mq: blk_account_io_start() takes a bool Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index bac12caece06..90db5b490df9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1237,7 +1237,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { init_request_from_bio(rq, bio); - blk_account_io_start(rq, 1); + blk_account_io_start(rq, true); } static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) -- cgit v1.2.3 From be07e14f96e3121483339a64d917fddb3b86ba98 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Dec 2016 14:19:06 +0100 Subject: blk-wbt: don't throttle discard or write zeroes Both of these are metadata only commands that are not issued by the writeback code and not directly relevant to the writeback bandwith. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-wbt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index d500e43da5d9..6e82769f4042 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -575,10 +575,9 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) const int op = bio_op(bio); /* - * If not a WRITE (or a discard or write zeroes), do nothing + * If not a WRITE, do nothing */ - if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD || - op == REQ_OP_WRITE_ZEROES)) + if (op != REQ_OP_WRITE) return false; /* -- cgit v1.2.3 From f9d03f96b988002027d4b28ea1b7a24729a4c9b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Dec 2016 15:20:32 -0700 Subject: block: improve handling of the magic discard payload Instead of allocating a single unused biovec for discard requests, send them down without any payload. Instead we allow the driver to add a "special" payload using a biovec embedded into struct request (unioned over other fields never used while in the driver), and overloading the number of segments for this case. This has a couple of advantages: - we don't have to allocate the bio_vec - the amount of special casing for discard requests in the block layer is significantly reduced - using this same scheme for other request types is trivial, which will be important for implementing the new WRITE_ZEROES op on devices where it actually requires a payload (e.g. SCSI) - we can get rid of playing games with the request length, as we'll never touch it and completions will work just fine - it will allow us to support ranged discard operations in the future by merging non-contiguous discard bios into a single request - last but not least it removes a lot of code This patch is the common base for my WIP series for ranges discards and to remove discard_zeroes_data in favor of always using REQ_OP_WRITE_ZEROES, so it would be good to get it in quickly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 10 +--------- block/blk-core.c | 34 ++-------------------------------- block/blk-lib.c | 2 +- block/blk-merge.c | 53 +++++++++++++++++------------------------------------ 4 files changed, 21 insertions(+), 78 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 83db1f37fd0b..2b375020fc49 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1840,15 +1840,7 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); - /* - * Discards need a mutable bio_vec to accommodate the payload - * required by the DSM TRIM and UNMAP commands. - */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - split = bio_clone_bioset(bio, gfp, bs); - else - split = bio_clone_fast(bio, gfp, bs); - + split = bio_clone_fast(bio, gfp, bs); if (!split) return NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 4b7ec5958055..bd642a43b98b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1475,38 +1475,6 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -/** - * blk_add_request_payload - add a payload to a request - * @rq: request to update - * @page: page backing the payload - * @offset: offset in page - * @len: length of the payload. - * - * This allows to later add a payload to an already submitted request by - * a block driver. The driver needs to take care of freeing the payload - * itself. - * - * Note that this is a quite horrible hack and nothing but handling of - * discard requests should ever use it. - */ -void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len) -{ - struct bio *bio = rq->bio; - - bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = offset; - bio->bi_io_vec->bv_len = len; - - bio->bi_iter.bi_size = len; - bio->bi_vcnt = 1; - bio->bi_phys_segments = 1; - - rq->__data_len = rq->resid_len = len; - rq->nr_phys_segments = 1; -} -EXPORT_SYMBOL_GPL(blk_add_request_payload); - bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) { @@ -2642,6 +2610,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) return false; } + WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD); + req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ diff --git a/block/blk-lib.c b/block/blk-lib.c index 510a6fb15318..ed89c8f4b2a0 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -80,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, req_sects = end_sect - sector; } - bio = next_bio(bio, 1, gfp_mask); + bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); diff --git a/block/blk-merge.c b/block/blk-merge.c index 1002afdfee99..182398cb1524 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -241,18 +241,13 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (!bio) return 0; - /* - * This should probably be returning 0, but blk_add_request_payload() - * (Christoph!!!!) - */ switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; - default: - break; } fbio = bio; @@ -410,39 +405,21 @@ new_segment: *bvprv = *bvec; } +static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv, + struct scatterlist *sglist, struct scatterlist **sg) +{ + *sg = sglist; + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + return 1; +} + static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist, struct scatterlist **sg) { struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; - int nsegs, cluster; - - nsegs = 0; - cluster = blk_queue_cluster(q); - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - /* - * This is a hack - drivers should be neither modifying the - * biovec, nor relying on bi_vcnt - but because of - * blk_add_request_payload(), a discard bio may or may not have - * a payload we need to set up here (thank you Christoph) and - * bi_vcnt is really the only way of telling if we need to. - */ - if (!bio->bi_vcnt) - return 0; - /* Fall through */ - case REQ_OP_WRITE_SAME: - *sg = sglist; - bvec = bio_iovec(bio); - sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); - return 1; - default: - break; - } + int cluster = blk_queue_cluster(q), nsegs = 0; for_each_bio(bio) bio_for_each_segment(bvec, bio, iter) @@ -462,7 +439,11 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sg = NULL; int nsegs = 0; - if (rq->bio) + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg); + else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) + nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg); + else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); if (unlikely(rq->rq_flags & RQF_COPY_USER) && @@ -495,7 +476,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, * Something must have been wrong if the figured number of * segment is bigger than number of req's physical segments */ - WARN_ON(nsegs > rq->nr_phys_segments); + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); return nsegs; } -- cgit v1.2.3 From ae911c5e796d51cb2d1ed3a55e73b9cc88d176cf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Dec 2016 13:19:30 -0700 Subject: blk-mq: add blk_mq_start_stopped_hw_queue() We have a variant for all hardware queues, but not one for a single hardware queue. Signed-off-by: Jens Axboe Reviewed-by: Hannes Reinecke --- block/blk-mq.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 90db5b490df9..c993476b630f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1064,18 +1064,23 @@ void blk_mq_start_hw_queues(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_start_hw_queues); +void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ + if (!blk_mq_hctx_stopped(hctx)) + return; + + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + blk_mq_run_hw_queue(hctx, async); +} +EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); + void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - if (!blk_mq_hctx_stopped(hctx)) - continue; - - clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, async); - } + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_stopped_hw_queue(hctx, async); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); -- cgit v1.2.3 From f04c3df3efeca0d226aeff7ef595e12a0b807ab2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 Dec 2016 08:41:17 -0700 Subject: blk-mq: abstract out blk_mq_dispatch_rq_list() helper Takes a list of requests, and dispatches it. Moves any residual requests to the dispatch list. Signed-off-by: Jens Axboe Reviewed-by: Hannes Reinecke --- block/blk-mq.c | 85 ++++++++++++++++++++++++++++++++-------------------------- block/blk-mq.h | 1 + 2 files changed, 48 insertions(+), 38 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c993476b630f..d79fdc11b1ee 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -821,41 +821,13 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -/* - * Run this hardware queue, pulling any software queues mapped to it in. - * Note that this function currently has various problems around ordering - * of IO. In particular, we'd like FIFO behaviour on handling existing - * items on the hctx->dispatch list. Ignore that for now. - */ -static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct request_queue *q = hctx->queue; struct request *rq; - LIST_HEAD(rq_list); LIST_HEAD(driver_list); struct list_head *dptr; - int queued; - - if (unlikely(blk_mq_hctx_stopped(hctx))) - return; - - hctx->run++; - - /* - * Touch any software queue that has pending entries. - */ - flush_busy_ctxs(hctx, &rq_list); - - /* - * If we have previous entries on our dispatch list, grab them - * and stuff them at the front for more fair dispatch. - */ - if (!list_empty_careful(&hctx->dispatch)) { - spin_lock(&hctx->lock); - if (!list_empty(&hctx->dispatch)) - list_splice_init(&hctx->dispatch, &rq_list); - spin_unlock(&hctx->lock); - } + int queued, ret = BLK_MQ_RQ_QUEUE_OK; /* * Start off with dptr being NULL, so we start the first request @@ -867,16 +839,15 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) * Now process all the entries, sending them to the driver. */ queued = 0; - while (!list_empty(&rq_list)) { + while (!list_empty(list)) { struct blk_mq_queue_data bd; - int ret; - rq = list_first_entry(&rq_list, struct request, queuelist); + rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); bd.rq = rq; bd.list = dptr; - bd.last = list_empty(&rq_list); + bd.last = list_empty(list); ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { @@ -884,7 +855,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: - list_add(&rq->queuelist, &rq_list); + list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; default: @@ -902,7 +873,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) * We've done the first request. If we have more than 1 * left in the list, set dptr to defer issue. */ - if (!dptr && rq_list.next != rq_list.prev) + if (!dptr && list->next != list->prev) dptr = &driver_list; } @@ -912,10 +883,11 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ - if (!list_empty(&rq_list)) { + if (!list_empty(list)) { spin_lock(&hctx->lock); - list_splice(&rq_list, &hctx->dispatch); + list_splice(list, &hctx->dispatch); spin_unlock(&hctx->lock); + /* * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but * it's possible the queue is stopped and restarted again @@ -927,6 +899,43 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) **/ blk_mq_run_hw_queue(hctx, true); } + + return ret != BLK_MQ_RQ_QUEUE_BUSY; +} + +/* + * Run this hardware queue, pulling any software queues mapped to it in. + * Note that this function currently has various problems around ordering + * of IO. In particular, we'd like FIFO behaviour on handling existing + * items on the hctx->dispatch list. Ignore that for now. + */ +static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) +{ + LIST_HEAD(rq_list); + LIST_HEAD(driver_list); + + if (unlikely(blk_mq_hctx_stopped(hctx))) + return; + + hctx->run++; + + /* + * Touch any software queue that has pending entries. + */ + flush_busy_ctxs(hctx, &rq_list); + + /* + * If we have previous entries on our dispatch list, grab them + * and stuff them at the front for more fair dispatch. + */ + if (!list_empty_careful(&hctx->dispatch)) { + spin_lock(&hctx->lock); + if (!list_empty(&hctx->dispatch)) + list_splice_init(&hctx->dispatch, &rq_list); + spin_unlock(&hctx->lock); + } + + blk_mq_dispatch_rq_list(hctx, &rq_list); } static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq.h b/block/blk-mq.h index b444370ae05b..3a54dd32a6fc 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -31,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); /* * CPU hotplug helpers -- cgit v1.2.3 From 70b3ea056f3074be6d9256c312b64c0d90a4a683 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 Dec 2016 08:43:31 -0700 Subject: elevator: make the rqhash helpers exported Signed-off-by: Jens Axboe Reviewed-by: Hannes Reinecke --- block/elevator.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index a18a5db274e4..40f0c04e5ad3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -248,13 +248,13 @@ static inline void __elv_rqhash_del(struct request *rq) rq->rq_flags &= ~RQF_HASHED; } -static void elv_rqhash_del(struct request_queue *q, struct request *rq) +void elv_rqhash_del(struct request_queue *q, struct request *rq) { if (ELV_ON_HASH(rq)) __elv_rqhash_del(rq); } -static void elv_rqhash_add(struct request_queue *q, struct request *rq) +void elv_rqhash_add(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; @@ -263,13 +263,13 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) rq->rq_flags |= RQF_HASHED; } -static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) +void elv_rqhash_reposition(struct request_queue *q, struct request *rq) { __elv_rqhash_del(rq); elv_rqhash_add(q, rq); } -static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) +struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; struct hlist_node *next; -- cgit v1.2.3 From c8e52ba5e2d6df5acaaeedda20d74f4ec3adcc82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 Dec 2016 15:53:18 -0700 Subject: blk-flush: run the queue when inserting blk-mq flush Currently we pass in to run the queue async, but don't flag the queue to be run. We don't need to run it async here, but we should run it. So fixup the parameters. Signed-off-by: Jens Axboe Reviewed-by: Hannes Reinecke --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 1bdbb3d3e5f5..27a42dab5a36 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -426,7 +426,7 @@ void blk_insert_flush(struct request *rq) if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { if (q->mq_ops) { - blk_mq_insert_request(rq, false, false, true); + blk_mq_insert_request(rq, false, true, false); } else list_add_tail(&rq->queuelist, &q->queue_head); return; -- cgit v1.2.3 From 7cd54aa8438947602cf68eda1db327822b9b8e6b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Dec 2016 13:08:35 -0700 Subject: blk-stat: fix a few cases of missing batch flushing Everytime we need to read ->nr_samples, we should have flushed the batch first. The non-mq read path also needs to flush the batch. Signed-off-by: Jens Axboe --- block/blk-stat.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-stat.c b/block/blk-stat.c index 4d0118568727..9b43efb8933f 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -64,6 +64,9 @@ static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) queue_for_each_hw_ctx(q, hctx, i) { hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); + blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); + if (!ctx->stat[BLK_STAT_READ].nr_samples && !ctx->stat[BLK_STAT_WRITE].nr_samples) continue; @@ -111,6 +114,8 @@ void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) if (q->mq_ops) blk_mq_stat_get(q, dst); else { + blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]); + blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]); memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], sizeof(struct blk_rq_stat)); memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], @@ -128,6 +133,9 @@ void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) uint64_t newest = 0; hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]); + blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]); + if (!ctx->stat[BLK_STAT_READ].nr_samples && !ctx->stat[BLK_STAT_WRITE].nr_samples) continue; -- cgit v1.2.3