From 4875253fddd7b6d322f028ad023d44b6efb7f73b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Mar 2017 08:56:07 -0700 Subject: blk-stat: move BLK_RQ_STAT_BATCH definition to blk-stat.c This is an implementation detail that no-one outside of blk-stat.c uses. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d703acb55d0f..e213c5e7500b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -287,8 +287,6 @@ struct blk_issue_stat { u64 time; }; -#define BLK_RQ_STAT_BATCH 64 - struct blk_rq_stat { s64 mean; u64 min; -- cgit v1.2.3 From 34dbad5d26e2f4b88e60f0e9ad03f99480802812 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Mar 2017 08:56:08 -0700 Subject: blk-stat: convert to callback-based statistics reporting Currently, statistics are gathered in ~0.13s windows, and users grab the statistics whenever they need them. This is not ideal for both in-tree users: 1. Writeback throttling wants its own dynamically sized window of statistics. Since the blk-stats statistics are reset after every window and the wbt windows don't line up with the blk-stats windows, wbt doesn't see every I/O. 2. Polling currently grabs the statistics on every I/O. Again, depending on how the window lines up, we may miss some I/Os. It's also unnecessary overhead to get the statistics on every I/O; the hybrid polling heuristic would be just as happy with the statistics from the previous full window. This reworks the blk-stats infrastructure to be callback-based: users register a callback that they want called at a given time with all of the statistics from the window during which the callback was active. Users can dynamically bucketize the statistics. wbt and polling both currently use read vs. write, but polling can be extended to further subdivide based on request size. The callbacks are kept on an RCU list, and each callback has percpu stats buffers. There will only be a few users, so the overhead on the I/O completion side is low. The stats flushing is also simplified considerably: since the timer function is responsible for clearing the statistics, we don't have to worry about stale statistics. wbt is a trivial conversion. After the conversion, the windowing problem mentioned above is fixed. For polling, we register an extra callback that caches the previous window's statistics in the struct request_queue for the hybrid polling heuristic to use. Since we no longer have a single stats buffer for the request queue, this also removes the sysfs and debugfs stats entries. To replace those, we add a debugfs entry for the poll statistics. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - include/linux/blkdev.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e213c5e7500b..270119a501fb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -294,7 +294,6 @@ struct blk_rq_stat { s32 nr_samples; s32 nr_batch; u64 batch; - s64 time; }; #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5a7da607ca04..1a7dc42a8918 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -40,6 +40,8 @@ struct blkcg_gq; struct blk_flush_queue; struct pr_ops; struct rq_wb; +struct blk_queue_stats; +struct blk_stat_callback; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -388,6 +390,7 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct blk_queue_stats *stats; struct rq_wb *rq_wb; /* @@ -505,8 +508,6 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; - struct blk_rq_stat rq_stats[2]; - /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -516,6 +517,10 @@ struct request_queue { unsigned int rq_timeout; int poll_nsec; + + struct blk_stat_callback *poll_cb; + struct blk_rq_stat poll_stat[2]; + struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; @@ -611,6 +616,7 @@ struct request_queue { #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ +#define QUEUE_FLAG_POLL_STATS 29 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From b7d680d7bf584bce6023343304b819009a7c3336 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:54 +0100 Subject: bdi: Mark congested->bdi as internal congested->bdi pointer is used only to be able to remove congested structure from bdi->cgwb_congested_tree on structure release. Moreover the pointer can become NULL when we unregister the bdi. Rename the field to __bdi and add a comment to make it more explicit this is internal stuff of memcg writeback code and people should not use the field as such use will be likely race prone. We do not bother with converting congested->bdi to a proper refcounted reference. It will be slightly ugly to special-case bdi->wb.congested to avoid effectively a cyclic reference of bdi to itself and the reference gets cleared from bdi_unregister() making it impossible to reference a freed bdi. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ad955817916d..8fb3dcdebc80 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -54,7 +54,9 @@ struct bdi_writeback_congested { atomic_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK - struct backing_dev_info *bdi; /* the associated bdi */ + struct backing_dev_info *__bdi; /* the associated bdi, set to NULL + * on bdi unregistration. For memcg-wb + * internal use only! */ int blkcg_id; /* ID of the associated blkcg */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ #endif -- cgit v1.2.3 From 5318ce7d46866e1dbc20ab9349b93753edba0b3e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:57 +0100 Subject: bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy() Currently we waited for all cgwbs to get freed in cgwb_bdi_destroy() which also means that writeback has been shutdown on them. Since this wait is going away, directly shutdown writeback on cgwbs from cgwb_bdi_destroy() to avoid live writeback structures after bdi_unregister() has finished. To make that safe with concurrent shutdown from cgwb_release_workfn(), we also have to make sure wb_shutdown() returns only after the bdi_writeback structure is really shutdown. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8fb3dcdebc80..8af720f22a2d 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; */ enum wb_state { WB_registered, /* bdi_register() was done */ + WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; -- cgit v1.2.3 From 4514451e79ae5baabb85d22ba3523602e59d5218 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:58 +0100 Subject: bdi: Do not wait for cgwbs release in bdi_unregister() Currently we wait for all cgwbs to get released in cgwb_bdi_destroy() (called from bdi_unregister()). That is however unnecessary now when cgwb->bdi is a proper refcounted reference (thus bdi cannot get released before all cgwbs are released) and when cgwb_bdi_destroy() shuts down writeback directly. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8af720f22a2d..e66d4722db8e 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -164,7 +164,6 @@ struct backing_dev_info { #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ - atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #else struct bdi_writeback_congested *wb_congested; #endif -- cgit v1.2.3 From f759741d9d913eb57784a94b9bca78b376fc26a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:37:00 +0100 Subject: block: Fix oops in locked_inode_to_wb_and_lock_list() When block device is closed, we call inode_detach_wb() in __blkdev_put() which sets inode->i_wb to NULL. That is contrary to expectations that inode->i_wb stays valid once set during the whole inode's lifetime and leads to oops in wb_get() in locked_inode_to_wb_and_lock_list() because inode_to_wb() returned NULL. The reason why we called inode_detach_wb() is not valid anymore though. BDI is guaranteed to stay along until we call bdi_put() from bdev_evict_inode() so we can postpone calling inode_detach_wb() to that moment. Also add a warning to catch if someone uses inode_detach_wb() in a dangerous way. Reported-by: Thiago Jung Bauermann Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a3c0cbd7c888..d5815794416c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { + WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } -- cgit v1.2.3 From c70c176ff8c3ff0ac6ef9a831cd591ea9a66bd1a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:37:01 +0100 Subject: kobject: Export kobject_get_unless_zero() Make the function available for outside use and fortify it against NULL kobject. CC: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/kobject.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e6284591599e..ca85cb80e99a 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name); extern int __must_check kobject_move(struct kobject *, struct kobject *); extern struct kobject *kobject_get(struct kobject *kobj); +extern struct kobject * __must_check kobject_get_unless_zero( + struct kobject *kobj); extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); -- cgit v1.2.3 From 7642747d674aff1f7cfe74ad9af7e9b12ab1d5ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Mar 2017 15:01:49 -0400 Subject: blk-mq: remove BLK_MQ_F_DEFER_ISSUE This flag was never used since it was introduced. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b296a9006117..5b3e201c8d4f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -152,7 +152,6 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, - BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, -- cgit v1.2.3 From 7a88fa191944589b2ed795bbed32ca6e9e2df31f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 Mar 2017 13:24:55 +0300 Subject: block: make nr_iovecs unsigned in bio_alloc_bioset() There isn't a bug here, but Smatch is not smart enough to know that "nr_iovecs" can't be negative so it complains about underflows. Really, it's slightly cleaner to make this parameter unsigned. Signed-off-by: Dan Carpenter Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 8e521194f6fc..4931756d86d9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); -extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); +extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); -- cgit v1.2.3 From 869ab90f0ae0002ce6e9d3a5c75156ae8de48ffc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 24 Mar 2017 18:03:48 -0700 Subject: block: constify struct blk_integrity_profile blk_integrity_profile's are never modified, so mark them 'const' so that they are placed in .rodata and benefit from memory protection. Signed-off-by: Eric Biggers Signed-off-by: Jens Axboe --- include/linux/genhd.h | 10 +++++----- include/linux/t10-pi.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 76f39754e7b0..9e11082c7f9b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -159,11 +159,11 @@ struct badblocks; #if defined(CONFIG_BLK_DEV_INTEGRITY) struct blk_integrity { - struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; + const struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 9fba9dd33544..9375d23a24e7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -34,9 +34,9 @@ struct t10_pi_tuple { }; -extern struct blk_integrity_profile t10_pi_type1_crc; -extern struct blk_integrity_profile t10_pi_type1_ip; -extern struct blk_integrity_profile t10_pi_type3_crc; -extern struct blk_integrity_profile t10_pi_type3_ip; +extern const struct blk_integrity_profile t10_pi_type1_crc; +extern const struct blk_integrity_profile t10_pi_type1_ip; +extern const struct blk_integrity_profile t10_pi_type3_crc; +extern const struct blk_integrity_profile t10_pi_type3_ip; #endif -- cgit v1.2.3 From 9e234eeafbe17e85908584392f249f0b329b8e1b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Mar 2017 10:51:41 -0700 Subject: blk-throttle: add a simple idle detection A cgroup gets assigned a low limit, but the cgroup could never dispatch enough IO to cross the low limit. In such case, the queue state machine will remain in LIMIT_LOW state and all other cgroups will be throttled according to low limit. This is unfair for other cgroups. We should treat the cgroup idle and upgrade the state machine to lower state. We also have a downgrade logic. If the state machine upgrades because of cgroup idle (real idle), the state machine will downgrade soon as the cgroup is below its low limit. This isn't what we want. A more complicated case is cgroup isn't idle when queue is in LIMIT_LOW. But when queue gets upgraded to lower state, other cgroups could dispatch more IO and this cgroup can't dispatch enough IO, so the cgroup is below its low limit and looks like idle (fake idle). In this case, the queue should downgrade soon. The key to determine if we should do downgrade is to detect if cgroup is truely idle. Unfortunately it's very hard to determine if a cgroup is real idle. This patch uses the 'think time check' idea from CFQ for the purpose. Please note, the idea doesn't work for all workloads. For example, a workload with io depth 8 has disk utilization 100%, hence think time is 0, eg, not idle. But the workload can run higher bandwidth with io depth 16. Compared to io depth 16, the io depth 8 workload is idle. We use the idea to roughly determine if a cgroup is idle. We treat a cgroup idle if its think time is above a threshold (by default 1ms for SSD and 100ms for HD). The idea is think time above the threshold will start to harm performance. HD is much slower so a longer think time is ok. The patch (and the latter patches) uses 'unsigned long' to track time. We convert 'ns' to 'us' with 'ns >> 10'. This is fast but loses precision, should not a big deal. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 270119a501fb..07a9e9607909 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -58,6 +58,9 @@ struct bio { */ struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + void *bi_cg_private; +#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) -- cgit v1.2.3 From 88eeca495ba7de749ff253376ec6be19bb05368d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Mar 2017 15:19:41 -0700 Subject: block: track request size in blk_issue_stat Currently there is no way to know the request size when the request is finished. Next patch will need this info. We could add extra field to record the size, but blk_issue_stat has enough space to record it, so this patch just overloads blk_issue_stat. With this, we will have 49bits to track time, which still is very long time. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 07a9e9607909..3ad567347671 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -287,7 +287,7 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) } struct blk_issue_stat { - u64 time; + u64 stat; }; struct blk_rq_stat { -- cgit v1.2.3 From b9147dd1bae2b15d6931ecd42f8606c775fecbc9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Mar 2017 15:19:42 -0700 Subject: blk-throttle: add a mechanism to estimate IO latency User configures latency target, but the latency threshold for each request size isn't fixed. For a SSD, the IO latency highly depends on request size. To calculate latency threshold, we sample some data, eg, average latency for request size 4k, 8k, 16k, 32k .. 1M. The latency threshold of each request size will be the sample latency (I'll call it base latency) plus latency target. For example, the base latency for request size 4k is 80us and user configures latency target 60us. The 4k latency threshold will be 80 + 60 = 140us. To sample data, we calculate the order base 2 of rounded up IO sectors. If the IO size is bigger than 1M, it will be accounted as 1M. Since the calculation does round up, the base latency will be slightly smaller than actual value. Also if there isn't any IO dispatched for a specific IO size, we will use the base latency of smaller IO size for this IO size. But we shouldn't sample data at any time. The base latency is supposed to be latency where disk isn't congested, because we use latency threshold to schedule IOs between cgroups. If disk is congested, the latency is higher, using it for scheduling is meaningless. Hence we only do the sampling when block throttling is in the LOW limit, with assumption disk isn't congested in such state. If the assumption isn't true, eg, low limit is too high, calculated latency threshold will be higher. Hard disk is completely different. Latency depends on spindle seek instead of request size. Currently this feature is SSD only, we probably can use a fixed threshold like 4ms for hard disk though. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3ad567347671..67bcf8a5326e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,6 +17,10 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +struct blk_issue_stat { + u64 stat; +}; + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -60,6 +64,7 @@ struct bio { struct cgroup_subsys_state *bi_css; #ifdef CONFIG_BLK_DEV_THROTTLING_LOW void *bi_cg_private; + struct blk_issue_stat bi_issue_stat; #endif #endif union { @@ -286,10 +291,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) return (cookie & BLK_QC_T_INTERNAL) != 0; } -struct blk_issue_stat { - u64 stat; -}; - struct blk_rq_stat { s64 mean; u64 min; -- cgit v1.2.3 From 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 9 Mar 2017 00:05:31 -0800 Subject: blkcg: allocate struct blkcg_gq outside request queue spinlock blkg_conf_prep() currently calls blkg_lookup_create() while holding request queue spinlock. This means allocating memory for struct blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM failures in call paths like below: pcpu_alloc+0x68f/0x710 __alloc_percpu_gfp+0xd/0x10 __percpu_counter_init+0x55/0xc0 cfq_pd_alloc+0x3b2/0x4e0 blkg_alloc+0x187/0x230 blkg_create+0x489/0x670 blkg_lookup_create+0x9a/0x230 blkg_conf_prep+0x1fb/0x240 __cfqg_set_weight_device.isra.105+0x5c/0x180 cfq_set_weight_on_dfl+0x69/0xc0 cgroup_file_write+0x39/0x1c0 kernfs_fop_write+0x13f/0x1d0 __vfs_write+0x23/0x120 vfs_write+0xc2/0x1f0 SyS_write+0x44/0xb0 entry_SYSCALL_64_fastpath+0x18/0xad In the code path above, percpu allocator cannot call vmalloc() due to queue spinlock. A failure in this call path gives grief to tools which are trying to configure io weights. We see occasional failures happen shortly after reboots even when system is not under any memory pressure. Machines with a lot of cpus are more vulnerable to this condition. Update blkg_create() function to temporarily drop the rcu and queue locks when it is allowed by gfp mask. Suggested-by: Tejun Heo Signed-off-by: Tahsin Erdogan Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 01b62e7bac74..955903a8f6cb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -172,7 +172,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); + struct request_queue *q, gfp_t gfp, + const struct blkcg_policy *pol); int blkcg_init_queue(struct request_queue *q); void blkcg_drain_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); @@ -694,7 +695,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); + blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN, + NULL); if (IS_ERR(blkg)) blkg = NULL; spin_unlock_irq(q->queue_lock); -- cgit v1.2.3 From 1671d522cdd9933dee7dddfcf9f62c561283824a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 27 Mar 2017 20:06:57 +0800 Subject: block: rename blk_mq_freeze_queue_start() As the .q_usage_counter is used by both legacy and mq path, we need to block new I/O if queue becomes dead in blk_queue_enter(). So rename it and we can use this function in both paths. Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5b3e201c8d4f..ea2e9dcd3aef 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -243,7 +243,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); -void blk_mq_freeze_queue_start(struct request_queue *q); +void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -- cgit v1.2.3 From 334335d2f7a077a5ff561d86b0ad43bedd83ca05 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 28 Mar 2017 16:12:15 -0700 Subject: block: warn if sharing request queue across gendisks Now that the remaining drivers have been converted to one request queue per gendisk, let's warn if a request queue gets registered more than once. This will catch future drivers which might do it inadvertently or any old drivers that I may have missed. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1a7dc42a8918..a2dc6b390d48 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -617,6 +617,7 @@ struct request_queue { #define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ #define QUEUE_FLAG_POLL_STATS 29 /* collecting stats for hybrid polling */ +#define QUEUE_FLAG_REGISTERED 30 /* queue has been registered to a disk */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From d708f0d5026f48081debdd1c5b0a5636455a9589 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Mar 2017 11:25:48 -0600 Subject: Revert "blkcg: allocate struct blkcg_gq outside request queue spinlock" I inadvertently applied the v5 version of this patch, whereas the agreed upon version was v5. Revert this one so we can apply the right one. This reverts commit 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788. --- include/linux/blk-cgroup.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 955903a8f6cb..01b62e7bac74 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -172,8 +172,7 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q, gfp_t gfp, - const struct blkcg_policy *pol); + struct request_queue *q); int blkcg_init_queue(struct request_queue *q); void blkcg_drain_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); @@ -695,8 +694,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN, - NULL); + blkg = blkg_lookup_create(blkcg, q); if (IS_ERR(blkg)) blkg = NULL; spin_unlock_irq(q->queue_lock); -- cgit v1.2.3 From b1a951fe469eb51646bf2e6d2c5f4a19fe1d4627 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 5 Feb 2017 21:47:22 +0200 Subject: net/utils: generic inet_pton_with_scope helper Several locations in the stack need to handle ipv4/ipv6 (with scope) and port strings conversion to sockaddr. Add a helper that takes either AF_INET, AF_INET6 or AF_UNSPEC (for wildcard) to centralize this handling. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/inet.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/inet.h b/include/linux/inet.h index 4cca05c9678e..636ebe87e6f8 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -43,6 +43,8 @@ #define _LINUX_INET_H #include +#include +#include /* * These mimic similar macros defined in user-space for inet_ntop(3). @@ -54,4 +56,8 @@ extern __be32 in_aton(const char *str); extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); + +extern int inet_pton_with_scope(struct net *net, unsigned short af, + const char *src, const char *port, struct sockaddr_storage *addr); + #endif /* _LINUX_INET_H */ -- cgit v1.2.3 From 0f222ccce359d21f927d07df2069e7029497b790 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 23 Mar 2017 20:41:22 -0700 Subject: nvme_fc: Sync FC-NVME header with standard Update FC-NVME definitions to match FC-NVME r1.14 (16-020vB) plus change voted in by 2/22 FC-NVME Adhoc (see HOSTID below). Includes the following: - Addition of "status_code" field to ERSP IU - Addition of FC-NVME LS RJT reason_codes and reason_explanations - CreateAssociation payload, HostID field shortened to 16 bytes Signed-off-by: James Smart Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/nvme-fc.h | 68 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 4b45226bd604..e997c4a49a88 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -16,8 +16,7 @@ */ /* - * This file contains definitions relative to FC-NVME r1.11 and a few - * newer items + * This file contains definitions relative to FC-NVME r1.14 (16-020vB). */ #ifndef _NVME_FC_H @@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu { #define NVME_FC_SIZEOF_ZEROS_RSP 12 +enum { + FCNVME_SC_SUCCESS = 0, + FCNVME_SC_INVALID_FIELD = 1, + FCNVME_SC_INVALID_CONNID = 2, +}; + struct nvme_fc_ersp_iu { - __u8 rsvd0[2]; + __u8 status_code; + __u8 rsvd1; __be16 iu_len; __be32 rsn; __be32 xfrd_len; @@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu { }; -/* FC-NVME r1.03/16-119v0 NVME Link Services */ +/* FC-NVME Link Services */ enum { FCNVME_LS_RSVD = 0, FCNVME_LS_RJT = 1, @@ -68,7 +74,7 @@ enum { FCNVME_LS_DISCONNECT = 5, }; -/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */ +/* FC-NVME Link Service Descriptors */ enum { FCNVME_LSDESC_RSVD = 0x0, FCNVME_LSDESC_RQST = 0x1, @@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz) return cpu_to_be32(sz - (2 * sizeof(u32))); } - struct fcnvme_ls_rqst_w0 { u8 ls_cmd; /* FCNVME_LS_xxx */ u8 zeros[3]; @@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst { __be32 rsvd12; }; +/* FC-NVME LS RJT reason_code values */ +enum fcnvme_ls_rjt_reason { + FCNVME_RJT_RC_NONE = 0, + /* no reason - not to be sent */ + + FCNVME_RJT_RC_INVAL = 0x01, + /* invalid NVMe_LS command code */ + + FCNVME_RJT_RC_LOGIC = 0x03, + /* logical error */ + + FCNVME_RJT_RC_UNAB = 0x09, + /* unable to perform command request */ + + FCNVME_RJT_RC_UNSUP = 0x0b, + /* command not supported */ + + FCNVME_RJT_RC_INPROG = 0x0e, + /* command already in progress */ + FCNVME_RJT_RC_INV_ASSOC = 0x40, + /* Invalid Association ID*/ + FCNVME_RJT_RC_INV_CONN = 0x41, + /* Invalid Connection ID*/ + + FCNVME_RJT_RC_VENDOR = 0xff, + /* vendor specific error */ +}; + +/* FC-NVME LS RJT reason_explanation values */ +enum fcnvme_ls_rjt_explan { + FCNVME_RJT_EXP_NONE = 0x00, + /* No additional explanation */ + + FCNVME_RJT_EXP_OXID_RXID = 0x17, + /* invalid OX_ID-RX_ID combination */ + + FCNVME_RJT_EXP_INSUF_RES = 0x29, + /* insufficient resources */ + + FCNVME_RJT_EXP_UNAB_DATA = 0x2a, + /* unable to supply requested data */ + + FCNVME_RJT_EXP_INV_LEN = 0x2d, + /* Invalid payload length */ +}; /* FCNVME_LSDESC_RJT */ struct fcnvme_lsdesc_rjt { @@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt { * Reject reason and explanaction codes are generic * to ELs's from LS-3. */ - u8 reason_code; - u8 reason_explanation; + u8 reason_code; /* fcnvme_ls_rjt_reason */ + u8 reason_explanation; /* fcnvme_ls_rjt_explan */ u8 vendor; __be32 rsvd12; }; -#define FCNVME_ASSOC_HOSTID_LEN 64 +#define FCNVME_ASSOC_HOSTID_LEN 16 #define FCNVME_ASSOC_HOSTNQN_LEN 256 #define FCNVME_ASSOC_SUBNQN_LEN 256 -- cgit v1.2.3 From 62eeacb0e04f2aff7099a7765f386bb7ba53d5e2 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 23 Mar 2017 20:41:27 -0700 Subject: nvme_fc: Clean up host fcpio done status handling As Dan Carpenter pointed out: mixing 16-bit nvme status with 32-bit error status from driver. Corrected comment on fcp request struct status field, and converted done routine to explicitly set nvme status codes for nvme status. Signed-off-by: James Smart Reported-by: Dan Carpenter Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/nvme-fc-driver.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index f21471f7ee40..16eb264980c2 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir { * transferred. Should equal payload_length on success. * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. * @status: Completion status of the FCP operation. must be 0 upon success, - * NVME_SC_FC_xxx value upon failure. Note: this is NOT a - * reflection of the NVME CQE completion status. Only the status - * of the FCP operation at the NVME-FC level. + * negative errno value upon failure (ex: -EIO). Note: this is + * NOT a reflection of the NVME CQE completion status. Only the + * status of the FCP operation at the NVME-FC level. */ struct nvmefc_fcp_req { void *cmdaddr; -- cgit v1.2.3 From f2fbc9dd78970accd7649e8b87c7f00a0da0cdbc Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 5 Apr 2017 08:39:18 -0700 Subject: blk-mq: Remove blk_mq_queue_data.list The block layer core sets blk_mq_queue_data.list but no block drivers read that member. Hence remove it and also the code that is used to set this member. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ea2e9dcd3aef..bdea90d75274 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -81,7 +81,6 @@ struct blk_mq_tag_set { struct blk_mq_queue_data { struct request *rq; - struct list_head *list; bool last; }; -- cgit v1.2.3 From 64c7f1d1572cacadfc0a4ca5a937aeffa486de58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:18:12 +0200 Subject: block, scsi: move the retries field to struct scsi_request Instead of bloating the generic struct request with it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a2dc6b390d48..ce6f9a6534c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -224,7 +224,6 @@ struct request { unsigned long deadline; struct list_head timeout_list; unsigned int timeout; - int retries; /* * completion callback. -- cgit v1.2.3 From 1dd5198b2df913aec9b77c14529f9ff1b6d33e30 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Apr 2017 12:16:38 -0600 Subject: block: move timeout field in struct request to pack better After commit 64c7f1d1572c, we went from 1 to 2 holes in my test setup. If we move the timeout field a bit, we remove both of those holes and shrink struct request by 8 bytes. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ce6f9a6534c9..3cf241b0814d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -215,6 +215,8 @@ struct request { unsigned short ioprio; + unsigned int timeout; + void *special; /* opaque pointer available for LLD use */ int errors; @@ -223,7 +225,6 @@ struct request { unsigned long deadline; struct list_head timeout_list; - unsigned int timeout; /* * completion callback. -- cgit v1.2.3 From dbde775cdbf5e401b8739f30c87d1af12c0028db Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 Apr 2017 11:10:44 +1000 Subject: block: simple improvements for bio->flags The comment for the 'flags' field of 'bio' mentions "command" which is no longer stored there, and doesn't mention the bvec pool number, which is. BIO_RESET_BITS is set in such a way that it would need to be updated if new bits were added, which is easy to miss. BVEC_POOL_BITS is larger than needed. The BVEC_POOL_IDX() ranges from 0 to 6, so 3 bits are sufficient. This patch make improvements in each of these areas. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 67bcf8a5326e..1ebbc289b642 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -33,7 +33,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, command, etc */ + unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; struct bvec_iter bi_iter; @@ -110,12 +110,7 @@ struct bio { #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ #define BIO_THROTTLED 9 /* This bio has already been subjected to * throttling rules. Don't do it again. */ - -/* - * Flags starting here get preserved by bio_reset() - this includes - * BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS 10 +/* See BVEC_POOL_OFFSET below before adding new flags */ /* * We support 6 different bvec pools, the last one is magic in that it @@ -125,13 +120,22 @@ struct bio { #define BVEC_POOL_MAX (BVEC_POOL_NR - 1) /* - * Top 4 bits of bio flags indicate the pool the bvecs came from. We add + * Top 3 bits of bio flags indicate the pool the bvecs came from. We add * 1 to the actual index so that 0 indicates that there are no bvecs to be * freed. */ -#define BVEC_POOL_BITS (4) +#define BVEC_POOL_BITS (3) #define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) +#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) +# error "BVEC_POOL_BITS is too small" +#endif + +/* + * Flags starting here get preserved by bio_reset() - this includes + * only BVEC_POOL_IDX() + */ +#define BIO_RESET_BITS BVEC_POOL_OFFSET /* * Operations and flags common to the bio and request structures. -- cgit v1.2.3 From fbbaf700e7b163a0f1704b2d542ee28be11fce21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 Apr 2017 09:40:52 -0600 Subject: block: trace completion of all bios. Currently only dm and md/raid5 bios trigger trace_block_bio_complete(). Now that we have bio_chain() and bio_inc_remaining(), it is not possible, in general, for a driver to know when the bio is really complete. Only bio_endio() knows that. So move the trace_block_bio_complete() call to bio_endio(). Now trace_block_bio_complete() pairs with trace_block_bio_queue(). Any bio for which a 'queue' event is traced, will subsequently generate a 'complete' event. There are a few cases where completion tracing is not wanted. 1/ If blk_update_request() has already generated a completion trace event at the 'request' level, there is no point generating one at the bio level too. In this case the bi_sector and bi_size will have changed, so the bio level event would be wrong 2/ If the bio hasn't actually been queued yet, but is being aborted early, then a trace event could be confusing. Some filesystems call bio_endio() but do not want tracing. 3/ The bio_integrity code interposes itself by replacing bi_end_io, then restoring it and calling bio_endio() again. This would produce two identical trace events if left like that. To handle these, we introduce a flag BIO_TRACE_COMPLETION and only produce the trace event when this is set. We address point 1 above by clearing the flag in blk_update_request(). We address point 2 above by only setting the flag when generic_make_request() is called. We address point 3 above by clearing the flag after generating a completion event. When bio_split() is used on a bio, particularly in blk_queue_split(), there is an extra complication. A new bio is split off the front, and may be handle directly without going through generic_make_request(). The old bio, which has been advanced, is passed to generic_make_request(), so it will trigger a trace event a second time. Probably the best result when a split happens is to see a single 'queue' event for the whole bio, then multiple 'complete' events - one for each component. To achieve this was can: - copy the BIO_TRACE_COMPLETION flag to the new bio in bio_split() - avoid generating a 'queue' event if BIO_TRACE_COMPLETION is already set. This way, the split-off bio won't create a queue event, the original won't either even if it re-submitted to generic_make_request(), but both will produce completion events, each for their own range. So if generic_make_request() is called (which generates a QUEUED event), then bi_endio() will create a single COMPLETE event for each range that the bio is split into, unless the driver has explicitly requested it not to. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1ebbc289b642..72aa9519167e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -110,6 +110,8 @@ struct bio { #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ #define BIO_THROTTLED 9 /* This bio has already been subjected to * throttling rules. Don't do it again. */ +#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion + * of this bio. */ /* See BVEC_POOL_OFFSET below before adding new flags */ /* -- cgit v1.2.3 From ee056f98126170ca8b16b9a4a6e20aae7c5c184e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 5 Apr 2017 12:01:34 -0700 Subject: blk-mq-sched: provide hooks for initializing hardware queue data Schedulers need to be informed when a hardware queue is added or removed at runtime so they can allocate/free per-hardware queue data. So, replace the blk_mq_sched_init_hctx_data() helper, which only makes sense at init time, with .init_hctx() and .exit_hctx() hooks. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 22d39e8d4de1..b7ec315ee7e7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -93,6 +93,8 @@ struct blk_mq_hw_ctx; struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_type *); void (*exit_sched)(struct elevator_queue *); + int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); -- cgit v1.2.3 From 1d62ac13634840e02f9b20df9d8e21204f9ab8b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:00 +0200 Subject: block: renumber REQ_OP_WRITE_ZEROES Make life easy for implementations that needs to send a data buffer to the device (e.g. SCSI) by numbering it as a data out command. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 72aa9519167e..c5bae0a669d1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -174,7 +174,7 @@ enum req_opf { /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, /* write the zero filled sector many times */ - REQ_OP_WRITE_ZEROES = 8, + REQ_OP_WRITE_ZEROES = 9, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, -- cgit v1.2.3 From ac62d6208a7977107a47be4eb8566d6e5034b5f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:05 +0200 Subject: dm: support REQ_OP_WRITE_ZEROES Copy & paste from the REQ_OP_WRITE_SAME code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/device-mapper.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index a7e6903866fd..3829bee2302a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -254,6 +254,12 @@ struct dm_target { */ unsigned num_write_same_bios; + /* + * The number of WRITE ZEROES bios that will be submitted to the target. + * The bio number can be accessed with dm_bio_get_target_bio_nr. + */ + unsigned num_write_zeroes_bios; + /* * The minimum number of extra bytes allocated in each io for the * target to use. -- cgit v1.2.3 From ee472d835c264a4cb77f8cf878603e1e40f3559e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:08 +0200 Subject: block: add a flags argument to (__)blkdev_issue_zeroout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turn the existing discard flag into a new BLKDEV_ZERO_UNMAP flag with similar semantics, but without referring to diѕcard. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d76bebbc632e..bd60f4401c9d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1336,23 +1336,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return bqt->tag_index[tag]; } +extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); +extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page); #define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ #define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); -extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct page *page); + +#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ + extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, - bool discard); + unsigned flags); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, bool discard); + sector_t nr_sects, gfp_t gfp_mask, unsigned flags); + static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { @@ -1366,7 +1370,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - 9), nr_blocks << (sb->s_blocksize_bits - 9), - gfp_mask, true); + gfp_mask, 0); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); -- cgit v1.2.3 From d928be9f853b9755692d7e9aed402c1809a88e56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:09 +0200 Subject: block: add a REQ_NOUNMAP flag for REQ_OP_WRITE_ZEROES If this flag is set logical provisioning capable device should release space for the zeroed blocks if possible, if it is not set devices should keep the blocks anchored. Also remove an out of sync kerneldoc comment for a static function that would have become even more out of data with this change. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c5bae0a669d1..61339bc44400 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -201,6 +201,10 @@ enum req_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ + + /* command specific flags for REQ_OP_WRITE_ZEROES: */ + __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_NR_BITS, /* stops here */ }; @@ -218,6 +222,8 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) +#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) + #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From cb365b9675fda026caba4cb5df83292cb7c0811a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:10 +0200 Subject: block: add a new BLKDEV_ZERO_NOFALLBACK flag This avoids fallbacks to explicit zeroing in (__)blkdev_issue_zeroout if the caller doesn't want them. Also clean up the convoluted check for the return condition that this new flag is added to. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bd60f4401c9d..21a30f011674 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1350,6 +1350,7 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct bio **biop); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ +#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, -- cgit v1.2.3 From 48920ff2a5a940cd07d12cc79e4a2c75f1185aee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:23 +0200 Subject: block: remove the discard_zeroes_data flag Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we can kill this hack. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 --------------- include/linux/device-mapper.h | 5 ----- 2 files changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 21a30f011674..ec993573e0a8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -339,7 +339,6 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char cluster; - unsigned char discard_zeroes_data; unsigned char raid_partial_stripes_expensive; enum blk_zoned_model zoned; }; @@ -1341,7 +1340,6 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); #define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ -#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); @@ -1541,19 +1539,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev) return q->limits.discard_alignment; } -static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) -{ - if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) - return 1; - - return 0; -} - -static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) -{ - return queue_discard_zeroes_data(bdev_get_queue(bdev)); -} - static inline unsigned int bdev_write_same(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 3829bee2302a..c7ea33e38fb9 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -296,11 +296,6 @@ struct dm_target { * on max_io_len boundary. */ bool split_discard_bios:1; - - /* - * Set if this target does not return zeroes on discarded blocks. - */ - bool discard_zeroes_data_unsupported:1; }; /* Each target can link one of these into the table */ -- cgit v1.2.3 From 84253394927c4352652d0b118ad9583f5646959b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Apr 2017 13:28:46 +0200 Subject: remove the mg_disk driver This drivers was added in 2008, but as far as a I can tell we never had a single platform that actually registered resources for the platform driver. It's also been unmaintained for a long time and apparently has a ATA mode that can be driven using the IDE/libata subsystem. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/mg_disk.h | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 include/linux/mg_disk.h (limited to 'include/linux') diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h deleted file mode 100644 index e11f4d9f1c2e..000000000000 --- a/include/linux/mg_disk.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * include/linux/mg_disk.c - * - * Private data for mflash platform driver - * - * (c) 2008 mGine Co.,LTD - * (c) 2008 unsik Kim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __MG_DISK_H__ -#define __MG_DISK_H__ - -/* name for platform device */ -#define MG_DEV_NAME "mg_disk" - -/* names of GPIO resource */ -#define MG_RST_PIN "mg_rst" -/* except MG_BOOT_DEV, reset-out pin should be assigned */ -#define MG_RSTOUT_PIN "mg_rstout" - -/* device attribution */ -/* use mflash as boot device */ -#define MG_BOOT_DEV (1 << 0) -/* use mflash as storage device */ -#define MG_STORAGE_DEV (1 << 1) -/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */ -#define MG_STORAGE_DEV_SKIP_RST (1 << 2) - -/* private driver data */ -struct mg_drv_data { - /* disk resource */ - u32 use_polling; - - /* device attribution */ - u32 dev_attr; - - /* internally used */ - void *host; -}; - -#endif -- cgit v1.2.3 From c05e66733788118377c21a913c1bc7b64bccc167 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 14 Apr 2017 00:59:58 -0700 Subject: sbitmap: add sbitmap_get_shallow() operation This operation supports the use case of limiting the number of bits that can be allocated for a given operation. Rather than setting aside some bits at the end of the bitmap, we can set aside bits in each word of the bitmap. This means we can keep the allocation hints spread out and support sbitmap_resize() nicely at the cost of lower granularity for the allowed depth. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index d4e0a204c118..a1904aadbc45 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -175,6 +175,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); */ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); +/** + * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, + * limiting the depth used from each word. + * @sb: Bitmap to allocate from. + * @alloc_hint: Hint for where to start searching for a free bit. + * @shallow_depth: The maximum number of bits to allocate from a single word. + * + * This rather specific operation allows for having multiple users with + * different allocation limits. E.g., there can be a high-priority class that + * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() + * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority + * class can only allocate half of the total bits in the bitmap, preventing it + * from starving out the high-priority class. + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, + unsigned long shallow_depth); + /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. @@ -325,6 +344,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); +/** + * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct + * sbitmap_queue, limiting the depth used from each word, with preemption + * already disabled. + * @sbq: Bitmap queue to allocate from. + * @shallow_depth: The maximum number of bits to allocate from a single word. + * See sbitmap_get_shallow(). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int shallow_depth); + /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. @@ -345,6 +377,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, return nr; } +/** + * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct + * sbitmap_queue, limiting the depth used from each word. + * @sbq: Bitmap queue to allocate from. + * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to + * sbitmap_queue_clear()). + * @shallow_depth: The maximum number of bits to allocate from a single word. + * See sbitmap_get_shallow(). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int *cpu, + unsigned int shallow_depth) +{ + int nr; + + *cpu = get_cpu(); + nr = __sbitmap_queue_get_shallow(sbq, shallow_depth); + put_cpu(); + return nr; +} + /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. -- cgit v1.2.3 From 5b72727299307e53888277729f980ab03264dac8 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 14 Apr 2017 01:00:00 -0700 Subject: blk-mq: export helpers blk_mq_finish_request() is required for schedulers that define their own put_request(). blk_mq_run_hw_queue() is required for schedulers that hold back requests to be run later. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b90c3d5766cd..d75de612845d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -238,6 +238,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, -- cgit v1.2.3 From c05f8525f67b7d6489b0502211d4ed35622d9beb Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 14 Apr 2017 01:00:01 -0700 Subject: blk-mq-sched: make completed_request() callback more useful Currently, this callback is called right after put_request() and has no distinguishable purpose. Instead, let's call it before put_request() as soon as I/O has completed on the request, before we account it in blk-stat. With this, Kyber can enable stats when it sees a latency outlier and make sure the outlier gets accounted. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index b7ec315ee7e7..3a216318ae73 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -106,7 +106,7 @@ struct elevator_mq_ops { void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); - void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); + void (*completed_request)(struct request *); void (*started_request)(struct request *); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); -- cgit v1.2.3 From 17912c49edfa6ab552329bf63d1b757eb874673b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Sat, 15 Apr 2017 20:55:37 +0200 Subject: lightnvm: submit erases using the I/O path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now erases have been submitted as synchronous commands through a dedicated erase function. In order to enable targets implementing asynchronous erases, refactor the erase path so that it uses the normal async I/O submission functions. If a target requires sync I/O, it can implement it internally. Also, adapt rrpc to use the new erase path. Signed-off-by: Javier González Fixed spelling error. Signed-off-by: Matias Bjørling Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ca45e4a088a9..e11163f9b3b7 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -56,7 +56,6 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -70,7 +69,6 @@ struct nvm_dev_ops { nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; - nvm_erase_blk_fn *erase_block; nvm_create_dma_pool_fn *create_dma_pool; nvm_destroy_dma_pool_fn *destroy_dma_pool; @@ -479,10 +477,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, +extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); +extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); -extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); +extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); -- cgit v1.2.3 From a7737f39c70d9c63ba530d6316724d7be67de541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Sat, 15 Apr 2017 20:55:38 +0200 Subject: lightnvm: rename scrambler controller hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the OCSSD 1.2 specification, the 0x200 hint enables the media scrambler for the read/write opcode, providing that the controller has been correctly configured by the firmware. Rename the macro to represent this meaning. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e11163f9b3b7..eff7d1f312a8 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -123,7 +123,7 @@ enum { /* NAND Access Modes */ NVM_IO_SUSPEND = 0x80, NVM_IO_SLC_MODE = 0x100, - NVM_IO_SCRAMBLE_DISABLE = 0x200, + NVM_IO_SCRAMBLE_ENABLE = 0x200, /* Block Types */ NVM_BLK_T_FREE = 0x0, -- cgit v1.2.3 From 4af3f75d7992dd0dc49da95fbc039fa3806fba4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Sat, 15 Apr 2017 20:55:45 +0200 Subject: lightnvm: allow to init targets on factory mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Target initialization has two responsibilities: creating the target partition and instantiating the target. This patch enables to create a factory partition (e.g., do not trigger recovery on the given target). This is useful for target development and for being able to restore the device state at any moment in time without requiring a full-device erase. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index eff7d1f312a8..7dfa56ebbc6d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -436,7 +436,8 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); +typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, + int flags); typedef void (nvm_tgt_exit_fn)(void *); typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); -- cgit v1.2.3 From e21b7a0b988772e82e7147e1c659a5afe2ae003c Mon Sep 17 00:00:00 2001 From: Arianna Avanzini Date: Wed, 12 Apr 2017 18:23:08 +0200 Subject: block, bfq: add full hierarchical scheduling and cgroups support Add complete support for full hierarchical scheduling, with a cgroups interface. Full hierarchical scheduling is implemented through the 'entity' abstraction: both bfq_queues, i.e., the internal BFQ queues associated with processes, and groups are represented in general by entities. Given the bfq_queues associated with the processes belonging to a given group, the entities representing these queues are sons of the entity representing the group. At higher levels, if a group, say G, contains other groups, then the entity representing G is the parent entity of the entities representing the groups in G. Hierarchical scheduling is performed as follows: if the timestamps of a leaf entity (i.e., of a bfq_queue) change, and such a change lets the entity become the next-to-serve entity for its parent entity, then the timestamps of the parent entity are recomputed as a function of the budget of its new next-to-serve leaf entity. If the parent entity belongs, in its turn, to a group, and its new timestamps let it become the next-to-serve for its parent entity, then the timestamps of the latter parent entity are recomputed as well, and so on. When a new bfq_queue must be set in service, the reverse path is followed: the next-to-serve highest-level entity is chosen, then its next-to-serve child entity, and so on, until the next-to-serve leaf entity is reached, and the bfq_queue that this entity represents is set in service. Writeback is accounted for on a per-group basis, i.e., for each group, the async I/O requests of the processes of the group are enqueued in a distinct bfq_queue, and the entity associated with this queue is a child of the entity associated with the group. Weights can be assigned explicitly to groups and processes through the cgroups interface, differently from what happens, for single processes, if the cgroups interface is not used (as explained in the description of the previous patch). In particular, since each node has a full scheduler, each group can be assigned its own weight. Signed-off-by: Fabio Checconi Signed-off-by: Paolo Valente Signed-off-by: Arianna Avanzini Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ec993573e0a8..fe9c512cc6fa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -50,7 +50,7 @@ struct blk_stat_callback; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 2 +#define BLKCG_MAX_POLS 3 typedef void (rq_end_io_fn)(struct request *, int); -- cgit v1.2.3 From 314fe91b4a99949bb720501ba74d2228093bbf47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 12 Apr 2017 12:13:57 +0200 Subject: block: remove blk_end_request_err and __blk_end_request_err Both functions are entirely unused. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe9c512cc6fa..cca704c80b01 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1127,12 +1127,10 @@ extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); extern bool blk_end_request_cur(struct request *rq, int error); -extern bool blk_end_request_err(struct request *rq, int error); extern bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void __blk_end_request_all(struct request *rq, int error); extern bool __blk_end_request_cur(struct request *rq, int error); -extern bool __blk_end_request_err(struct request *rq, int error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); -- cgit v1.2.3 From fa1a15c08e23cb89c5837915b1989909bce47456 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 12 Apr 2017 12:13:58 +0200 Subject: block: remove blk_end_request_cur This function is not used anywhere in the kernel. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cca704c80b01..5b52b3d7818c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1126,7 +1126,6 @@ extern void blk_finish_request(struct request *rq, int error); extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); -extern bool blk_end_request_cur(struct request *rq, int error); extern bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void __blk_end_request_all(struct request *rq, int error); -- cgit v1.2.3 From da8d7f079b868ceab830309f80efc69d350576f3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 19 Apr 2017 14:01:24 -0700 Subject: block: Export blk_init_request_from_bio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export this function such that it becomes available to block drivers. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Cc: Matias Bjørling Cc: Adam Manzanares Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5b52b3d7818c..3470375952a1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -924,6 +924,7 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); +extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); -- cgit v1.2.3 From 0be0dee64eacd950f8e4b6c45adb5a92392eaaaf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 19 Apr 2017 14:01:27 -0700 Subject: block: Inline blk_rq_set_prio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since only a single caller remains, inline blk_rq_set_prio(). Initialize req->ioprio even if no I/O priority has been set in the bio nor in the I/O context. Signed-off-by: Bart Van Assche Reviewed-by: Adam Manzanares Tested-by: Adam Manzanares Reviewed-by: Christoph Hellwig Cc: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3470375952a1..51c9e391798e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1087,20 +1087,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq) return nr_bios; } -/* - * blk_rq_set_prio - associate a request with prio from ioc - * @rq: request of interest - * @ioc: target iocontext - * - * Assocate request prio with ioc prio so request based drivers - * can leverage priority information. - */ -static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc) -{ - if (ioc) - rq->ioprio = ioc->ioprio; -} - /* * Request issue related functions. */ -- cgit v1.2.3 From baf7a616d537f577d33b7d9986f40532e2bd9f66 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:25 +0200 Subject: bdi: Provide bdi_register_va() and bdi_alloc() Add function that registers bdi and takes va_list instead of variable number of arguments. Add bdi_alloc() as simple wrapper for NUMA-unaware users allocating BDI. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c52a48cb9a66..47a98e6e2a65 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -30,6 +30,8 @@ void bdi_put(struct backing_dev_info *bdi); __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); +int bdi_register_va(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, va_list args); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); @@ -37,6 +39,10 @@ void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_destroy(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); +static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) +{ + return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); +} void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); -- cgit v1.2.3 From fca39346a55bb7196888ffc77d9e3557340d1d0b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:28 +0200 Subject: fs: Provide infrastructure for dynamic BDIs in filesystems Provide helper functions for setting up dynamically allocated backing_dev_info structures for filesystems and cleaning them up on superblock destruction. CC: linux-mtd@lists.infradead.org CC: linux-nfs@vger.kernel.org CC: Petr Vandrovec CC: linux-nilfs@vger.kernel.org CC: cluster-devel@redhat.com CC: osd-dev@open-osd.org CC: codalist@coda.cs.cmu.edu CC: linux-afs@lists.infradead.org CC: ecryptfs@vger.kernel.org CC: linux-cifs@vger.kernel.org CC: ceph-devel@vger.kernel.org CC: linux-btrfs@vger.kernel.org CC: v9fs-developer@lists.sourceforge.net CC: lustre-devel@lists.lustre.org Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 +- include/linux/fs.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e66d4722db8e..866c433e7d32 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -146,7 +146,7 @@ struct backing_dev_info { congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ - char *name; + const char *name; struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 7251f7bb45e8..98cf14ea78c0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1272,6 +1272,9 @@ struct mm_struct; /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +/* Temporary flag until all filesystems are converted to dynamic bdis */ +#define SB_I_DYNBDI 0x00000100 + /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ @@ -2121,6 +2124,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); extern bool our_mnt(struct vfsmount *mnt); +extern __printf(2, 3) +int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); +extern int super_setup_bdi(struct super_block *sb); extern int current_umask(void); -- cgit v1.2.3 From fa06052d637bf3a76f18cd2304048b866af4096e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:37 +0200 Subject: mtd: Convert to dynamically allocated bdi infrastructure MTD already allocates backing_dev_info dynamically. Convert it to use generic infrastructure for this including proper refcounting. We drop mtd->backing_dev_info as its only use was to pass mtd_bdi pointer from one file into another and if we wanted to keep that in a clean way, we'd have to make mtd hold and drop bdi reference as needed which seems pointless for passing one global pointer... CC: David Woodhouse CC: Brian Norris CC: linux-mtd@lists.infradead.org Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/mtd/mtd.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index eebdc63cf6af..79b176eca04a 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -334,11 +334,6 @@ struct mtd_info { int (*_get_device) (struct mtd_info *mtd); void (*_put_device) (struct mtd_info *mtd); - /* Backing device capabilities for this device - * - provides mmap capabilities - */ - struct backing_dev_info *backing_dev_info; - struct notifier_block reboot_notifier; /* default mode before reboot */ /* ECC status information */ -- cgit v1.2.3 From a5695a79088653c73c92ae8d48658cbc49f31884 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:38 +0200 Subject: coda: Convert to separately allocated bdi Allocate struct backing_dev_info separately instead of embedding it inside the superblock. This unifies handling of bdi among users. CC: Jan Harkes CC: coda@cs.cmu.edu CC: codalist@coda.cs.cmu.edu Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/coda_psdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 5b8721efa948..31e4e1f1547c 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -15,7 +15,6 @@ struct venus_comm { struct list_head vc_processing; int vc_inuse; struct super_block *vc_sb; - struct backing_dev_info bdi; struct mutex vc_mutex; }; -- cgit v1.2.3 From 0db10944a76ba09f37d43b99d0fe085a18307f22 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:45 +0200 Subject: nfs: Convert to separately allocated bdi Allocate struct backing_dev_info separately instead of embedding it inside the superblock. This unifies handling of bdi among users. CC: Anna Schumaker CC: linux-nfs@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Acked-by: Trond Myklebust Signed-off-by: Jens Axboe --- include/linux/nfs_fs_sb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b34097c67848..e1502c55741e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -133,7 +133,6 @@ struct nfs_server { struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ - struct backing_dev_info backing_dev_info; atomic_long_t writeback; /* number of writeback pages */ int flags; /* various flags */ unsigned int caps; /* server capabilities */ -- cgit v1.2.3 From c1844d536dafa5f2cddf4b4841a3634f80a27666 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:47 +0200 Subject: fs: Remove SB_I_DYNBDI flag Now that all bdi structures filesystems use are properly refcounted, we can remove the SB_I_DYNBDI flag. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98cf14ea78c0..30e5c14bd743 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1272,9 +1272,6 @@ struct mm_struct; /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -/* Temporary flag until all filesystems are converted to dynamic bdis */ -#define SB_I_DYNBDI 0x00000100 - /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ -- cgit v1.2.3 From 2e82b84c01d9438d86079980e22e036eee71e754 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:48 +0200 Subject: block: Remove unused functions Now that all backing_dev_info structure are allocated separately, we can drop some unused functions. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 47a98e6e2a65..aaeb2ec5d33c 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -17,8 +17,6 @@ #include #include -int __must_check bdi_init(struct backing_dev_info *bdi); - static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); @@ -32,12 +30,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_va(struct backing_dev_info *bdi, struct device *parent, const char *fmt, va_list args); -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); -int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); -void bdi_destroy(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) { -- cgit v1.2.3 From 7c4cc30024946dae9530cd6dc0d8d4eb40fca173 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Apr 2017 12:24:49 +0200 Subject: bdi: Drop 'parent' argument from bdi_register[_va]() Drop 'parent' argument of bdi_register() and bdi_register_va(). It is always NULL. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index aaeb2ec5d33c..557d84063934 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -25,11 +25,10 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) void bdi_put(struct backing_dev_info *bdi); -__printf(3, 4) -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...); -int bdi_register_va(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, va_list args); +__printf(2, 3) +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); +int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, + va_list args); int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); -- cgit v1.2.3 From b7819b9259185dcdcc81eb32182a4dc13d695738 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Apr 2017 16:02:55 +0200 Subject: block: remove the blk_execute_rq return value The function only returns -EIO if rq->errors is non-zero, which is not very useful and lets a large number of callers ignore the return value. Just let the callers figure out their error themselves. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 51c9e391798e..e2064ed3c703 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -970,7 +970,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern int blk_execute_rq(struct request_queue *, struct gendisk *, +extern void blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -- cgit v1.2.3 From 17d5363b83f8c73ef9109f75a4a9b578f31d842f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Apr 2017 16:03:01 +0200 Subject: scsi: introduce a result field in struct scsi_request This passes on the scsi_cmnd result field to users of passthrough requests. Currently we abuse req->errors for this purpose, but that field will go away in its current form. Note that the old IDE code abuses the errors field in very creative ways and stores all kinds of different values in it. I didn't dare to touch this magic, so the abuses are brought forward 1:1. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/ide.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 2f51c1724b5a..6980ca322074 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -88,7 +88,7 @@ static inline bool ata_pm_request(struct request *rq) ide_req(rq)->type == ATA_PRIV_PM_RESUME); } -/* Error codes returned in rq->errors to the higher part of the driver. */ +/* Error codes returned in result to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, IDE_DRV_ERROR_FILEMARK = 102, -- cgit v1.2.3 From 08e0029aa2a4acdd365613ce88a1184e5351a8a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Apr 2017 16:03:09 +0200 Subject: blk-mq: remove the error argument to blk_mq_complete_request Now that all drivers that call blk_mq_complete_requests have a ->complete callback we can remove the direct call to blk_mq_end_request, as well as the error argument to blk_mq_complete_request. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d75de612845d..0c4dadb85f62 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -228,7 +228,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_abort_requeue_list(struct request_queue *q); -void blk_mq_complete_request(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); -- cgit v1.2.3 From e26738e037f34aedfe05e412f442833f44f4a6e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Apr 2017 16:03:11 +0200 Subject: block: add a error_count field to struct request This is for the legacy floppy and ataflop drivers that currently abuse ->errors for this purpose. It's stashed away in a union to not grow the struct size, the other fields are either used by modern drivers for different purposes or the I/O scheduler before queing the I/O to drivers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e2064ed3c703..a3dcee624de3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -175,6 +175,7 @@ struct request { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; void *completion_data; + int error_count; /* for legacy drivers, don't use */ }; /* -- cgit v1.2.3 From caf7df12272118e0274c8353bcfeaf60c7743a47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Apr 2017 16:03:16 +0200 Subject: block: remove the errors field from struct request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Acked-by: Roger Pau Monné Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a3dcee624de3..6c4ab0d4a160 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -220,8 +220,6 @@ struct request { void *special; /* opaque pointer available for LLD use */ - int errors; - unsigned int extra_len; /* length of alignment and padding */ unsigned long deadline; -- cgit v1.2.3 From 0206319fdfee7c36b97aa6c0561bab206132f813 Mon Sep 17 00:00:00 2001 From: Stephen Bates Date: Thu, 20 Apr 2017 16:59:11 -0600 Subject: blk-mq: Fix poll_stat for new size-based bucketing. Fixes an issue where the size of the poll_stat array in request_queue does not match the size expected by the new size based bucketing for IO completion polling. Fixes: 720b8ccc4500 ("blk-mq: Add a polling specific stats function") Signed-off-by: Stephen Bates Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6c4ab0d4a160..6c247861cb66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -46,6 +46,9 @@ struct blk_stat_callback; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ +/* Must be consisitent with blk_mq_poll_stats_bkt() */ +#define BLK_MQ_POLL_STATS_BKTS 16 + /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. @@ -517,7 +520,7 @@ struct request_queue { int poll_nsec; struct blk_stat_callback *poll_cb; - struct blk_rq_stat poll_stat[2]; + struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; struct timer_list timeout; struct work_struct timeout_work; -- cgit v1.2.3 From f9f38e33389c019ec880f6825119c94867c1fde0 Mon Sep 17 00:00:00 2001 From: Helen Koike Date: Mon, 10 Apr 2017 12:51:07 -0300 Subject: nvme: improve performance for virtual NVMe devices This change provides a mechanism to reduce the number of MMIO doorbell writes for the NVMe driver. When running in a virtualized environment like QEMU, the cost of an MMIO is quite hefy here. The main idea for the patch is provide the device two memory location locations: 1) to store the doorbell values so they can be lookup without the doorbell MMIO write 2) to store an event index. I believe the doorbell value is obvious, the event index not so much. Similar to the virtio specification, the virtual device can tell the driver (guest OS) not to write MMIO unless you are writing past this value. FYI: doorbell values are written by the nvme driver (guest OS) and the event index is written by the virtual device (host OS). The patch implements a new admin command that will communicate where these two memory locations reside. If the command fails, the nvme driver will work as before without any optimizations. Contributions: Eric Northup Frank Swiderski Ted Tso Keith Busch Just to give an idea on the performance boost with the vendor extension: Running fio [1], a stock NVMe driver I get about 200K read IOPs with my vendor patch I get about 1000K read IOPs. This was running with a null device i.e. the backing device simply returned success on every read IO request. [1] Running on a 4 core machine: fio --time_based --name=benchmark --runtime=30 --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32 --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4 --rw=randread --blocksize=4k --randrepeat=false Signed-off-by: Rob Nelson [mlin: port for upstream] Signed-off-by: Ming Lin [koike: updated for upstream] Signed-off-by: Helen Koike Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9061780b141f..b625bacf37ef 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -245,6 +245,7 @@ enum { NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7, }; struct nvme_lbaf { @@ -603,6 +604,7 @@ enum nvme_admin_opcode { nvme_admin_download_fw = 0x11, nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, + nvme_admin_dbbuf = 0x7C, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, @@ -874,6 +876,16 @@ struct nvmf_property_get_command { __u8 resv4[16]; }; +struct nvme_dbbuf { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __u32 rsvd12[6]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -893,6 +905,7 @@ struct nvme_command { struct nvmf_connect_command connect; struct nvmf_property_set_command prop_set; struct nvmf_property_get_command prop_get; + struct nvme_dbbuf dbbuf; }; }; -- cgit v1.2.3 From 39498faef7c02f9f6de4060ccdc7e8975a6e690b Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 11 Apr 2017 11:32:28 -0700 Subject: nvmet_fc: add target feature flags for upcall isr contexts Two new feature flags were added to control whether upcalls to the transport result in context switches or stay in the calling context. NVMET_FCTGTFEAT_CMD_IN_ISR: By default, if the flag is not set, the transport assumes the lldd is in a non-isr context and in the cpu context it should be for the io queue. As such, the cmd handler is called directly in the calling context. If the flag is set, indicating the upcall is an isr context, the transport mandates a transition to a workqueue. The workqueue assigned to the queue is used for the context. NVMET_FCTGTFEAT_OPDONE_IN_ISR By default, if the flag is not set, the transport assumes the lldd is in a non-isr context and in the cpu context it should be for the io queue. As such, the fcp operation done callback is called directly in the calling context. If the flag is set, indicating the upcall is an isr context, the transport mandates a transition to a workqueue. The workqueue assigned to the queue is used for the context. Updated lpfc for flags Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- include/linux/nvme-fc-driver.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 16eb264980c2..d70a9c98bc23 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -655,6 +655,22 @@ enum { * on. The transport should pick a cpu to schedule the work * on. */ + NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2), + /* Bit 2: When 0, the LLDD is calling the cmd rcv handler + * in a non-isr context, allowing the transport to finish + * op completion in the calling context. When 1, the LLDD + * is calling the cmd rcv handler in an ISR context, + * requiring the transport to transition to a workqueue + * for op completion. + */ + NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3), + /* Bit 3: When 0, the LLDD is calling the op done handler + * in a non-isr context, allowing the transport to finish + * op completion in the calling context. When 1, the LLDD + * is calling the op done handler in an ISR context, + * requiring the transport to transition to a workqueue + * for op completion. + */ }; -- cgit v1.2.3 From 19b58d9473e8e3d38e7f3602a07c8febfbd07bc1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 11 Apr 2017 11:32:29 -0700 Subject: nvmet_fc: add req_release to lldd api With the advent of the opdone calls changing context, the lldd can no longer assume that once the op->done call returns for RSP operations that the request struct is no longer being accessed. As such, revise the lldd api for a req_release callback that the transport will call when the job is complete. This will also be used with abort cases. Fixed text in api header for change in io complete semantics. Revised lpfc to support the new req_release api. Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- include/linux/nvme-fc-driver.h | 57 ++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index d70a9c98bc23..d98ddb2feabc 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -741,12 +741,12 @@ struct nvmet_fc_target_port { * be freed/released. * Entrypoint is Mandatory. * - * @fcp_op: Called to perform a data transfer, transmit a response, or - * abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same - * LLDD-supplied exchange structure specified in the - * nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received. - * The op field in the structure shall indicate the operation for - * the LLDD to perform relative to the io. + * @fcp_op: Called to perform a data transfer or transmit a response. + * The nvmefc_tgt_fcp_req structure is the same LLDD-supplied + * exchange structure specified in the nvmet_fc_rcv_fcp_req() call + * made when the FCP CMD IU was received. The op field in the + * structure shall indicate the operation for the LLDD to perform + * relative to the io. * NVMET_FCOP_READDATA operation: the LLDD is to send the * payload data (described by sglist) to the host in 1 or * more FC sequences (preferrably 1). Note: the fc-nvme layer @@ -768,29 +768,35 @@ struct nvmet_fc_target_port { * successfully, the LLDD is to update the nvmefc_tgt_fcp_req * transferred_length field and may subsequently transmit the * FCP_RSP iu payload (described by rspbuf, rspdma, rsplen). - * The LLDD is to await FCP_CONF reception to confirm the RSP - * reception by the host. The LLDD may retramsit the FCP_RSP iu - * if necessary per FC-NVME. Upon reception of FCP_CONF, or upon - * FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req - * fcp_error field and consider the operation complete.. + * If FCP_CONF is supported, the LLDD is to await FCP_CONF + * reception to confirm the RSP reception by the host. The LLDD + * may retramsit the FCP_RSP iu if necessary per FC-NVME. Upon + * transmission of the FCP_RSP iu if FCP_CONF is not supported, + * or upon success/failure of FCP_CONF if it is supported, the + * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and + * consider the operation complete. * NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload - * (described by rspbuf, rspdma, rsplen). The LLDD is to await - * FCP_CONF reception to confirm the RSP reception by the host. - * The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME. - * Upon reception of FCP_CONF, or upon FCP_CONF failure, the + * (described by rspbuf, rspdma, rsplen). If FCP_CONF is + * supported, the LLDD is to await FCP_CONF reception to confirm + * the RSP reception by the host. The LLDD may retramsit the + * FCP_RSP iu if FCP_CONF is not received per FC-NVME. Upon + * transmission of the FCP_RSP iu if FCP_CONF is not supported, + * or upon success/failure of FCP_CONF if it is supported, the * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and - * consider the operation complete.. + * consider the operation complete. * NVMET_FCOP_ABORT: the LLDD is to terminate the exchange * corresponding to the fcp operation. The LLDD shall send * ABTS and follow FC exchange abort-multi rules, including * ABTS retries and possible logout. * Upon completing the indicated operation, the LLDD is to set the * status fields for the operation (tranferred_length and fcp_error - * status) in the request, then all the "done" routine - * indicated in the fcp request. Upon return from the "done" - * routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation - * the fc-nvme layer will not longer reference the fcp request, - * allowing the LLDD to free/release the fcp request. + * status) in the request, then call the "done" routine + * indicated in the fcp request. After the operation completes, + * regardless of whether the FCP_RSP iu was successfully transmit, + * the LLDD-supplied exchange structure must remain valid until the + * transport calls the fcp_req_release() callback to return ownership + * of the exchange structure back to the LLDD so that it may be used + * for another fcp command. * Note: when calling the done routine for READDATA or WRITEDATA * operations, the fc-nvme layer may immediate convert, in the same * thread and before returning to the LLDD, the fcp operation to @@ -802,6 +808,11 @@ struct nvmet_fc_target_port { * Returns 0 on success, - on failure (Ex: -EIO) * Entrypoint is Mandatory. * + * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req + * to the LLDD after all operations on the fcp operation are complete. + * This may be due to the command completing or upon completion of + * abort cleanup. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -836,7 +847,9 @@ struct nvmet_fc_target_template { int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_ls_req *tls_req); int (*fcp_op)(struct nvmet_fc_target_port *tgtport, - struct nvmefc_tgt_fcp_req *); + struct nvmefc_tgt_fcp_req *fcpreq); + void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); u32 max_hw_queues; u16 max_sgl_segments; -- cgit v1.2.3 From a97ec51b37efacb84f286979876675a8143035b0 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 11 Apr 2017 11:32:31 -0700 Subject: nvmet_fc: Rework target side abort handling target transport: ---------------------- There are cases when there is a need to abort in-progress target operations (writedata) so that controller termination or errors can clean up. That can't happen currently as the abort is another target op type, so it can't be used till the running one finishes (and it may not). Solve by removing the abort op type and creating a separate downcall from the transport to the lldd to request an io to be aborted. The transport will abort ios on queue teardown or io errors. In general the transport tries to call the lldd abort only when the io state is idle. Meaning: ops that transmit data (readdata or rsp) will always finish their transmit (or the lldd will see a state on the link or initiator port that fails the transmit) and the done call for the operation will occur. The transport will wait for the op done upcall before calling the abort function, and as the io is idle, the io can be cleaned up immediately after the abort call; Similarly, ios that are not waiting for data or transmitting data must be in the nvmet layer being processed. The transport will wait for the nvmet layer completion before calling the abort function, and as the io is idle, the io can be cleaned up immediately after the abort call; As for ops that are waiting for data (writedata), they may be outstanding indefinitely if the lldd doesn't see a condition where the initiatior port or link is bad. In those cases, the transport will call the abort function and wait for the lldd's op done upcall for the operation, where it will then clean up the io. Additionally, if a lldd receives an ABTS and matches it to an outstanding request in the transport, A new new transport upcall was created to abort the outstanding request in the transport. The transport expects any outstanding op call (readdata or writedata) will completed by the lldd and the operation upcall made. The transport doesn't act on the reported abort (e.g. clean up the io) until an op done upcall occurs, a new op is attempted, or the nvmet layer completes the io processing. fcloop: ---------------------- Updated to support the new target apis. On fcp io aborts from the initiator, the loopback context is updated to NULL out the half that has completed. The initiator side is immediately called after the abort request with an io completion (abort status). On fcp io aborts from the target, the io is stopped and the initiator side sees it as an aborted io. Target side ops, perhaps in progress while the initiator side is done, continue but noop the data movement as there's no structure on the initiator side to reference. patch also contains: ---------------------- Revised lpfc to support the new abort api commonized rsp buffer syncing and nulling of private data based on calling paths. errors in op done calls don't take action on the fod. They're bad operations which implies the fod may be bad. Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- include/linux/nvme-fc-driver.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index d98ddb2feabc..0db37158a61d 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -533,9 +533,6 @@ enum { * rsp as well */ NVMET_FCOP_RSP = 4, /* send rsp frame */ - NVMET_FCOP_ABORT = 5, /* abort exchange via ABTS */ - NVMET_FCOP_BA_ACC = 6, /* send BA_ACC */ - NVMET_FCOP_BA_RJT = 7, /* send BA_RJT */ }; /** @@ -572,8 +569,6 @@ enum { * upon compeletion of the operation. The nvmet-fc layer will also set a * private pointer for its own use in the done routine. * - * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !! - * * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op * entrypoint. * @op: Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx) @@ -784,10 +779,6 @@ struct nvmet_fc_target_port { * or upon success/failure of FCP_CONF if it is supported, the * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and * consider the operation complete. - * NVMET_FCOP_ABORT: the LLDD is to terminate the exchange - * corresponding to the fcp operation. The LLDD shall send - * ABTS and follow FC exchange abort-multi rules, including - * ABTS retries and possible logout. * Upon completing the indicated operation, the LLDD is to set the * status fields for the operation (tranferred_length and fcp_error * status) in the request, then call the "done" routine @@ -808,6 +799,17 @@ struct nvmet_fc_target_port { * Returns 0 on success, - on failure (Ex: -EIO) * Entrypoint is Mandatory. * + * @fcp_abort: Called by the transport to abort an active command. + * The command may be in-between operations (nothing active in LLDD) + * or may have an active WRITEDATA operation pending. The LLDD is to + * initiate the ABTS process for the command and return from the + * callback. The ABTS does not need to be complete on the command. + * The fcp_abort callback inherently cannot fail. After the + * fcp_abort() callback completes, the transport will wait for any + * outstanding operation (if there was one) to complete, then will + * call the fcp_req_release() callback to return the command's + * exchange context back to the LLDD. + * * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req * to the LLDD after all operations on the fcp operation are complete. * This may be due to the command completing or upon completion of @@ -848,6 +850,8 @@ struct nvmet_fc_target_template { struct nvmefc_tgt_ls_req *tls_req); int (*fcp_op)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); + void (*fcp_abort)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq); @@ -877,4 +881,7 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq, void *cmdiubuf, u32 cmdiubuf_len); +void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); + #endif /* _NVME_FC_DRIVER_H */ -- cgit v1.2.3 From 19b7ccf8651df09d274671b53039c672a52ad84d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 18 Apr 2017 18:43:20 +0200 Subject: block: get rid of blk_integrity_revalidate() Commit 25520d55cdb6 ("block: Inline blk_integrity in struct gendisk") introduced blk_integrity_revalidate(), which seems to assume ownership of the stable pages flag and unilaterally clears it if no blk_integrity profile is registered: if (bi->profile) disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; It's called from revalidate_disk() and rescan_partitions(), making it impossible to enable stable pages for drivers that support partitions and don't use blk_integrity: while the call in revalidate_disk() can be trivially worked around (see zram, which doesn't support partitions and hence gets away with zram_revalidate_disk()), rescan_partitions() can be triggered from userspace at any time. This breaks rbd, where the ceph messenger is responsible for generating/verifying CRCs. Since blk_integrity_{un,}register() "must" be used for (un)registering the integrity profile with the block layer, move BDI_CAP_STABLE_WRITES setting there. This way drivers that call blk_integrity_register() and use integrity infrastructure won't interfere with drivers that don't but still want stable pages. Fixes: 25520d55cdb6 ("block: Inline blk_integrity in struct gendisk") Cc: "Martin K. Petersen" Cc: Christoph Hellwig Cc: Mike Snitzer Cc: stable@vger.kernel.org # 4.4+, needs backporting Tested-by: Dan Williams Signed-off-by: Ilya Dryomov Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 9e11082c7f9b..acff9437e5c3 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -722,11 +722,9 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #if defined(CONFIG_BLK_DEV_INTEGRITY) extern void blk_integrity_add(struct gendisk *); extern void blk_integrity_del(struct gendisk *); -extern void blk_integrity_revalidate(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline void blk_integrity_add(struct gendisk *disk) { } static inline void blk_integrity_del(struct gendisk *disk) { } -static inline void blk_integrity_revalidate(struct gendisk *disk) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ #else /* CONFIG_BLOCK */ -- cgit v1.2.3 From 2836ee4b1acbe7b396219d0677426885f14cd792 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 26 Apr 2017 13:47:56 -0700 Subject: blk-mq: Add blk_mq_ops.show_rq() This new callback function will be used in the next patch to show more information about SCSI requests. Signed-off-by: Bart Van Assche Reviewed-by: Omar Sandoval Cc: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0c4dadb85f62..32bd8eb5ba67 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -142,6 +142,14 @@ struct blk_mq_ops { reinit_request_fn *reinit_request; map_queues_fn *map_queues; + +#ifdef CONFIG_BLK_DEBUG_FS + /* + * Used by the debugfs implementation to show driver-specific + * information about a request. + */ + void (*show_rq)(struct seq_file *m, struct request *rq); +#endif }; enum { -- cgit v1.2.3 From 9f993737906b30d7b2454a38637d1f70ffd60f2f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2017 09:54:54 -0600 Subject: blk-mq: unify hctx delayed_run_work and run_work They serve the exact same purpose. Get rid of the non-delayed work variant, and just run it without delay for the normal case. Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 32bd8eb5ba67..c7cc90328426 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -15,7 +15,7 @@ struct blk_mq_hw_ctx { unsigned long state; /* BLK_MQ_S_* flags */ } ____cacheline_aligned_in_smp; - struct work_struct run_work; + struct delayed_work run_work; cpumask_var_t cpumask; int next_cpu; int next_cpu_batch; @@ -51,7 +51,6 @@ struct blk_mq_hw_ctx { atomic_t nr_active; - struct delayed_work delayed_run_work; struct delayed_work delay_work; struct hlist_node cpuhp_dead; -- cgit v1.2.3 From 818cd1cbaa7b00bbc35452a76bebc681a65f1912 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2017 09:54:55 -0600 Subject: block: add kblock_mod_delayed_work_on() This modifies (or adds, if not currently pending) an existing delayed work item. Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6c247861cb66..d098c66b3ab0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1685,6 +1685,7 @@ int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); +int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP /* -- cgit v1.2.3 From 21c6e939a9f6bb06fe616a87defec0f92a7c3df0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2017 09:54:56 -0600 Subject: blk-mq: unify hctx delay_work and run_work The only difference between ->run_work and ->delay_work, is that the latter is used to defer running a queue. This is done by marking the queue stopped, and scheduling ->delay_work to run sometime in the future. While the queue is stopped, direct runs or runs through ->run_work will not run the queue. If we combine the handlers, then we need to handle two things: 1) If a delayed/stopped run is scheduled, then we should not run the queue before that has been completed. 2) If a queue is delayed/stopped, the handler needs to restart the queue. Normally a run of a queue with the stopped bit set would be a no-op. Case 1 is handled by modifying a currently pending queue run to the deadline set by the caller of blk_mq_delay_queue(). Subsequent attempts to queue a queue run will find the work item already pending, and direct runs will see a stopped queue as before. Case 2 is handled by adding a new bit, BLK_MQ_S_START_ON_RUN, that tells the work handler that it should clear a stopped queue and run the handler. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c7cc90328426..f3e5e1de1bdb 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -51,8 +51,6 @@ struct blk_mq_hw_ctx { atomic_t nr_active; - struct delayed_work delay_work; - struct hlist_node cpuhp_dead; struct kobject kobj; @@ -168,6 +166,7 @@ enum { BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_S_TAG_WAITING = 3, + BLK_MQ_S_START_ON_RUN = 4, BLK_MQ_MAX_DEPTH = 10240, -- cgit v1.2.3