From 797476b88bde2a6001f9552f383f147e58c1a330 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 18 Oct 2016 15:40:29 +0900 Subject: block: Add 'zoned' queue limit Add the zoned queue limit to indicate the zoning model of a block device. Defined values are 0 (BLK_ZONED_NONE) for regular block devices, 1 (BLK_ZONED_HA) for host-aware zone block devices and 2 (BLK_ZONED_HM) for host-managed zone block devices. The standards defined drive managed model is not defined here since these block devices do not provide any command for accessing zone information. Drive managed model devices will be reported as BLK_ZONED_NONE. The helper functions blk_queue_zoned_model and bdev_zoned_model return the zoned limit and the functions blk_queue_is_zoned and bdev_is_zoned return a boolean for callers to test if a block device is zoned. The zoned attribute is also exported as a string to applications via sysfs. BLK_ZONED_NONE shows as "none", BLK_ZONED_HA as "host-aware" and BLK_ZONED_HM as "host-managed". Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c47c358ba052..f19e16bb43d1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -261,6 +261,15 @@ struct blk_queue_tag { #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) +/* + * Zoned block device models (zoned limit). + */ +enum blk_zoned_model { + BLK_ZONED_NONE, /* Regular block device */ + BLK_ZONED_HA, /* Host-aware zoned block device */ + BLK_ZONED_HM, /* Host-managed zoned block device */ +}; + struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; @@ -290,6 +299,7 @@ struct queue_limits { unsigned char cluster; unsigned char discard_zeroes_data; unsigned char raid_partial_stripes_expensive; + enum blk_zoned_model zoned; }; struct request_queue { @@ -627,6 +637,23 @@ static inline unsigned int blk_queue_cluster(struct request_queue *q) return q->limits.cluster; } +static inline enum blk_zoned_model +blk_queue_zoned_model(struct request_queue *q) +{ + return q->limits.zoned; +} + +static inline bool blk_queue_is_zoned(struct request_queue *q) +{ + switch (blk_queue_zoned_model(q)) { + case BLK_ZONED_HA: + case BLK_ZONED_HM: + return true; + default: + return false; + } +} + /* * We regard a request as sync, if either a read or a sync write */ @@ -1354,6 +1381,26 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zoned_model(q); + + return BLK_ZONED_NONE; +} + +static inline bool bdev_is_zoned(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_is_zoned(q); + + return false; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 2d253440b5afb128d22ccdae812dde9ba77a2cca Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 18 Oct 2016 15:40:32 +0900 Subject: block: Define zoned block device operations Define REQ_OP_ZONE_REPORT and REQ_OP_ZONE_RESET for handling zones of host-managed and host-aware zoned block devices. With with these two new operations, the total number of operations defined reaches 8 and still fits with the 3 bits definition of REQ_OP_BITS. Signed-off-by: Shaun Tancheff Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cd395ecec99d..dd50dce89a80 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -243,6 +243,8 @@ enum req_op { REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ REQ_OP_WRITE_SAME, /* write same block many times */ REQ_OP_FLUSH, /* request for cache flush */ + REQ_OP_ZONE_REPORT, /* Get zone information */ + REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ }; #define REQ_OP_BITS 3 -- cgit v1.2.3 From 6a0cb1bc106fc07ce0443303bcdb7f7da5131e5c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 18 Oct 2016 15:40:33 +0900 Subject: block: Implement support for zoned block devices Implement zoned block device zone information reporting and reset. Zone information are reported as struct blk_zone. This implementation does not differentiate between host-aware and host-managed device models and is valid for both. Two functions are provided: blkdev_report_zones for discovering the zone configuration of a zoned block device, and blkdev_reset_zones for resetting the write pointer of sequential zones. The helper function blk_queue_zone_size and bdev_zone_size are also provided for, as the name suggest, obtaining the zone size (in 512B sectors) of the zones of the device. Signed-off-by: Hannes Reinecke [Damien: * Removed the zone cache * Implement report zones operation based on earlier proposal by Shaun Tancheff ] Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f19e16bb43d1..252043f7cd2c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -302,6 +303,21 @@ struct queue_limits { enum blk_zoned_model zoned; }; +#ifdef CONFIG_BLK_DEV_ZONED + +struct blk_zone_report_hdr { + unsigned int nr_zones; + u8 padding[60]; +}; + +extern int blkdev_report_zones(struct block_device *bdev, + sector_t sector, struct blk_zone *zones, + unsigned int *nr_zones, gfp_t gfp_mask); +extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, + sector_t nr_sectors, gfp_t gfp_mask); + +#endif /* CONFIG_BLK_DEV_ZONED */ + struct request_queue { /* * Together with queue_head for cacheline sharing @@ -654,6 +670,11 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } +static inline unsigned int blk_queue_zone_size(struct request_queue *q) +{ + return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; +} + /* * We regard a request as sync, if either a read or a sync write */ @@ -1401,6 +1422,16 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } +static inline unsigned int bdev_zone_size(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zone_size(q); + + return 0; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 3ed05a987e0f63b21e634101e0b460d32f3581c3 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 18 Oct 2016 15:40:35 +0900 Subject: blk-zoned: implement ioctls Adds the new BLKREPORTZONE and BLKRESETZONE ioctls for respectively obtaining the zone configuration of a zoned block device and resetting the write pointer of sequential zones of a zoned block device. The BLKREPORTZONE ioctl maps directly to a single call of the function blkdev_report_zones. The zone information result is passed as an array of struct blk_zone identical to the structure used internally for processing the REQ_OP_ZONE_REPORT operation. The BLKRESETZONE ioctl maps to a single call of the blkdev_reset_zones function. Signed-off-by: Shaun Tancheff Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 252043f7cd2c..90097dd8b8ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -316,6 +316,27 @@ extern int blkdev_report_zones(struct block_device *bdev, extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); + +#else /* CONFIG_BLK_DEV_ZONED */ + +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + +static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_DEV_ZONED */ struct request_queue { -- cgit v1.2.3 From c4aebd0332da831a3403faf2035af45059ab6b7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:09 +0200 Subject: block: remove bio_is_rw With the addition of the zoned operations the tests in this function became incorrect. But I think it's much better to just open code the allow operations in the only caller anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/bio.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 97cb48f03dc7..87ce64dafb93 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -83,17 +83,6 @@ static inline bool bio_no_advance_iter(struct bio *bio) bio_op(bio) == REQ_OP_WRITE_SAME; } -static inline bool bio_is_rw(struct bio *bio) -{ - if (!bio_has_data(bio)) - return false; - - if (bio_no_advance_iter(bio)) - return false; - - return true; -} - static inline bool bio_mergeable(struct bio *bio) { if (bio->bi_opf & REQ_NOMERGE_FLAGS) -- cgit v1.2.3 From bd1c1c21741cbd6e894960bcbc8b36f719590064 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:10 +0200 Subject: block: REQ_NOMERGE is common to the bio and request So move it into the common setion of the request flags. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dd50dce89a80..b54142534793 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -158,6 +158,7 @@ enum rq_flag_bits { __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ + __REQ_NOMERGE, /* don't touch this for merging */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ @@ -171,7 +172,6 @@ enum rq_flag_bits { /* request only flags */ __REQ_SORTED, /* elevator knows about this request */ __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ - __REQ_NOMERGE, /* don't touch this for merging */ __REQ_STARTED, /* drive already may have started this one */ __REQ_DONTPREP, /* don't call prep for this one */ __REQ_QUEUED, /* uses queueing */ -- cgit v1.2.3 From 188bd2b16b3c6ea87a90df20f33db0adcdb75f0c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:11 +0200 Subject: block: move REQ_RAHEAD to common flags The information that am I/O is a read-ahead can be useful for drivers. In fact the NVMe driver already checks it, even if it won't ever be set at the moment. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b54142534793..44f9bca332e5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -163,9 +163,9 @@ enum rq_flag_bits { __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ + __REQ_RAHEAD, /* read ahead, can fail anytime */ /* bio only flags */ - __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ @@ -205,7 +205,7 @@ enum rq_flag_bits { (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) #define REQ_COMMON_MASK \ (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE) + REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_RAHEAD) #define REQ_CLONE_MASK REQ_COMMON_MASK /* This mask is used for both bio and request merge checking */ -- cgit v1.2.3 From 8d2bbd4c8236e9e38e6b36ac9e2c54fdcfe5b335 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:12 +0200 Subject: block: replace REQ_THROTTLED with a bio flag It's the last bio-only REQ_* flag, and we have space for it in the bio bi_flags field. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 44f9bca332e5..6df722de2e22 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -119,6 +119,8 @@ struct bio { #define BIO_QUIET 6 /* Make BIO Quiet */ #define BIO_CHAIN 7 /* chained bio, ->bi_remaining in effect */ #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ +#define BIO_THROTTLED 9 /* This bio has already been subjected to + * throttling rules. Don't do it again. */ /* * Flags starting here get preserved by bio_reset() - this includes @@ -165,10 +167,6 @@ enum rq_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ - /* bio only flags */ - __REQ_THROTTLED, /* This bio has already been subjected to - * throttling rules. Don't do it again. */ - /* request only flags */ __REQ_SORTED, /* elevator knows about this request */ __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ @@ -213,8 +211,6 @@ enum rq_flag_bits { (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) -#define REQ_THROTTLED (1ULL << __REQ_THROTTLED) - #define REQ_SORTED (1ULL << __REQ_SORTED) #define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER) #define REQ_FUA (1ULL << __REQ_FUA) -- cgit v1.2.3 From e806402130c9c494e22c73ae9ead4e79d2a5811c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:13 +0200 Subject: block: split out request-only flags into a new namespace A lot of the REQ_* flags are only used on struct requests, and only of use to the block layer and a few drivers that dig into struct request internals. This patch adds a new req_flags_t rq_flags field to struct request for them, and thus dramatically shrinks the number of common requests. It also removes the unfortunate situation where we have to fit the fields from the same enum into 32 bits for struct bio and 64 bits for struct request. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 39 +------------------------------------ include/linux/blkdev.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6df722de2e22..ec69a8fe3b29 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -167,26 +167,6 @@ enum rq_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ - /* request only flags */ - __REQ_SORTED, /* elevator knows about this request */ - __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ - __REQ_STARTED, /* drive already may have started this one */ - __REQ_DONTPREP, /* don't call prep for this one */ - __REQ_QUEUED, /* uses queueing */ - __REQ_ELVPRIV, /* elevator private data attached */ - __REQ_FAILED, /* set if the request failed */ - __REQ_QUIET, /* don't worry about errors */ - __REQ_PREEMPT, /* set for "ide_preempt" requests and also - for requests for which the SCSI "quiesce" - state must be ignored. */ - __REQ_ALLOCED, /* request came from our alloc pool */ - __REQ_COPY_USER, /* contains copies of user pages */ - __REQ_FLUSH_SEQ, /* request for flush sequence */ - __REQ_IO_STAT, /* account I/O stat */ - __REQ_MIXED_MERGE, /* merge of different types, fail separately */ - __REQ_PM, /* runtime pm request */ - __REQ_HASHED, /* on IO scheduler merge hash */ - __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ }; @@ -208,29 +188,12 @@ enum rq_flag_bits { /* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ) + (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) -#define REQ_SORTED (1ULL << __REQ_SORTED) -#define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER) #define REQ_FUA (1ULL << __REQ_FUA) #define REQ_NOMERGE (1ULL << __REQ_NOMERGE) -#define REQ_STARTED (1ULL << __REQ_STARTED) -#define REQ_DONTPREP (1ULL << __REQ_DONTPREP) -#define REQ_QUEUED (1ULL << __REQ_QUEUED) -#define REQ_ELVPRIV (1ULL << __REQ_ELVPRIV) -#define REQ_FAILED (1ULL << __REQ_FAILED) -#define REQ_QUIET (1ULL << __REQ_QUIET) -#define REQ_PREEMPT (1ULL << __REQ_PREEMPT) -#define REQ_ALLOCED (1ULL << __REQ_ALLOCED) -#define REQ_COPY_USER (1ULL << __REQ_COPY_USER) #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) -#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) -#define REQ_IO_STAT (1ULL << __REQ_IO_STAT) -#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) -#define REQ_PM (1ULL << __REQ_PM) -#define REQ_HASHED (1ULL << __REQ_HASHED) -#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) enum req_op { REQ_OP_READ, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 90097dd8b8ed..b4415feac679 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -78,6 +78,50 @@ enum rq_cmd_type_bits { REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; +/* + * request flags */ +typedef __u32 __bitwise req_flags_t; + +/* elevator knows about this request */ +#define RQF_SORTED ((__force req_flags_t)(1 << 0)) +/* drive already may have started this one */ +#define RQF_STARTED ((__force req_flags_t)(1 << 1)) +/* uses tagged queueing */ +#define RQF_QUEUED ((__force req_flags_t)(1 << 2)) +/* may not be passed by ioscheduler */ +#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) +/* request for flush sequence */ +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) +/* merge of different types, fail separately */ +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) +/* track inflight for MQ */ +#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) +/* don't call prep for this one */ +#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) +/* set for "ide_preempt" requests and also for requests for which the SCSI + "quiesce" state must be ignored. */ +#define RQF_PREEMPT ((__force req_flags_t)(1 << 8)) +/* contains copies of user pages */ +#define RQF_COPY_USER ((__force req_flags_t)(1 << 9)) +/* vaguely specified driver internal error. Ignored by the block layer */ +#define RQF_FAILED ((__force req_flags_t)(1 << 10)) +/* don't warn about errors */ +#define RQF_QUIET ((__force req_flags_t)(1 << 11)) +/* elevator private data attached */ +#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) +/* account I/O stat */ +#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +/* request came from our alloc pool */ +#define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) +/* runtime pm request */ +#define RQF_PM ((__force req_flags_t)(1 << 15)) +/* on IO scheduler merge hash */ +#define RQF_HASHED ((__force req_flags_t)(1 << 16)) + +/* flags that prevent us from merging requests: */ +#define RQF_NOMERGE_FLAGS \ + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ) + #define BLK_MAX_CDB 16 /* @@ -99,6 +143,7 @@ struct request { int cpu; unsigned cmd_type; u64 cmd_flags; + req_flags_t rq_flags; unsigned long atomic_flags; /* the following two fields are internal, NEVER access directly */ @@ -648,7 +693,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) REQ_FAILFAST_DRIVER)) #define blk_account_rq(rq) \ - (((rq)->cmd_flags & REQ_STARTED) && \ + (((rq)->rq_flags & RQF_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) @@ -740,6 +785,8 @@ static inline bool rq_mergeable(struct request *rq) if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; + if (rq->rq_flags & RQF_NOMERGE_FLAGS) + return false; return true; } -- cgit v1.2.3 From ef295ecf090d3e86e5b742fc6ab34f1122a43773 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2016 08:48:16 -0600 Subject: block: better op and flags encoding Now that we don't need the common flags to overflow outside the range of a 32-bit type we can encode them the same way for both the bio and request fields. This in addition allows us to place the operation first (and make some room for more ops while we're at it) and to stop having to shift around the operation values. In addition this allows passing around only one value in the block layer instead of two (and eventuall also in the file systems, but we can do that later) and thus clean up a lot of code. Last but not least this allows decreasing the size of the cmd_flags field in struct request to 32-bits. Various functions passing this value could also be updated, but I'd like to avoid the churn for now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 11 +++--- include/linux/blk_types.h | 83 ++++++++++++++++++++------------------------ include/linux/blkdev.h | 26 ++------------ include/linux/blktrace_api.h | 2 +- include/linux/dm-io.h | 2 +- include/linux/elevator.h | 4 +-- 6 files changed, 48 insertions(+), 80 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3bf5d33800ab..ddaf28d0988f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -581,15 +581,14 @@ static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat - * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int op, int op_flags, uint64_t val) + unsigned int op, uint64_t val) { struct percpu_counter *cnt; @@ -600,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); - if (op_flags & REQ_SYNC) + if (op & REQ_SYNC) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; @@ -705,9 +704,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf, + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } rcu_read_unlock(); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ec69a8fe3b29..dca972d67548 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -88,24 +88,6 @@ struct bio { struct bio_vec bi_inline_vecs[0]; }; -#define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) -#define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) -#define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) - -#define bio_set_op_attrs(bio, op, op_flags) do { \ - if (__builtin_constant_p(op)) \ - BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \ - else \ - WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \ - if (__builtin_constant_p(op_flags)) \ - BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - else \ - WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - (bio)->bi_opf = bio_flags(bio); \ - (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \ - (bio)->bi_opf |= (op_flags); \ -} while (0) - #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) /* @@ -147,26 +129,40 @@ struct bio { #endif /* CONFIG_BLOCK */ /* - * Request flags. For use in the cmd_flags field of struct request, and in - * bi_opf of struct bio. Note that some flags are only valid in either one. + * Operations and flags common to the bio and request structures. + * We use 8 bits for encoding the operation, and the remaining 24 for flags. */ -enum rq_flag_bits { - /* common flags */ - __REQ_FAILFAST_DEV, /* no driver retries of device errors */ +#define REQ_OP_BITS 8 +#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) +#define REQ_FLAG_BITS 24 + +enum req_opf { + REQ_OP_READ, + REQ_OP_WRITE, + REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ + REQ_OP_WRITE_SAME, /* write same block many times */ + REQ_OP_FLUSH, /* request for cache flush */ + REQ_OP_ZONE_REPORT, /* Get zone information */ + REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ + + REQ_OP_LAST, +}; + +enum req_flag_bits { + __REQ_FAILFAST_DEV = /* no driver retries of device errors */ + REQ_OP_BITS, __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ - __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ - __REQ_NOMERGE, /* don't touch this for merging */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ - __REQ_NR_BITS, /* stops here */ }; @@ -176,37 +172,32 @@ enum rq_flag_bits { #define REQ_SYNC (1ULL << __REQ_SYNC) #define REQ_META (1ULL << __REQ_META) #define REQ_PRIO (1ULL << __REQ_PRIO) +#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) #define REQ_NOIDLE (1ULL << __REQ_NOIDLE) #define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) +#define REQ_FUA (1ULL << __REQ_FUA) +#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -#define REQ_COMMON_MASK \ - (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_RAHEAD) -#define REQ_CLONE_MASK REQ_COMMON_MASK -/* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) -#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) -#define REQ_FUA (1ULL << __REQ_FUA) -#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) -#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define bio_op(bio) \ + ((bio)->bi_opf & REQ_OP_MASK) +#define req_op(req) \ + ((req)->cmd_flags & REQ_OP_MASK) -enum req_op { - REQ_OP_READ, - REQ_OP_WRITE, - REQ_OP_DISCARD, /* request to discard sectors */ - REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ - REQ_OP_WRITE_SAME, /* write same block many times */ - REQ_OP_FLUSH, /* request for cache flush */ - REQ_OP_ZONE_REPORT, /* Get zone information */ - REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ -}; +/* obsolete, don't use in new code */ +#define bio_set_op_attrs(bio, op, op_flags) \ + ((bio)->bi_opf |= (op | op_flags)) -#define REQ_OP_BITS 3 +static inline bool op_is_sync(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC); +} typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b4415feac679..8396da2bb698 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -142,7 +142,7 @@ struct request { int cpu; unsigned cmd_type; - u64 cmd_flags; + unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; unsigned long atomic_flags; @@ -244,20 +244,6 @@ struct request { struct request *next_rq; }; -#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS) -#define req_op(req) ((req)->cmd_flags >> REQ_OP_SHIFT) - -#define req_set_op(req, op) do { \ - WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (req)->cmd_flags &= ((1ULL << REQ_OP_SHIFT) - 1); \ - (req)->cmd_flags |= ((u64) (op) << REQ_OP_SHIFT); \ -} while (0) - -#define req_set_op_attrs(req, op, flags) do { \ - req_set_op(req, op); \ - (req)->cmd_flags |= flags; \ -} while (0) - static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; @@ -741,17 +727,9 @@ static inline unsigned int blk_queue_zone_size(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -/* - * We regard a request as sync, if either a read or a sync write - */ -static inline bool rw_is_sync(int op, unsigned int rw_flags) -{ - return op == REQ_OP_READ || (rw_flags & REQ_SYNC); -} - static inline bool rq_is_sync(struct request *rq) { - return rw_is_sync(req_op(rq), rq->cmd_flags); + return op_is_sync(rq->cmd_flags); } static inline bool blk_rl_full(struct request_list *rl, bool sync) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cceb72f9e29f..e417f080219a 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq) } extern void blk_dump_cmd(char *buf, struct request *rq); -extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes); +extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h index b91b023deffb..a52c6580cc9a 100644 --- a/include/linux/dm-io.h +++ b/include/linux/dm-io.h @@ -58,7 +58,7 @@ struct dm_io_notify { struct dm_io_client; struct dm_io_request { int bi_op; /* REQ_OP */ - int bi_op_flags; /* rq_flag_bits */ + int bi_op_flags; /* req_flag_bits */ struct dm_io_memory mem; /* Memory to use for io */ struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */ struct dm_io_client *client; /* Client memory handler */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index e7f358d2e5fc..f219c9aed360 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -30,7 +30,7 @@ typedef int (elevator_dispatch_fn) (struct request_queue *, int); typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_may_queue_fn) (struct request_queue *, int, int); +typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int); typedef void (elevator_init_icq_fn) (struct io_cq *); typedef void (elevator_exit_icq_fn) (struct io_cq *); @@ -139,7 +139,7 @@ extern struct request *elv_former_request(struct request_queue *, struct request extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); extern void elv_unregister_queue(struct request_queue *q); -extern int elv_may_queue(struct request_queue *, int, int); +extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, struct bio *bio, gfp_t gfp_mask); -- cgit v1.2.3 From 87374179c535a98337569904727aa02f960fe79e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:15 +0200 Subject: block: add a proper block layer data direction encoding Currently the block layer op_is_write, bio_data_dir and rq_data_dir helper treat every operation that is not a READ as a data out operation. This worked surprisingly long, but the new REQ_OP_ZONE_REPORT operation actually adds a second operation that reads data from the device. Surprisingly nothing critical relied on this direction, but this might be a good opportunity to properly fix this issue up. We take a little inspiration and use the least significant bit of the operation number to encode the data direction, which just requires us to renumber the operations to fix this scheme. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 38 ++++++++++++++++++++++++++++++-------- include/linux/fs.h | 5 ----- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dca972d67548..3fa62cabe8d2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -131,20 +131,37 @@ struct bio { /* * Operations and flags common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. + * + * The least significant bit of the operation number indicates the data + * transfer direction: + * + * - if the least significant bit is set transfers are TO the device + * - if the least significant bit is not set transfers are FROM the device + * + * If a operation does not transfer data the least significant bit has no + * meaning. */ #define REQ_OP_BITS 8 #define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) #define REQ_FLAG_BITS 24 enum req_opf { - REQ_OP_READ, - REQ_OP_WRITE, - REQ_OP_DISCARD, /* request to discard sectors */ - REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ - REQ_OP_WRITE_SAME, /* write same block many times */ - REQ_OP_FLUSH, /* request for cache flush */ - REQ_OP_ZONE_REPORT, /* Get zone information */ - REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ + /* read sectors from the device */ + REQ_OP_READ = 0, + /* write sectors to the device */ + REQ_OP_WRITE = 1, + /* flush the volatile write cache */ + REQ_OP_FLUSH = 2, + /* discard sectors */ + REQ_OP_DISCARD = 3, + /* get zone information */ + REQ_OP_ZONE_REPORT = 4, + /* securely erase sectors */ + REQ_OP_SECURE_ERASE = 5, + /* seset a zone write pointer */ + REQ_OP_ZONE_RESET = 6, + /* write the same sector many times */ + REQ_OP_WRITE_SAME = 7, REQ_OP_LAST, }; @@ -194,6 +211,11 @@ enum req_flag_bits { #define bio_set_op_attrs(bio, op, op_flags) \ ((bio)->bi_opf |= (op | op_flags)) +static inline bool op_is_write(unsigned int op) +{ + return (op & 1); +} + static inline bool op_is_sync(unsigned int op) { return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC); diff --git a/include/linux/fs.h b/include/linux/fs.h index 16d2b6e874d6..e3e878f12b25 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2499,11 +2499,6 @@ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); #ifdef CONFIG_BLOCK -static inline bool op_is_write(unsigned int op) -{ - return op == REQ_OP_READ ? false : true; -} - /* * return data direction, READ or WRITE */ -- cgit v1.2.3 From d71d9ae14a0942fae519d890a743b12679e3d153 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:03 -0600 Subject: blk-cgroup: use op_is_sync to check for synchronous requests Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index ddaf28d0988f..01b62e7bac74 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -599,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); - if (op & REQ_SYNC) + if (op_is_sync(op)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; -- cgit v1.2.3 From 6f6b29171a192e84b666c816e49d2175afbbb09f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:07 -0600 Subject: block: don't use REQ_SYNC in the READ_SYNC definition Reads are synchronous per definition, don't add another flag for it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e3e878f12b25..5e0078fceed7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -196,7 +196,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define READ REQ_OP_READ #define WRITE REQ_OP_WRITE -#define READ_SYNC REQ_SYNC +#define READ_SYNC 0 #define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) #define WRITE_ODIRECT REQ_SYNC #define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH) -- cgit v1.2.3 From b685d3d65ac791406e0dfd8779cc9b3707fea5a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:08 -0600 Subject: block: treat REQ_FUA and REQ_PREFLUSH as synchronous MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of requiring everyone to specify the REQ_SYNC flag aѕ well. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 8 +++++++- include/linux/fs.h | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3fa62cabe8d2..107d23d18096 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -216,9 +216,15 @@ static inline bool op_is_write(unsigned int op) return (op & 1); } +/* + * Reads are always treated as synchronous, as are requests with the FUA or + * PREFLUSH flag. Other operations may be marked as synchronous using the + * REQ_SYNC flag. + */ static inline bool op_is_sync(unsigned int op) { - return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC); + return (op & REQ_OP_MASK) == REQ_OP_READ || + (op & (REQ_SYNC | REQ_FUA | REQ_PREFLUSH)); } typedef unsigned int blk_qc_t; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5e0078fceed7..ccedccb28ec8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -199,9 +199,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define READ_SYNC 0 #define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) #define WRITE_ODIRECT REQ_SYNC -#define WRITE_FLUSH (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH) -#define WRITE_FUA (REQ_SYNC | REQ_NOIDLE | REQ_FUA) -#define WRITE_FLUSH_FUA (REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA) +#define WRITE_FLUSH (REQ_NOIDLE | REQ_PREFLUSH) +#define WRITE_FUA (REQ_NOIDLE | REQ_FUA) +#define WRITE_FLUSH_FUA (REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA) /* * Attribute flags. These should be or-ed together to figure out what -- cgit v1.2.3 From a2b809672ee6fcb4d5756ea815725b3dbaea654e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:09 -0600 Subject: block: replace REQ_NOIDLE with REQ_IDLE Noidle should be the default for writes as seen by all the compounds definitions in fs.h using it. In fact only direct I/O really should be using NODILE, so turn the whole flag around to get the defaults right, which will make our life much easier especially onces the WRITE_* defines go away. This assumes all the existing "raw" users of REQ_SYNC for writes want noidle behavior, which seems to be spot on from a quick audit. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 ++-- include/linux/fs.h | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 107d23d18096..63b750a3b165 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -175,7 +175,7 @@ enum req_flag_bits { __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ __REQ_NOMERGE, /* don't touch this for merging */ - __REQ_NOIDLE, /* don't anticipate more IO after this one */ + __REQ_IDLE, /* anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ @@ -190,7 +190,7 @@ enum req_flag_bits { #define REQ_META (1ULL << __REQ_META) #define REQ_PRIO (1ULL << __REQ_PRIO) #define REQ_NOMERGE (1ULL << __REQ_NOMERGE) -#define REQ_NOIDLE (1ULL << __REQ_NOIDLE) +#define REQ_IDLE (1ULL << __REQ_IDLE) #define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) #define REQ_FUA (1ULL << __REQ_FUA) #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) diff --git a/include/linux/fs.h b/include/linux/fs.h index ccedccb28ec8..46a74209917f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -197,11 +197,11 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define WRITE REQ_OP_WRITE #define READ_SYNC 0 -#define WRITE_SYNC (REQ_SYNC | REQ_NOIDLE) -#define WRITE_ODIRECT REQ_SYNC -#define WRITE_FLUSH (REQ_NOIDLE | REQ_PREFLUSH) -#define WRITE_FUA (REQ_NOIDLE | REQ_FUA) -#define WRITE_FLUSH_FUA (REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA) +#define WRITE_SYNC REQ_SYNC +#define WRITE_ODIRECT (REQ_SYNC | REQ_IDLE) +#define WRITE_FLUSH REQ_PREFLUSH +#define WRITE_FUA REQ_FUA +#define WRITE_FLUSH_FUA (REQ_PREFLUSH | REQ_FUA) /* * Attribute flags. These should be or-ed together to figure out what -- cgit v1.2.3 From 70fd76140a6cb63262bd47b68d57b42e889c10ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:10 -0600 Subject: block,fs: use REQ_* flags directly Remove the WRITE_* and READ_SYNC wrappers, and just use the flags directly. Where applicable this also drops usage of the bio_set_op_attrs wrapper. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/fs.h | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 46a74209917f..7a1b78ab7c15 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -151,58 +151,11 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, */ #define CHECK_IOVEC_ONLY -1 -/* - * The below are the various read and write flags that we support. Some of - * them include behavioral modifiers that send information down to the - * block layer and IO scheduler. They should be used along with a req_op. - * Terminology: - * - * The block layer uses device plugging to defer IO a little bit, in - * the hope that we will see more IO very shortly. This increases - * coalescing of adjacent IO and thus reduces the number of IOs we - * have to send to the device. It also allows for better queuing, - * if the IO isn't mergeable. If the caller is going to be waiting - * for the IO, then he must ensure that the device is unplugged so - * that the IO is dispatched to the driver. - * - * All IO is handled async in Linux. This is fine for background - * writes, but for reads or writes that someone waits for completion - * on, we want to notify the block layer and IO scheduler so that they - * know about it. That allows them to make better scheduling - * decisions. So when the below references 'sync' and 'async', it - * is referencing this priority hint. - * - * With that in mind, the available types are: - * - * READ A normal read operation. Device will be plugged. - * READ_SYNC A synchronous read. Device is not plugged, caller can - * immediately wait on this read without caring about - * unplugging. - * WRITE A normal async write. Device will be plugged. - * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down - * the hint that someone will be waiting on this IO - * shortly. The write equivalent of READ_SYNC. - * WRITE_ODIRECT Special case write for O_DIRECT only. - * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. - * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on - * non-volatile media on completion. - * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded - * by a cache flush and data is guaranteed to be on - * non-volatile media on completion. - * - */ #define RW_MASK REQ_OP_WRITE #define READ REQ_OP_READ #define WRITE REQ_OP_WRITE -#define READ_SYNC 0 -#define WRITE_SYNC REQ_SYNC -#define WRITE_ODIRECT (REQ_SYNC | REQ_IDLE) -#define WRITE_FLUSH REQ_PREFLUSH -#define WRITE_FUA REQ_FUA -#define WRITE_FLUSH_FUA (REQ_PREFLUSH | REQ_FUA) - /* * Attribute flags. These should be or-ed together to figure out what * has been changed! -- cgit v1.2.3 From d38499530e5f170d30f32d3841fade204e63081d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:11 -0600 Subject: fs: decouple READ and WRITE from the block layer ops Move READ and WRITE to kernel.h and don't define them in terms of block layer ops; they are our generic data direction indicators these days and have no more resemblance with the block layer ops. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ++++++ include/linux/fs.h | 13 ------------- include/linux/kernel.h | 4 ++++ include/linux/uio.h | 2 +- 4 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 87ce64dafb93..fe9a17017608 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -62,6 +62,12 @@ #define bio_sectors(bio) ((bio)->bi_iter.bi_size >> 9) #define bio_end_sector(bio) ((bio)->bi_iter.bi_sector + bio_sectors((bio))) +/* + * Return the data direction, READ or WRITE. + */ +#define bio_data_dir(bio) \ + (op_is_write(bio_op(bio)) ? WRITE : READ) + /* * Check whether this bio carries any data or not. A NULL bio is allowed. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 7a1b78ab7c15..0ad36e0c7fa7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -151,11 +151,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, */ #define CHECK_IOVEC_ONLY -1 -#define RW_MASK REQ_OP_WRITE - -#define READ REQ_OP_READ -#define WRITE REQ_OP_WRITE - /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -2452,14 +2447,6 @@ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); #ifdef CONFIG_BLOCK -/* - * return data direction, READ or WRITE - */ -static inline int bio_data_dir(struct bio *bio) -{ - return op_is_write(bio_op(bio)) ? WRITE : READ; -} - extern void check_disk_size_change(struct gendisk *disk, struct block_device *bdev); extern int revalidate_disk(struct gendisk *); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index bc6ed52a39b9..01b6b460c34d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -50,6 +50,10 @@ #define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) #define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +/* generic data direction definitions */ +#define READ 0 +#define WRITE 1 + #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) #define u64_to_user_ptr(x) ( \ diff --git a/include/linux/uio.h b/include/linux/uio.h index 6e22b544d039..d5aba1512b8b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -125,7 +125,7 @@ static inline bool iter_is_iovec(const struct iov_iter *i) * * The ?: is just for type safety. */ -#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & RW_MASK) +#define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))->type & (READ | WRITE)) /* * Cap the iov_iter by given limit; note that the second argument is -- cgit v1.2.3 From 1e3914d4cf4e14653b7917b0e965217465cb7a9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:12 -0600 Subject: block, fs: move submit_bio to bio.h This is where all the other bio operations live, so users must include bio.h anyway. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ include/linux/fs.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index fe9a17017608..5c604b4914bf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -404,6 +404,8 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } +extern blk_qc_t submit_bio(struct bio *); + extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) diff --git a/include/linux/fs.h b/include/linux/fs.h index 0ad36e0c7fa7..5b0a9b77534d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2717,7 +2717,6 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); #ifdef CONFIG_BLOCK -extern blk_qc_t submit_bio(struct bio *); extern int bdev_read_only(struct block_device *); #endif extern int set_blocksize(struct block_device *, int); -- cgit v1.2.3 From 2f8b544477e627a42e66902e948d87f86554aeca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:13 -0600 Subject: block,fs: untangle fs.h and blk_types.h Nothing in fs.h should require blk_types.h to be included. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- include/linux/swap.h | 1 + include/linux/writeback.h | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b0a9b77534d..8533e9d59c29 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -38,6 +37,7 @@ struct backing_dev_info; struct bdi_writeback; +struct bio; struct export_operations; struct hd_geometry; struct iovec; diff --git a/include/linux/swap.h b/include/linux/swap.h index a56523cefb9b..3a6aebc23001 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -11,6 +11,7 @@ #include #include #include +#include #include struct notifier_block; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 797100e10010..e4c38703bf4e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -10,6 +10,8 @@ #include #include +struct bio; + DECLARE_PER_CPU(int, dirty_throttle_leaks); /* -- cgit v1.2.3 From 9f08217120568afdfb59973a89a675e649c0096d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:15 -0600 Subject: ceph: don't include blk_types.h in messenger.h The file only needs the struct bvec_iter delcaration, which is available from bvec.h. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 8dbd7879fdc6..67bcef2ecddb 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -1,7 +1,7 @@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H -#include +#include #include #include #include -- cgit v1.2.3 From be297968da22cf40c9c419df51e71ba8856a2ec2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:16 -0600 Subject: mm: only include blk_types in swap.h if CONFIG_SWAP is enabled It's only needed for the CONFIG_SWAP-only use of bio_end_io_t. Because CONFIG_SWAP implies CONFIG_BLOCK this will allow to drop some ifdefs in blk_types.h. Instead we'll need to add a few explicit includes that were implicit before, though. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/swap.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3a6aebc23001..bfee1af1f54f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -11,7 +11,6 @@ #include #include #include -#include #include struct notifier_block; @@ -352,6 +351,9 @@ extern int kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP + +#include /* for bio_end_io_t */ + /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); -- cgit v1.2.3 From 7281b4526cefc898d180850b54d1369f38c6b202 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:17 -0600 Subject: block: remove the CONFIG_BLOCK ifdef in blk_types.h Now that we have a separate header for struct bio_vec there is absolutely no excuse for including this header from non-block I/O code. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 63b750a3b165..bb921028e7c5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,7 +17,6 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); -#ifdef CONFIG_BLOCK /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -126,8 +125,6 @@ struct bio { #define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) -#endif /* CONFIG_BLOCK */ - /* * Operations and flags common to the bio and request structures. * We use 8 bits for encoding the operation, and the remaining 24 for flags. -- cgit v1.2.3 From 1d796d6a9641fbfcd90fcfaf6fb4894a13d0304f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Nov 2016 09:52:57 -0600 Subject: block: add REQ_BACKGROUND This adds a new request flag, REQ_BACKGROUND, that callers can use to tell the block layer that this is background (non-urgent) IO. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- include/linux/blk_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index bb921028e7c5..562ac46cb790 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -177,6 +177,7 @@ enum req_flag_bits { __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ + __REQ_BACKGROUND, /* background IO */ __REQ_NR_BITS, /* stops here */ }; @@ -192,6 +193,7 @@ enum req_flag_bits { #define REQ_FUA (1ULL << __REQ_FUA) #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) +#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From 7637241e651ec36e409412869f986dd5f097735f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Nov 2016 10:00:38 -0600 Subject: writeback: add wbc_to_write_flags() Add wbc_to_write_flags(), which returns the write modifier flags to use, based on a struct writeback_control. No functional changes in this patch, but it prepares us for factoring other wbc fields for write type. Signed-off-by: Jens Axboe Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig --- include/linux/writeback.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e4c38703bf4e..50c96ee8108f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -9,6 +9,7 @@ #include #include #include +#include struct bio; @@ -102,6 +103,14 @@ struct writeback_control { #endif }; +static inline int wbc_to_write_flags(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return REQ_SYNC; + + return 0; +} + /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global -- cgit v1.2.3 From 13edd5e7315a26b448c5f7f33fc7721b1e0c17ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Nov 2016 10:01:35 -0600 Subject: writeback: mark background writeback as such If we're doing background type writes, then use the appropriate background write flags for that. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- include/linux/writeback.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 50c96ee8108f..c78f9f0920b5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -107,6 +107,8 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL) return REQ_SYNC; + else if (wbc->for_kupdate || wbc->for_background) + return REQ_BACKGROUND; return 0; } -- cgit v1.2.3 From 2cefe4dbaadf83b236caab46705b4b5a4958e3b6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 31 Oct 2016 11:59:24 -0600 Subject: block: add bio_iov_iter_get_pages() This is a helper that pins down a range from an iov_iter and adds it to a bio without requiring a separate memory allocation for the page array. It will be used for upcoming direct I/O implementations for block devices and iomap based file systems. Signed-off-by: Kent Overstreet [hch: ported to the iov_iter interface, renamed and added comments. All blame should be directed to me and all fame should go to Kent after this!] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 5c604b4914bf..d367cd37a7f7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -427,6 +427,7 @@ void bio_chain(struct bio *, struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, const struct iov_iter *, gfp_t); -- cgit v1.2.3 From fd00144301d64f1742541a3c5e64cd1c51f39c55 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:19:37 -0700 Subject: blk-mq: Introduce blk_mq_queue_stopped() The function blk_queue_stopped() allows to test whether or not a traditional request queue has been stopped. Introduce a helper function that allows block drivers to query easily whether or not one or more hardware contexts of a blk-mq queue have been stopped. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 535ab2e13d2e..aa930009fcd3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -223,6 +223,7 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs void blk_mq_abort_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq, int error); +bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); -- cgit v1.2.3 From 9b7dd572cc439fa92e120290eb74d0295567c5a0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:20:49 -0700 Subject: blk-mq: Remove blk_mq_cancel_requeue_work() Since blk_mq_requeue_work() no longer restarts stopped queues canceling requeue work is no longer needed to prevent that a stopped queue would be restarted. Hence remove this function. Signed-off-by: Bart Van Assche Cc: Mike Snitzer Cc: Keith Busch Cc: Hannes Reinecke Cc: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index aa930009fcd3..a85a20f80aaa 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -217,7 +217,6 @@ void __blk_mq_end_request(struct request *rq, int error); void blk_mq_requeue_request(struct request *rq); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); -void blk_mq_cancel_requeue_work(struct request_queue *q); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_abort_requeue_list(struct request_queue *q); -- cgit v1.2.3 From 6a83e74d214a47a1371cd2e6a783264fcba7d428 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 2 Nov 2016 10:09:51 -0600 Subject: blk-mq: Introduce blk_mq_quiesce_queue() blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations have finished. This function does *not* wait until all outstanding requests have finished (this means invocation of request.end_io()). The algorithm used by blk_mq_quiesce_queue() is as follows: * Hold either an RCU read lock or an SRCU read lock around .queue_rq() calls. The former is used if .queue_rq() does not block and the latter if .queue_rq() may block. * blk_mq_quiesce_queue() first calls blk_mq_stop_hw_queues() followed by synchronize_srcu() or synchronize_rcu(). The latter call waits for .queue_rq() invocations that started before blk_mq_quiesce_queue() was called. * The blk_mq_hctx_stopped() calls that control whether or not .queue_rq() will be called are called with the (S)RCU read lock held. This is necessary to avoid race conditions against blk_mq_quiesce_queue(). Signed-off-by: Bart Van Assche Cc: Hannes Reinecke Cc: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +++ include/linux/blkdev.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a85a20f80aaa..ed20ac74c62a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -3,6 +3,7 @@ #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -35,6 +36,8 @@ struct blk_mq_hw_ctx { struct blk_mq_tags *tags; + struct srcu_struct queue_rq_srcu; + unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8396da2bb698..13d893a69b46 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -918,6 +918,7 @@ extern void __blk_run_queue(struct request_queue *q); extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); +extern void blk_mq_quiesce_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); -- cgit v1.2.3 From 2b053aca76b48e681be57b34ca3a8c2c10b275c5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 28 Oct 2016 17:21:41 -0700 Subject: blk-mq: Add a kick_requeue_list argument to blk_mq_requeue_request() Most blk_mq_requeue_request() and blk_mq_add_to_requeue_list() calls are followed by kicking the requeue list. Hence add an argument to these two functions that allows to kick the requeue list. This was proposed by Christoph Hellwig. Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Cc: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ed20ac74c62a..35a0af5ede6d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -218,8 +218,9 @@ void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, int error); void __blk_mq_end_request(struct request *rq, int error); -void blk_mq_requeue_request(struct request *rq); -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); +void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_abort_requeue_list(struct request_queue *q); -- cgit v1.2.3 From 50d24c34403c62ad29e8b6db559d491bae20b4b7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Nov 2016 17:03:53 -0700 Subject: block: immediately dispatch big size request Currently block plug holds up to 16 non-mergeable requests. This makes sense if the request size is small, eg, reduce lock contention. But if request size is big enough, we don't need to worry about lock contention. Holding such request makes no sense and it lows the disk utilization. In practice, this improves 10% throughput for my raid5 sequential write workload. The size (128k) is arbitrary right now, but it makes sure lock contention is small. This probably could be more intelligent, eg, check average request size holded. Since this is mainly for sequential IO, probably not worthy. V2: check the last request instead of the first request, so as long as there is one big size request we flush the plug. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13d893a69b46..9189a2d5c392 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1173,6 +1173,7 @@ struct blk_plug { struct list_head cb_list; /* md requires an unplug callback */ }; #define BLK_MAX_REQUEST_COUNT 16 +#define BLK_PLUG_FLUSH_SIZE (128 * 1024) struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); -- cgit v1.2.3 From d278d4a8892f13b6a9eb6102b356402f0e062324 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:21:08 -0600 Subject: block: add code to track actual device queue depth For blk-mq, ->nr_requests does track queue depth, at least at init time. But for the older queue paths, it's simply a soft setting. On top of that, it's generally larger than the hardware setting on purpose, to allow backup of requests for merging. Fill a hole in struct request with a 'queue_depth' member, that drivers can call to more closely inform the block layer of the real queue depth. Signed-off-by: Jens Axboe Reviewed-by: Jan Kara --- include/linux/blkdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9189a2d5c392..d364be6e6959 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -405,6 +405,8 @@ struct request_queue { struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; + unsigned int queue_depth; + /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; @@ -777,6 +779,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) return false; } +static inline unsigned int blk_queue_depth(struct request_queue *q) +{ + if (q->queue_depth) + return q->queue_depth; + + return q->nr_requests; +} + /* * q->prep_rq_fn return values */ @@ -1094,6 +1104,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, -- cgit v1.2.3 From b57d74aff9ab92fbfb7c197c384d1adfa2827b2e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Sep 2016 10:20:33 -0600 Subject: writeback: track if we're sleeping on progress in balance_dirty_pages() Note in the bdi_writeback structure whenever a task ends up sleeping waiting for progress. We can use that information in the lower layers to increase the priority of writes. Signed-off-by: Jens Axboe Reviewed-by: Jan Kara --- include/linux/backing-dev-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c357f27d5483..dc5f76d7f648 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -116,6 +116,8 @@ struct bdi_writeback { struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + unsigned long dirty_sleep; /* last wait */ + struct list_head bdi_node; /* anchored at bdi->wb_list */ #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From d49187e97e94e2eb613cb6fed810356972077cc3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 10 Nov 2016 07:32:33 -0800 Subject: nvme: introduce struct nvme_request This adds a shared per-request structure for all NVMe I/O. This structure is embedded as the first member in all NVMe transport drivers request private data and allows to implement common functionality between the drivers. The first use is to replace the current abuse of the SCSI command passthrough fields in struct request for the NVMe command passthrough, but it will grow a field more fields to allow implementing things like common abort handlers in the future. The passthrough commands are handled by having a pointer to the SQE (struct nvme_command) in struct nvme_request, and the union of the possible result fields, which had to be turned from an anonymous into a named union for that purpose. This avoids having to pass a reference to a full CQE around and thus makes checking the result a lot more lightweight. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/nvme.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7676557ce357..18ce9f7cc881 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -949,11 +949,11 @@ struct nvme_completion { /* * Used by Admin and Fabrics commands to return data: */ - union { - __le16 result16; - __le32 result; - __le64 result64; - }; + union nvme_result { + __le16 u16; + __le32 u32; + __le64 u64; + } result; __le16 sq_head; /* how much of this queue may be reclaimed */ __le16 sq_id; /* submission queue that generated this entry */ __u16 command_id; /* of the command which completed */ -- cgit v1.2.3 From cf43e6be865a582ba66ee4747ae27a0513f6bba1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Nov 2016 21:32:37 -0700 Subject: block: add scalable completion tracking of requests For legacy block, we simply track them in the request queue. For blk-mq, we track them on a per-sw queue basis, which we can then sum up through the hardware queues and finally to a per device state. The stats are tracked in, roughly, 0.1s interval windows. Add sysfs files to display the stats. The feature is off by default, to avoid any extra overhead. In-kernel users of it can turn it on by setting QUEUE_FLAG_STATS in the queue flags. We currently don't turn it on if someone just reads any of the stats files, that is something we could add as well. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 16 ++++++++++++++++ include/linux/blkdev.h | 7 +++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 562ac46cb790..4d0044d09984 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -250,4 +250,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +struct blk_issue_stat { + u64 time; +}; + +#define BLK_RQ_STAT_BATCH 64 + +struct blk_rq_stat { + s64 mean; + u64 min; + u64 max; + s32 nr_samples; + s32 nr_batch; + u64 batch; + s64 time; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d364be6e6959..303723a2e5b8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -117,6 +117,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ #define RQF_HASHED ((__force req_flags_t)(1 << 16)) +/* IO stats tracking on */ +#define RQF_STATS ((__force req_flags_t)(1 << 17)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -197,6 +199,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + struct blk_issue_stat issue_stat; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -492,6 +495,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -585,6 +591,7 @@ struct request_queue { #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ +#define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From 87760e5eef359788047d6fd54fc12eec74ce0d27 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Nov 2016 12:38:14 -0700 Subject: block: hook up writeback throttling Enable throttling of buffered writeback to make it a lot more smooth, and has way less impact on other system activity. Background writeback should be, by definition, background activity. The fact that we flush huge bundles of it at the time means that it potentially has heavy impacts on foreground workloads, which isn't ideal. We can't easily limit the sizes of writes that we do, since that would impact file system layout in the presence of delayed allocation. So just throttle back buffered writeback, unless someone is waiting for it. The algorithm for when to throttle takes its inspiration in the CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors the minimum latencies of requests over a window of time. In that window of time, if the minimum latency of any request exceeds a given target, then a scale count is incremented and the queue depth is shrunk. The next monitoring window is shrunk accordingly. Unlike CoDel, if we hit a window that exhibits good behavior, then we simply increment the scale count and re-calculate the limits for that scale value. This prevents us from oscillating between a close-to-ideal value and max all the time, instead remaining in the windows where we get good behavior. Unlike CoDel, blk-wb allows the scale count to to negative. This happens if we primarily have writes going on. Unlike positive scale counts, this doesn't change the size of the monitoring window. When the heavy writers finish, blk-bw quickly snaps back to it's stable state of a zero scale count. The patch registers a sysfs entry, 'wb_lat_usec'. This sets the latency target to me met. It defaults to 2 msec for non-rotational storage, and 75 msec for rotational storage. Setting this value to '0' disables blk-wb. Generally, a user would not have to touch this setting. We don't enable WBT on devices that are managed with CFQ, and have a non-root block cgroup attached. If we have a proportional share setup on this particular disk, then the wbt throttling will interfere with that. We don't have a strong need for wbt for that case, since we will rely on CFQ doing that for us. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 303723a2e5b8..15da9e430f90 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -383,6 +384,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other -- cgit v1.2.3 From bbd7bb7017d5c2b1e75f3818b4ce88fa58bb0eab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Nov 2016 09:34:34 -0600 Subject: block: move poll code to blk-mq The poll code is blk-mq specific, let's move it to blk-mq.c. This is a prep patch for improving the polling code. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 15da9e430f90..bab18ee5810d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -952,7 +952,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -bool blk_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { -- cgit v1.2.3 From 06426adf072bca62ac31ea396ff2159a34f276c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:01:59 -0700 Subject: blk-mq: implement hybrid poll mode for sync O_DIRECT This patch enables a hybrid polling mode. Instead of polling after IO submission, we can induce an artificial delay, and then poll after that. For example, if the IO is presumed to complete in 8 usecs from now, we can sleep for 4 usecs, wake up, and then do our polling. This still puts a sleep/wakeup cycle in the IO path, but instead of the wakeup happening after the IO has completed, it'll happen before. With this hybrid scheme, we can achieve big latency reductions while still using the same (or less) amount of CPU. Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bab18ee5810d..37ed4ea705c8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -509,6 +509,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; + unsigned int poll_nsec; struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; -- cgit v1.2.3 From 64f1c21e86f7fe63337b5c23c129de3ec506431d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:03:03 -0700 Subject: blk-mq: make the polling code adaptive The previous commit introduced the hybrid sleep/poll mode. Take that one step further, and use the completion latencies to automatically sleep for half the mean completion time. This is a good approximation. This changes the 'io_poll_delay' sysfs file a bit to expose the various options. Depending on the value, the polling code will behave differently: -1 Never enter hybrid sleep mode 0 Use half of the completion mean for the sleep delay >0 Use this specific value as the sleep delay Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 37ed4ea705c8..85699bc90a51 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -509,7 +509,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; - unsigned int poll_nsec; + int poll_nsec; struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; -- cgit v1.2.3 From 9a05e7541c39680d28ecf91892338e074738d5fd Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Nov 2016 15:16:06 +0100 Subject: block: Change extern inline to static inline With compilers which follow the C99 standard (like modern versions of gcc and clang), "extern inline" does the opposite thing from older versions of gcc (emits code for an externally linkable version of the inline function). "static inline" does the intended behavior in all cases instead. Description taken from commit 6d91857d4826 ("staging, rtl8192e, LLVMLinux: Change extern inline to static inline"). This also fixes the following GCC warning when building with CONFIG_PM disabled: ./include/linux/blkdev.h:1143:20: warning: no previous prototype for 'blk_set_runtime_active' [-Wmissing-prototypes] Fixes: d07ab6d11477 ("block: Add blk_set_runtime_active()") Reviewed-by: Mika Westerberg Signed-off-by: Tobias Klauser Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 85699bc90a51..541fdd8787a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1174,7 +1174,7 @@ static inline int blk_pre_runtime_suspend(struct request_queue *q) static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} static inline void blk_pre_runtime_resume(struct request_queue *q) {} static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} -extern inline void blk_set_runtime_active(struct request_queue *q) {} +static inline void blk_set_runtime_active(struct request_queue *q) {} #endif /* -- cgit v1.2.3 From 93c5bdf7ab71bbdae27f8f51fa175e06f000d69d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 21 Nov 2016 16:18:18 +0100 Subject: block: clear all of bi_opf in bio_set_op_attrs Since commit 87374179 ("block: add a proper block layer data direction encoding") we only or the new op and flags into bi_opf in bio_set_op_attrs instead of clearing the old value. I've not seen any breakage with the new behavior, but it seems dangerous. Also convert it to an inline function to make the argument passing safer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4d0044d09984..f57458a6a93b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -207,8 +207,11 @@ enum req_flag_bits { ((req)->cmd_flags & REQ_OP_MASK) /* obsolete, don't use in new code */ -#define bio_set_op_attrs(bio, op, op_flags) \ - ((bio)->bi_opf |= (op | op_flags)) +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_opf = op | op_flags; +} static inline bool op_is_write(unsigned int op) { -- cgit v1.2.3 From 3a83f4677539bce8eaa2bca9ee9c20e172d7ab04 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 22 Nov 2016 08:57:21 -0700 Subject: block: bio: pass bvec table to bio_init() Some drivers often use external bvec table, so introduce this helper for this case. It is always safe to access the bio->bi_io_vec in this way for this case. After converting to this usage, it will becomes a bit easier to evaluate the remaining direct access to bio->bi_io_vec, so it can help to prepare for the following multipage bvec support. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Fixed up the new O_DIRECT cases. Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index d367cd37a7f7..70a7244f08a7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -420,7 +420,8 @@ extern int bio_phys_segments(struct request_queue *, struct bio *); extern int submit_bio_wait(struct bio *bio); extern void bio_advance(struct bio *, unsigned); -extern void bio_init(struct bio *); +extern void bio_init(struct bio *bio, struct bio_vec *table, + unsigned short max_vecs); extern void bio_reset(struct bio *); void bio_chain(struct bio *, struct bio *); -- cgit v1.2.3 From 3dc87dd048dc442bab633e85bfb96c893612d765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 28 Nov 2016 22:38:53 +0100 Subject: nvme: lightnvm: attach lightnvm sysfs to nvme block device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, LBA read and write were not supported in the lightnvm specification. Now that it supports it, lets use the traditional NVMe gendisk, and attach the lightnvm sysfs geometry export. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index d190786e4ad8..fb2e601d674e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -352,8 +352,6 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; - struct device dev; - struct device *parent_dev; char name[DISK_NAME_LEN]; void *private_data; -- cgit v1.2.3 From bb3149792e0ed52cf5f457dda4c9bf9c5bda1542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:38:54 +0100 Subject: lightnvm: enable to send hint to erase command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Erases might be subject to host hints. An example is multi-plane programming to erase blocks in parallel. Enable targets to specify this hint. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index fb2e601d674e..d87be02edc39 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -470,8 +470,7 @@ typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, - unsigned long); +typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int); typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); typedef int (nvmm_reserve_lun)(struct nvm_dev *, int); @@ -537,8 +536,8 @@ extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); -extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); +extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); +extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *, int); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); -- cgit v1.2.3 From a24ba4644b7ae5af3cd2eb6992c237cb4548c45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:38:56 +0100 Subject: lightnvm: export set bad block table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bad blocks should be managed by block owners. This would be either targets for data blocks or sysblk for system blocks. In order to support this, export two functions: One to mark a block as an specific type (e.g., bad block) and another to update the bad block table on the device. Move bad block management to rrpc. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index d87be02edc39..4480d1c6a1a5 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -423,6 +423,15 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, return ppa; } +static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) +{ + if (ppa_empty(ppa1) || ppa_empty(ppa2)) + return 0; + + return ((ppa1.g.ch == ppa2.g.ch) && (ppa1.g.lun == ppa2.g.lun) && + (ppa1.g.blk == ppa2.g.blk)); +} + static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) { return dev->lptbl[slc_pg]; @@ -528,7 +537,9 @@ extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); +extern void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); +extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, + int nr_ppas, int type); extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); -- cgit v1.2.3 From 402ab9a89d7b5bab08a5534027b39d80085ec19b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:38:57 +0100 Subject: lightnvm: add ECC error codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ECC error codes to enable the appropriate handling in the target. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 4480d1c6a1a5..6b26a3289bce 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -107,6 +107,8 @@ enum { NVM_RSP_NOT_CHANGEABLE = 0x1, NVM_RSP_ERR_FAILWRITE = 0x40ff, NVM_RSP_ERR_EMPTYPAGE = 0x42ff, + NVM_RSP_ERR_FAILECC = 0x4281, + NVM_RSP_WARN_HIGHECC = 0x4700, /* Device opcodes */ NVM_OP_HBREAD = 0x02, -- cgit v1.2.3 From 7e4f64a9b3004ce592f21653c3b7781628862232 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:00 +0100 Subject: lightnvm: cleanup unused target operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup definition leftovers from old gennvm interface Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 6b26a3289bce..e598308882aa 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -477,9 +477,6 @@ typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *, struct nvm_lun *, unsigned long); typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *); -typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *); -typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *); -typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int); typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); @@ -504,9 +501,6 @@ struct nvmm_type { /* Block administration callbacks */ nvmm_get_blk_fn *get_blk; nvmm_put_blk_fn *put_blk; - nvmm_open_blk_fn *open_blk; - nvmm_close_blk_fn *close_blk; - nvmm_flush_blk_fn *flush_blk; nvmm_submit_io_fn *submit_io; nvmm_erase_blk_fn *erase_blk; -- cgit v1.2.3 From 0e5c3246dbb96b6870634e7d51b2490f05c976cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:01 +0100 Subject: lightnvm: make address conversion functions global MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Targets are assumed to used the same generic ppa format, where the address is partitioned on ch:lun:block:pg:pl:sec. Thus, make the function in charge of transforming the ppa address from a linear format to the generic one available to all targets. This function will be needed by the media manager in order to do target mapping translations when targets are divided on different physical partitions. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e598308882aa..98278a9fcb1f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -361,6 +361,36 @@ struct nvm_dev { spinlock_t lock; }; +static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, + struct ppa_addr r) +{ + struct ppa_addr l; + int secs, pgs, blks, luns; + sector_t ppa = r.ppa; + + l.ppa = 0; + + div_u64_rem(ppa, dev->sec_per_pg, &secs); + l.g.sec = secs; + + sector_div(ppa, dev->sec_per_pg); + div_u64_rem(ppa, dev->pgs_per_blk, &pgs); + l.g.pg = pgs; + + sector_div(ppa, dev->pgs_per_blk); + div_u64_rem(ppa, dev->blks_per_lun, &blks); + l.g.blk = blks; + + sector_div(ppa, dev->blks_per_lun); + div_u64_rem(ppa, dev->luns_per_chnl, &luns); + l.g.lun = luns; + + sector_div(ppa, dev->luns_per_chnl); + l.g.ch = ppa; + + return l; +} + static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, struct ppa_addr r) { -- cgit v1.2.3 From de93434fcf74d41754a48e45365a5914e00bc0be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:04 +0100 Subject: lightnvm: remove gen_lun abstraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gen_lun abstraction in the generic media manager was conceived on the assumption that a single target would instantiated on top of it. This has complicated target design to implement multi-instances. Remove this abstraction and move its logic to nvm_lun, which manages physical lun geometry and operations. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 98278a9fcb1f..33940bdc18a9 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -275,7 +275,17 @@ struct nvm_lun { spinlock_t lock; + /* lun block lists */ + struct list_head used_list; /* In-use blocks */ + struct list_head free_list; /* Not used blocks i.e. released + * and ready for use + */ + struct list_head bb_list; /* Bad blocks. Mutually exclusive with + * free_list and used_list + */ unsigned int nr_free_blocks; /* Number of unused blocks */ + int reserved_blocks; + struct nvm_block *blocks; }; -- cgit v1.2.3 From 8176117b82e49e043d045f214ba7a892fba6b827 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:05 +0100 Subject: lightnvm: manage lun partitions internally in mm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LUNs are exclusively owned by targets implementing a block device FTL. Doing this reservation requires at the moment a 2-way callback gennvm <-> target. The reason behind this is that LUNs were not assumed to always be exclusively owned by targets. However, this design decision goes against I/O determinism QoS (two targets would mix I/O on the same parallel unit in the device). This patch makes LUN reservation as part of the target creation on the media manager. This makes that LUNs are always exclusively owned by the target instantiated on top of them. LUN stripping and/or sharing should be implemented on the target itself or the layers on top. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 33940bdc18a9..89c695483d55 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -210,6 +210,7 @@ struct nvm_id { struct nvm_target { struct list_head list; + struct list_head lun_list; struct nvm_dev *dev; struct nvm_tgt_type *type; struct gendisk *disk; @@ -273,6 +274,7 @@ struct nvm_lun { int lun_id; int chnl_id; + struct list_head list; spinlock_t lock; /* lun block lists */ @@ -521,8 +523,6 @@ typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int); typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); -typedef int (nvmm_reserve_lun)(struct nvm_dev *, int); -typedef void (nvmm_release_lun)(struct nvm_dev *, int); typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); @@ -550,8 +550,6 @@ struct nvmm_type { /* Configuration management */ nvmm_get_lun_fn *get_lun; - nvmm_reserve_lun *reserve_lun; - nvmm_release_lun *release_lun; /* Statistics */ nvmm_lun_info_print_fn *lun_info_print; -- cgit v1.2.3 From 8e79b5cb1d3b8eceaf6862995952dd4de431dd99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:06 +0100 Subject: lightnvm: move block provisioning to targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to naturally support multi-target instances on an Open-Channel SSD, targets should own the LUNs they get blocks from and manage provisioning internally. This is done in several steps. This patch moves the block provisioning inside of the target and removes the get/put block interface from the media manager. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 133 +++++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 89c695483d55..1f1588c2557e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -211,7 +211,7 @@ struct nvm_id { struct nvm_target { struct list_head list; struct list_head lun_list; - struct nvm_dev *dev; + struct nvm_tgt_dev *dev; struct nvm_tgt_type *type; struct gendisk *disk; }; @@ -286,7 +286,6 @@ struct nvm_lun { * free_list and used_list */ unsigned int nr_free_blocks; /* Number of unused blocks */ - int reserved_blocks; struct nvm_block *blocks; }; @@ -315,22 +314,12 @@ struct nvm_sb_info { struct ppa_addr fs_ppa; }; -struct nvm_dev { - struct nvm_dev_ops *ops; - - struct list_head devices; - - /* Media manager */ - struct nvmm_type *mt; - void *mp; - - /* System blocks */ - struct nvm_sb_info sb; - - /* Device information */ +/* Device generic information */ +struct nvm_geo { int nr_chnls; + int nr_luns; + int luns_per_chnl; /* -1 if channels are not symmetric */ int nr_planes; - int luns_per_chnl; int sec_per_pg; /* only sectors for a single page */ int pgs_per_blk; int blks_per_lun; @@ -350,14 +339,43 @@ struct nvm_dev { int sec_per_pl; /* all sectors across planes */ int sec_per_blk; int sec_per_lun; +}; + +struct nvm_tgt_dev { + /* Device information */ + struct nvm_geo geo; + + sector_t total_secs; + + struct nvm_id identity; + struct request_queue *q; + + struct nvmm_type *mt; + struct nvm_dev_ops *ops; + + void *parent; +}; + +struct nvm_dev { + struct nvm_dev_ops *ops; + + struct list_head devices; + + /* Media manager */ + struct nvmm_type *mt; + void *mp; + + /* System blocks */ + struct nvm_sb_info sb; + + /* Device information */ + struct nvm_geo geo; /* lower page table */ int lps_per_blk; int *lptbl; - unsigned long total_blocks; unsigned long total_secs; - int nr_luns; unsigned long *lun_map; void *dma_pool; @@ -373,7 +391,7 @@ struct nvm_dev { spinlock_t lock; }; -static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, +static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, struct ppa_addr r) { struct ppa_addr l; @@ -382,22 +400,22 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, l.ppa = 0; - div_u64_rem(ppa, dev->sec_per_pg, &secs); + div_u64_rem(ppa, geo->sec_per_pg, &secs); l.g.sec = secs; - sector_div(ppa, dev->sec_per_pg); - div_u64_rem(ppa, dev->pgs_per_blk, &pgs); + sector_div(ppa, geo->sec_per_pg); + div_u64_rem(ppa, geo->pgs_per_blk, &pgs); l.g.pg = pgs; - sector_div(ppa, dev->pgs_per_blk); - div_u64_rem(ppa, dev->blks_per_lun, &blks); + sector_div(ppa, geo->pgs_per_blk); + div_u64_rem(ppa, geo->blks_per_lun, &blks); l.g.blk = blks; - sector_div(ppa, dev->blks_per_lun); - div_u64_rem(ppa, dev->luns_per_chnl, &luns); + sector_div(ppa, geo->blks_per_lun); + div_u64_rem(ppa, geo->luns_per_chnl, &luns); l.g.lun = luns; - sector_div(ppa, dev->luns_per_chnl); + sector_div(ppa, geo->luns_per_chnl); l.g.ch = ppa; return l; @@ -406,14 +424,15 @@ static inline struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, struct ppa_addr r) { + struct nvm_geo *geo = &dev->geo; struct ppa_addr l; - l.ppa = ((u64)r.g.blk) << dev->ppaf.blk_offset; - l.ppa |= ((u64)r.g.pg) << dev->ppaf.pg_offset; - l.ppa |= ((u64)r.g.sec) << dev->ppaf.sect_offset; - l.ppa |= ((u64)r.g.pl) << dev->ppaf.pln_offset; - l.ppa |= ((u64)r.g.lun) << dev->ppaf.lun_offset; - l.ppa |= ((u64)r.g.ch) << dev->ppaf.ch_offset; + l.ppa = ((u64)r.g.blk) << geo->ppaf.blk_offset; + l.ppa |= ((u64)r.g.pg) << geo->ppaf.pg_offset; + l.ppa |= ((u64)r.g.sec) << geo->ppaf.sect_offset; + l.ppa |= ((u64)r.g.pl) << geo->ppaf.pln_offset; + l.ppa |= ((u64)r.g.lun) << geo->ppaf.lun_offset; + l.ppa |= ((u64)r.g.ch) << geo->ppaf.ch_offset; return l; } @@ -421,24 +440,25 @@ static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, struct ppa_addr r) { + struct nvm_geo *geo = &dev->geo; struct ppa_addr l; l.ppa = 0; /* * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. */ - l.g.blk = (r.ppa >> dev->ppaf.blk_offset) & - (((1 << dev->ppaf.blk_len) - 1)); - l.g.pg |= (r.ppa >> dev->ppaf.pg_offset) & - (((1 << dev->ppaf.pg_len) - 1)); - l.g.sec |= (r.ppa >> dev->ppaf.sect_offset) & - (((1 << dev->ppaf.sect_len) - 1)); - l.g.pl |= (r.ppa >> dev->ppaf.pln_offset) & - (((1 << dev->ppaf.pln_len) - 1)); - l.g.lun |= (r.ppa >> dev->ppaf.lun_offset) & - (((1 << dev->ppaf.lun_len) - 1)); - l.g.ch |= (r.ppa >> dev->ppaf.ch_offset) & - (((1 << dev->ppaf.ch_len) - 1)); + l.g.blk = (r.ppa >> geo->ppaf.blk_offset) & + (((1 << geo->ppaf.blk_len) - 1)); + l.g.pg |= (r.ppa >> geo->ppaf.pg_offset) & + (((1 << geo->ppaf.pg_len) - 1)); + l.g.sec |= (r.ppa >> geo->ppaf.sect_offset) & + (((1 << geo->ppaf.sect_len) - 1)); + l.g.pl |= (r.ppa >> geo->ppaf.pln_offset) & + (((1 << geo->ppaf.pln_len) - 1)); + l.g.lun |= (r.ppa >> geo->ppaf.lun_offset) & + (((1 << geo->ppaf.lun_len) - 1)); + l.g.ch |= (r.ppa >> geo->ppaf.ch_offset) & + (((1 << geo->ppaf.ch_len) - 1)); return l; } @@ -456,11 +476,12 @@ static inline void ppa_set_empty(struct ppa_addr *ppa_addr) static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, struct nvm_block *blk) { + struct nvm_geo *geo = &dev->geo; struct ppa_addr ppa; struct nvm_lun *lun = blk->lun; ppa.ppa = 0; - ppa.g.blk = blk->id % dev->blks_per_lun; + ppa.g.blk = blk->id % geo->blks_per_lun; ppa.g.lun = lun->lun_id; ppa.g.ch = lun->chnl_id; @@ -483,7 +504,8 @@ static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); +typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, int, + int); typedef void (nvm_tgt_exit_fn)(void *); struct nvm_tgt_type { @@ -516,9 +538,6 @@ typedef void (nvmm_unregister_fn)(struct nvm_dev *); typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); -typedef struct nvm_block *(nvmm_get_blk_fn)(struct nvm_dev *, - struct nvm_lun *, unsigned long); -typedef void (nvmm_put_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int); typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); @@ -538,10 +557,6 @@ struct nvmm_type { nvmm_create_tgt_fn *create_tgt; nvmm_remove_tgt_fn *remove_tgt; - /* Block administration callbacks */ - nvmm_get_blk_fn *get_blk; - nvmm_put_blk_fn *put_blk; - nvmm_submit_io_fn *submit_io; nvmm_erase_blk_fn *erase_blk; @@ -563,10 +578,6 @@ struct nvmm_type { extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); -extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, - unsigned long); -extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); - extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); @@ -611,10 +622,10 @@ extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); extern int nvm_dev_factory(struct nvm_dev *, int flags); -#define nvm_for_each_lun_ppa(dev, ppa, chid, lunid) \ - for ((chid) = 0, (ppa).ppa = 0; (chid) < (dev)->nr_chnls; \ +#define nvm_for_each_lun_ppa(geo, ppa, chid, lunid) \ + for ((chid) = 0, (ppa).ppa = 0; (chid) < (geo)->nr_chnls; \ (chid)++, (ppa).g.ch = (chid)) \ - for ((lunid) = 0; (lunid) < (dev)->luns_per_chnl; \ + for ((lunid) = 0; (lunid) < (geo)->luns_per_chnl; \ (lunid)++, (ppa).g.lun = (lunid)) #else /* CONFIG_NVM */ -- cgit v1.2.3 From 0ac4072eb10c9627415eb1ca511121156e20012c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:07 +0100 Subject: lightnvm: remove get_lun operation on gennvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since LUNs are managed internally on the target, there is no need for the media manager to implement a get_lun operation. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 1f1588c2557e..e56c35227249 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -504,8 +504,8 @@ static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, int, - int); +typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, + struct list_head *lun_list); typedef void (nvm_tgt_exit_fn)(void *); struct nvm_tgt_type { @@ -541,7 +541,6 @@ typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int); typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); -typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); @@ -563,9 +562,6 @@ struct nvmm_type { /* Bad block mgmt */ nvmm_mark_blk_fn *mark_blk; - /* Configuration management */ - nvmm_get_lun_fn *get_lun; - /* Statistics */ nvmm_lun_info_print_fn *lun_info_print; -- cgit v1.2.3 From eec44565e9ab13bbf5b48864a68871eabf1115c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:08 +0100 Subject: lightnvm: remove debug lun statistics from gennvm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since LUNs are managed internally on targets, the media manager has no access to the free LUN lists. Thus, debug functions that show LUN information on the device should not be implemented on the media manager, but rather on the target in itself. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e56c35227249..ed04fa642371 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -541,8 +541,6 @@ typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int); typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); -typedef void (nvmm_lun_info_print_fn)(struct nvm_dev *); - typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); @@ -562,9 +560,6 @@ struct nvmm_type { /* Bad block mgmt */ nvmm_mark_blk_fn *mark_blk; - /* Statistics */ - nvmm_lun_info_print_fn *lun_info_print; - nvmm_get_area_fn *get_area; nvmm_put_area_fn *put_area; -- cgit v1.2.3 From 2a02e627c245bfa987b97707123d7747d7b0e486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:09 +0100 Subject: lightnvm: eliminate nvm_block abstraction on mm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to naturally support multi-target instances on an Open-Channel SSD, targets should own the LUNs they get blocks from and manage provisioning internally. This is done in several steps. A part of this transformation is that targets manage their blocks internally. This patch eliminates the nvm_block abstraction and moves block management to the target logic. The rrpc target is transformed. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 48 ++---------------------------------------------- 1 file changed, 2 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ed04fa642371..cc210cc85c6d 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -266,8 +266,6 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) return rqdata + 1; } -struct nvm_block; - struct nvm_lun { int id; @@ -275,19 +273,6 @@ struct nvm_lun { int chnl_id; struct list_head list; - spinlock_t lock; - - /* lun block lists */ - struct list_head used_list; /* In-use blocks */ - struct list_head free_list; /* Not used blocks i.e. released - * and ready for use - */ - struct list_head bb_list; /* Bad blocks. Mutually exclusive with - * free_list and used_list - */ - unsigned int nr_free_blocks; /* Number of unused blocks */ - - struct nvm_block *blocks; }; enum { @@ -296,15 +281,6 @@ enum { NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; -struct nvm_block { - struct list_head list; - struct nvm_lun *lun; - unsigned long id; - - void *priv; - int state; -}; - /* system block cpu representation */ struct nvm_sb_info { unsigned long seqnr; @@ -473,21 +449,6 @@ static inline void ppa_set_empty(struct ppa_addr *ppa_addr) ppa_addr->ppa = ADDR_EMPTY; } -static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, - struct nvm_block *blk) -{ - struct nvm_geo *geo = &dev->geo; - struct ppa_addr ppa; - struct nvm_lun *lun = blk->lun; - - ppa.ppa = 0; - ppa.g.blk = blk->id % geo->blks_per_lun; - ppa.g.lun = lun->lun_id; - ppa.g.ch = lun->chnl_id; - - return ppa; -} - static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) { if (ppa_empty(ppa1) || ppa_empty(ppa2)) @@ -539,8 +500,7 @@ typedef void (nvmm_unregister_fn)(struct nvm_dev *); typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, int); -typedef void (nvmm_mark_blk_fn)(struct nvm_dev *, struct ppa_addr, int); +typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct ppa_addr *, int); typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); @@ -557,9 +517,6 @@ struct nvmm_type { nvmm_submit_io_fn *submit_io; nvmm_erase_blk_fn *erase_blk; - /* Bad block mgmt */ - nvmm_mark_blk_fn *mark_blk; - nvmm_get_area_fn *get_area; nvmm_put_area_fn *put_area; @@ -573,7 +530,6 @@ extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -extern void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas, int type); @@ -584,7 +540,7 @@ extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); -extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *, int); +extern int nvm_erase_blk(struct nvm_dev *, struct ppa_addr *, int); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); -- cgit v1.2.3 From 8e53624d44c1de31b1b0d4f500703669418a4c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:10 +0100 Subject: lightnvm: eliminate nvm_lun abstraction in mm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to naturally support multi-target instances on an Open-Channel SSD, targets should own the LUNs they get blocks from and manage provisioning internally. This is done in several steps. Since targets own the LUNs the are instantiated on top of and manage the free block list internally, there is no need for a LUN abstraction in the media manager. LUNs are intrinsically managed as in the physical layout (ch:0,lun:0, ..., ch:0,lun:n, ch:1,lun:0, ch:1,lun:n, ..., ch:m,lun:0, ch:m,lun:n) and given to the targets based on the target creation ioctl. This simplifies LUN management and clears the path for a partition manager to sit directly underneath LightNVM targets. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index cc210cc85c6d..2222853ef969 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -47,6 +47,7 @@ struct ppa_addr { struct nvm_rq; struct nvm_id; struct nvm_dev; +struct nvm_tgt_dev; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); @@ -210,7 +211,6 @@ struct nvm_id { struct nvm_target { struct list_head list; - struct list_head lun_list; struct nvm_tgt_dev *dev; struct nvm_tgt_type *type; struct gendisk *disk; @@ -231,7 +231,7 @@ typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { struct nvm_tgt_instance *ins; - struct nvm_dev *dev; + struct nvm_tgt_dev *dev; struct bio *bio; @@ -266,15 +266,6 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) return rqdata + 1; } -struct nvm_lun { - int id; - - int lun_id; - int chnl_id; - - struct list_head list; -}; - enum { NVM_BLK_ST_FREE = 0x1, /* Free block */ NVM_BLK_ST_TGT = 0x2, /* Block in use by target */ @@ -321,6 +312,9 @@ struct nvm_tgt_dev { /* Device information */ struct nvm_geo geo; + /* Base ppas for target LUNs */ + struct ppa_addr *luns; + sector_t total_secs; struct nvm_id identity; @@ -330,6 +324,7 @@ struct nvm_tgt_dev { struct nvm_dev_ops *ops; void *parent; + void *map; }; struct nvm_dev { @@ -363,16 +358,18 @@ struct nvm_dev { char name[DISK_NAME_LEN]; void *private_data; + void *rmap; + struct mutex mlock; spinlock_t lock; }; static inline struct ppa_addr linear_to_generic_addr(struct nvm_geo *geo, - struct ppa_addr r) + u64 pba) { struct ppa_addr l; int secs, pgs, blks, luns; - sector_t ppa = r.ppa; + sector_t ppa = pba; l.ppa = 0; @@ -465,8 +462,7 @@ static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, - struct list_head *lun_list); +typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); typedef void (nvm_tgt_exit_fn)(void *); struct nvm_tgt_type { @@ -499,10 +495,11 @@ typedef void (nvmm_unregister_fn)(struct nvm_dev *); typedef int (nvmm_create_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_create *); typedef int (nvmm_remove_tgt_fn)(struct nvm_dev *, struct nvm_ioctl_remove *); -typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct ppa_addr *, int); +typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *); +typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int); typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); +typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int); struct nvmm_type { const char *name; @@ -520,6 +517,8 @@ struct nvmm_type { nvmm_get_area_fn *get_area; nvmm_put_area_fn *put_area; + nvmm_part_to_tgt_fn *part_to_tgt; + struct list_head list; }; @@ -533,14 +532,18 @@ extern void nvm_unregister(struct nvm_dev *); extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas, int type); -extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); +extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); -extern int nvm_erase_blk(struct nvm_dev *, struct ppa_addr *, int); +extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); +extern int nvm_get_l2p_tbl(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, + void *); +extern int nvm_get_area(struct nvm_dev *, sector_t *, sector_t); +extern void nvm_put_area(struct nvm_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); -- cgit v1.2.3 From 959e911b31981b52ed3f3d6e351b107bcb9163ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:11 +0100 Subject: lightnvm: introduce helpers for generic ops in rrpc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid calling media manager and device-specific operations directly from rrpc. Create helper functions on lightnvm's core instead. Signed-off-by: Javier González Made it work with null_blk as well. Signed-off-by: Matias Bjørling Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 2222853ef969..99cd1e70e451 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -320,10 +320,7 @@ struct nvm_tgt_dev { struct nvm_id identity; struct request_queue *q; - struct nvmm_type *mt; - struct nvm_dev_ops *ops; - - void *parent; + struct nvm_dev *parent; void *map; }; -- cgit v1.2.3 From a279006afa3377493c4240395c70430f2a9b0d2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:12 +0100 Subject: lightnvm: introduce max_phys_sects helper function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Target devices do not have access to the device driver operations. Introduce a helper function that exposes the max. number of physical sectors supported by the underlying device. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 99cd1e70e451..96375317b479 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -529,6 +529,7 @@ extern void nvm_unregister(struct nvm_dev *); extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas, int type); +extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); -- cgit v1.2.3 From da2d7cb828ce2714c603827ac5a6e1c98a02e861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:13 +0100 Subject: lightnvm: use target nvm on target-specific ops. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On target-specific operations pass on nvm_tgt_dev instead of the generic nvm device. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 96375317b479..e76f9c4aa49b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -538,10 +538,10 @@ extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int, int); extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); -extern int nvm_get_l2p_tbl(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, +extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); -extern int nvm_get_area(struct nvm_dev *, sector_t *, sector_t); -extern void nvm_put_area(struct nvm_dev *, sector_t); +extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); +extern void nvm_put_area(struct nvm_tgt_dev *, sector_t); extern void nvm_end_io(struct nvm_rq *, int); extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); -- cgit v1.2.3 From 333ba053d145d6f9152f6b0311a345b876f0fed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Mon, 28 Nov 2016 22:39:14 +0100 Subject: lightnvm: transform target get/set bad block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since targets are given a virtual target device, it is necessary to translate all communication between targets and the backend device. Implement the translation layer for get/set bad block table. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index e76f9c4aa49b..7c273bbc5351 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -496,8 +496,15 @@ typedef int (nvmm_submit_io_fn)(struct nvm_tgt_dev *, struct nvm_rq *); typedef int (nvmm_erase_blk_fn)(struct nvm_tgt_dev *, struct ppa_addr *, int); typedef int (nvmm_get_area_fn)(struct nvm_dev *, sector_t *, sector_t); typedef void (nvmm_put_area_fn)(struct nvm_dev *, sector_t); +typedef struct ppa_addr (nvmm_trans_ppa_fn)(struct nvm_tgt_dev *, + struct ppa_addr, int); typedef void (nvmm_part_to_tgt_fn)(struct nvm_dev *, sector_t*, int); +enum { + TRANS_TGT_TO_DEV = 0x0, + TRANS_DEV_TO_TGT = 0x1, +}; + struct nvmm_type { const char *name; unsigned int version[3]; @@ -514,6 +521,7 @@ struct nvmm_type { nvmm_get_area_fn *get_area; nvmm_put_area_fn *put_area; + nvmm_trans_ppa_fn *trans_ppa; nvmm_part_to_tgt_fn *part_to_tgt; struct list_head list; @@ -526,9 +534,9 @@ extern struct nvm_dev *nvm_alloc_dev(int); extern int nvm_register(struct nvm_dev *); extern void nvm_unregister(struct nvm_dev *); -extern int nvm_set_bb_tbl(struct nvm_dev *dev, struct ppa_addr *ppas, - int nr_ppas, int type); - +extern int nvm_set_bb_tbl(struct nvm_dev *, struct ppa_addr *, int, int); +extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, + int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); @@ -549,6 +557,7 @@ extern int nvm_submit_ppa_list(struct nvm_dev *, struct ppa_addr *, int, int, int, void *, int); extern int nvm_bb_tbl_fold(struct nvm_dev *, u8 *, int); extern int nvm_get_bb_tbl(struct nvm_dev *, struct ppa_addr, u8 *); +extern int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr, u8 *); /* sysblk.c */ #define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ -- cgit v1.2.3 From e73c23ff736e1ea371dfa419d7bf8e77ee53044a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:58 -0800 Subject: block: add async variant of blkdev_issue_zeroout Similar to __blkdev_issue_discard this variant allows submitting the final bio asynchronously and chaining multiple ranges into a single completion. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 541fdd8787a5..7e9d8a0895be 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1269,6 +1269,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct bio **biop); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); +extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + bool discard); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard); static inline int sb_issue_discard(struct super_block *sb, sector_t block, -- cgit v1.2.3 From a6f0788ec2881ac14e97ff7fa6a78a807f87b5ba Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:59 -0800 Subject: block: add support for REQ_OP_WRITE_ZEROES This adds a new block layer operation to zero out a range of LBAs. This allows to implement zeroing for devices that don't use either discard with a predictable zero pattern or WRITE SAME of zeroes. The prominent example of that is NVMe with the Write Zeroes command, but in the future, this should also help with improving the way zeroing discards work. For this operation, suitable entry is exported in sysfs which indicate the number of maximum bytes allowed in one write zeroes operation by the device. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 25 ++++++++++++++----------- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 70a7244f08a7..b15323934a29 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -76,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio) if (bio && bio->bi_iter.bi_size && bio_op(bio) != REQ_OP_DISCARD && - bio_op(bio) != REQ_OP_SECURE_ERASE) + bio_op(bio) != REQ_OP_SECURE_ERASE && + bio_op(bio) != REQ_OP_WRITE_ZEROES) return true; return false; @@ -86,7 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || - bio_op(bio) == REQ_OP_WRITE_SAME; + bio_op(bio) == REQ_OP_WRITE_SAME || + bio_op(bio) == REQ_OP_WRITE_ZEROES; } static inline bool bio_mergeable(struct bio *bio) @@ -188,18 +190,19 @@ static inline unsigned bio_segments(struct bio *bio) struct bvec_iter iter; /* - * We special case discard/write same, because they interpret bi_size - * differently: + * We special case discard/write same/write zeroes, because they + * interpret bi_size differently: */ - if (bio_op(bio) == REQ_OP_DISCARD) - return 1; - - if (bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: return 1; + default: + break; + } bio_for_each_segment(bv, bio, iter) segs++; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f57458a6a93b..519ea2c9df61 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -159,6 +159,8 @@ enum req_opf { REQ_OP_ZONE_RESET = 6, /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, + /* write the zero filled sector many times */ + REQ_OP_WRITE_ZEROES = 8, REQ_OP_LAST, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e9d8a0895be..ebeef2b79c5a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -323,6 +323,7 @@ struct queue_limits { unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; + unsigned int max_write_zeroes_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -774,6 +775,9 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_FLUSH) return false; + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) @@ -1004,6 +1008,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_SAME)) return q->limits.max_write_same_sectors; + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; + return q->limits.max_sectors; } @@ -1107,6 +1114,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_same_sectors(struct request_queue *q, unsigned int max_write_same_sectors); +extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, @@ -1475,6 +1484,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return q->limits.max_write_zeroes_sectors; + + return 0; +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 3b7c33b28a44d4621e365e090bf8bd332ab232a1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:29:00 -0800 Subject: nvme.h: add Write Zeroes definitions Add the command structure, optional command set support (ONCS) bit and a new error code for the Write Zeroes command. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/nvme.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 18ce9f7cc881..0df9466a7c38 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -237,6 +237,7 @@ enum { NVME_CTRL_ONCS_COMPARE = 1 << 0, NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, NVME_CTRL_ONCS_DSM = 1 << 2, + NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, }; @@ -543,6 +544,23 @@ struct nvme_dsm_range { __le64 slba; }; +struct nvme_write_zeroes_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + union nvme_data_ptr dptr; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + /* Admin commands */ enum nvme_admin_opcode { @@ -839,6 +857,7 @@ struct nvme_command { struct nvme_download_firmware dlfw; struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; + struct nvme_write_zeroes_cmd write_zeroes; struct nvme_abort_cmd abort; struct nvme_get_log_page_command get_log_page; struct nvmf_common_command fabrics; @@ -918,6 +937,7 @@ enum { NVME_SC_BAD_ATTRIBUTES = 0x180, NVME_SC_INVALID_PI = 0x181, NVME_SC_READ_ONLY = 0x182, + NVME_SC_ONCS_NOT_SUPPORTED = 0x183, /* * I/O Command Set Specific - Fabrics commands: -- cgit v1.2.3 From a317178e36b52f5f24ad226a77403eeea5ac05c4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 21 Oct 2016 23:51:54 +0300 Subject: parser: add u64 number parser Will be used by the nvme-fabrics FC transport in parsing options Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- include/linux/parser.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/parser.h b/include/linux/parser.h index 39d5b7955b23..884c1e6eb3fe 100644 --- a/include/linux/parser.h +++ b/include/linux/parser.h @@ -27,6 +27,7 @@ typedef struct { int match_token(char *, const match_table_t table, substring_t args[]); int match_int(substring_t *, int *result); +int match_u64(substring_t *, u64 *result); int match_octal(substring_t *, int *result); int match_hex(substring_t *, int *result); bool match_wildcard(const char *pattern, const char *str); -- cgit v1.2.3 From cba3bdfd2e89edd706e2c40dfad914aca663b6ac Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 2 Dec 2016 00:28:39 -0800 Subject: nvme-fabrics: Add FC transport error codes to nvme.h Signed-off-by: James Smart Reviewed-by: Christoph Hellwig Reviewed-by: Jay Freyensee Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 0df9466a7c38..5ac1f57226f4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -963,6 +963,19 @@ enum { NVME_SC_ACCESS_DENIED = 0x286, NVME_SC_DNR = 0x4000, + + + /* + * FC Transport-specific error status values for NVME commands + * + * Transport-specific status code values must be in the range 0xB0..0xBF + */ + + /* Generic FC failure - catchall */ + NVME_SC_FC_TRANSPORT_ERROR = 0x00B0, + + /* I/O failure due to FC ABTS'd */ + NVME_SC_FC_TRANSPORT_ABORTED = 0x00B1, }; struct nvme_completion { -- cgit v1.2.3 From b1ad1475b447a7668ac8bfad77277c4405941883 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 2 Dec 2016 00:28:40 -0800 Subject: nvme-fabrics: Add FC transport FC-NVME definitions - Formats for Cmd, Data, Rsp IUs - Formats FC-4 LS definitions - Add to MAINTAINERS file Signed-off-by: James Smart Reviewed-by: Christoph Hellwig Reviewed-by: Jay Freyensee Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc.h | 268 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 include/linux/nvme-fc.h (limited to 'include/linux') diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h new file mode 100644 index 000000000000..4b45226bd604 --- /dev/null +++ b/include/linux/nvme-fc.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016 Avago Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful. + * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, + * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A + * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO + * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID. + * See the GNU General Public License for more details, a copy of which + * can be found in the file COPYING included with this package + * + */ + +/* + * This file contains definitions relative to FC-NVME r1.11 and a few + * newer items + */ + +#ifndef _NVME_FC_H +#define _NVME_FC_H 1 + + +#define NVME_CMD_SCSI_ID 0xFD +#define NVME_CMD_FC_ID FC_TYPE_NVME + +/* FC-NVME Cmd IU Flags */ +#define FCNVME_CMD_FLAGS_DIRMASK 0x03 +#define FCNVME_CMD_FLAGS_WRITE 0x01 +#define FCNVME_CMD_FLAGS_READ 0x02 + +struct nvme_fc_cmd_iu { + __u8 scsi_id; + __u8 fc_id; + __be16 iu_len; + __u8 rsvd4[3]; + __u8 flags; + __be64 connection_id; + __be32 csn; + __be32 data_len; + struct nvme_command sqe; + __be32 rsvd88[2]; +}; + +#define NVME_FC_SIZEOF_ZEROS_RSP 12 + +struct nvme_fc_ersp_iu { + __u8 rsvd0[2]; + __be16 iu_len; + __be32 rsn; + __be32 xfrd_len; + __be32 rsvd12; + struct nvme_completion cqe; + /* for now - no additional payload */ +}; + + +/* FC-NVME r1.03/16-119v0 NVME Link Services */ +enum { + FCNVME_LS_RSVD = 0, + FCNVME_LS_RJT = 1, + FCNVME_LS_ACC = 2, + FCNVME_LS_CREATE_ASSOCIATION = 3, + FCNVME_LS_CREATE_CONNECTION = 4, + FCNVME_LS_DISCONNECT = 5, +}; + +/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */ +enum { + FCNVME_LSDESC_RSVD = 0x0, + FCNVME_LSDESC_RQST = 0x1, + FCNVME_LSDESC_RJT = 0x2, + FCNVME_LSDESC_CREATE_ASSOC_CMD = 0x3, + FCNVME_LSDESC_CREATE_CONN_CMD = 0x4, + FCNVME_LSDESC_DISCONN_CMD = 0x5, + FCNVME_LSDESC_CONN_ID = 0x6, + FCNVME_LSDESC_ASSOC_ID = 0x7, +}; + + +/* ********** start of Link Service Descriptors ********** */ + + +/* + * fills in length of a descriptor. Struture minus descriptor header + */ +static inline __be32 fcnvme_lsdesc_len(size_t sz) +{ + return cpu_to_be32(sz - (2 * sizeof(u32))); +} + + +struct fcnvme_ls_rqst_w0 { + u8 ls_cmd; /* FCNVME_LS_xxx */ + u8 zeros[3]; +}; + +/* FCNVME_LSDESC_RQST */ +struct fcnvme_lsdesc_rqst { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + struct fcnvme_ls_rqst_w0 w0; + __be32 rsvd12; +}; + + + + +/* FCNVME_LSDESC_RJT */ +struct fcnvme_lsdesc_rjt { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + u8 rsvd8; + + /* + * Reject reason and explanaction codes are generic + * to ELs's from LS-3. + */ + u8 reason_code; + u8 reason_explanation; + + u8 vendor; + __be32 rsvd12; +}; + + +#define FCNVME_ASSOC_HOSTID_LEN 64 +#define FCNVME_ASSOC_HOSTNQN_LEN 256 +#define FCNVME_ASSOC_SUBNQN_LEN 256 + +/* FCNVME_LSDESC_CREATE_ASSOC_CMD */ +struct fcnvme_lsdesc_cr_assoc_cmd { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be16 ersp_ratio; + __be16 rsvd10; + __be32 rsvd12[9]; + __be16 cntlid; + __be16 sqsize; + __be32 rsvd52; + u8 hostid[FCNVME_ASSOC_HOSTID_LEN]; + u8 hostnqn[FCNVME_ASSOC_HOSTNQN_LEN]; + u8 subnqn[FCNVME_ASSOC_SUBNQN_LEN]; + u8 rsvd632[384]; +}; + +/* FCNVME_LSDESC_CREATE_CONN_CMD */ +struct fcnvme_lsdesc_cr_conn_cmd { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be16 ersp_ratio; + __be16 rsvd10; + __be32 rsvd12[9]; + __be16 qid; + __be16 sqsize; + __be32 rsvd52; +}; + +/* Disconnect Scope Values */ +enum { + FCNVME_DISCONN_ASSOCIATION = 0, + FCNVME_DISCONN_CONNECTION = 1, +}; + +/* FCNVME_LSDESC_DISCONN_CMD */ +struct fcnvme_lsdesc_disconn_cmd { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + u8 rsvd8[3]; + /* note: scope is really a 1 bit field */ + u8 scope; /* FCNVME_DISCONN_xxx */ + __be32 rsvd12; + __be64 id; +}; + +/* FCNVME_LSDESC_CONN_ID */ +struct fcnvme_lsdesc_conn_id { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be64 connection_id; +}; + +/* FCNVME_LSDESC_ASSOC_ID */ +struct fcnvme_lsdesc_assoc_id { + __be32 desc_tag; /* FCNVME_LSDESC_xxx */ + __be32 desc_len; + __be64 association_id; +}; + +/* r_ctl values */ +enum { + FCNVME_RS_RCTL_DATA = 1, + FCNVME_RS_RCTL_XFER_RDY = 5, + FCNVME_RS_RCTL_RSP = 8, +}; + + +/* ********** start of Link Services ********** */ + + +/* FCNVME_LS_RJT */ +struct fcnvme_ls_rjt { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_rqst rqst; + struct fcnvme_lsdesc_rjt rjt; +}; + +/* FCNVME_LS_ACC */ +struct fcnvme_ls_acc_hdr { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_rqst rqst; + /* Followed by cmd-specific ACC descriptors, see next definitions */ +}; + +/* FCNVME_LS_CREATE_ASSOCIATION */ +struct fcnvme_ls_cr_assoc_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_cr_assoc_cmd assoc_cmd; +}; + +struct fcnvme_ls_cr_assoc_acc { + struct fcnvme_ls_acc_hdr hdr; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_conn_id connectid; +}; + + +/* FCNVME_LS_CREATE_CONNECTION */ +struct fcnvme_ls_cr_conn_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_cr_conn_cmd connect_cmd; +}; + +struct fcnvme_ls_cr_conn_acc { + struct fcnvme_ls_acc_hdr hdr; + struct fcnvme_lsdesc_conn_id connectid; +}; + +/* FCNVME_LS_DISCONNECT */ +struct fcnvme_ls_disconnect_rqst { + struct fcnvme_ls_rqst_w0 w0; + __be32 desc_list_len; + struct fcnvme_lsdesc_assoc_id associd; + struct fcnvme_lsdesc_disconn_cmd discon_cmd; +}; + +struct fcnvme_ls_disconnect_acc { + struct fcnvme_ls_acc_hdr hdr; +}; + + +/* + * Yet to be defined in FC-NVME: + */ +#define NVME_FC_CONNECT_TIMEOUT_SEC 2 /* 2 seconds */ +#define NVME_FC_LS_TIMEOUT_SEC 2 /* 2 seconds */ +#define NVME_FC_TGTOP_TIMEOUT_SEC 2 /* 2 seconds */ + + +#endif /* _NVME_FC_H */ -- cgit v1.2.3 From d6d20012e116904065d192be6146040c99c03c3c Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 2 Dec 2016 00:28:41 -0800 Subject: nvme-fabrics: Add FC transport LLDD api definitions Host: - LLDD registration with the host transport - registering host ports (local ports) and target ports seen on fabric (remote ports) - Data structures and call points for FC-4 LS's and FCP IO requests Target: - LLDD registration with the target transport - registering nvme subsystem ports (target ports) - Data structures and call points for reception of FC-4 LS's and FCP IO requests, and callbacks to perform data and rsp transfers for the io. Add to MAINTAINERS file Signed-off-by: James Smart Reviewed-by: Christoph Hellwig Reviewed-by: Jay Freyensee Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 851 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 851 insertions(+) create mode 100644 include/linux/nvme-fc-driver.h (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h new file mode 100644 index 000000000000..f21471f7ee40 --- /dev/null +++ b/include/linux/nvme-fc-driver.h @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2016, Avago Technologies + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVME_FC_DRIVER_H +#define _NVME_FC_DRIVER_H 1 + + +/* + * ********************** LLDD FC-NVME Host API ******************** + * + * For FC LLDD's that are the NVME Host role. + * + * ****************************************************************** + */ + + + +/* FC Port role bitmask - can merge with FC Port Roles in fc transport */ +#define FC_PORT_ROLE_NVME_INITIATOR 0x10 +#define FC_PORT_ROLE_NVME_TARGET 0x11 +#define FC_PORT_ROLE_NVME_DISCOVERY 0x12 + + +/** + * struct nvme_fc_port_info - port-specific ids and FC connection-specific + * data element used during NVME Host role + * registrations + * + * Static fields describing the port being registered: + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @port_role: What NVME roles are supported (see FC_PORT_ROLE_xxx) + * + * Initialization values for dynamic port fields: + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + */ +struct nvme_fc_port_info { + u64 node_name; + u64 port_name; + u32 port_role; + u32 port_id; +}; + + +/** + * struct nvmefc_ls_req - Request structure passed from NVME-FC transport + * to LLDD in order to perform a NVME FC-4 LS + * request and obtain a response. + * + * Values set by the NVME-FC layer prior to calling the LLDD ls_req + * entrypoint. + * @rqstaddr: pointer to request buffer + * @rqstdma: PCI DMA address of request buffer + * @rqstlen: Length, in bytes, of request buffer + * @rspaddr: pointer to response buffer + * @rspdma: PCI DMA address of response buffer + * @rsplen: Length, in bytes, of response buffer + * @timeout: Maximum amount of time, in seconds, to wait for the LS response. + * If timeout exceeded, LLDD to abort LS exchange and complete + * LS request with error status. + * @private: pointer to memory allocated alongside the ls request structure + * that is specifically for the LLDD to use while processing the + * request. The length of the buffer corresponds to the + * lsrqst_priv_sz value specified in the nvme_fc_port_template + * supplied by the LLDD. + * @done: The callback routine the LLDD is to invoke upon completion of + * the LS request. req argument is the pointer to the original LS + * request structure. Status argument must be 0 upon success, a + * negative errno on failure (example: -ENXIO). + */ +struct nvmefc_ls_req { + void *rqstaddr; + dma_addr_t rqstdma; + u32 rqstlen; + void *rspaddr; + dma_addr_t rspdma; + u32 rsplen; + u32 timeout; + + void *private; + + void (*done)(struct nvmefc_ls_req *req, int status); + +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +enum nvmefc_fcp_datadir { + NVMEFC_FCP_NODATA, /* payload_length and sg_cnt will be zero */ + NVMEFC_FCP_WRITE, + NVMEFC_FCP_READ, +}; + + +#define NVME_FC_MAX_SEGMENTS 256 + +/** + * struct nvmefc_fcp_req - Request structure passed from NVME-FC transport + * to LLDD in order to perform a NVME FCP IO operation. + * + * Values set by the NVME-FC layer prior to calling the LLDD fcp_io + * entrypoint. + * @cmdaddr: pointer to the FCP CMD IU buffer + * @rspaddr: pointer to the FCP RSP IU buffer + * @cmddma: PCI DMA address of the FCP CMD IU buffer + * @rspdma: PCI DMA address of the FCP RSP IU buffer + * @cmdlen: Length, in bytes, of the FCP CMD IU buffer + * @rsplen: Length, in bytes, of the FCP RSP IU buffer + * @payload_length: Length of DATA_IN or DATA_OUT payload data to transfer + * @sg_table: scatter/gather structure for payload data + * @first_sgl: memory for 1st scatter/gather list segment for payload data + * @sg_cnt: number of elements in the scatter/gather list + * @io_dir: direction of the FCP request (see NVMEFC_FCP_xxx) + * @sqid: The nvme SQID the command is being issued on + * @done: The callback routine the LLDD is to invoke upon completion of + * the FCP operation. req argument is the pointer to the original + * FCP IO operation. + * @private: pointer to memory allocated alongside the FCP operation + * request structure that is specifically for the LLDD to use + * while processing the operation. The length of the buffer + * corresponds to the fcprqst_priv_sz value specified in the + * nvme_fc_port_template supplied by the LLDD. + * + * Values set by the LLDD indicating completion status of the FCP operation. + * Must be set prior to calling the done() callback. + * @transferred_length: amount of payload data, in bytes, that were + * transferred. Should equal payload_length on success. + * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. + * @status: Completion status of the FCP operation. must be 0 upon success, + * NVME_SC_FC_xxx value upon failure. Note: this is NOT a + * reflection of the NVME CQE completion status. Only the status + * of the FCP operation at the NVME-FC level. + */ +struct nvmefc_fcp_req { + void *cmdaddr; + void *rspaddr; + dma_addr_t cmddma; + dma_addr_t rspdma; + u16 cmdlen; + u16 rsplen; + + u32 payload_length; + struct sg_table sg_table; + struct scatterlist *first_sgl; + int sg_cnt; + enum nvmefc_fcp_datadir io_dir; + + __le16 sqid; + + void (*done)(struct nvmefc_fcp_req *req); + + void *private; + + u32 transferred_length; + u16 rcv_rsplen; + u32 status; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/* + * Direct copy of fc_port_state enum. For later merging + */ +enum nvme_fc_obj_state { + FC_OBJSTATE_UNKNOWN, + FC_OBJSTATE_NOTPRESENT, + FC_OBJSTATE_ONLINE, + FC_OBJSTATE_OFFLINE, /* User has taken Port Offline */ + FC_OBJSTATE_BLOCKED, + FC_OBJSTATE_BYPASSED, + FC_OBJSTATE_DIAGNOSTICS, + FC_OBJSTATE_LINKDOWN, + FC_OBJSTATE_ERROR, + FC_OBJSTATE_LOOPBACK, + FC_OBJSTATE_DELETED, +}; + + +/** + * struct nvme_fc_local_port - structure used between NVME-FC transport and + * a LLDD to reference a local NVME host port. + * Allocated/created by the nvme_fc_register_localport() + * transport interface. + * + * Fields with static values for the port. Initialized by the + * port_info struct supplied to the registration call. + * @port_num: NVME-FC transport host port number + * @port_role: NVME roles are supported on the port (see FC_PORT_ROLE_xxx) + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @private: pointer to memory allocated alongside the local port + * structure that is specifically for the LLDD to use. + * The length of the buffer corresponds to the local_priv_sz + * value specified in the nvme_fc_port_template supplied by + * the LLDD. + * + * Fields with dynamic values. Values may change base on link state. LLDD + * may reference fields directly to change them. Initialized by the + * port_info struct supplied to the registration call. + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + * @port_state: Operational state of the port. + */ +struct nvme_fc_local_port { + /* static/read-only fields */ + u32 port_num; + u32 port_role; + u64 node_name; + u64 port_name; + + void *private; + + /* dynamic fields */ + u32 port_id; + enum nvme_fc_obj_state port_state; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/** + * struct nvme_fc_remote_port - structure used between NVME-FC transport and + * a LLDD to reference a remote NVME subsystem port. + * Allocated/created by the nvme_fc_register_remoteport() + * transport interface. + * + * Fields with static values for the port. Initialized by the + * port_info struct supplied to the registration call. + * @port_num: NVME-FC transport remote subsystem port number + * @port_role: NVME roles are supported on the port (see FC_PORT_ROLE_xxx) + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @localport: pointer to the NVME-FC local host port the subsystem is + * connected to. + * @private: pointer to memory allocated alongside the remote port + * structure that is specifically for the LLDD to use. + * The length of the buffer corresponds to the remote_priv_sz + * value specified in the nvme_fc_port_template supplied by + * the LLDD. + * + * Fields with dynamic values. Values may change base on link or login + * state. LLDD may reference fields directly to change them. Initialized by + * the port_info struct supplied to the registration call. + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + * @port_state: Operational state of the remote port. Valid values are + * ONLINE or UNKNOWN. + */ +struct nvme_fc_remote_port { + /* static fields */ + u32 port_num; + u32 port_role; + u64 node_name; + u64 port_name; + + struct nvme_fc_local_port *localport; + + void *private; + + /* dynamic fields */ + u32 port_id; + enum nvme_fc_obj_state port_state; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/** + * struct nvme_fc_port_template - structure containing static entrypoints and + * operational parameters for an LLDD that supports NVME host + * behavior. Passed by reference in port registrations. + * NVME-FC transport remembers template reference and may + * access it during runtime operation. + * + * Host/Initiator Transport Entrypoints/Parameters: + * + * @localport_delete: The LLDD initiates deletion of a localport via + * nvme_fc_deregister_localport(). However, the teardown is + * asynchronous. This routine is called upon the completion of the + * teardown to inform the LLDD that the localport has been deleted. + * Entrypoint is Mandatory. + * + * @remoteport_delete: The LLDD initiates deletion of a remoteport via + * nvme_fc_deregister_remoteport(). However, the teardown is + * asynchronous. This routine is called upon the completion of the + * teardown to inform the LLDD that the remoteport has been deleted. + * Entrypoint is Mandatory. + * + * @create_queue: Upon creating a host<->controller association, queues are + * created such that they can be affinitized to cpus/cores. This + * callback into the LLDD to notify that a controller queue is being + * created. The LLDD may choose to allocate an associated hw queue + * or map it onto a shared hw queue. Upon return from the call, the + * LLDD specifies a handle that will be given back to it for any + * command that is posted to the controller queue. The handle can + * be used by the LLDD to map quickly to the proper hw queue for + * command execution. The mask of cpu's that will map to this queue + * at the block-level is also passed in. The LLDD should use the + * queue id and/or cpu masks to ensure proper affinitization of the + * controller queue to the hw queue. + * Entrypoint is Optional. + * + * @delete_queue: This is the inverse of the crete_queue. During + * host<->controller association teardown, this routine is called + * when a controller queue is being terminated. Any association with + * a hw queue should be termined. If there is a unique hw queue, the + * hw queue should be torn down. + * Entrypoint is Optional. + * + * @poll_queue: Called to poll for the completion of an io on a blk queue. + * Entrypoint is Optional. + * + * @ls_req: Called to issue a FC-NVME FC-4 LS service request. + * The nvme_fc_ls_req structure will fully describe the buffers for + * the request payload and where to place the response payload. The + * LLDD is to allocate an exchange, issue the LS request, obtain the + * LS response, and call the "done" routine specified in the request + * structure (argument to done is the ls request structure itself). + * Entrypoint is Mandatory. + * + * @fcp_io: called to issue a FC-NVME I/O request. The I/O may be for + * an admin queue or an i/o queue. The nvmefc_fcp_req structure will + * fully describe the io: the buffer containing the FC-NVME CMD IU + * (which contains the SQE), the sg list for the payload if applicable, + * and the buffer to place the FC-NVME RSP IU into. The LLDD will + * complete the i/o, indicating the amount of data transferred or + * any transport error, and call the "done" routine specified in the + * request structure (argument to done is the fcp request structure + * itself). + * Entrypoint is Mandatory. + * + * @ls_abort: called to request the LLDD to abort the indicated ls request. + * The call may return before the abort has completed. After aborting + * the request, the LLDD must still call the ls request done routine + * indicating an FC transport Aborted status. + * Entrypoint is Mandatory. + * + * @fcp_abort: called to request the LLDD to abort the indicated fcp request. + * The call may return before the abort has completed. After aborting + * the request, the LLDD must still call the fcp request done routine + * indicating an FC transport Aborted status. + * Entrypoint is Mandatory. + * + * @max_hw_queues: indicates the maximum number of hw queues the LLDD + * supports for cpu affinitization. + * Value is Mandatory. Must be at least 1. + * + * @max_sgl_segments: indicates the maximum number of sgl segments supported + * by the LLDD + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @max_dif_sgl_segments: indicates the maximum number of sgl segments + * supported by the LLDD for DIF operations. + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @dma_boundary: indicates the dma address boundary where dma mappings + * will be split across. + * Value is Mandatory. Typical value is 0xFFFFFFFF to split across + * 4Gig address boundarys + * + * @local_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a localport is allocated. The additional memory + * area solely for the of the LLDD and its location is specified by + * the localport->private pointer. + * Value is Mandatory. Allowed to be zero. + * + * @remote_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a remoteport is allocated. The additional memory + * area solely for the of the LLDD and its location is specified by + * the remoteport->private pointer. + * Value is Mandatory. Allowed to be zero. + * + * @lsrqst_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a ls request structure is allocated. The additional + * memory area solely for the of the LLDD and its location is + * specified by the ls_request->private pointer. + * Value is Mandatory. Allowed to be zero. + * + * @fcprqst_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a fcp request structure is allocated. The additional + * memory area solely for the of the LLDD and its location is + * specified by the fcp_request->private pointer. + * Value is Mandatory. Allowed to be zero. + */ +struct nvme_fc_port_template { + /* initiator-based functions */ + void (*localport_delete)(struct nvme_fc_local_port *); + void (*remoteport_delete)(struct nvme_fc_remote_port *); + int (*create_queue)(struct nvme_fc_local_port *, + unsigned int qidx, u16 qsize, + void **handle); + void (*delete_queue)(struct nvme_fc_local_port *, + unsigned int qidx, void *handle); + void (*poll_queue)(struct nvme_fc_local_port *, void *handle); + int (*ls_req)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + struct nvmefc_ls_req *); + int (*fcp_io)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + void *hw_queue_handle, + struct nvmefc_fcp_req *); + void (*ls_abort)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + struct nvmefc_ls_req *); + void (*fcp_abort)(struct nvme_fc_local_port *, + struct nvme_fc_remote_port *, + void *hw_queue_handle, + struct nvmefc_fcp_req *); + + u32 max_hw_queues; + u16 max_sgl_segments; + u16 max_dif_sgl_segments; + u64 dma_boundary; + + /* sizes of additional private data for data structures */ + u32 local_priv_sz; + u32 remote_priv_sz; + u32 lsrqst_priv_sz; + u32 fcprqst_priv_sz; +}; + + +/* + * Initiator/Host functions + */ + +int nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, + struct nvme_fc_port_template *template, + struct device *dev, + struct nvme_fc_local_port **lport_p); + +int nvme_fc_unregister_localport(struct nvme_fc_local_port *localport); + +int nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, + struct nvme_fc_port_info *pinfo, + struct nvme_fc_remote_port **rport_p); + +int nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *remoteport); + + + +/* + * *************** LLDD FC-NVME Target/Subsystem API *************** + * + * For FC LLDD's that are the NVME Subsystem role + * + * ****************************************************************** + */ + +/** + * struct nvmet_fc_port_info - port-specific ids and FC connection-specific + * data element used during NVME Subsystem role + * registrations + * + * Static fields describing the port being registered: + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * + * Initialization values for dynamic port fields: + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + */ +struct nvmet_fc_port_info { + u64 node_name; + u64 port_name; + u32 port_id; +}; + + +/** + * struct nvmefc_tgt_ls_req - Structure used between LLDD and NVMET-FC + * layer to represent the exchange context for + * a FC-NVME Link Service (LS). + * + * The structure is allocated by the LLDD whenever a LS Request is received + * from the FC link. The address of the structure is passed to the nvmet-fc + * layer via the nvmet_fc_rcv_ls_req() call. The address of the structure + * will be passed back to the LLDD when the response is to be transmit. + * The LLDD is to use the address to map back to the LLDD exchange structure + * which maintains information such as the targetport the LS was received + * on, the remote FC NVME initiator that sent the LS, and any FC exchange + * context. Upon completion of the LS response transmit, the address of the + * structure will be passed back to the LS rsp done() routine, allowing the + * nvmet-fc layer to release dma resources. Upon completion of the done() + * routine, no further access will be made by the nvmet-fc layer and the + * LLDD can de-allocate the structure. + * + * Field initialization: + * At the time of the nvmet_fc_rcv_ls_req() call, there is no content that + * is valid in the structure. + * + * When the structure is used for the LLDD->xmt_ls_rsp() call, the nvmet-fc + * layer will fully set the fields in order to specify the response + * payload buffer and its length as well as the done routine to be called + * upon compeletion of the transmit. The nvmet-fc layer will also set a + * private pointer for its own use in the done routine. + * + * Values set by the NVMET-FC layer prior to calling the LLDD xmt_ls_rsp + * entrypoint. + * @rspbuf: pointer to the LS response buffer + * @rspdma: PCI DMA address of the LS response buffer + * @rsplen: Length, in bytes, of the LS response buffer + * @done: The callback routine the LLDD is to invoke upon completion of + * transmitting the LS response. req argument is the pointer to + * the original ls request. + * @nvmet_fc_private: pointer to an internal NVMET-FC layer structure used + * as part of the NVMET-FC processing. The LLDD is not to access + * this pointer. + */ +struct nvmefc_tgt_ls_req { + void *rspbuf; + dma_addr_t rspdma; + u16 rsplen; + + void (*done)(struct nvmefc_tgt_ls_req *req); + void *nvmet_fc_private; /* LLDD is not to access !! */ +}; + +/* Operations that NVME-FC layer may request the LLDD to perform for FCP */ +enum { + NVMET_FCOP_READDATA = 1, /* xmt data to initiator */ + NVMET_FCOP_WRITEDATA = 2, /* xmt data from initiator */ + NVMET_FCOP_READDATA_RSP = 3, /* xmt data to initiator and send + * rsp as well + */ + NVMET_FCOP_RSP = 4, /* send rsp frame */ + NVMET_FCOP_ABORT = 5, /* abort exchange via ABTS */ + NVMET_FCOP_BA_ACC = 6, /* send BA_ACC */ + NVMET_FCOP_BA_RJT = 7, /* send BA_RJT */ +}; + +/** + * struct nvmefc_tgt_fcp_req - Structure used between LLDD and NVMET-FC + * layer to represent the exchange context and + * the specific FC-NVME IU operation(s) to perform + * for a FC-NVME FCP IO. + * + * Structure used between LLDD and nvmet-fc layer to represent the exchange + * context for a FC-NVME FCP I/O operation (e.g. a nvme sqe, the sqe-related + * memory transfers, and its assocated cqe transfer). + * + * The structure is allocated by the LLDD whenever a FCP CMD IU is received + * from the FC link. The address of the structure is passed to the nvmet-fc + * layer via the nvmet_fc_rcv_fcp_req() call. The address of the structure + * will be passed back to the LLDD for the data operations and transmit of + * the response. The LLDD is to use the address to map back to the LLDD + * exchange structure which maintains information such as the targetport + * the FCP I/O was received on, the remote FC NVME initiator that sent the + * FCP I/O, and any FC exchange context. Upon completion of the FCP target + * operation, the address of the structure will be passed back to the FCP + * op done() routine, allowing the nvmet-fc layer to release dma resources. + * Upon completion of the done() routine for either RSP or ABORT ops, no + * further access will be made by the nvmet-fc layer and the LLDD can + * de-allocate the structure. + * + * Field initialization: + * At the time of the nvmet_fc_rcv_fcp_req() call, there is no content that + * is valid in the structure. + * + * When the structure is used for an FCP target operation, the nvmet-fc + * layer will fully set the fields in order to specify the scattergather + * list, the transfer length, as well as the done routine to be called + * upon compeletion of the operation. The nvmet-fc layer will also set a + * private pointer for its own use in the done routine. + * + * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !! + * + * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op + * entrypoint. + * @op: Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx) + * @hwqid: Specifies the hw queue index (0..N-1, where N is the + * max_hw_queues value from the LLD's nvmet_fc_target_template) + * that the operation is to use. + * @offset: Indicates the DATA_OUT/DATA_IN payload offset to be tranferred. + * Field is only valid on WRITEDATA, READDATA, or READDATA_RSP ops. + * @timeout: amount of time, in seconds, to wait for a response from the NVME + * host. A value of 0 is an infinite wait. + * Valid only for the following ops: + * WRITEDATA: caps the wait for data reception + * READDATA_RSP & RSP: caps wait for FCP_CONF reception (if used) + * @transfer_length: the length, in bytes, of the DATA_OUT or DATA_IN payload + * that is to be transferred. + * Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops. + * @ba_rjt: Contains the BA_RJT payload that is to be transferred. + * Valid only for the NVMET_FCOP_BA_RJT op. + * @sg: Scatter/gather list for the DATA_OUT/DATA_IN payload data. + * Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops. + * @sg_cnt: Number of valid entries in the scatter/gather list. + * Valid only for the WRITEDATA, READDATA, or READDATA_RSP ops. + * @rspaddr: pointer to the FCP RSP IU buffer to be transmit + * Used by RSP and READDATA_RSP ops + * @rspdma: PCI DMA address of the FCP RSP IU buffer + * Used by RSP and READDATA_RSP ops + * @rsplen: Length, in bytes, of the FCP RSP IU buffer + * Used by RSP and READDATA_RSP ops + * @done: The callback routine the LLDD is to invoke upon completion of + * the operation. req argument is the pointer to the original + * FCP subsystem op request. + * @nvmet_fc_private: pointer to an internal NVMET-FC layer structure used + * as part of the NVMET-FC processing. The LLDD is not to + * reference this field. + * + * Values set by the LLDD indicating completion status of the FCP operation. + * Must be set prior to calling the done() callback. + * @transferred_length: amount of DATA_OUT payload data received by a + * a WRITEDATA operation. If not a WRITEDATA operation, value must + * be set to 0. Should equal transfer_length on success. + * @fcp_error: status of the FCP operation. Must be 0 on success; on failure + * must be a NVME_SC_FC_xxxx value. + */ +struct nvmefc_tgt_fcp_req { + u8 op; + u16 hwqid; + u32 offset; + u32 timeout; + u32 transfer_length; + struct fc_ba_rjt ba_rjt; + struct scatterlist sg[NVME_FC_MAX_SEGMENTS]; + int sg_cnt; + void *rspaddr; + dma_addr_t rspdma; + u16 rsplen; + + void (*done)(struct nvmefc_tgt_fcp_req *); + + void *nvmet_fc_private; /* LLDD is not to access !! */ + + u32 transferred_length; + int fcp_error; +}; + + +/* Target Features (Bit fields) LLDD supports */ +enum { + NVMET_FCTGTFEAT_READDATA_RSP = (1 << 0), + /* Bit 0: supports the NVMET_FCPOP_READDATA_RSP op, which + * sends (the last) Read Data sequence followed by the RSP + * sequence in one LLDD operation. Errors during Data + * sequence transmit must not allow RSP sequence to be sent. + */ + NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED = (1 << 1), + /* Bit 1: When 0, the LLDD will deliver FCP CMD + * on the CPU it should be affinitized to. Thus work will + * be scheduled on the cpu received on. When 1, the LLDD + * may not deliver the CMD on the CPU it should be worked + * on. The transport should pick a cpu to schedule the work + * on. + */ +}; + + +/** + * struct nvmet_fc_target_port - structure used between NVME-FC transport and + * a LLDD to reference a local NVME subsystem port. + * Allocated/created by the nvme_fc_register_targetport() + * transport interface. + * + * Fields with static values for the port. Initialized by the + * port_info struct supplied to the registration call. + * @port_num: NVME-FC transport subsytem port number + * @node_name: FC WWNN for the port + * @port_name: FC WWPN for the port + * @private: pointer to memory allocated alongside the local port + * structure that is specifically for the LLDD to use. + * The length of the buffer corresponds to the target_priv_sz + * value specified in the nvme_fc_target_template supplied by + * the LLDD. + * + * Fields with dynamic values. Values may change base on link state. LLDD + * may reference fields directly to change them. Initialized by the + * port_info struct supplied to the registration call. + * @port_id: FC N_Port_ID currently assigned the port. Upper 8 bits must + * be set to 0. + * @port_state: Operational state of the port. + */ +struct nvmet_fc_target_port { + /* static/read-only fields */ + u32 port_num; + u64 node_name; + u64 port_name; + + void *private; + + /* dynamic fields */ + u32 port_id; + enum nvme_fc_obj_state port_state; +} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ + + +/** + * struct nvmet_fc_target_template - structure containing static entrypoints + * and operational parameters for an LLDD that supports NVME + * subsystem behavior. Passed by reference in port + * registrations. NVME-FC transport remembers template + * reference and may access it during runtime operation. + * + * Subsystem/Target Transport Entrypoints/Parameters: + * + * @targetport_delete: The LLDD initiates deletion of a targetport via + * nvmet_fc_unregister_targetport(). However, the teardown is + * asynchronous. This routine is called upon the completion of the + * teardown to inform the LLDD that the targetport has been deleted. + * Entrypoint is Mandatory. + * + * @xmt_ls_rsp: Called to transmit the response to a FC-NVME FC-4 LS service. + * The nvmefc_tgt_ls_req structure is the same LLDD-supplied exchange + * structure specified in the nvmet_fc_rcv_ls_req() call made when + * the LS request was received. The structure will fully describe + * the buffers for the response payload and the dma address of the + * payload. The LLDD is to transmit the response (or return a non-zero + * errno status), and upon completion of the transmit, call the + * "done" routine specified in the nvmefc_tgt_ls_req structure + * (argument to done is the ls reqwuest structure itself). + * After calling the done routine, the LLDD shall consider the + * LS handling complete and the nvmefc_tgt_ls_req structure may + * be freed/released. + * Entrypoint is Mandatory. + * + * @fcp_op: Called to perform a data transfer, transmit a response, or + * abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same + * LLDD-supplied exchange structure specified in the + * nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received. + * The op field in the structure shall indicate the operation for + * the LLDD to perform relative to the io. + * NVMET_FCOP_READDATA operation: the LLDD is to send the + * payload data (described by sglist) to the host in 1 or + * more FC sequences (preferrably 1). Note: the fc-nvme layer + * may call the READDATA operation multiple times for longer + * payloads. + * NVMET_FCOP_WRITEDATA operation: the LLDD is to receive the + * payload data (described by sglist) from the host via 1 or + * more FC sequences (preferrably 1). The LLDD is to generate + * the XFER_RDY IU(s) corresponding to the data being requested. + * Note: the FC-NVME layer may call the WRITEDATA operation + * multiple times for longer payloads. + * NVMET_FCOP_READDATA_RSP operation: the LLDD is to send the + * payload data (described by sglist) to the host in 1 or + * more FC sequences (preferrably 1). If an error occurs during + * payload data transmission, the LLDD is to set the + * nvmefc_tgt_fcp_req fcp_error and transferred_length field, then + * consider the operation complete. On error, the LLDD is to not + * transmit the FCP_RSP iu. If all payload data is transferred + * successfully, the LLDD is to update the nvmefc_tgt_fcp_req + * transferred_length field and may subsequently transmit the + * FCP_RSP iu payload (described by rspbuf, rspdma, rsplen). + * The LLDD is to await FCP_CONF reception to confirm the RSP + * reception by the host. The LLDD may retramsit the FCP_RSP iu + * if necessary per FC-NVME. Upon reception of FCP_CONF, or upon + * FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req + * fcp_error field and consider the operation complete.. + * NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload + * (described by rspbuf, rspdma, rsplen). The LLDD is to await + * FCP_CONF reception to confirm the RSP reception by the host. + * The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME. + * Upon reception of FCP_CONF, or upon FCP_CONF failure, the + * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and + * consider the operation complete.. + * NVMET_FCOP_ABORT: the LLDD is to terminate the exchange + * corresponding to the fcp operation. The LLDD shall send + * ABTS and follow FC exchange abort-multi rules, including + * ABTS retries and possible logout. + * Upon completing the indicated operation, the LLDD is to set the + * status fields for the operation (tranferred_length and fcp_error + * status) in the request, then all the "done" routine + * indicated in the fcp request. Upon return from the "done" + * routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation + * the fc-nvme layer will not longer reference the fcp request, + * allowing the LLDD to free/release the fcp request. + * Note: when calling the done routine for READDATA or WRITEDATA + * operations, the fc-nvme layer may immediate convert, in the same + * thread and before returning to the LLDD, the fcp operation to + * the next operation for the fcp io and call the LLDDs fcp_op + * call again. If fields in the fcp request are to be accessed post + * the done call, the LLDD should save their values prior to calling + * the done routine, and inspect the save values after the done + * routine. + * Returns 0 on success, - on failure (Ex: -EIO) + * Entrypoint is Mandatory. + * + * @max_hw_queues: indicates the maximum number of hw queues the LLDD + * supports for cpu affinitization. + * Value is Mandatory. Must be at least 1. + * + * @max_sgl_segments: indicates the maximum number of sgl segments supported + * by the LLDD + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @max_dif_sgl_segments: indicates the maximum number of sgl segments + * supported by the LLDD for DIF operations. + * Value is Mandatory. Must be at least 1. Recommend at least 256. + * + * @dma_boundary: indicates the dma address boundary where dma mappings + * will be split across. + * Value is Mandatory. Typical value is 0xFFFFFFFF to split across + * 4Gig address boundarys + * + * @target_features: The LLDD sets bits in this field to correspond to + * optional features that are supported by the LLDD. + * Refer to the NVMET_FCTGTFEAT_xxx values. + * Value is Mandatory. Allowed to be zero. + * + * @target_priv_sz: The LLDD sets this field to the amount of additional + * memory that it would like fc nvme layer to allocate on the LLDD's + * behalf whenever a targetport is allocated. The additional memory + * area solely for the of the LLDD and its location is specified by + * the targetport->private pointer. + * Value is Mandatory. Allowed to be zero. + */ +struct nvmet_fc_target_template { + void (*targetport_delete)(struct nvmet_fc_target_port *tgtport); + int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_ls_req *tls_req); + int (*fcp_op)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *); + + u32 max_hw_queues; + u16 max_sgl_segments; + u16 max_dif_sgl_segments; + u64 dma_boundary; + + u32 target_features; + + u32 target_priv_sz; +}; + + +int nvmet_fc_register_targetport(struct nvmet_fc_port_info *portinfo, + struct nvmet_fc_target_template *template, + struct device *dev, + struct nvmet_fc_target_port **tgtport_p); + +int nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *tgtport); + +int nvmet_fc_rcv_ls_req(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_ls_req *lsreq, + void *lsreqbuf, u32 lsreqbuf_len); + +int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq, + void *cmdiubuf, u32 cmdiubuf_len); + +#endif /* _NVME_FC_DRIVER_H */ -- cgit v1.2.3 From f9d03f96b988002027d4b28ea1b7a24729a4c9b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Dec 2016 15:20:32 -0700 Subject: block: improve handling of the magic discard payload Instead of allocating a single unused biovec for discard requests, send them down without any payload. Instead we allow the driver to add a "special" payload using a biovec embedded into struct request (unioned over other fields never used while in the driver), and overloading the number of segments for this case. This has a couple of advantages: - we don't have to allocate the bio_vec - the amount of special casing for discard requests in the block layer is significantly reduced - using this same scheme for other request types is trivial, which will be important for implementing the new WRITE_ZEROES op on devices where it actually requires a payload (e.g. SCSI) - we can get rid of playing games with the request length, as we'll never touch it and completions will work just fine - it will allow us to support ranged discard operations in the future by merging non-contiguous discard bios into a single request - last but not least it removes a lot of code This patch is the common base for my WIP series for ranges discards and to remove discard_zeroes_data in favor of always using REQ_OP_WRITE_ZEROES, so it would be good to get it in quickly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- include/linux/blkdev.h | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index b15323934a29..7cf8a6c70a3f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -197,8 +197,9 @@ static inline unsigned bio_segments(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; default: break; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ebeef2b79c5a..c5393766909d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -120,10 +120,13 @@ typedef __u32 __bitwise req_flags_t; #define RQF_HASHED ((__force req_flags_t)(1 << 16)) /* IO stats tracking on */ #define RQF_STATS ((__force req_flags_t)(1 << 17)) +/* Look at ->special_vec for the actual data payload instead of the + bio chain. */ +#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ) + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) #define BLK_MAX_CDB 16 @@ -175,6 +178,7 @@ struct request { */ union { struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; void *completion_data; }; @@ -909,8 +913,6 @@ extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); -extern void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, @@ -1153,6 +1155,13 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return 1; + return rq->nr_phys_segments; +} + extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v1.2.3 From ae911c5e796d51cb2d1ed3a55e73b9cc88d176cf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Dec 2016 13:19:30 -0700 Subject: blk-mq: add blk_mq_start_stopped_hw_queue() We have a variant for all hardware queues, but not one for a single hardware queue. Signed-off-by: Jens Axboe Reviewed-by: Hannes Reinecke --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 35a0af5ede6d..87e404aae267 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -231,6 +231,7 @@ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); +void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -- cgit v1.2.3 From 70b3ea056f3074be6d9256c312b64c0d90a4a683 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 Dec 2016 08:43:31 -0700 Subject: elevator: make the rqhash helpers exported Signed-off-by: Jens Axboe Reviewed-by: Hannes Reinecke --- include/linux/elevator.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index f219c9aed360..b276e9ef0e0b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -108,6 +108,11 @@ struct elevator_type #define ELV_HASH_BITS 6 +void elv_rqhash_del(struct request_queue *q, struct request *rq); +void elv_rqhash_add(struct request_queue *q, struct request *rq); +void elv_rqhash_reposition(struct request_queue *q, struct request *rq); +struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); + /* * each queue has an elevator_queue associated with it */ -- cgit v1.2.3