summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/biodoc.txt4
-rw-r--r--MAINTAINERS1
-rw-r--r--block/Kconfig1
-rw-r--r--block/bio.c5
-rw-r--r--block/blk-core.c16
-rw-r--r--block/blk-mq-sysfs.c40
-rw-r--r--block/blk-mq-tag.c503
-rw-r--r--block/blk-mq-tag.h42
-rw-r--r--block/blk-mq.c183
-rw-r--r--block/blk-mq.h11
-rw-r--r--block/blk-sysfs.c4
-rw-r--r--block/cfq-iosched.c13
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c2
-rw-r--r--drivers/block/nbd.c411
-rw-r--r--drivers/block/null_blk.c128
-rw-r--r--drivers/lightnvm/Kconfig2
-rw-r--r--drivers/lightnvm/Makefile2
-rw-r--r--drivers/lightnvm/core.c55
-rw-r--r--drivers/lightnvm/lightnvm.h35
-rw-r--r--drivers/lightnvm/sysfs.c198
-rw-r--r--drivers/md/bcache/btree.c6
-rw-r--r--drivers/md/bcache/debug.c6
-rw-r--r--drivers/md/bcache/movinggc.c5
-rw-r--r--drivers/md/bcache/request.c9
-rw-r--r--drivers/md/bcache/writeback.c5
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-log-writes.c6
-rw-r--r--drivers/md/dm-rq.c2
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/nvme/host/core.c155
-rw-r--r--drivers/nvme/host/fabrics.c25
-rw-r--r--drivers/nvme/host/fabrics.h11
-rw-r--r--drivers/nvme/host/lightnvm.c33
-rw-r--r--drivers/nvme/host/nvme.h30
-rw-r--r--drivers/nvme/host/scsi.c80
-rw-r--r--drivers/nvme/target/admin-cmd.c88
-rw-r--r--drivers/nvme/target/io-cmd.c3
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/block_dev.c18
-rw-r--r--fs/btrfs/inode.c5
-rw-r--r--include/linux/bio.h3
-rw-r--r--include/linux/blk-cgroup.h2
-rw-r--r--include/linux/blk-mq.h28
-rw-r--r--include/linux/blk_types.h21
-rw-r--r--include/linux/blkdev.h4
-rw-r--r--include/linux/ioprio.h1
-rw-r--r--include/linux/lightnvm.h18
-rw-r--r--include/linux/sbitmap.h373
-rw-r--r--include/linux/workqueue.h1
-rw-r--r--kernel/workqueue.c40
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile2
-rw-r--r--lib/sbitmap.c347
53 files changed, 1828 insertions, 1170 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index bcdb2b4c1f12..918e1e0d0e78 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -115,7 +115,7 @@ i. Per-queue limits/values exported to the generic layer by the driver
Various parameters that the generic i/o scheduler logic uses are set at
a per-queue level (e.g maximum request size, maximum number of segments in
-a scatter-gather list, hardsect size)
+a scatter-gather list, logical block size)
Some parameters that were earlier available as global arrays indexed by
major/minor are now directly associated with the queue. Some of these may
@@ -156,7 +156,7 @@ Some new queue property settings:
blk_queue_max_segment_size(q, max_seg_size)
Maximum size of a clustered segment, 64kB default.
- blk_queue_hardsect_size(q, hardsect_size)
+ blk_queue_logical_block_size(q, logical_block_size)
Lowest possible sector size that the hardware can operate
on, 512 bytes default.
diff --git a/MAINTAINERS b/MAINTAINERS
index 255655880881..fb4d381a6ecf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2472,6 +2472,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
S: Maintained
F: block/
F: kernel/trace/blktrace.c
+F: lib/sbitmap.c
BLOCK2MTD DRIVER
M: Joern Engel <joern@lazybastard.org>
diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..5136ad4bb6d5 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+ select SBITMAP
help
Provide block layer support for the kernel.
diff --git a/block/bio.c b/block/bio.c
index aa7354088008..db85c5753a76 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1068,7 +1068,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
return 0;
}
-static void bio_free_pages(struct bio *bio)
+void bio_free_pages(struct bio *bio)
{
struct bio_vec *bvec;
int i;
@@ -1076,6 +1076,7 @@ static void bio_free_pages(struct bio *bio)
bio_for_each_segment_all(bvec, bio, i)
__free_page(bvec->bv_page);
}
+EXPORT_SYMBOL(bio_free_pages);
/**
* bio_uncopy_user - finish previously mapped bio
@@ -1274,7 +1275,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
nr_pages += end - start;
/*
- * buffer must be aligned to at least hardsector size for now
+ * buffer must be aligned to at least logical block size for now
*/
if (uaddr & queue_dma_alignment(q))
return ERR_PTR(-EINVAL);
diff --git a/block/blk-core.c b/block/blk-core.c
index 36c7ac328d8c..14d7c0740dc0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -288,7 +288,7 @@ void blk_sync_queue(struct request_queue *q)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- cancel_delayed_work_sync(&hctx->run_work);
+ cancel_work_sync(&hctx->run_work);
cancel_delayed_work_sync(&hctx->delay_work);
}
} else {
@@ -3097,6 +3097,12 @@ int kblockd_schedule_work(struct work_struct *work)
}
EXPORT_SYMBOL(kblockd_schedule_work);
+int kblockd_schedule_work_on(int cpu, struct work_struct *work)
+{
+ return queue_work_on(cpu, kblockd_workqueue, work);
+}
+EXPORT_SYMBOL(kblockd_schedule_work_on);
+
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
@@ -3301,19 +3307,23 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
{
struct blk_plug *plug;
long state;
+ unsigned int queue_num;
+ struct blk_mq_hw_ctx *hctx;
if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) ||
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return false;
+ queue_num = blk_qc_t_to_queue_num(cookie);
+ hctx = q->queue_hw_ctx[queue_num];
+ hctx->poll_considered++;
+
plug = current->plug;
if (plug)
blk_flush_plug_list(plug, false);
state = current->state;
while (!need_resched()) {
- unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
- struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
int ret;
hctx->poll_invoked++;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index fe822aa5b8e4..01fb455d3377 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -176,7 +176,17 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
{
- return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success);
+ return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n",
+ hctx->poll_considered, hctx->poll_invoked,
+ hctx->poll_success);
+}
+
+static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx,
+ const char *page, size_t size)
+{
+ hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
+
+ return size;
}
static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
@@ -198,12 +208,14 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
- for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
- unsigned long d = 1U << (i - 1);
+ for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
+ unsigned int d = 1U << (i - 1);
- page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
+ page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]);
}
+ page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1),
+ hctx->dispatched[i]);
return page - start_page;
}
@@ -301,8 +313,9 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
.show = blk_mq_hw_sysfs_cpus_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
- .attr = {.name = "io_poll", .mode = S_IRUGO },
+ .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO },
.show = blk_mq_hw_sysfs_poll_show,
+ .store = blk_mq_hw_sysfs_poll_store,
};
static struct attribute *default_hw_ctx_attrs[] = {
@@ -380,9 +393,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
return ret;
}
-static void __blk_mq_unregister_disk(struct gendisk *disk)
+static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
- struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
int i, j;
@@ -400,15 +412,15 @@ static void __blk_mq_unregister_disk(struct gendisk *disk)
kobject_del(&q->mq_kobj);
kobject_put(&q->mq_kobj);
- kobject_put(&disk_to_dev(disk)->kobj);
+ kobject_put(&dev->kobj);
q->mq_sysfs_init_done = false;
}
-void blk_mq_unregister_disk(struct gendisk *disk)
+void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
blk_mq_disable_hotplug();
- __blk_mq_unregister_disk(disk);
+ __blk_mq_unregister_dev(dev, q);
blk_mq_enable_hotplug();
}
@@ -430,10 +442,8 @@ static void blk_mq_sysfs_init(struct request_queue *q)
}
}
-int blk_mq_register_disk(struct gendisk *disk)
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
- struct device *dev = disk_to_dev(disk);
- struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
int ret, i;
@@ -454,7 +464,7 @@ int blk_mq_register_disk(struct gendisk *disk)
}
if (ret)
- __blk_mq_unregister_disk(disk);
+ __blk_mq_unregister_dev(dev, q);
else
q->mq_sysfs_init_done = true;
out:
@@ -462,7 +472,7 @@ out:
return ret;
}
-EXPORT_SYMBOL_GPL(blk_mq_register_disk);
+EXPORT_SYMBOL_GPL(blk_mq_register_dev);
void blk_mq_sysfs_unregister(struct request_queue *q)
{
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 729bac3a673b..cef618f6fc92 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,58 +1,24 @@
/*
- * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
- * over multiple cachelines to avoid ping-pong between multiple submitters
- * or submitter and completer. Uses rolling wakeups to avoid falling of
- * the scaling cliff when we run out of tags and have to start putting
- * submitters to sleep.
- *
- * Uses active queue tracking to support fairer distribution of tags
- * between multiple submitters when a shared tag map is used.
+ * Tag allocation using scalable bitmaps. Uses active queue tracking to support
+ * fairer distribution of tags between multiple submitters when a shared tag map
+ * is used.
*
* Copyright (C) 2013-2014 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/random.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
-static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
-{
- int i;
-
- for (i = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
- int ret;
-
- ret = find_first_zero_bit(&bm->word, bm->depth);
- if (ret < bm->depth)
- return true;
- }
-
- return false;
-}
-
bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
{
if (!tags)
return true;
- return bt_has_free_tags(&tags->bitmap_tags);
-}
-
-static inline int bt_index_inc(int index)
-{
- return (index + 1) & (BT_WAIT_QUEUES - 1);
-}
-
-static inline void bt_index_atomic_inc(atomic_t *index)
-{
- int old = atomic_read(index);
- int new = bt_index_inc(old);
- atomic_cmpxchg(index, old, new);
+ return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
}
/*
@@ -72,29 +38,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
- struct blk_mq_bitmap_tags *bt;
- int i, wake_index;
-
- /*
- * Make sure all changes prior to this are visible from other CPUs.
- */
- smp_mb();
- bt = &tags->bitmap_tags;
- wake_index = atomic_read(&bt->wake_index);
- for (i = 0; i < BT_WAIT_QUEUES; i++) {
- struct bt_wait_state *bs = &bt->bs[wake_index];
-
- if (waitqueue_active(&bs->wait))
- wake_up(&bs->wait);
-
- wake_index = bt_index_inc(wake_index);
- }
-
- if (include_reserve) {
- bt = &tags->breserved_tags;
- if (waitqueue_active(&bt->bs[0].wait))
- wake_up(&bt->bs[0].wait);
- }
+ sbitmap_queue_wake_all(&tags->bitmap_tags);
+ if (include_reserve)
+ sbitmap_queue_wake_all(&tags->breserved_tags);
}
/*
@@ -118,7 +64,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_bitmap_tags *bt)
+ struct sbitmap_queue *bt)
{
unsigned int depth, users;
@@ -130,7 +76,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
/*
* Don't try dividing an ant
*/
- if (bt->depth == 1)
+ if (bt->sb.depth == 1)
return true;
users = atomic_read(&hctx->tags->active_queues);
@@ -140,142 +86,36 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
/*
* Allow at least some tags
*/
- depth = max((bt->depth + users - 1) / users, 4U);
+ depth = max((bt->sb.depth + users - 1) / users, 4U);
return atomic_read(&hctx->nr_active) < depth;
}
-static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag,
- bool nowrap)
-{
- int tag, org_last_tag = last_tag;
-
- while (1) {
- tag = find_next_zero_bit(&bm->word, bm->depth, last_tag);
- if (unlikely(tag >= bm->depth)) {
- /*
- * We started with an offset, and we didn't reset the
- * offset to 0 in a failure case, so start from 0 to
- * exhaust the map.
- */
- if (org_last_tag && last_tag && !nowrap) {
- last_tag = org_last_tag = 0;
- continue;
- }
- return -1;
- }
-
- if (!test_and_set_bit(tag, &bm->word))
- break;
-
- last_tag = tag + 1;
- if (last_tag >= bm->depth - 1)
- last_tag = 0;
- }
-
- return tag;
-}
-
-#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR)
-
-/*
- * Straight forward bitmap tag implementation, where each bit is a tag
- * (cleared == free, and set == busy). The small twist is using per-cpu
- * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
- * contexts. This enables us to drastically limit the space searched,
- * without dirtying an extra shared cacheline like we would if we stored
- * the cache value inside the shared blk_mq_bitmap_tags structure. On top
- * of that, each word of tags is in a separate cacheline. This means that
- * multiple users will tend to stick to different cachelines, at least
- * until the map is exhausted.
- */
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
- unsigned int *tag_cache, struct blk_mq_tags *tags)
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
{
- unsigned int last_tag, org_last_tag;
- int index, i, tag;
-
if (!hctx_may_queue(hctx, bt))
return -1;
-
- last_tag = org_last_tag = *tag_cache;
- index = TAG_TO_INDEX(bt, last_tag);
-
- for (i = 0; i < bt->map_nr; i++) {
- tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag),
- BT_ALLOC_RR(tags));
- if (tag != -1) {
- tag += (index << bt->bits_per_word);
- goto done;
- }
-
- /*
- * Jump to next index, and reset the last tag to be the
- * first tag of that index
- */
- index++;
- last_tag = (index << bt->bits_per_word);
-
- if (index >= bt->map_nr) {
- index = 0;
- last_tag = 0;
- }
- }
-
- *tag_cache = 0;
- return -1;
-
- /*
- * Only update the cache from the allocation path, if we ended
- * up using the specific cached tag.
- */
-done:
- if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) {
- last_tag = tag + 1;
- if (last_tag >= bt->depth - 1)
- last_tag = 0;
-
- *tag_cache = last_tag;
- }
-
- return tag;
+ return __sbitmap_queue_get(bt);
}
-static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
- struct blk_mq_hw_ctx *hctx)
+static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
+ struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
{
- struct bt_wait_state *bs;
- int wait_index;
-
- if (!hctx)
- return &bt->bs[0];
-
- wait_index = atomic_read(&hctx->wait_index);
- bs = &bt->bs[wait_index];
- bt_index_atomic_inc(&hctx->wait_index);
- return bs;
-}
-
-static int bt_get(struct blk_mq_alloc_data *data,
- struct blk_mq_bitmap_tags *bt,
- struct blk_mq_hw_ctx *hctx,
- unsigned int *last_tag, struct blk_mq_tags *tags)
-{
- struct bt_wait_state *bs;
+ struct sbq_wait_state *ws;
DEFINE_WAIT(wait);
int tag;
- tag = __bt_get(hctx, bt, last_tag, tags);
+ tag = __bt_get(hctx, bt);
if (tag != -1)
return tag;
if (data->flags & BLK_MQ_REQ_NOWAIT)
return -1;
- bs = bt_wait_ptr(bt, hctx);
+ ws = bt_wait_ptr(bt, hctx);
do {
- prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
- tag = __bt_get(hctx, bt, last_tag, tags);
+ tag = __bt_get(hctx, bt);
if (tag != -1)
break;
@@ -292,7 +132,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
* Retry tag allocation after running the hardware queue,
* as running the queue may also have found completions.
*/
- tag = __bt_get(hctx, bt, last_tag, tags);
+ tag = __bt_get(hctx, bt);
if (tag != -1)
break;
@@ -306,15 +146,14 @@ static int bt_get(struct blk_mq_alloc_data *data,
if (data->flags & BLK_MQ_REQ_RESERVED) {
bt = &data->hctx->tags->breserved_tags;
} else {
- last_tag = &data->ctx->last_tag;
hctx = data->hctx;
bt = &hctx->tags->bitmap_tags;
}
- finish_wait(&bs->wait, &wait);
- bs = bt_wait_ptr(bt, hctx);
+ finish_wait(&ws->wait, &wait);
+ ws = bt_wait_ptr(bt, hctx);
} while (1);
- finish_wait(&bs->wait, &wait);
+ finish_wait(&ws->wait, &wait);
return tag;
}
@@ -323,7 +162,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
int tag;
tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
- &data->ctx->last_tag, data->hctx->tags);
+ data->hctx->tags);
if (tag >= 0)
return tag + data->hctx->tags->nr_reserved_tags;
@@ -332,15 +171,15 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
{
- int tag, zero = 0;
+ int tag;
if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
return BLK_MQ_TAG_FAIL;
}
- tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero,
- data->hctx->tags);
+ tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
+ data->hctx->tags);
if (tag < 0)
return BLK_MQ_TAG_FAIL;
@@ -354,55 +193,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
return __blk_mq_get_tag(data);
}
-static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
-{
- int i, wake_index;
-
- wake_index = atomic_read(&bt->wake_index);
- for (i = 0; i < BT_WAIT_QUEUES; i++) {
- struct bt_wait_state *bs = &bt->bs[wake_index];
-
- if (waitqueue_active(&bs->wait)) {
- int o = atomic_read(&bt->wake_index);
- if (wake_index != o)
- atomic_cmpxchg(&bt->wake_index, o, wake_index);
-
- return bs;
- }
-
- wake_index = bt_index_inc(wake_index);
- }
-
- return NULL;
-}
-
-static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
-{
- const int index = TAG_TO_INDEX(bt, tag);
- struct bt_wait_state *bs;
- int wait_cnt;
-
- clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
-
- /* Ensure that the wait list checks occur after clear_bit(). */
- smp_mb();
-
- bs = bt_wake_ptr(bt);
- if (!bs)
- return;
-
- wait_cnt = atomic_dec_return(&bs->wait_cnt);
- if (unlikely(wait_cnt < 0))
- wait_cnt = atomic_inc_return(&bs->wait_cnt);
- if (wait_cnt == 0) {
- atomic_add(bt->wake_cnt, &bs->wait_cnt);
- bt_index_atomic_inc(&bt->wake_index);
- wake_up(&bs->wait);
- }
-}
-
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
- unsigned int *last_tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ unsigned int tag)
{
struct blk_mq_tags *tags = hctx->tags;
@@ -410,67 +202,92 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags);
- bt_clear_tag(&tags->bitmap_tags, real_tag);
- if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO))
- *last_tag = real_tag;
+ sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
- bt_clear_tag(&tags->breserved_tags, tag);
+ sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
}
}
-static void bt_for_each(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_bitmap_tags *bt, unsigned int off,
- busy_iter_fn *fn, void *data, bool reserved)
+struct bt_iter_data {
+ struct blk_mq_hw_ctx *hctx;
+ busy_iter_fn *fn;
+ void *data;
+ bool reserved;
+};
+
+static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
+ struct bt_iter_data *iter_data = data;
+ struct blk_mq_hw_ctx *hctx = iter_data->hctx;
+ struct blk_mq_tags *tags = hctx->tags;
+ bool reserved = iter_data->reserved;
struct request *rq;
- int bit, i;
- for (i = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
+ if (!reserved)
+ bitnr += tags->nr_reserved_tags;
+ rq = tags->rqs[bitnr];
- for (bit = find_first_bit(&bm->word, bm->depth);
- bit < bm->depth;
- bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
- rq = hctx->tags->rqs[off + bit];
- if (rq->q == hctx->queue)
- fn(hctx, rq, data, reserved);
- }
+ if (rq->q == hctx->queue)
+ iter_data->fn(hctx, rq, iter_data->data, reserved);
+ return true;
+}
- off += (1 << bt->bits_per_word);
- }
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
+ busy_iter_fn *fn, void *data, bool reserved)
+{
+ struct bt_iter_data iter_data = {
+ .hctx = hctx,
+ .fn = fn,
+ .data = data,
+ .reserved = reserved,
+ };
+
+ sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
}
-static void bt_tags_for_each(struct blk_mq_tags *tags,
- struct blk_mq_bitmap_tags *bt, unsigned int off,
- busy_tag_iter_fn *fn, void *data, bool reserved)
+struct bt_tags_iter_data {
+ struct blk_mq_tags *tags;
+ busy_tag_iter_fn *fn;
+ void *data;
+ bool reserved;
+};
+
+static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
+ struct bt_tags_iter_data *iter_data = data;
+ struct blk_mq_tags *tags = iter_data->tags;
+ bool reserved = iter_data->reserved;
struct request *rq;
- int bit, i;
- if (!tags->rqs)
- return;
- for (i = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
-
- for (bit = find_first_bit(&bm->word, bm->depth);
- bit < bm->depth;
- bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
- rq = tags->rqs[off + bit];
- fn(rq, data, reserved);
- }
+ if (!reserved)
+ bitnr += tags->nr_reserved_tags;
+ rq = tags->rqs[bitnr];
- off += (1 << bt->bits_per_word);
- }
+ iter_data->fn(rq, iter_data->data, reserved);
+ return true;
+}
+
+static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
+ busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+ struct bt_tags_iter_data iter_data = {
+ .tags = tags,
+ .fn = fn,
+ .data = data,
+ .reserved = reserved,
+ };
+
+ if (tags->rqs)
+ sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}
static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
busy_tag_iter_fn *fn, void *priv)
{
if (tags->nr_reserved_tags)
- bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
- bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
- false);
+ bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
+ bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
}
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
@@ -529,124 +346,40 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
continue;
if (tags->nr_reserved_tags)
- bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
- bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
- false);
+ bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
+ bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
}
}
-static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
{
- unsigned int i, used;
-
- for (i = 0, used = 0; i < bt->map_nr; i++) {
- struct blk_align_bitmap *bm = &bt->map[i];
-
- used += bitmap_weight(&bm->word, bm->depth);
- }
-
- return bt->depth - used;
+ return bt->sb.depth - sbitmap_weight(&bt->sb);
}
-static void bt_update_count(struct blk_mq_bitmap_tags *bt,
- unsigned int depth)
+static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
+ bool round_robin, int node)
{
- unsigned int tags_per_word = 1U << bt->bits_per_word;
- unsigned int map_depth = depth;
-
- if (depth) {
- int i;
-
- for (i = 0; i < bt->map_nr; i++) {
- bt->map[i].depth = min(map_depth, tags_per_word);
- map_depth -= bt->map[i].depth;
- }
- }
-
- bt->wake_cnt = BT_WAIT_BATCH;
- if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
- bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
-
- bt->depth = depth;
-}
-
-static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
- int node, bool reserved)
-{
- int i;
-
- bt->bits_per_word = ilog2(BITS_PER_LONG);
-
- /*
- * Depth can be zero for reserved tags, that's not a failure
- * condition.
- */
- if (depth) {
- unsigned int nr, tags_per_word;
-
- tags_per_word = (1 << bt->bits_per_word);
-
- /*
- * If the tag space is small, shrink the number of tags
- * per word so we spread over a few cachelines, at least.
- * If less than 4 tags, just forget about it, it's not
- * going to work optimally anyway.
- */
- if (depth >= 4) {
- while (tags_per_word * 4 > depth) {
- bt->bits_per_word--;
- tags_per_word = (1 << bt->bits_per_word);
- }
- }
-
- nr = ALIGN(depth, tags_per_word) / tags_per_word;
- bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
- GFP_KERNEL, node);
- if (!bt->map)
- return -ENOMEM;
-
- bt->map_nr = nr;
- }
-
- bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
- if (!bt->bs) {
- kfree(bt->map);
- bt->map = NULL;
- return -ENOMEM;
- }
-
- bt_update_count(bt, depth);
-
- for (i = 0; i < BT_WAIT_QUEUES; i++) {
- init_waitqueue_head(&bt->bs[i].wait);
- atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
- }
-
- return 0;
-}
-
-static void bt_free(struct blk_mq_bitmap_tags *bt)
-{
- kfree(bt->map);
- kfree(bt->bs);
+ return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
+ node);
}
static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node, int alloc_policy)
{
unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+ bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
- tags->alloc_policy = alloc_policy;
-
- if (bt_alloc(&tags->bitmap_tags, depth, node, false))
- goto enomem;
- if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
- goto enomem;
+ if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
+ goto free_tags;
+ if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
+ node))
+ goto free_bitmap_tags;
return tags;
-enomem:
- bt_free(&tags->bitmap_tags);
+free_bitmap_tags:
+ sbitmap_queue_free(&tags->bitmap_tags);
+free_tags:
kfree(tags);
return NULL;
}
@@ -679,19 +412,12 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
void blk_mq_free_tags(struct blk_mq_tags *tags)
{
- bt_free(&tags->bitmap_tags);
- bt_free(&tags->breserved_tags);
+ sbitmap_queue_free(&tags->bitmap_tags);
+ sbitmap_queue_free(&tags->breserved_tags);
free_cpumask_var(tags->cpumask);
kfree(tags);
}
-void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
-{
- unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
-
- *tag = prandom_u32() % depth;
-}
-
int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
{
tdepth -= tags->nr_reserved_tags;
@@ -702,7 +428,8 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
* Don't need (or can't) update reserved tags here, they remain
* static and should never need resizing.
*/
- bt_update_count(&tags->bitmap_tags, tdepth);
+ sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+
blk_mq_tag_wakeup_all(tags, false);
return 0;
}
@@ -746,7 +473,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
"bits_per_word=%u\n",
tags->nr_tags, tags->nr_reserved_tags,
- tags->bitmap_tags.bits_per_word);
+ 1U << tags->bitmap_tags.sb.shift);
free = bt_unused_tags(&tags->bitmap_tags);
res = bt_unused_tags(&tags->breserved_tags);
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d468a79f2c4a..09f4cc0aaa84 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -3,31 +3,6 @@
#include "blk-mq.h"
-enum {
- BT_WAIT_QUEUES = 8,
- BT_WAIT_BATCH = 8,
-};
-
-struct bt_wait_state {
- atomic_t wait_cnt;
- wait_queue_head_t wait;
-} ____cacheline_aligned_in_smp;
-
-#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word)
-#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1))
-
-struct blk_mq_bitmap_tags {
- unsigned int depth;
- unsigned int wake_cnt;
- unsigned int bits_per_word;
-
- unsigned int map_nr;
- struct blk_align_bitmap *map;
-
- atomic_t wake_index;
- struct bt_wait_state *bs;
-};
-
/*
* Tag address space map.
*/
@@ -37,13 +12,12 @@ struct blk_mq_tags {
atomic_t active_queues;
- struct blk_mq_bitmap_tags bitmap_tags;
- struct blk_mq_bitmap_tags breserved_tags;
+ struct sbitmap_queue bitmap_tags;
+ struct sbitmap_queue breserved_tags;
struct request **rqs;
struct list_head page_list;
- int alloc_policy;
cpumask_var_t cpumask;
};
@@ -52,15 +26,23 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ unsigned int tag);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
-extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
+static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
+ struct blk_mq_hw_ctx *hctx)
+{
+ if (!hctx)
+ return &bt->ws[0];
+ return sbq_wait_ptr(bt, &hctx->wait_index);
+}
+
enum {
BLK_MQ_TAG_CACHE_MIN = 1,
BLK_MQ_TAG_CACHE_MAX = 64,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c207fa9870eb..dc5f47f60931 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -22,6 +22,7 @@
#include <linux/sched/sysctl.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
+#include <linux/prefetch.h>
#include <trace/events/block.h>
@@ -33,49 +34,28 @@
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
-
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
- unsigned int i;
-
- for (i = 0; i < hctx->ctx_map.size; i++)
- if (hctx->ctx_map.map[i].word)
- return true;
-
- return false;
-}
-
-static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx)
-{
- return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
+ return sbitmap_any_bit_set(&hctx->ctx_map);
}
-#define CTX_TO_BIT(hctx, ctx) \
- ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
-
/*
* Mark this ctx as having pending work in this hardware queue
*/
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
- struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
- if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
- set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+ if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
+ sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
}
static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
- struct blk_align_bitmap *bm = get_bm(hctx, ctx);
-
- clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
+ sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
}
void blk_mq_freeze_queue_start(struct request_queue *q)
@@ -246,19 +226,9 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
ctx = blk_mq_get_ctx(q);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-
rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
- if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
- __blk_mq_run_hw_queue(hctx);
- blk_mq_put_ctx(ctx);
-
- ctx = blk_mq_get_ctx(q);
- hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
- ctx = alloc_data.ctx;
- }
blk_mq_put_ctx(ctx);
+
if (!rq) {
blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
@@ -333,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
rq->cmd_flags = 0;
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- blk_mq_put_tag(hctx, tag, &ctx->last_tag);
+ blk_mq_put_tag(hctx, ctx, tag);
blk_queue_exit(q);
}
@@ -513,7 +483,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request);
static void blk_mq_requeue_work(struct work_struct *work)
{
struct request_queue *q =
- container_of(work, struct request_queue, requeue_work);
+ container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
struct request *rq, *next;
unsigned long flags;
@@ -568,16 +538,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
void blk_mq_cancel_requeue_work(struct request_queue *q)
{
- cancel_work_sync(&q->requeue_work);
+ cancel_delayed_work_sync(&q->requeue_work);
}
EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work);
void blk_mq_kick_requeue_list(struct request_queue *q)
{
- kblockd_schedule_work(&q->requeue_work);
+ kblockd_schedule_delayed_work(&q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+void blk_mq_delay_kick_requeue_list(struct request_queue *q,
+ unsigned long msecs)
+{
+ kblockd_schedule_delayed_work(&q->requeue_work,
+ msecs_to_jiffies(msecs));
+}
+EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
+
void blk_mq_abort_requeue_list(struct request_queue *q)
{
unsigned long flags;
@@ -600,8 +578,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
- if (tag < tags->nr_tags)
+ if (tag < tags->nr_tags) {
+ prefetch(tags->rqs[tag]);
return tags->rqs[tag];
+ }
return NULL;
}
@@ -756,38 +736,44 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
return false;
}
+struct flush_busy_ctx_data {
+ struct blk_mq_hw_ctx *hctx;
+ struct list_head *list;
+};
+
+static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
+{
+ struct flush_busy_ctx_data *flush_data = data;
+ struct blk_mq_hw_ctx *hctx = flush_data->hctx;
+ struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+
+ sbitmap_clear_bit(sb, bitnr);
+ spin_lock(&ctx->lock);
+ list_splice_tail_init(&ctx->rq_list, flush_data->list);
+ spin_unlock(&ctx->lock);
+ return true;
+}
+
/*
* Process software queues that have been marked busy, splicing them
* to the for-dispatch
*/
static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
- struct blk_mq_ctx *ctx;
- int i;
-
- for (i = 0; i < hctx->ctx_map.size; i++) {
- struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
- unsigned int off, bit;
-
- if (!bm->word)
- continue;
+ struct flush_busy_ctx_data data = {
+ .hctx = hctx,
+ .list = list,
+ };
- bit = 0;
- off = i * hctx->ctx_map.bits_per_word;
- do {
- bit = find_next_bit(&bm->word, bm->depth, bit);
- if (bit >= bm->depth)
- break;
+ sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
+}
- ctx = hctx->ctxs[bit + off];
- clear_bit(bit, &bm->word);
- spin_lock(&ctx->lock);
- list_splice_tail_init(&ctx->rq_list, list);
- spin_unlock(&ctx->lock);
+static inline unsigned int queued_to_index(unsigned int queued)
+{
+ if (!queued)
+ return 0;
- bit++;
- } while (1);
- }
+ return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
/*
@@ -878,10 +864,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
dptr = &driver_list;
}
- if (!queued)
- hctx->dispatched[0]++;
- else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
- hctx->dispatched[ilog2(queued) + 1]++;
+ hctx->dispatched[queued_to_index(queued)]++;
/*
* Any items that need requeuing? Stuff them into hctx->dispatch,
@@ -937,7 +920,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
!blk_mq_hw_queue_mapped(hctx)))
return;
- if (!async) {
+ if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
@@ -948,8 +931,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
put_cpu();
}
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->run_work, 0);
+ kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
}
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -970,7 +952,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- cancel_delayed_work(&hctx->run_work);
+ cancel_work(&hctx->run_work);
cancel_delayed_work(&hctx->delay_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
@@ -1023,7 +1005,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
- hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
+ hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
__blk_mq_run_hw_queue(hctx);
}
@@ -1240,20 +1222,8 @@ static struct request *blk_mq_map_request(struct request_queue *q,
op_flags |= REQ_SYNC;
trace_block_getrq(q, bio, op);
- blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
+ blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
- if (unlikely(!rq)) {
- __blk_mq_run_hw_queue(hctx);
- blk_mq_put_ctx(ctx);
- trace_block_sleeprq(q, bio, op);
-
- ctx = blk_mq_get_ctx(q);
- hctx = q->mq_ops->map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, op, op_flags);
- ctx = alloc_data.ctx;
- hctx = alloc_data.hctx;
- }
hctx->queued++;
data->hctx = hctx;
@@ -1606,32 +1576,6 @@ fail:
return NULL;
}
-static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
-{
- kfree(bitmap->map);
-}
-
-static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
-{
- unsigned int bpw = 8, total, num_maps, i;
-
- bitmap->bits_per_word = bpw;
-
- num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
- bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
- GFP_KERNEL, node);
- if (!bitmap->map)
- return -ENOMEM;
-
- total = nr_cpu_ids;
- for (i = 0; i < num_maps; i++) {
- bitmap->map[i].depth = min(total, bitmap->bits_per_word);
- total -= bitmap->map[i].depth;
- }
-
- return 0;
-}
-
/*
* 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it
@@ -1697,7 +1641,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
blk_free_flush_queue(hctx->fq);
- blk_mq_free_bitmap(&hctx->ctx_map);
+ sbitmap_free(&hctx->ctx_map);
}
static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -1734,7 +1678,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (node == NUMA_NO_NODE)
node = hctx->numa_node = set->numa_node;
- INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
+ INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
@@ -1757,7 +1701,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (!hctx->ctxs)
goto unregister_cpu_notifier;
- if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
+ if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
+ node))
goto free_ctxs;
hctx->nr_ctx = 0;
@@ -1784,7 +1729,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
free_bitmap:
- blk_mq_free_bitmap(&hctx->ctx_map);
+ sbitmap_free(&hctx->ctx_map);
free_ctxs:
kfree(hctx->ctxs);
unregister_cpu_notifier:
@@ -1860,8 +1805,6 @@ static void blk_mq_map_swqueue(struct request_queue *q,
mutex_unlock(&q->sysfs_lock);
queue_for_each_hw_ctx(q, hctx, i) {
- struct blk_mq_ctxmap *map = &hctx->ctx_map;
-
/*
* If no software queues are mapped to this hardware queue,
* disable it and free the request entries.
@@ -1887,7 +1830,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
* This is more accurate and more efficient than looping
* over all possibly mapped software queues.
*/
- map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word);
+ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/*
* Initialize batch roundrobin counts
@@ -2094,7 +2037,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->sg_reserved_size = INT_MAX;
- INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
+ INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11037b7..9b15d2ef7f7b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -12,8 +12,6 @@ struct blk_mq_ctx {
unsigned int cpu;
unsigned int index_hw;
- unsigned int last_tag ____cacheline_aligned_in_smp;
-
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
unsigned long rq_merged;
@@ -63,15 +61,6 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q);
-/*
- * Basic implementation of sparser bitmap, allowing the user to spread
- * the bits over more cachelines.
- */
-struct blk_align_bitmap {
- unsigned long word;
- unsigned long depth;
-} ____cacheline_aligned_in_smp;
-
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f87a7e747d36..9cc8d7c5439a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -704,7 +704,7 @@ int blk_register_queue(struct gendisk *disk)
kobject_uevent(&q->kobj, KOBJ_ADD);
if (q->mq_ops)
- blk_mq_register_disk(disk);
+ blk_mq_register_dev(dev, q);
if (!q->request_fn)
return 0;
@@ -729,7 +729,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
if (q->mq_ops)
- blk_mq_unregister_disk(disk);
+ blk_mq_unregister_dev(disk_to_dev(disk), q);
if (q->request_fn)
elv_unregister_queue(q);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cc2f6dbd4303..5e24d880306c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
if (ktime_get_ns() < rq->fifo_time)
rq = NULL;
- cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
return rq;
}
@@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
unsigned int max_dispatch;
+ if (cfq_cfqq_must_dispatch(cfqq))
+ return true;
+
/*
* Drain async requests before we start sync IO
*/
@@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
+ rq = cfq_check_fifo(cfqq);
+ if (rq)
+ cfq_mark_cfqq_must_dispatch(cfqq);
+
if (!cfq_may_dispatch(cfqd, cfqq))
return false;
/*
* follow expired path, else get first next available
*/
- rq = cfq_check_fifo(cfqq);
if (!rq)
rq = cfqq->next_rq;
+ else
+ cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
/*
* insert request into driver dispatch list
@@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
* if the new request is sync, but the currently running queue is
* not, let the sync request have priority.
*/
- if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
+ if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
return true;
/*
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 2aca98e8e427..88c46853dbb5 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3686,7 +3686,7 @@ static int mtip_block_open(struct block_device *dev, fmode_t mode)
return -ENODEV;
}
-void mtip_block_release(struct gendisk *disk, fmode_t mode)
+static void mtip_block_release(struct gendisk *disk, fmode_t mode)
{
}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a9e398019f38..ccfcfc11399a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -34,33 +34,29 @@
#include <linux/kthread.h>
#include <linux/types.h>
#include <linux/debugfs.h>
+#include <linux/blk-mq.h>
#include <asm/uaccess.h>
#include <asm/types.h>
#include <linux/nbd.h>
+#define NBD_TIMEDOUT 0
+#define NBD_DISCONNECT_REQUESTED 1
+
struct nbd_device {
u32 flags;
+ unsigned long runtime_flags;
struct socket * sock; /* If == NULL, device is not ready, yet */
int magic;
- spinlock_t queue_lock;
- struct list_head queue_head; /* Requests waiting result */
- struct request *active_req;
- wait_queue_head_t active_wq;
- struct list_head waiting_queue; /* Requests to be sent */
- wait_queue_head_t waiting_wq;
+ struct blk_mq_tag_set tag_set;
struct mutex tx_lock;
struct gendisk *disk;
int blksize;
loff_t bytesize;
- int xmit_timeout;
- bool timedout;
- bool disconnect; /* a disconnect has been requested by user */
- struct timer_list timeout_timer;
/* protects initialization and shutdown of the socket */
spinlock_t sock_lock;
struct task_struct *task_recv;
@@ -71,6 +67,11 @@ struct nbd_device {
#endif
};
+struct nbd_cmd {
+ struct nbd_device *nbd;
+ struct list_head list;
+};
+
#if IS_ENABLED(CONFIG_DEBUG_FS)
static struct dentry *nbd_dbg_dir;
#endif
@@ -83,18 +84,6 @@ static unsigned int nbds_max = 16;
static struct nbd_device *nbd_dev;
static int max_part;
-/*
- * Use just one lock (or at most 1 per NIC). Two arguments for this:
- * 1. Each NIC is essentially a synchronization point for all servers
- * accessed through that NIC so there's no need to have more locks
- * than NICs anyway.
- * 2. More locks lead to more "Dirty cache line bouncing" which will slow
- * down each lock to the point where they're actually slower than just
- * a single lock.
- * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
- */
-static DEFINE_SPINLOCK(nbd_lock);
-
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
{
return disk_to_dev(nbd->disk);
@@ -153,18 +142,16 @@ static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
return 0;
}
-static void nbd_end_request(struct nbd_device *nbd, struct request *req)
+static void nbd_end_request(struct nbd_cmd *cmd)
{
+ struct nbd_device *nbd = cmd->nbd;
+ struct request *req = blk_mq_rq_from_pdu(cmd);
int error = req->errors ? -EIO : 0;
- struct request_queue *q = req->q;
- unsigned long flags;
- dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
+ dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
error ? "failed" : "done");
- spin_lock_irqsave(q->queue_lock, flags);
- __blk_end_request_all(req, error);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ blk_mq_complete_request(req, error);
}
/*
@@ -172,40 +159,49 @@ static void nbd_end_request(struct nbd_device *nbd, struct request *req)
*/
static void sock_shutdown(struct nbd_device *nbd)
{
- spin_lock_irq(&nbd->sock_lock);
+ struct socket *sock;
+
+ spin_lock(&nbd->sock_lock);
if (!nbd->sock) {
spin_unlock_irq(&nbd->sock_lock);
return;
}
+ sock = nbd->sock;
dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
- kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
- sockfd_put(nbd->sock);
nbd->sock = NULL;
- spin_unlock_irq(&nbd->sock_lock);
+ spin_unlock(&nbd->sock_lock);
- del_timer(&nbd->timeout_timer);
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sockfd_put(sock);
}
-static void nbd_xmit_timeout(unsigned long arg)
+static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
+ bool reserved)
{
- struct nbd_device *nbd = (struct nbd_device *)arg;
- unsigned long flags;
-
- if (list_empty(&nbd->queue_head))
- return;
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+ struct nbd_device *nbd = cmd->nbd;
+ struct socket *sock = NULL;
- spin_lock_irqsave(&nbd->sock_lock, flags);
+ spin_lock(&nbd->sock_lock);
- nbd->timedout = true;
+ set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
- if (nbd->sock)
- kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+ if (nbd->sock) {
+ sock = nbd->sock;
+ get_file(sock->file);
+ }
- spin_unlock_irqrestore(&nbd->sock_lock, flags);
+ spin_unlock(&nbd->sock_lock);
+ if (sock) {
+ kernel_sock_shutdown(sock, SHUT_RDWR);
+ sockfd_put(sock);
+ }
+ req->errors++;
dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
+ return BLK_EH_HANDLED;
}
/*
@@ -255,9 +251,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
tsk_restore_flags(current, pflags, PF_MEMALLOC);
- if (!send && nbd->xmit_timeout)
- mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
-
return result;
}
@@ -273,8 +266,9 @@ static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
}
/* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *nbd, struct request *req)
+static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd)
{
+ struct request *req = blk_mq_rq_from_pdu(cmd);
int result, flags;
struct nbd_request request;
unsigned long size = blk_rq_bytes(req);
@@ -298,10 +292,10 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
request.len = htonl(size);
}
- memcpy(request.handle, &req, sizeof(req));
+ memcpy(request.handle, &req->tag, sizeof(req->tag));
dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
- req, nbdcmd_to_ascii(type),
+ cmd, nbdcmd_to_ascii(type),
(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
result = sock_xmit(nbd, 1, &request, sizeof(request),
(type == NBD_CMD_WRITE) ? MSG_MORE : 0);
@@ -323,7 +317,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
if (!rq_iter_last(bvec, iter))
flags = MSG_MORE;
dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
- req, bvec.bv_len);
+ cmd, bvec.bv_len);
result = sock_send_bvec(nbd, &bvec, flags);
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk),
@@ -336,29 +330,6 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
return 0;
}
-static struct request *nbd_find_request(struct nbd_device *nbd,
- struct request *xreq)
-{
- struct request *req, *tmp;
- int err;
-
- err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
- if (unlikely(err))
- return ERR_PTR(err);
-
- spin_lock(&nbd->queue_lock);
- list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
- if (req != xreq)
- continue;
- list_del_init(&req->queuelist);
- spin_unlock(&nbd->queue_lock);
- return req;
- }
- spin_unlock(&nbd->queue_lock);
-
- return ERR_PTR(-ENOENT);
-}
-
static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
{
int result;
@@ -370,11 +341,14 @@ static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
}
/* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *nbd)
+static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd)
{
int result;
struct nbd_reply reply;
- struct request *req;
+ struct nbd_cmd *cmd;
+ struct request *req = NULL;
+ u16 hwq;
+ int tag;
reply.magic = 0;
result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
@@ -390,25 +364,27 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
return ERR_PTR(-EPROTO);
}
- req = nbd_find_request(nbd, *(struct request **)reply.handle);
- if (IS_ERR(req)) {
- result = PTR_ERR(req);
- if (result != -ENOENT)
- return ERR_PTR(result);
+ memcpy(&tag, reply.handle, sizeof(int));
- dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
- reply.handle);
- return ERR_PTR(-EBADR);
+ hwq = blk_mq_unique_tag_to_hwq(tag);
+ if (hwq < nbd->tag_set.nr_hw_queues)
+ req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
+ blk_mq_unique_tag_to_tag(tag));
+ if (!req || !blk_mq_request_started(req)) {
+ dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
+ tag, req);
+ return ERR_PTR(-ENOENT);
}
+ cmd = blk_mq_rq_to_pdu(req);
if (ntohl(reply.error)) {
dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
ntohl(reply.error));
req->errors++;
- return req;
+ return cmd;
}
- dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
+ dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd);
if (rq_data_dir(req) != WRITE) {
struct req_iterator iter;
struct bio_vec bvec;
@@ -419,13 +395,13 @@ static struct request *nbd_read_stat(struct nbd_device *nbd)
dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
result);
req->errors++;
- return req;
+ return cmd;
}
dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
- req, bvec.bv_len);
+ cmd, bvec.bv_len);
}
}
- return req;
+ return cmd;
}
static ssize_t pid_show(struct device *dev,
@@ -444,7 +420,7 @@ static struct device_attribute pid_attr = {
static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
{
- struct request *req;
+ struct nbd_cmd *cmd;
int ret;
BUG_ON(nbd->magic != NBD_MAGIC);
@@ -460,13 +436,13 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
nbd_size_update(nbd, bdev);
while (1) {
- req = nbd_read_stat(nbd);
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
+ cmd = nbd_read_stat(nbd);
+ if (IS_ERR(cmd)) {
+ ret = PTR_ERR(cmd);
break;
}
- nbd_end_request(nbd, req);
+ nbd_end_request(cmd);
}
nbd_size_clear(nbd, bdev);
@@ -475,44 +451,37 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
return ret;
}
-static void nbd_clear_que(struct nbd_device *nbd)
+static void nbd_clear_req(struct request *req, void *data, bool reserved)
{
- struct request *req;
+ struct nbd_cmd *cmd;
+
+ if (!blk_mq_request_started(req))
+ return;
+ cmd = blk_mq_rq_to_pdu(req);
+ req->errors++;
+ nbd_end_request(cmd);
+}
+static void nbd_clear_que(struct nbd_device *nbd)
+{
BUG_ON(nbd->magic != NBD_MAGIC);
/*
* Because we have set nbd->sock to NULL under the tx_lock, all
- * modifications to the list must have completed by now. For
- * the same reason, the active_req must be NULL.
- *
- * As a consequence, we don't need to take the spin lock while
- * purging the list here.
+ * modifications to the list must have completed by now.
*/
BUG_ON(nbd->sock);
- BUG_ON(nbd->active_req);
- while (!list_empty(&nbd->queue_head)) {
- req = list_entry(nbd->queue_head.next, struct request,
- queuelist);
- list_del_init(&req->queuelist);
- req->errors++;
- nbd_end_request(nbd, req);
- }
-
- while (!list_empty(&nbd->waiting_queue)) {
- req = list_entry(nbd->waiting_queue.next, struct request,
- queuelist);
- list_del_init(&req->queuelist);
- req->errors++;
- nbd_end_request(nbd, req);
- }
+ blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
}
-static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
+static void nbd_handle_cmd(struct nbd_cmd *cmd)
{
+ struct request *req = blk_mq_rq_from_pdu(cmd);
+ struct nbd_device *nbd = cmd->nbd;
+
if (req->cmd_type != REQ_TYPE_FS)
goto error_out;
@@ -526,6 +495,7 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
req->errors = 0;
mutex_lock(&nbd->tx_lock);
+ nbd->task_send = current;
if (unlikely(!nbd->sock)) {
mutex_unlock(&nbd->tx_lock);
dev_err(disk_to_dev(nbd->disk),
@@ -533,106 +503,30 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
goto error_out;
}
- nbd->active_req = req;
-
- if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
- mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
-
- if (nbd_send_req(nbd, req) != 0) {
+ if (nbd_send_cmd(nbd, cmd) != 0) {
dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
req->errors++;
- nbd_end_request(nbd, req);
- } else {
- spin_lock(&nbd->queue_lock);
- list_add_tail(&req->queuelist, &nbd->queue_head);
- spin_unlock(&nbd->queue_lock);
+ nbd_end_request(cmd);
}
- nbd->active_req = NULL;
+ nbd->task_send = NULL;
mutex_unlock(&nbd->tx_lock);
- wake_up_all(&nbd->active_wq);
return;
error_out:
req->errors++;
- nbd_end_request(nbd, req);
+ nbd_end_request(cmd);
}
-static int nbd_thread_send(void *data)
+static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
{
- struct nbd_device *nbd = data;
- struct request *req;
-
- nbd->task_send = current;
-
- set_user_nice(current, MIN_NICE);
- while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
- /* wait for something to do */
- wait_event_interruptible(nbd->waiting_wq,
- kthread_should_stop() ||
- !list_empty(&nbd->waiting_queue));
-
- /* extract request */
- if (list_empty(&nbd->waiting_queue))
- continue;
-
- spin_lock_irq(&nbd->queue_lock);
- req = list_entry(nbd->waiting_queue.next, struct request,
- queuelist);
- list_del_init(&req->queuelist);
- spin_unlock_irq(&nbd->queue_lock);
-
- /* handle request */
- nbd_handle_req(nbd, req);
- }
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
- nbd->task_send = NULL;
-
- return 0;
-}
-
-/*
- * We always wait for result of write, for now. It would be nice to make it optional
- * in future
- * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
- * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
- */
-
-static void nbd_request_handler(struct request_queue *q)
- __releases(q->queue_lock) __acquires(q->queue_lock)
-{
- struct request *req;
-
- while ((req = blk_fetch_request(q)) != NULL) {
- struct nbd_device *nbd;
-
- spin_unlock_irq(q->queue_lock);
-
- nbd = req->rq_disk->private_data;
-
- BUG_ON(nbd->magic != NBD_MAGIC);
-
- dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
- req, req->cmd_type);
-
- if (unlikely(!nbd->sock)) {
- dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on closed socket\n");
- req->errors++;
- nbd_end_request(nbd, req);
- spin_lock_irq(q->queue_lock);
- continue;
- }
-
- spin_lock_irq(&nbd->queue_lock);
- list_add_tail(&req->queuelist, &nbd->waiting_queue);
- spin_unlock_irq(&nbd->queue_lock);
-
- wake_up(&nbd->waiting_wq);
-
- spin_lock_irq(q->queue_lock);
- }
+ blk_mq_start_request(bd->rq);
+ nbd_handle_cmd(cmd);
+ return BLK_MQ_RQ_QUEUE_OK;
}
static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
@@ -657,15 +551,13 @@ out:
/* Reset all properties of an NBD device */
static void nbd_reset(struct nbd_device *nbd)
{
- nbd->disconnect = false;
- nbd->timedout = false;
+ nbd->runtime_flags = 0;
nbd->blksize = 1024;
nbd->bytesize = 0;
set_capacity(nbd->disk, 0);
nbd->flags = 0;
- nbd->xmit_timeout = 0;
+ nbd->tag_set.timeout = 0;
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
- del_timer_sync(&nbd->timeout_timer);
}
static void nbd_bdev_reset(struct block_device *bdev)
@@ -700,33 +592,37 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
{
switch (cmd) {
case NBD_DISCONNECT: {
- struct request sreq;
+ struct request *sreq;
dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
if (!nbd->sock)
return -EINVAL;
+ sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0);
+ if (!sreq)
+ return -ENOMEM;
+
mutex_unlock(&nbd->tx_lock);
fsync_bdev(bdev);
mutex_lock(&nbd->tx_lock);
- blk_rq_init(NULL, &sreq);
- sreq.cmd_type = REQ_TYPE_DRV_PRIV;
+ sreq->cmd_type = REQ_TYPE_DRV_PRIV;
/* Check again after getting mutex back. */
- if (!nbd->sock)
+ if (!nbd->sock) {
+ blk_mq_free_request(sreq);
return -EINVAL;
+ }
- nbd->disconnect = true;
+ set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags);
- nbd_send_req(nbd, &sreq);
+ nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq));
+ blk_mq_free_request(sreq);
return 0;
}
case NBD_CLEAR_SOCK:
sock_shutdown(nbd);
nbd_clear_que(nbd);
- BUG_ON(!list_empty(&nbd->queue_head));
- BUG_ON(!list_empty(&nbd->waiting_queue));
kill_bdev(bdev);
return 0;
@@ -758,13 +654,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return nbd_size_set(nbd, bdev, nbd->blksize, arg);
case NBD_SET_TIMEOUT:
- nbd->xmit_timeout = arg * HZ;
- if (arg)
- mod_timer(&nbd->timeout_timer,
- jiffies + nbd->xmit_timeout);
- else
- del_timer_sync(&nbd->timeout_timer);
-
+ nbd->tag_set.timeout = arg * HZ;
return 0;
case NBD_SET_FLAGS:
@@ -772,7 +662,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return 0;
case NBD_DO_IT: {
- struct task_struct *thread;
int error;
if (nbd->task_recv)
@@ -786,18 +675,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
nbd_parse_flags(nbd, bdev);
- thread = kthread_run(nbd_thread_send, nbd, "%s",
- nbd_name(nbd));
- if (IS_ERR(thread)) {
- mutex_lock(&nbd->tx_lock);
- nbd->task_recv = NULL;
- return PTR_ERR(thread);
- }
-
nbd_dev_dbg_init(nbd);
error = nbd_thread_recv(nbd, bdev);
nbd_dev_dbg_close(nbd);
- kthread_stop(thread);
mutex_lock(&nbd->tx_lock);
nbd->task_recv = NULL;
@@ -807,9 +687,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
kill_bdev(bdev);
nbd_bdev_reset(bdev);
- if (nbd->disconnect) /* user requested, ignore socket errors */
+ /* user requested, ignore socket errors */
+ if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
error = 0;
- if (nbd->timedout)
+ if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
error = -ETIMEDOUT;
nbd_reset(nbd);
@@ -825,10 +706,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return 0;
case NBD_PRINT_DEBUG:
- dev_info(disk_to_dev(nbd->disk),
- "next = %p, prev = %p, head = %p\n",
- nbd->queue_head.next, nbd->queue_head.prev,
- &nbd->queue_head);
+ /*
+ * For compatibility only, we no longer keep a list of
+ * outstanding requests.
+ */
return 0;
}
return -ENOTTY;
@@ -935,7 +816,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
- debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
+ debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
@@ -987,6 +868,24 @@ static void nbd_dbg_close(void)
#endif
+static int nbd_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ cmd->nbd = data;
+ INIT_LIST_HEAD(&cmd->list);
+ return 0;
+}
+
+static struct blk_mq_ops nbd_mq_ops = {
+ .queue_rq = nbd_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = nbd_init_request,
+ .timeout = nbd_xmit_timeout,
+};
+
/*
* And here should be modules and kernel interface
* (Just smiley confuses emacs :-)
@@ -1035,16 +934,34 @@ static int __init nbd_init(void)
if (!disk)
goto out;
nbd_dev[i].disk = disk;
+
+ nbd_dev[i].tag_set.ops = &nbd_mq_ops;
+ nbd_dev[i].tag_set.nr_hw_queues = 1;
+ nbd_dev[i].tag_set.queue_depth = 128;
+ nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE;
+ nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd);
+ nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
+ BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
+ nbd_dev[i].tag_set.driver_data = &nbd_dev[i];
+
+ err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set);
+ if (err) {
+ put_disk(disk);
+ goto out;
+ }
+
/*
* The new linux 2.5 block layer implementation requires
* every gendisk to have its very own request_queue struct.
* These structs are big so we dynamically allocate them.
*/
- disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
+ disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set);
if (!disk->queue) {
+ blk_mq_free_tag_set(&nbd_dev[i].tag_set);
put_disk(disk);
goto out;
}
+
/*
* Tell the block layer that we are not a rotational device
*/
@@ -1069,16 +986,8 @@ static int __init nbd_init(void)
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = nbd_dev[i].disk;
nbd_dev[i].magic = NBD_MAGIC;
- INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
- spin_lock_init(&nbd_dev[i].queue_lock);
spin_lock_init(&nbd_dev[i].sock_lock);
- INIT_LIST_HEAD(&nbd_dev[i].queue_head);
mutex_init(&nbd_dev[i].tx_lock);
- init_timer(&nbd_dev[i].timeout_timer);
- nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
- nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
- init_waitqueue_head(&nbd_dev[i].active_wq);
- init_waitqueue_head(&nbd_dev[i].waiting_wq);
disk->major = NBD_MAJOR;
disk->first_minor = i << part_shift;
disk->fops = &nbd_fops;
@@ -1091,6 +1000,7 @@ static int __init nbd_init(void)
return 0;
out:
while (i--) {
+ blk_mq_free_tag_set(&nbd_dev[i].tag_set);
blk_cleanup_queue(nbd_dev[i].disk->queue);
put_disk(nbd_dev[i].disk);
}
@@ -1110,6 +1020,7 @@ static void __exit nbd_cleanup(void)
if (disk) {
del_gendisk(disk);
blk_cleanup_queue(disk->queue);
+ blk_mq_free_tag_set(&nbd_dev[i].tag_set);
put_disk(disk);
}
}
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 75a7f88d6717..91e1de898daf 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -34,6 +34,7 @@ struct nullb {
unsigned int index;
struct request_queue *q;
struct gendisk *disk;
+ struct nvm_dev *ndev;
struct blk_mq_tag_set tag_set;
struct hrtimer timer;
unsigned int queue_depth;
@@ -414,23 +415,6 @@ static void cleanup_queues(struct nullb *nullb)
kfree(nullb->queues);
}
-static void null_del_dev(struct nullb *nullb)
-{
- list_del_init(&nullb->list);
-
- if (use_lightnvm)
- nvm_unregister(nullb->disk_name);
- else
- del_gendisk(nullb->disk);
- blk_cleanup_queue(nullb->q);
- if (queue_mode == NULL_Q_MQ)
- blk_mq_free_tag_set(&nullb->tag_set);
- if (!use_lightnvm)
- put_disk(nullb->disk);
- cleanup_queues(nullb);
- kfree(nullb);
-}
-
#ifdef CONFIG_NVM
static void null_lnvm_end_io(struct request *rq, int error)
@@ -564,10 +548,58 @@ static struct nvm_dev_ops null_lnvm_dev_ops = {
/* Simulate nvme protocol restriction */
.max_phys_sect = 64,
};
+
+static int null_nvm_register(struct nullb *nullb)
+{
+ struct nvm_dev *dev;
+ int rv;
+
+ dev = nvm_alloc_dev(0);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->q = nullb->q;
+ memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
+ dev->ops = &null_lnvm_dev_ops;
+
+ rv = nvm_register(dev);
+ if (rv) {
+ kfree(dev);
+ return rv;
+ }
+ nullb->ndev = dev;
+ return 0;
+}
+
+static void null_nvm_unregister(struct nullb *nullb)
+{
+ nvm_unregister(nullb->ndev);
+}
#else
-static struct nvm_dev_ops null_lnvm_dev_ops;
+static int null_nvm_register(struct nullb *nullb)
+{
+ return -EINVAL;
+}
+static void null_nvm_unregister(struct nullb *nullb) {}
#endif /* CONFIG_NVM */
+static void null_del_dev(struct nullb *nullb)
+{
+ list_del_init(&nullb->list);
+
+ if (use_lightnvm)
+ null_nvm_unregister(nullb);
+ else
+ del_gendisk(nullb->disk);
+ blk_cleanup_queue(nullb->q);
+ if (queue_mode == NULL_Q_MQ)
+ blk_mq_free_tag_set(&nullb->tag_set);
+ if (!use_lightnvm)
+ put_disk(nullb->disk);
+ cleanup_queues(nullb);
+ kfree(nullb);
+}
+
static int null_open(struct block_device *bdev, fmode_t mode)
{
return 0;
@@ -640,11 +672,32 @@ static int init_driver_queues(struct nullb *nullb)
return 0;
}
-static int null_add_dev(void)
+static int null_gendisk_register(struct nullb *nullb)
{
struct gendisk *disk;
- struct nullb *nullb;
sector_t size;
+
+ disk = nullb->disk = alloc_disk_node(1, home_node);
+ if (!disk)
+ return -ENOMEM;
+ size = gb * 1024 * 1024 * 1024ULL;
+ set_capacity(disk, size >> 9);
+
+ disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
+ disk->major = null_major;
+ disk->first_minor = nullb->index;
+ disk->fops = &null_fops;
+ disk->private_data = nullb;
+ disk->queue = nullb->q;
+ strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+
+ add_disk(disk);
+ return 0;
+}
+
+static int null_add_dev(void)
+{
+ struct nullb *nullb;
int rv;
nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
@@ -716,42 +769,19 @@ static int null_add_dev(void)
sprintf(nullb->disk_name, "nullb%d", nullb->index);
- if (use_lightnvm) {
- rv = nvm_register(nullb->q, nullb->disk_name,
- &null_lnvm_dev_ops);
- if (rv)
- goto out_cleanup_blk_queue;
- goto done;
- }
-
- disk = nullb->disk = alloc_disk_node(1, home_node);
- if (!disk) {
- rv = -ENOMEM;
- goto out_cleanup_lightnvm;
- }
- size = gb * 1024 * 1024 * 1024ULL;
- set_capacity(disk, size >> 9);
-
- disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
- disk->major = null_major;
- disk->first_minor = nullb->index;
- disk->fops = &null_fops;
- disk->private_data = nullb;
- disk->queue = nullb->q;
- strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+ if (use_lightnvm)
+ rv = null_nvm_register(nullb);
+ else
+ rv = null_gendisk_register(nullb);
- add_disk(disk);
+ if (rv)
+ goto out_cleanup_blk_queue;
-done:
mutex_lock(&lock);
list_add_tail(&nullb->list, &nullb_list);
mutex_unlock(&lock);
return 0;
-
-out_cleanup_lightnvm:
- if (use_lightnvm)
- nvm_unregister(nullb->disk_name);
out_cleanup_blk_queue:
blk_cleanup_queue(nullb->q);
out_cleanup_tags:
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 61c68a1f054a..2f5d5f4a4c75 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -4,7 +4,7 @@
menuconfig NVM
bool "Open-Channel SSD target support"
- depends on BLOCK
+ depends on BLOCK && HAS_DMA
help
Say Y here to get to enable Open-channel SSDs.
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index a7a0a22cf1a5..1f6b6521016a 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -2,6 +2,6 @@
# Makefile for Open-Channel SSDs.
#
-obj-$(CONFIG_NVM) := core.o sysblk.o
+obj-$(CONFIG_NVM) := core.o sysblk.o sysfs.o
obj-$(CONFIG_NVM_GENNVM) += gennvm.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index c784ddcd4405..1cac0f8bc0dc 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -27,6 +27,8 @@
#include <linux/lightnvm.h>
#include <linux/sched/sysctl.h>
+#include "lightnvm.h"
+
static LIST_HEAD(nvm_tgt_types);
static DECLARE_RWSEM(nvm_tgtt_lock);
static LIST_HEAD(nvm_mgrs);
@@ -581,6 +583,8 @@ static int nvm_core_init(struct nvm_dev *dev)
mutex_init(&dev->mlock);
spin_lock_init(&dev->lock);
+ blk_queue_logical_block_size(dev->q, dev->sec_size);
+
return 0;
err_fmtype:
kfree(dev->lun_map);
@@ -596,15 +600,19 @@ static void nvm_free_mgr(struct nvm_dev *dev)
dev->mt = NULL;
}
-static void nvm_free(struct nvm_dev *dev)
+void nvm_free(struct nvm_dev *dev)
{
if (!dev)
return;
nvm_free_mgr(dev);
+ if (dev->dma_pool)
+ dev->ops->destroy_dma_pool(dev->dma_pool);
+
kfree(dev->lptbl);
kfree(dev->lun_map);
+ kfree(dev);
}
static int nvm_init(struct nvm_dev *dev)
@@ -651,30 +659,19 @@ err:
static void nvm_exit(struct nvm_dev *dev)
{
- if (dev->dma_pool)
- dev->ops->destroy_dma_pool(dev->dma_pool);
- nvm_free(dev);
+ nvm_sysfs_unregister_dev(dev);
+}
- pr_info("nvm: successfully unloaded\n");
+struct nvm_dev *nvm_alloc_dev(int node)
+{
+ return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node);
}
+EXPORT_SYMBOL(nvm_alloc_dev);
-int nvm_register(struct request_queue *q, char *disk_name,
- struct nvm_dev_ops *ops)
+int nvm_register(struct nvm_dev *dev)
{
- struct nvm_dev *dev;
int ret;
- if (!ops->identity)
- return -EINVAL;
-
- dev = kzalloc(sizeof(struct nvm_dev), GFP_KERNEL);
- if (!dev)
- return -ENOMEM;
-
- dev->q = q;
- dev->ops = ops;
- strncpy(dev->name, disk_name, DISK_NAME_LEN);
-
ret = nvm_init(dev);
if (ret)
goto err_init;
@@ -694,6 +691,10 @@ int nvm_register(struct request_queue *q, char *disk_name,
}
}
+ ret = nvm_sysfs_register_dev(dev);
+ if (ret)
+ goto err_ppalist;
+
if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) {
ret = nvm_get_sysblock(dev, &dev->sb);
if (!ret)
@@ -710,31 +711,21 @@ int nvm_register(struct request_queue *q, char *disk_name,
up_write(&nvm_lock);
return 0;
+err_ppalist:
+ dev->ops->destroy_dma_pool(dev->dma_pool);
err_init:
kfree(dev->lun_map);
- kfree(dev);
return ret;
}
EXPORT_SYMBOL(nvm_register);
-void nvm_unregister(char *disk_name)
+void nvm_unregister(struct nvm_dev *dev)
{
- struct nvm_dev *dev;
-
down_write(&nvm_lock);
- dev = nvm_find_nvm_dev(disk_name);
- if (!dev) {
- pr_err("nvm: could not find device %s to unregister\n",
- disk_name);
- up_write(&nvm_lock);
- return;
- }
-
list_del(&dev->devices);
up_write(&nvm_lock);
nvm_exit(dev);
- kfree(dev);
}
EXPORT_SYMBOL(nvm_unregister);
diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h
new file mode 100644
index 000000000000..305c181509a6
--- /dev/null
+++ b/drivers/lightnvm/lightnvm.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2016 CNEX Labs. All rights reserved.
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#ifndef LIGHTNVM_H
+#define LIGHTNVM_H
+
+#include <linux/lightnvm.h>
+
+/* core -> sysfs.c */
+int __must_check nvm_sysfs_register_dev(struct nvm_dev *);
+void nvm_sysfs_unregister_dev(struct nvm_dev *);
+int nvm_sysfs_register(void);
+void nvm_sysfs_unregister(void);
+
+/* sysfs > core */
+void nvm_free(struct nvm_dev *);
+
+#endif
diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c
new file mode 100644
index 000000000000..0338c27ab95a
--- /dev/null
+++ b/drivers/lightnvm/sysfs.c
@@ -0,0 +1,198 @@
+#include <linux/kernel.h>
+#include <linux/lightnvm.h>
+#include <linux/miscdevice.h>
+#include <linux/kobject.h>
+#include <linux/blk-mq.h>
+
+#include "lightnvm.h"
+
+static ssize_t nvm_dev_attr_show(struct device *dev,
+ struct device_attribute *dattr, char *page)
+{
+ struct nvm_dev *ndev = container_of(dev, struct nvm_dev, dev);
+ struct nvm_id *id = &ndev->identity;
+ struct nvm_id_group *grp = &id->groups[0];
+ struct attribute *attr = &dattr->attr;
+
+ if (strcmp(attr->name, "version") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id);
+ } else if (strcmp(attr->name, "vendor_opcode") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt);
+ } else if (strcmp(attr->name, "capabilities") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", id->cap);
+ } else if (strcmp(attr->name, "device_mode") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", id->dom);
+ } else if (strcmp(attr->name, "media_manager") == 0) {
+ if (!ndev->mt)
+ return scnprintf(page, PAGE_SIZE, "%s\n", "none");
+ return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name);
+ } else if (strcmp(attr->name, "ppa_format") == 0) {
+ return scnprintf(page, PAGE_SIZE,
+ "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+ id->ppaf.ch_offset, id->ppaf.ch_len,
+ id->ppaf.lun_offset, id->ppaf.lun_len,
+ id->ppaf.pln_offset, id->ppaf.pln_len,
+ id->ppaf.blk_offset, id->ppaf.blk_len,
+ id->ppaf.pg_offset, id->ppaf.pg_len,
+ id->ppaf.sect_offset, id->ppaf.sect_len);
+ } else if (strcmp(attr->name, "media_type") == 0) { /* u8 */
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype);
+ } else if (strcmp(attr->name, "flash_media_type") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype);
+ } else if (strcmp(attr->name, "num_channels") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch);
+ } else if (strcmp(attr->name, "num_luns") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun);
+ } else if (strcmp(attr->name, "num_planes") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
+ } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
+ } else if (strcmp(attr->name, "num_pages") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
+ } else if (strcmp(attr->name, "page_size") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz);
+ } else if (strcmp(attr->name, "hw_sector_size") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs);
+ } else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos);
+ } else if (strcmp(attr->name, "read_typ") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt);
+ } else if (strcmp(attr->name, "read_max") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm);
+ } else if (strcmp(attr->name, "prog_typ") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt);
+ } else if (strcmp(attr->name, "prog_max") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm);
+ } else if (strcmp(attr->name, "erase_typ") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet);
+ } else if (strcmp(attr->name, "erase_max") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem);
+ } else if (strcmp(attr->name, "multiplane_modes") == 0) {
+ return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos);
+ } else if (strcmp(attr->name, "media_capabilities") == 0) {
+ return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap);
+ } else if (strcmp(attr->name, "max_phys_secs") == 0) {
+ return scnprintf(page, PAGE_SIZE, "%u\n",
+ ndev->ops->max_phys_sect);
+ } else {
+ return scnprintf(page,
+ PAGE_SIZE,
+ "Unhandled attr(%s) in `nvm_dev_attr_show`\n",
+ attr->name);
+ }
+}
+
+#define NVM_DEV_ATTR_RO(_name) \
+ DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL)
+
+static NVM_DEV_ATTR_RO(version);
+static NVM_DEV_ATTR_RO(vendor_opcode);
+static NVM_DEV_ATTR_RO(capabilities);
+static NVM_DEV_ATTR_RO(device_mode);
+static NVM_DEV_ATTR_RO(ppa_format);
+static NVM_DEV_ATTR_RO(media_manager);
+
+static NVM_DEV_ATTR_RO(media_type);
+static NVM_DEV_ATTR_RO(flash_media_type);
+static NVM_DEV_ATTR_RO(num_channels);
+static NVM_DEV_ATTR_RO(num_luns);
+static NVM_DEV_ATTR_RO(num_planes);
+static NVM_DEV_ATTR_RO(num_blocks);
+static NVM_DEV_ATTR_RO(num_pages);
+static NVM_DEV_ATTR_RO(page_size);
+static NVM_DEV_ATTR_RO(hw_sector_size);
+static NVM_DEV_ATTR_RO(oob_sector_size);
+static NVM_DEV_ATTR_RO(read_typ);
+static NVM_DEV_ATTR_RO(read_max);
+static NVM_DEV_ATTR_RO(prog_typ);
+static NVM_DEV_ATTR_RO(prog_max);
+static NVM_DEV_ATTR_RO(erase_typ);
+static NVM_DEV_ATTR_RO(erase_max);
+static NVM_DEV_ATTR_RO(multiplane_modes);
+static NVM_DEV_ATTR_RO(media_capabilities);
+static NVM_DEV_ATTR_RO(max_phys_secs);
+
+#define NVM_DEV_ATTR(_name) (dev_attr_##_name##)
+
+static struct attribute *nvm_dev_attrs[] = {
+ &dev_attr_version.attr,
+ &dev_attr_vendor_opcode.attr,
+ &dev_attr_capabilities.attr,
+ &dev_attr_device_mode.attr,
+ &dev_attr_media_manager.attr,
+
+ &dev_attr_ppa_format.attr,
+ &dev_attr_media_type.attr,
+ &dev_attr_flash_media_type.attr,
+ &dev_attr_num_channels.attr,
+ &dev_attr_num_luns.attr,
+ &dev_attr_num_planes.attr,
+ &dev_attr_num_blocks.attr,
+ &dev_attr_num_pages.attr,
+ &dev_attr_page_size.attr,
+ &dev_attr_hw_sector_size.attr,
+ &dev_attr_oob_sector_size.attr,
+ &dev_attr_read_typ.attr,
+ &dev_attr_read_max.attr,
+ &dev_attr_prog_typ.attr,
+ &dev_attr_prog_max.attr,
+ &dev_attr_erase_typ.attr,
+ &dev_attr_erase_max.attr,
+ &dev_attr_multiplane_modes.attr,
+ &dev_attr_media_capabilities.attr,
+ &dev_attr_max_phys_secs.attr,
+ NULL,
+};
+
+static struct attribute_group nvm_dev_attr_group = {
+ .name = "lightnvm",
+ .attrs = nvm_dev_attrs,
+};
+
+static const struct attribute_group *nvm_dev_attr_groups[] = {
+ &nvm_dev_attr_group,
+ NULL,
+};
+
+static void nvm_dev_release(struct device *device)
+{
+ struct nvm_dev *dev = container_of(device, struct nvm_dev, dev);
+ struct request_queue *q = dev->q;
+
+ pr_debug("nvm/sysfs: `nvm_dev_release`\n");
+
+ blk_mq_unregister_dev(device, q);
+
+ nvm_free(dev);
+}
+
+static struct device_type nvm_type = {
+ .name = "lightnvm",
+ .groups = nvm_dev_attr_groups,
+ .release = nvm_dev_release,
+};
+
+int nvm_sysfs_register_dev(struct nvm_dev *dev)
+{
+ int ret;
+
+ if (!dev->parent_dev)
+ return 0;
+
+ dev->dev.parent = dev->parent_dev;
+ dev_set_name(&dev->dev, "%s", dev->name);
+ dev->dev.type = &nvm_type;
+ device_initialize(&dev->dev);
+ ret = device_add(&dev->dev);
+
+ if (!ret)
+ blk_mq_register_dev(&dev->dev, dev->q);
+
+ return ret;
+}
+
+void nvm_sysfs_unregister_dev(struct nvm_dev *dev)
+{
+ if (dev && dev->parent_dev)
+ kobject_put(&dev->dev.kobj);
+}
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 76f7534d1dd1..81d3db40cd7b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -361,12 +361,8 @@ static void __btree_node_write_done(struct closure *cl)
static void btree_node_write_done(struct closure *cl)
{
struct btree *b = container_of(cl, struct btree, io);
- struct bio_vec *bv;
- int n;
-
- bio_for_each_segment_all(bv, b->bio, n)
- __free_page(bv->bv_page);
+ bio_free_pages(b->bio);
__btree_node_write_done(cl);
}
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c28df164701e..333a1e5f6ae6 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -107,9 +107,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
{
char name[BDEVNAME_SIZE];
struct bio *check;
- struct bio_vec bv, *bv2;
+ struct bio_vec bv;
struct bvec_iter iter;
- int i;
check = bio_clone(bio, GFP_NOIO);
if (!check)
@@ -136,8 +135,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
kunmap_atomic(p1);
}
- bio_for_each_segment_all(bv2, check, i)
- __free_page(bv2->bv_page);
+ bio_free_pages(check);
out_put:
bio_put(check);
}
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 1881319f2298..5c4bddecfaf0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -44,11 +44,8 @@ static void write_moving_finish(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct bio *bio = &io->bio.bio;
- struct bio_vec *bv;
- int i;
- bio_for_each_segment_all(bv, bio, i)
- __free_page(bv->bv_page);
+ bio_free_pages(bio);
if (io->op.replace_collision)
trace_bcache_gc_copy_collision(&io->w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4b177fe11ebb..40ffe5e424b3 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -694,13 +694,8 @@ static void cached_dev_cache_miss_done(struct closure *cl)
if (s->iop.replace_collision)
bch_mark_cache_miss_collision(s->iop.c, s->d);
- if (s->iop.bio) {
- int i;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, s->iop.bio, i)
- __free_page(bv->bv_page);
- }
+ if (s->iop.bio)
+ bio_free_pages(s->iop.bio);
cached_dev_bio_complete(cl);
}
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d9fd2a62e5f6..e51644e503a5 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -128,11 +128,8 @@ static void write_dirty_finish(struct closure *cl)
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
- struct bio_vec *bv;
- int i;
- bio_for_each_segment_all(bv, &io->bio, i)
- __free_page(bv->bv_page);
+ bio_free_pages(&io->bio);
/* This is kind of a dumb way of signalling errors. */
if (KEY_DIRTY(&w->key)) {
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 874295757caa..0448e7e35c8c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
clone->bi_private = io;
clone->bi_end_io = crypt_endio;
clone->bi_bdev = cc->dev->bdev;
- bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf);
+ bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio));
}
static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 49e4d8d4558f..4dfe38655a49 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -149,8 +149,6 @@ static void put_io_block(struct log_writes_c *lc)
static void log_end_io(struct bio *bio)
{
struct log_writes_c *lc = bio->bi_private;
- struct bio_vec *bvec;
- int i;
if (bio->bi_error) {
unsigned long flags;
@@ -161,9 +159,7 @@ static void log_end_io(struct bio *bio)
spin_unlock_irqrestore(&lc->blocks_lock, flags);
}
- bio_for_each_segment_all(bvec, bio, i)
- __free_page(bvec->bv_page);
-
+ bio_free_pages(bio);
put_io_block(lc);
bio_put(bio);
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1ca7463e8bb2..ee48230a2952 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -955,7 +955,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
dm_init_md_queue(md);
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
- blk_mq_register_disk(md->disk);
+ blk_mq_register_dev(disk_to_dev(md->disk), q);
return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 21dc00eb1989..1961d827dbd1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -145,12 +145,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
return r1_bio;
out_free_pages:
- while (--j >= 0) {
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, r1_bio->bios[j], i)
- __free_page(bv->bv_page);
- }
+ while (--j >= 0)
+ bio_free_pages(r1_bio->bios[j]);
out_free_bio:
while (++j < pi->raid_disks)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2feacc70bf61..4669c052239e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -156,12 +156,14 @@ static void nvme_free_ns(struct kref *kref)
{
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
- if (ns->type == NVME_NS_LIGHTNVM)
- nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+ if (ns->ndev)
+ nvme_nvm_unregister(ns);
- spin_lock(&dev_list_lock);
- ns->disk->private_data = NULL;
- spin_unlock(&dev_list_lock);
+ if (ns->disk) {
+ spin_lock(&dev_list_lock);
+ ns->disk->private_data = NULL;
+ spin_unlock(&dev_list_lock);
+ }
put_disk(ns->disk);
ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
@@ -597,7 +599,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
}
int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
- dma_addr_t dma_addr, u32 *result)
+ void *buffer, size_t buflen, u32 *result)
{
struct nvme_command c;
struct nvme_completion cqe;
@@ -606,10 +608,9 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_get_features;
c.features.nsid = cpu_to_le32(nsid);
- c.features.dptr.prp1 = cpu_to_le64(dma_addr);
c.features.fid = cpu_to_le32(fid);
- ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
+ ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, buffer, buflen, 0,
NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(cqe.result);
@@ -617,7 +618,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
}
int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
- dma_addr_t dma_addr, u32 *result)
+ void *buffer, size_t buflen, u32 *result)
{
struct nvme_command c;
struct nvme_completion cqe;
@@ -625,12 +626,11 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_set_features;
- c.features.dptr.prp1 = cpu_to_le64(dma_addr);
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
- ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe,
+ buffer, buflen, 0, NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(cqe.result);
return ret;
@@ -664,7 +664,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
u32 result;
int status, nr_io_queues;
- status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
+ status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
&result);
if (status < 0)
return status;
@@ -888,42 +888,32 @@ static void nvme_config_discard(struct nvme_ns *ns)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
}
-static int nvme_revalidate_disk(struct gendisk *disk)
+static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
{
- struct nvme_ns *ns = disk->private_data;
- struct nvme_id_ns *id;
- u8 lbaf, pi_type;
- u16 old_ms;
- unsigned short bs;
-
- if (test_bit(NVME_NS_DEAD, &ns->flags)) {
- set_capacity(disk, 0);
- return -ENODEV;
- }
- if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
- dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n",
- __func__);
- return -ENODEV;
- }
- if (id->ncap == 0) {
- kfree(id);
+ if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) {
+ dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__);
return -ENODEV;
}
- if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
- if (nvme_nvm_register(ns->queue, disk->disk_name)) {
- dev_warn(disk_to_dev(ns->disk),
- "%s: LightNVM init failure\n", __func__);
- kfree(id);
- return -ENODEV;
- }
- ns->type = NVME_NS_LIGHTNVM;
+ if ((*id)->ncap == 0) {
+ kfree(*id);
+ return -ENODEV;
}
if (ns->ctrl->vs >= NVME_VS(1, 1))
- memcpy(ns->eui, id->eui64, sizeof(ns->eui));
+ memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui));
if (ns->ctrl->vs >= NVME_VS(1, 2))
- memcpy(ns->uuid, id->nguid, sizeof(ns->uuid));
+ memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid));
+
+ return 0;
+}
+
+static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
+{
+ struct nvme_ns *ns = disk->private_data;
+ u8 lbaf, pi_type;
+ u16 old_ms;
+ unsigned short bs;
old_ms = ns->ms;
lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
@@ -962,8 +952,26 @@ static int nvme_revalidate_disk(struct gendisk *disk)
if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
blk_mq_unfreeze_queue(disk->queue);
+}
+
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_id_ns *id = NULL;
+ int ret;
+
+ if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+ set_capacity(disk, 0);
+ return -ENODEV;
+ }
+
+ ret = nvme_revalidate_ns(ns, &id);
+ if (ret)
+ return ret;
+ __nvme_revalidate_disk(disk, id);
kfree(id);
+
return 0;
}
@@ -1425,7 +1433,7 @@ static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
struct nvme_ctrl *ctrl = ns->ctrl;
int serial_len = sizeof(ctrl->serial);
int model_len = sizeof(ctrl->model);
@@ -1449,7 +1457,7 @@ static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL);
static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%pU\n", ns->uuid);
}
static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
@@ -1457,7 +1465,7 @@ static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL);
static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%8phd\n", ns->eui);
}
static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
@@ -1465,7 +1473,7 @@ static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL);
static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%d\n", ns->ns_id);
}
static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL);
@@ -1482,7 +1490,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
struct attribute *a, int n)
{
struct device *dev = container_of(kobj, struct device, kobj);
- struct nvme_ns *ns = dev_to_disk(dev)->private_data;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
if (a == &dev_attr_uuid.attr) {
if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid)))
@@ -1642,6 +1650,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
struct gendisk *disk;
+ struct nvme_id_ns *id;
+ char disk_name[DISK_NAME_LEN];
int node = dev_to_node(ctrl->dev);
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
@@ -1659,34 +1669,49 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
- disk = alloc_disk_node(0, node);
- if (!disk)
- goto out_free_queue;
-
kref_init(&ns->kref);
ns->ns_id = nsid;
- ns->disk = disk;
ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
-
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
- disk->fops = &nvme_fops;
- disk->private_data = ns;
- disk->queue = ns->queue;
- disk->flags = GENHD_FL_EXT_DEVT;
- sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
+ sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
+
+ if (nvme_revalidate_ns(ns, &id))
+ goto out_free_queue;
- if (nvme_revalidate_disk(ns->disk))
- goto out_free_disk;
+ if (nvme_nvm_ns_supported(ns, id)) {
+ if (nvme_nvm_register(ns, disk_name, node,
+ &nvme_ns_attr_group)) {
+ dev_warn(ctrl->dev, "%s: LightNVM init failure\n",
+ __func__);
+ goto out_free_id;
+ }
+ } else {
+ disk = alloc_disk_node(0, node);
+ if (!disk)
+ goto out_free_id;
+
+ disk->fops = &nvme_fops;
+ disk->private_data = ns;
+ disk->queue = ns->queue;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
+ ns->disk = disk;
+
+ __nvme_revalidate_disk(disk, id);
+ }
mutex_lock(&ctrl->namespaces_mutex);
list_add_tail(&ns->list, &ctrl->namespaces);
mutex_unlock(&ctrl->namespaces_mutex);
kref_get(&ctrl->kref);
- if (ns->type == NVME_NS_LIGHTNVM)
+
+ kfree(id);
+
+ if (ns->ndev)
return;
device_add_disk(ctrl->device, ns->disk);
@@ -1695,8 +1720,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
pr_warn("%s: failed to create sysfs group for identification\n",
ns->disk->disk_name);
return;
- out_free_disk:
- kfree(disk);
+ out_free_id:
+ kfree(id);
out_free_queue:
blk_cleanup_queue(ns->queue);
out_release_instance:
@@ -1710,7 +1735,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
- if (ns->disk->flags & GENHD_FL_UP) {
+ if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
if (blk_get_integrity(ns->disk))
blk_integrity_unregister(ns->disk);
sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
@@ -1733,7 +1758,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
- if (revalidate_disk(ns->disk))
+ if (ns->disk && revalidate_disk(ns->disk))
nvme_ns_remove(ns);
nvme_put_ns(ns);
} else
@@ -2038,7 +2063,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
* Revalidating a dead namespace sets capacity to 0. This will
* end buffered writers dirtying pages that can't be synced.
*/
- if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags))
revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 4eff49174466..5a3f008d3480 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -111,8 +111,19 @@ static void nvmf_host_put(struct nvmf_host *host)
*/
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
- return snprintf(buf, size, "traddr=%s,trsvcid=%s\n",
- ctrl->opts->traddr, ctrl->opts->trsvcid);
+ int len = 0;
+
+ if (ctrl->opts->mask & NVMF_OPT_TRADDR)
+ len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr);
+ if (ctrl->opts->mask & NVMF_OPT_TRSVCID)
+ len += snprintf(buf + len, size - len, "%strsvcid=%s",
+ (len) ? "," : "", ctrl->opts->trsvcid);
+ if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)
+ len += snprintf(buf + len, size - len, "%shost_traddr=%s",
+ (len) ? "," : "", ctrl->opts->host_traddr);
+ len += snprintf(buf + len, size - len, "\n");
+
+ return len;
}
EXPORT_SYMBOL_GPL(nvmf_get_address);
@@ -519,6 +530,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" },
{ NVMF_OPT_KATO, "keep_alive_tmo=%d" },
{ NVMF_OPT_HOSTNQN, "hostnqn=%s" },
+ { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
{ NVMF_OPT_ERR, NULL }
};
@@ -675,6 +687,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->reconnect_delay = token;
break;
+ case NVMF_OPT_HOST_TRADDR:
+ p = match_strdup(args);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ opts->host_traddr = p;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -741,6 +761,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
kfree(opts->traddr);
kfree(opts->trsvcid);
kfree(opts->subsysnqn);
+ kfree(opts->host_traddr);
kfree(opts);
}
EXPORT_SYMBOL_GPL(nvmf_free_options);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 46e460aee52d..924145c979f1 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -52,6 +52,7 @@ enum {
NVMF_OPT_KATO = 1 << 7,
NVMF_OPT_HOSTNQN = 1 << 8,
NVMF_OPT_RECONNECT_DELAY = 1 << 9,
+ NVMF_OPT_HOST_TRADDR = 1 << 10,
};
/**
@@ -64,9 +65,12 @@ enum {
* being added.
* @subsysnqn: Hold the fully qualified NQN subystem name (format defined
* in the NVMe specification, "NVMe Qualified Names").
- * @traddr: network address that will be used by the host to communicate
- * to the added NVMe controller.
- * @trsvcid: network port used for host-controller communication.
+ * @traddr: The transport-specific TRADDR field for a port on the
+ * subsystem which is adding a controller.
+ * @trsvcid: The transport-specific TRSVCID field for a port on the
+ * subsystem which is adding a controller.
+ * @host_traddr: A transport-specific field identifying the NVME host port
+ * to use for the connection to the controller.
* @queue_size: Number of IO queue elements.
* @nr_io_queues: Number of controller IO queues that will be established.
* @reconnect_delay: Time between two consecutive reconnect attempts.
@@ -80,6 +84,7 @@ struct nvmf_ctrl_options {
char *subsysnqn;
char *traddr;
char *trsvcid;
+ char *host_traddr;
size_t queue_size;
unsigned int nr_io_queues;
unsigned int reconnect_delay;
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 63f483daf930..f5e3011e31fc 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -475,7 +475,7 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd,
if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
- rqd->bio->bi_iter.bi_sector));
+ rqd->bio->bi_iter.bi_sector));
}
static void nvme_nvm_end_io(struct request *rq, int error)
@@ -592,14 +592,37 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
.max_phys_sect = 64,
};
-int nvme_nvm_register(struct request_queue *q, char *disk_name)
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
+ const struct attribute_group *attrs)
{
- return nvm_register(q, disk_name, &nvme_nvm_dev_ops);
+ struct request_queue *q = ns->queue;
+ struct nvm_dev *dev;
+ int ret;
+
+ dev = nvm_alloc_dev(node);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->q = q;
+ memcpy(dev->name, disk_name, DISK_NAME_LEN);
+ dev->ops = &nvme_nvm_dev_ops;
+ dev->parent_dev = ns->ctrl->device;
+ dev->private_data = ns;
+ ns->ndev = dev;
+
+ ret = nvm_register(dev);
+
+ ns->lba_shift = ilog2(dev->sec_size) - 9;
+
+ if (sysfs_create_group(&dev->dev.kobj, attrs))
+ pr_warn("%s: failed to create sysfs group for identification\n",
+ disk_name);
+ return ret;
}
-void nvme_nvm_unregister(struct request_queue *q, char *disk_name)
+void nvme_nvm_unregister(struct nvme_ns *ns)
{
- nvm_unregister(disk_name);
+ nvm_unregister(ns->ndev);
}
/* move to shared place when used in multiple places. */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab18b78102bf..b0a9ec681685 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -18,6 +18,7 @@
#include <linux/pci.h>
#include <linux/kref.h>
#include <linux/blk-mq.h>
+#include <linux/lightnvm.h>
enum {
/*
@@ -154,6 +155,7 @@ struct nvme_ns {
struct nvme_ctrl *ctrl;
struct request_queue *queue;
struct gendisk *disk;
+ struct nvm_dev *ndev;
struct kref kref;
int instance;
@@ -165,7 +167,6 @@ struct nvme_ns {
u16 ms;
bool ext;
u8 pi_type;
- int type;
unsigned long flags;
#define NVME_NS_REMOVING 0
@@ -292,9 +293,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
struct nvme_id_ns **id);
int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log);
int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
- dma_addr_t dma_addr, u32 *result);
+ void *buffer, size_t buflen, u32 *result);
int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
- dma_addr_t dma_addr, u32 *result);
+ void *buffer, size_t buflen, u32 *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
@@ -307,20 +308,35 @@ int nvme_sg_get_version_num(int __user *ip);
#ifdef CONFIG_NVM
int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
-int nvme_nvm_register(struct request_queue *q, char *disk_name);
-void nvme_nvm_unregister(struct request_queue *q, char *disk_name);
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node,
+ const struct attribute_group *attrs);
+void nvme_nvm_unregister(struct nvme_ns *ns);
+
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+ if (dev->type->devnode)
+ return dev_to_disk(dev)->private_data;
+
+ return (container_of(dev, struct nvm_dev, dev))->private_data;
+}
#else
-static inline int nvme_nvm_register(struct request_queue *q, char *disk_name)
+static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
+ int node,
+ const struct attribute_group *attrs)
{
return 0;
}
-static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {};
+static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
{
return 0;
}
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+ return dev_to_disk(dev)->private_data;
+}
#endif /* CONFIG_NVM */
int __init nvme_core_init(void);
diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c
index e947e298a737..c2a0a1c7d05d 100644
--- a/drivers/nvme/host/scsi.c
+++ b/drivers/nvme/host/scsi.c
@@ -72,15 +72,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */
#define ALL_LUNS_RETURNED 0x02
#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01
#define RESTRICTED_LUNS_RETURNED 0x00
-#define NVME_POWER_STATE_START_VALID 0x00
-#define NVME_POWER_STATE_ACTIVE 0x01
-#define NVME_POWER_STATE_IDLE 0x02
-#define NVME_POWER_STATE_STANDBY 0x03
-#define NVME_POWER_STATE_LU_CONTROL 0x07
-#define POWER_STATE_0 0
-#define POWER_STATE_1 1
-#define POWER_STATE_2 2
-#define POWER_STATE_3 3
#define DOWNLOAD_SAVE_ACTIVATE 0x05
#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E
#define ACTIVATE_DEFERRED_MICROCODE 0x0F
@@ -915,7 +906,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
kfree(smart_log);
/* Get Features for Temp Threshold */
- res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0,
+ res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0,
&feature_resp);
if (res != NVME_SC_SUCCESS)
temp_c_thresh = LOG_TEMP_UNKNOWN;
@@ -1048,7 +1039,7 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
if (len < MODE_PAGE_CACHING_LEN)
return -EINVAL;
- nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0,
+ nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0,
&feature_resp);
res = nvme_trans_status_code(hdr, nvme_sc);
if (res)
@@ -1229,64 +1220,6 @@ static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns,
/* Start Stop Unit Helper Functions */
-static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
- u8 pc, u8 pcmod, u8 start)
-{
- int res;
- int nvme_sc;
- struct nvme_id_ctrl *id_ctrl;
- int lowest_pow_st; /* max npss = lowest power consumption */
- unsigned ps_desired = 0;
-
- nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl);
- res = nvme_trans_status_code(hdr, nvme_sc);
- if (res)
- return res;
-
- lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1));
- kfree(id_ctrl);
-
- switch (pc) {
- case NVME_POWER_STATE_START_VALID:
- /* Action unspecified if POWER CONDITION MODIFIER != 0 */
- if (pcmod == 0 && start == 0x1)
- ps_desired = POWER_STATE_0;
- if (pcmod == 0 && start == 0x0)
- ps_desired = lowest_pow_st;
- break;
- case NVME_POWER_STATE_ACTIVE:
- /* Action unspecified if POWER CONDITION MODIFIER != 0 */
- if (pcmod == 0)
- ps_desired = POWER_STATE_0;
- break;
- case NVME_POWER_STATE_IDLE:
- /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */
- if (pcmod == 0x0)
- ps_desired = POWER_STATE_1;
- else if (pcmod == 0x1)
- ps_desired = POWER_STATE_2;
- else if (pcmod == 0x2)
- ps_desired = POWER_STATE_3;
- break;
- case NVME_POWER_STATE_STANDBY:
- /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */
- if (pcmod == 0x0)
- ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2));
- else if (pcmod == 0x1)
- ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1));
- break;
- case NVME_POWER_STATE_LU_CONTROL:
- default:
- res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
- ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
- SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
- break;
- }
- nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0,
- NULL);
- return nvme_trans_status_code(hdr, nvme_sc);
-}
-
static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
u8 buffer_id)
{
@@ -1395,7 +1328,7 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
case MODE_PAGE_CACHING:
dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC,
- dword11, 0, NULL);
+ dword11, NULL, 0, NULL);
res = nvme_trans_status_code(hdr, nvme_sc);
break;
case MODE_PAGE_CONTROL:
@@ -2235,11 +2168,10 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
u8 *cmd)
{
- u8 immed, pcmod, pc, no_flush, start;
+ u8 immed, pcmod, no_flush, start;
immed = cmd[1] & 0x01;
pcmod = cmd[3] & 0x0f;
- pc = (cmd[4] & 0xf0) >> 4;
no_flush = cmd[4] & 0x04;
start = cmd[4] & 0x01;
@@ -2254,8 +2186,8 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
if (res)
return res;
}
- /* Setup the expected power state transition */
- return nvme_trans_power_state(ns, hdr, pc, pcmod, start);
+
+ return 0;
}
}
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 47c564b5a289..7ab9c9381b98 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -14,6 +14,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <generated/utsrelease.h>
+#include <asm/unaligned.h>
#include "nvmet.h"
u32 nvmet_get_log_page_len(struct nvme_command *cmd)
@@ -29,8 +30,84 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd)
return len;
}
+static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
+ struct nvme_smart_log *slog)
+{
+ u16 status;
+ struct nvmet_ns *ns;
+ u64 host_reads, host_writes, data_units_read, data_units_written;
+
+ status = NVME_SC_SUCCESS;
+ ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid);
+ if (!ns) {
+ status = NVME_SC_INVALID_NS;
+ pr_err("nvmet : Counld not find namespace id : %d\n",
+ le32_to_cpu(req->cmd->get_log_page.nsid));
+ goto out;
+ }
+
+ host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]);
+ data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]);
+ host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]);
+ data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+
+ put_unaligned_le64(host_reads, &slog->host_reads[0]);
+ put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
+ put_unaligned_le64(host_writes, &slog->host_writes[0]);
+ put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+ nvmet_put_namespace(ns);
+out:
+ return status;
+}
+
+static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
+ struct nvme_smart_log *slog)
+{
+ u16 status;
+ u64 host_reads = 0, host_writes = 0;
+ u64 data_units_read = 0, data_units_written = 0;
+ struct nvmet_ns *ns;
+ struct nvmet_ctrl *ctrl;
+
+ status = NVME_SC_SUCCESS;
+ ctrl = req->sq->ctrl;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
+ host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]);
+ data_units_read +=
+ part_stat_read(ns->bdev->bd_part, sectors[READ]);
+ host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
+ data_units_written +=
+ part_stat_read(ns->bdev->bd_part, sectors[WRITE]);
+
+ }
+ rcu_read_unlock();
+
+ put_unaligned_le64(host_reads, &slog->host_reads[0]);
+ put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
+ put_unaligned_le64(host_writes, &slog->host_writes[0]);
+ put_unaligned_le64(data_units_written, &slog->data_units_written[0]);
+
+ return status;
+}
+
+static u16 nvmet_get_smart_log(struct nvmet_req *req,
+ struct nvme_smart_log *slog)
+{
+ u16 status;
+
+ WARN_ON(req == NULL || slog == NULL);
+ if (req->cmd->get_log_page.nsid == 0xFFFFFFFF)
+ status = nvmet_get_smart_log_all(req, slog);
+ else
+ status = nvmet_get_smart_log_nsid(req, slog);
+ return status;
+}
+
static void nvmet_execute_get_log_page(struct nvmet_req *req)
{
+ struct nvme_smart_log *smart_log;
size_t data_len = nvmet_get_log_page_len(req->cmd);
void *buf;
u16 status = 0;
@@ -59,6 +136,16 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
* available (e.g. units or commands read/written) those aren't
* persistent over power loss.
*/
+ if (data_len != sizeof(*smart_log)) {
+ status = NVME_SC_INTERNAL;
+ goto err;
+ }
+ smart_log = buf;
+ status = nvmet_get_smart_log(req, smart_log);
+ if (status) {
+ memset(buf, '\0', data_len);
+ goto err;
+ }
break;
case 0x03:
/*
@@ -73,6 +160,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
status = nvmet_copy_to_sgl(req, 0, buf, data_len);
+err:
kfree(buf);
out:
nvmet_req_complete(req, status);
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 2cd069b691ae..4a96c2049b7b 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -58,6 +58,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
if (req->cmd->rw.opcode == nvme_cmd_write) {
op = REQ_OP_WRITE;
+ op_flags = WRITE_ODIRECT;
if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
op_flags |= REQ_FUA;
} else {
@@ -205,7 +206,7 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
return 0;
case nvme_cmd_dsm:
req->execute = nvmet_execute_dsm;
- req->data_len = le32_to_cpu(cmd->dsm.nr) *
+ req->data_len = le32_to_cpu(cmd->dsm.nr + 1) *
sizeof(struct nvme_dsm_range);
return 0;
default:
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 7da05b159ade..bfe9f9994935 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -789,7 +789,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
* Will be set to real fs blocksize later.
*
* Linux 2.4.10 and later refuse to read blocks smaller than
- * the hardsect size for the device. But we also need to read at
+ * the logical block size for the device. But we also need to read at
* least 1k to get the second 512 bytes of the volume.
* -WD 10-26-01
*/
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 08ae99343d92..376e4e426324 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -180,9 +180,6 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
- if (IS_DAX(inode))
- return dax_do_io(iocb, inode, iter, blkdev_get_block,
- NULL, DIO_SKIP_DIO_COUNT);
return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
blkdev_get_block, NULL, NULL,
DIO_SKIP_DIO_COUNT);
@@ -302,14 +299,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
error = sb->s_op->thaw_super(sb);
else
error = thaw_super(sb);
- if (error) {
+ if (error)
bdev->bd_fsfreeze_count++;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return error;
- }
out:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return 0;
+ return error;
}
EXPORT_SYMBOL(thaw_bdev);
@@ -1275,7 +1269,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- bdev->bd_inode->i_flags = 0;
if (!partno) {
ret = -ENXIO;
@@ -1303,11 +1296,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret) {
+ if (!ret)
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- if (!bdev_dax_capable(bdev))
- bdev->bd_inode->i_flags &= ~S_DAX;
- }
/*
* If the device is invalidated, rescan partition
@@ -1342,8 +1332,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
- if (!bdev_dax_capable(bdev))
- bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
if (bdev->bd_contains == bdev) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e6811c42e41e..ca01106795ea 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8412,7 +8412,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
if (!bio)
return -ENOMEM;
- bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+ bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
@@ -8450,7 +8450,8 @@ next_block:
start_sector, GFP_NOFS);
if (!bio)
goto out_err;
- bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf);
+ bio_set_op_attrs(bio, bio_op(orig_bio),
+ bio_flags(orig_bio));
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 23ddf4b46a9b..97cb48f03dc7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -1,6 +1,4 @@
/*
- * 2.5 block I/O model
- *
* Copyright (C) 2001 Jens Axboe <axboe@suse.de>
*
* This program is free software; you can redistribute it and/or modify
@@ -461,6 +459,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+extern void bio_free_pages(struct bio *bio);
extern struct bio *bio_copy_user_iov(struct request_queue *,
struct rq_map_data *,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 10648e300c93..cbdbf34de5b6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -45,7 +45,7 @@ struct blkcg {
spinlock_t lock;
struct radix_tree_root blkg_tree;
- struct blkcg_gq *blkg_hint;
+ struct blkcg_gq __rcu *blkg_hint;
struct hlist_head blkg_list;
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e43bbffb5b7a..5daa0ef756dd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -2,6 +2,7 @@
#define BLK_MQ_H
#include <linux/blkdev.h>
+#include <linux/sbitmap.h>
struct blk_mq_tags;
struct blk_flush_queue;
@@ -12,21 +13,14 @@ struct blk_mq_cpu_notifier {
int (*notify)(void *data, unsigned long action, unsigned int cpu);
};
-struct blk_mq_ctxmap {
- unsigned int size;
- unsigned int bits_per_word;
- struct blk_align_bitmap *map;
-};
-
struct blk_mq_hw_ctx {
struct {
spinlock_t lock;
struct list_head dispatch;
+ unsigned long state; /* BLK_MQ_S_* flags */
} ____cacheline_aligned_in_smp;
- unsigned long state; /* BLK_MQ_S_* flags */
- struct delayed_work run_work;
- struct delayed_work delay_work;
+ struct work_struct run_work;
cpumask_var_t cpumask;
int next_cpu;
int next_cpu_batch;
@@ -38,10 +32,10 @@ struct blk_mq_hw_ctx {
void *driver_data;
- struct blk_mq_ctxmap ctx_map;
+ struct sbitmap ctx_map;
- unsigned int nr_ctx;
struct blk_mq_ctx **ctxs;
+ unsigned int nr_ctx;
atomic_t wait_index;
@@ -49,7 +43,7 @@ struct blk_mq_hw_ctx {
unsigned long queued;
unsigned long run;
-#define BLK_MQ_MAX_DISPATCH_ORDER 10
+#define BLK_MQ_MAX_DISPATCH_ORDER 7
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
unsigned int numa_node;
@@ -57,9 +51,12 @@ struct blk_mq_hw_ctx {
atomic_t nr_active;
+ struct delayed_work delay_work;
+
struct blk_mq_cpu_notifier cpu_notifier;
struct kobject kobj;
+ unsigned long poll_considered;
unsigned long poll_invoked;
unsigned long poll_success;
};
@@ -158,6 +155,7 @@ enum {
BLK_MQ_F_TAG_SHARED = 1 << 1,
BLK_MQ_F_SG_MERGE = 1 << 2,
BLK_MQ_F_DEFER_ISSUE = 1 << 4,
+ BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
BLK_MQ_F_ALLOC_POLICY_BITS = 1,
@@ -178,8 +176,8 @@ enum {
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q);
-int blk_mq_register_disk(struct gendisk *);
-void blk_mq_unregister_disk(struct gendisk *);
+int blk_mq_register_dev(struct device *, struct request_queue *);
+void blk_mq_unregister_dev(struct device *, struct request_queue *);
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
@@ -221,7 +219,6 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
}
struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
int blk_mq_request_started(struct request *rq);
void blk_mq_start_request(struct request *rq);
@@ -232,6 +229,7 @@ void blk_mq_requeue_request(struct request *rq);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
void blk_mq_cancel_requeue_work(struct request_queue *q);
void blk_mq_kick_requeue_list(struct request_queue *q);
+void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_abort_requeue_list(struct request_queue *q);
void blk_mq_complete_request(struct request *rq, int error);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 436f43f87da9..cd395ecec99d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -16,7 +16,6 @@ struct block_device;
struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *);
-typedef void (bio_destructor_t) (struct bio *);
#ifdef CONFIG_BLOCK
/*
@@ -89,14 +88,22 @@ struct bio {
struct bio_vec bi_inline_vecs[0];
};
-#define BIO_OP_SHIFT (8 * sizeof(unsigned int) - REQ_OP_BITS)
+#define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS)
+#define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1))
#define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT)
-#define bio_set_op_attrs(bio, op, op_flags) do { \
- WARN_ON(op >= (1 << REQ_OP_BITS)); \
- (bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1); \
- (bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT); \
- (bio)->bi_opf |= op_flags; \
+#define bio_set_op_attrs(bio, op, op_flags) do { \
+ if (__builtin_constant_p(op)) \
+ BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \
+ else \
+ WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \
+ if (__builtin_constant_p(op_flags)) \
+ BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \
+ else \
+ WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \
+ (bio)->bi_opf = bio_flags(bio); \
+ (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \
+ (bio)->bi_opf |= (op_flags); \
} while (0)
#define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e79055c8b577..c47c358ba052 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -449,7 +449,7 @@ struct request_queue {
struct list_head requeue_list;
spinlock_t requeue_lock;
- struct work_struct requeue_work;
+ struct delayed_work requeue_work;
struct mutex sysfs_lock;
@@ -1440,8 +1440,8 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
return bio_will_gap(req->q, bio, req->bio);
}
-struct work_struct;
int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_work_on(int cpu, struct work_struct *work);
int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index beb9ce1c2c23..8c1239020d79 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -7,7 +7,6 @@
/*
* Gives us 8 prio classes with 13-bits of data for each class
*/
-#define IOPRIO_BITS (16)
#define IOPRIO_CLASS_SHIFT (13)
#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ba78b8306674..d190786e4ad8 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -352,7 +352,10 @@ struct nvm_dev {
/* Backend device */
struct request_queue *q;
+ struct device dev;
+ struct device *parent_dev;
char name[DISK_NAME_LEN];
+ void *private_data;
struct mutex mlock;
spinlock_t lock;
@@ -524,9 +527,9 @@ extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *,
unsigned long);
extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *);
-extern int nvm_register(struct request_queue *, char *,
- struct nvm_dev_ops *);
-extern void nvm_unregister(char *);
+extern struct nvm_dev *nvm_alloc_dev(int);
+extern int nvm_register(struct nvm_dev *);
+extern void nvm_unregister(struct nvm_dev *);
void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type);
@@ -575,11 +578,14 @@ extern int nvm_dev_factory(struct nvm_dev *, int flags);
#else /* CONFIG_NVM */
struct nvm_dev_ops;
-static inline int nvm_register(struct request_queue *q, char *disk_name,
- struct nvm_dev_ops *ops)
+static inline struct nvm_dev *nvm_alloc_dev(int node)
+{
+ return ERR_PTR(-EINVAL);
+}
+static inline int nvm_register(struct nvm_dev *dev)
{
return -EINVAL;
}
-static inline void nvm_unregister(char *disk_name) {}
+static inline void nvm_unregister(struct nvm_dev *dev) {}
#endif /* CONFIG_NVM */
#endif /* LIGHTNVM.H */
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
new file mode 100644
index 000000000000..f017fd6e69c4
--- /dev/null
+++ b/include/linux/sbitmap.h
@@ -0,0 +1,373 @@
+/*
+ * Fast and scalable bitmaps.
+ *
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LINUX_SCALE_BITMAP_H
+#define __LINUX_SCALE_BITMAP_H
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+/**
+ * struct sbitmap_word - Word in a &struct sbitmap.
+ */
+struct sbitmap_word {
+ /**
+ * @word: The bitmap word itself.
+ */
+ unsigned long word;
+
+ /**
+ * @depth: Number of bits being used in @word.
+ */
+ unsigned long depth;
+} ____cacheline_aligned_in_smp;
+
+/**
+ * struct sbitmap - Scalable bitmap.
+ *
+ * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
+ * trades off higher memory usage for better scalability.
+ */
+struct sbitmap {
+ /**
+ * @depth: Number of bits used in the whole bitmap.
+ */
+ unsigned int depth;
+
+ /**
+ * @shift: log2(number of bits used per word)
+ */
+ unsigned int shift;
+
+ /**
+ * @map_nr: Number of words (cachelines) being used for the bitmap.
+ */
+ unsigned int map_nr;
+
+ /**
+ * @map: Allocated bitmap.
+ */
+ struct sbitmap_word *map;
+};
+
+#define SBQ_WAIT_QUEUES 8
+#define SBQ_WAKE_BATCH 8
+
+/**
+ * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
+ */
+struct sbq_wait_state {
+ /**
+ * @wait_cnt: Number of frees remaining before we wake up.
+ */
+ atomic_t wait_cnt;
+
+ /**
+ * @wait: Wait queue.
+ */
+ wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+
+/**
+ * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
+ * bits.
+ *
+ * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
+ * avoid contention on the wait queue spinlock. This ensures that we don't hit a
+ * scalability wall when we run out of free bits and have to start putting tasks
+ * to sleep.
+ */
+struct sbitmap_queue {
+ /**
+ * @sb: Scalable bitmap.
+ */
+ struct sbitmap sb;
+
+ /*
+ * @alloc_hint: Cache of last successfully allocated or freed bit.
+ *
+ * This is per-cpu, which allows multiple users to stick to different
+ * cachelines until the map is exhausted.
+ */
+ unsigned int __percpu *alloc_hint;
+
+ /**
+ * @wake_batch: Number of bits which must be freed before we wake up any
+ * waiters.
+ */
+ unsigned int wake_batch;
+
+ /**
+ * @wake_index: Next wait queue in @ws to wake up.
+ */
+ atomic_t wake_index;
+
+ /**
+ * @ws: Wait queues.
+ */
+ struct sbq_wait_state *ws;
+
+ /**
+ * @round_robin: Allocate bits in strict round-robin order.
+ */
+ bool round_robin;
+};
+
+/**
+ * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
+ * @sb: Bitmap to initialize.
+ * @depth: Number of bits to allocate.
+ * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
+ * given, a good default is chosen.
+ * @flags: Allocation flags.
+ * @node: Memory node to allocate on.
+ *
+ * Return: Zero on success or negative errno on failure.
+ */
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+ gfp_t flags, int node);
+
+/**
+ * sbitmap_free() - Free memory used by a &struct sbitmap.
+ * @sb: Bitmap to free.
+ */
+static inline void sbitmap_free(struct sbitmap *sb)
+{
+ kfree(sb->map);
+ sb->map = NULL;
+}
+
+/**
+ * sbitmap_resize() - Resize a &struct sbitmap.
+ * @sb: Bitmap to resize.
+ * @depth: New number of bits to resize to.
+ *
+ * Doesn't reallocate anything. It's up to the caller to ensure that the new
+ * depth doesn't exceed the depth that the sb was initialized with.
+ */
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
+
+/**
+ * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @round_robin: If true, be stricter about allocation order; always allocate
+ * starting from the last allocated bit. This is less efficient
+ * than the default behavior (false).
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
+
+/**
+ * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
+ * @sb: Bitmap to check.
+ *
+ * Return: true if any bit in the bitmap is set, false otherwise.
+ */
+bool sbitmap_any_bit_set(const struct sbitmap *sb);
+
+/**
+ * sbitmap_any_bit_clear() - Check for an unset bit in a &struct
+ * sbitmap.
+ * @sb: Bitmap to check.
+ *
+ * Return: true if any bit in the bitmap is clear, false otherwise.
+ */
+bool sbitmap_any_bit_clear(const struct sbitmap *sb);
+
+typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);
+
+/**
+ * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
+ * @sb: Bitmap to iterate over.
+ * @fn: Callback. Should return true to continue or false to break early.
+ * @data: Pointer to pass to callback.
+ *
+ * This is inline even though it's non-trivial so that the function calls to the
+ * callback will hopefully get optimized away.
+ */
+static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
+ void *data)
+{
+ unsigned int i;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ struct sbitmap_word *word = &sb->map[i];
+ unsigned int off, nr;
+
+ if (!word->word)
+ continue;
+
+ nr = 0;
+ off = i << sb->shift;
+ while (1) {
+ nr = find_next_bit(&word->word, word->depth, nr);
+ if (nr >= word->depth)
+ break;
+
+ if (!fn(sb, off + nr, data))
+ return;
+
+ nr++;
+ }
+ }
+}
+
+#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
+#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))
+
+static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
+ unsigned int bitnr)
+{
+ return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
+}
+
+/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */
+
+static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+ set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+ clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+ return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
+}
+
+unsigned int sbitmap_weight(const struct sbitmap *sb);
+
+/**
+ * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
+ * memory node.
+ * @sbq: Bitmap queue to initialize.
+ * @depth: See sbitmap_init_node().
+ * @shift: See sbitmap_init_node().
+ * @round_robin: See sbitmap_get().
+ * @flags: Allocation flags.
+ * @node: Memory node to allocate on.
+ *
+ * Return: Zero on success or negative errno on failure.
+ */
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+ int shift, bool round_robin, gfp_t flags, int node);
+
+/**
+ * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
+ *
+ * @sbq: Bitmap queue to free.
+ */
+static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
+{
+ kfree(sbq->ws);
+ free_percpu(sbq->alloc_hint);
+ sbitmap_free(&sbq->sb);
+}
+
+/**
+ * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
+ * @sbq: Bitmap queue to resize.
+ * @depth: New number of bits to resize to.
+ *
+ * Like sbitmap_resize(), this doesn't reallocate anything. It has to do
+ * some extra work on the &struct sbitmap_queue, so it's not safe to just
+ * resize the underlying &struct sbitmap.
+ */
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
+
+/**
+ * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
+ * sbitmap_queue with preemption already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get(struct sbitmap_queue *sbq);
+
+/**
+ * sbitmap_queue_get() - Try to allocate a free bit from a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to allocate from.
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ * sbitmap_queue_clear()).
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
+ unsigned int *cpu)
+{
+ int nr;
+
+ *cpu = get_cpu();
+ nr = __sbitmap_queue_get(sbq);
+ put_cpu();
+ return nr;
+}
+
+/**
+ * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
+ * &struct sbitmap_queue.
+ * @sbq: Bitmap to free from.
+ * @nr: Bit number to free.
+ * @cpu: CPU the bit was allocated on.
+ */
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+ unsigned int cpu);
+
+static inline int sbq_index_inc(int index)
+{
+ return (index + 1) & (SBQ_WAIT_QUEUES - 1);
+}
+
+static inline void sbq_index_atomic_inc(atomic_t *index)
+{
+ int old = atomic_read(index);
+ int new = sbq_index_inc(old);
+ atomic_cmpxchg(index, old, new);
+}
+
+/**
+ * sbq_wait_ptr() - Get the next wait queue to use for a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to wait on.
+ * @wait_index: A counter per "user" of @sbq.
+ */
+static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
+ atomic_t *wait_index)
+{
+ struct sbq_wait_state *ws;
+
+ ws = &sbq->ws[atomic_read(wait_index)];
+ sbq_index_atomic_inc(wait_index);
+ return ws;
+}
+
+/**
+ * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
+ * sbitmap_queue.
+ * @sbq: Bitmap queue to wake up.
+ */
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
+
+#endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26cc1df280d6..fc6e22186405 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -442,6 +442,7 @@ extern int schedule_on_each_cpu(work_func_t func);
int execute_in_process_context(work_func_t fn, struct execute_work *);
extern bool flush_work(struct work_struct *work);
+extern bool cancel_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);
extern bool flush_delayed_work(struct delayed_work *dwork);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ef071ca73fc3..bd81f0390277 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork)
}
EXPORT_SYMBOL(flush_delayed_work);
+static bool __cancel_work(struct work_struct *work, bool is_dwork)
+{
+ unsigned long flags;
+ int ret;
+
+ do {
+ ret = try_to_grab_pending(work, is_dwork, &flags);
+ } while (unlikely(ret == -EAGAIN));
+
+ if (unlikely(ret < 0))
+ return false;
+
+ set_work_pool_and_clear_pending(work, get_work_pool_id(work));
+ local_irq_restore(flags);
+ return ret;
+}
+
+/*
+ * See cancel_delayed_work()
+ */
+bool cancel_work(struct work_struct *work)
+{
+ return __cancel_work(work, false);
+}
+
/**
* cancel_delayed_work - cancel a delayed work
* @dwork: delayed_work to cancel
@@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work);
*/
bool cancel_delayed_work(struct delayed_work *dwork)
{
- unsigned long flags;
- int ret;
-
- do {
- ret = try_to_grab_pending(&dwork->work, true, &flags);
- } while (unlikely(ret == -EAGAIN));
-
- if (unlikely(ret < 0))
- return false;
-
- set_work_pool_and_clear_pending(&dwork->work,
- get_work_pool_id(&dwork->work));
- local_irq_restore(flags);
- return ret;
+ return __cancel_work(&dwork->work, true);
}
EXPORT_SYMBOL(cancel_delayed_work);
diff --git a/lib/Kconfig b/lib/Kconfig
index d79909dc01ec..942fb8091a86 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -550,4 +550,7 @@ config STACKDEPOT
bool
select STACKTRACE
+config SBITMAP
+ bool
+
endmenu
diff --git a/lib/Makefile b/lib/Makefile
index df747e5eeb7a..f3ca8c0ab634 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -227,3 +227,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
obj-$(CONFIG_UBSAN) += ubsan.o
UBSAN_SANITIZE_ubsan.o := n
+
+obj-$(CONFIG_SBITMAP) += sbitmap.o
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
new file mode 100644
index 000000000000..2cecf05c82fd
--- /dev/null
+++ b/lib/sbitmap.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/random.h>
+#include <linux/sbitmap.h>
+
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+ gfp_t flags, int node)
+{
+ unsigned int bits_per_word;
+ unsigned int i;
+
+ if (shift < 0) {
+ shift = ilog2(BITS_PER_LONG);
+ /*
+ * If the bitmap is small, shrink the number of bits per word so
+ * we spread over a few cachelines, at least. If less than 4
+ * bits, just forget about it, it's not going to work optimally
+ * anyway.
+ */
+ if (depth >= 4) {
+ while ((4U << shift) > depth)
+ shift--;
+ }
+ }
+ bits_per_word = 1U << shift;
+ if (bits_per_word > BITS_PER_LONG)
+ return -EINVAL;
+
+ sb->shift = shift;
+ sb->depth = depth;
+ sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+ if (depth == 0) {
+ sb->map = NULL;
+ return 0;
+ }
+
+ sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
+ if (!sb->map)
+ return -ENOMEM;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ sb->map[i].depth = min(depth, bits_per_word);
+ depth -= sb->map[i].depth;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_init_node);
+
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
+{
+ unsigned int bits_per_word = 1U << sb->shift;
+ unsigned int i;
+
+ sb->depth = depth;
+ sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+ for (i = 0; i < sb->map_nr; i++) {
+ sb->map[i].depth = min(depth, bits_per_word);
+ depth -= sb->map[i].depth;
+ }
+}
+EXPORT_SYMBOL_GPL(sbitmap_resize);
+
+static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
+ bool wrap)
+{
+ unsigned int orig_hint = hint;
+ int nr;
+
+ while (1) {
+ nr = find_next_zero_bit(&word->word, word->depth, hint);
+ if (unlikely(nr >= word->depth)) {
+ /*
+ * We started with an offset, and we didn't reset the
+ * offset to 0 in a failure case, so start from 0 to
+ * exhaust the map.
+ */
+ if (orig_hint && hint && wrap) {
+ hint = orig_hint = 0;
+ continue;
+ }
+ return -1;
+ }
+
+ if (!test_and_set_bit(nr, &word->word))
+ break;
+
+ hint = nr + 1;
+ if (hint >= word->depth - 1)
+ hint = 0;
+ }
+
+ return nr;
+}
+
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
+{
+ unsigned int i, index;
+ int nr = -1;
+
+ index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+ for (i = 0; i < sb->map_nr; i++) {
+ nr = __sbitmap_get_word(&sb->map[index],
+ SB_NR_TO_BIT(sb, alloc_hint),
+ !round_robin);
+ if (nr != -1) {
+ nr += index << sb->shift;
+ break;
+ }
+
+ /* Jump to next index. */
+ index++;
+ alloc_hint = index << sb->shift;
+
+ if (index >= sb->map_nr) {
+ index = 0;
+ alloc_hint = 0;
+ }
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get);
+
+bool sbitmap_any_bit_set(const struct sbitmap *sb)
+{
+ unsigned int i;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ if (sb->map[i].word)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
+
+bool sbitmap_any_bit_clear(const struct sbitmap *sb)
+{
+ unsigned int i;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ const struct sbitmap_word *word = &sb->map[i];
+ unsigned long ret;
+
+ ret = find_first_zero_bit(&word->word, word->depth);
+ if (ret < word->depth)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
+
+unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+ unsigned int i, weight = 0;
+
+ for (i = 0; i < sb->map_nr; i++) {
+ const struct sbitmap_word *word = &sb->map[i];
+
+ weight += bitmap_weight(&word->word, word->depth);
+ }
+ return weight;
+}
+EXPORT_SYMBOL_GPL(sbitmap_weight);
+
+static unsigned int sbq_calc_wake_batch(unsigned int depth)
+{
+ unsigned int wake_batch;
+
+ /*
+ * For each batch, we wake up one queue. We need to make sure that our
+ * batch size is small enough that the full depth of the bitmap is
+ * enough to wake up all of the queues.
+ */
+ wake_batch = SBQ_WAKE_BATCH;
+ if (wake_batch > depth / SBQ_WAIT_QUEUES)
+ wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+
+ return wake_batch;
+}
+
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+ int shift, bool round_robin, gfp_t flags, int node)
+{
+ int ret;
+ int i;
+
+ ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
+ if (ret)
+ return ret;
+
+ sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
+ if (!sbq->alloc_hint) {
+ sbitmap_free(&sbq->sb);
+ return -ENOMEM;
+ }
+
+ if (depth && !round_robin) {
+ for_each_possible_cpu(i)
+ *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
+ }
+
+ sbq->wake_batch = sbq_calc_wake_batch(depth);
+ atomic_set(&sbq->wake_index, 0);
+
+ sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
+ if (!sbq->ws) {
+ free_percpu(sbq->alloc_hint);
+ sbitmap_free(&sbq->sb);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+ init_waitqueue_head(&sbq->ws[i].wait);
+ atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
+ }
+
+ sbq->round_robin = round_robin;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+ sbq->wake_batch = sbq_calc_wake_batch(depth);
+ sbitmap_resize(&sbq->sb, depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
+
+int __sbitmap_queue_get(struct sbitmap_queue *sbq)
+{
+ unsigned int hint, depth;
+ int nr;
+
+ hint = this_cpu_read(*sbq->alloc_hint);
+ depth = READ_ONCE(sbq->sb.depth);
+ if (unlikely(hint >= depth)) {
+ hint = depth ? prandom_u32() % depth : 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+ nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
+
+ if (nr == -1) {
+ /* If the map is full, a hint won't do us much good. */
+ this_cpu_write(*sbq->alloc_hint, 0);
+ } else if (nr == hint || unlikely(sbq->round_robin)) {
+ /* Only update the hint if we used it. */
+ hint = nr + 1;
+ if (hint >= depth - 1)
+ hint = 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+
+static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
+{
+ int i, wake_index;
+
+ wake_index = atomic_read(&sbq->wake_index);
+ for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+ struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+ if (waitqueue_active(&ws->wait)) {
+ int o = atomic_read(&sbq->wake_index);
+
+ if (wake_index != o)
+ atomic_cmpxchg(&sbq->wake_index, o, wake_index);
+ return ws;
+ }
+
+ wake_index = sbq_index_inc(wake_index);
+ }
+
+ return NULL;
+}
+
+static void sbq_wake_up(struct sbitmap_queue *sbq)
+{
+ struct sbq_wait_state *ws;
+ int wait_cnt;
+
+ /* Ensure that the wait list checks occur after clear_bit(). */
+ smp_mb();
+
+ ws = sbq_wake_ptr(sbq);
+ if (!ws)
+ return;
+
+ wait_cnt = atomic_dec_return(&ws->wait_cnt);
+ if (unlikely(wait_cnt < 0))
+ wait_cnt = atomic_inc_return(&ws->wait_cnt);
+ if (wait_cnt == 0) {
+ atomic_add(sbq->wake_batch, &ws->wait_cnt);
+ sbq_index_atomic_inc(&sbq->wake_index);
+ wake_up(&ws->wait);
+ }
+}
+
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+ unsigned int cpu)
+{
+ sbitmap_clear_bit(&sbq->sb, nr);
+ sbq_wake_up(sbq);
+ if (likely(!sbq->round_robin && nr < sbq->sb.depth))
+ *per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
+
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
+{
+ int i, wake_index;
+
+ /*
+ * Make sure all changes prior to this are visible from other CPUs.
+ */
+ smp_mb();
+ wake_index = atomic_read(&sbq->wake_index);
+ for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+ struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+ if (waitqueue_active(&ws->wait))
+ wake_up(&ws->wait);
+
+ wake_index = sbq_index_inc(wake_index);
+ }
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);