summaryrefslogtreecommitdiffstats
path: root/drivers/nvme/host/tcp.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-06-02 15:37:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-06-02 15:37:03 -0700
commitbce159d734091fe31340976081577333f52a85e4 (patch)
tree8396be51e6703797a60aefb4992e729f327d27c2 /drivers/nvme/host/tcp.c
parent750a02ab8d3c49ca7d23102be90d3d1db19e2827 (diff)
parent0c8d3fceade2ab1bbac68bca013e62bfdb851d19 (diff)
downloadlinux-stable-bce159d734091fe31340976081577333f52a85e4.tar.gz
linux-stable-bce159d734091fe31340976081577333f52a85e4.tar.bz2
linux-stable-bce159d734091fe31340976081577333f52a85e4.zip
Merge tag 'for-5.8/drivers-2020-06-01' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe: "On top of the core changes, here are the block driver changes for this merge window: - NVMe changes: - NVMe over Fibre Channel protocol updates, which also reach over to drivers/scsi/lpfc (James Smart) - namespace revalidation support on the target (Anthony Iliopoulos) - gcc zero length array fix (Arnd Bergmann) - nvmet cleanups (Chaitanya Kulkarni) - misc cleanups and fixes (me, Keith Busch, Sagi Grimberg) - use a SRQ per completion vector (Max Gurtovoy) - fix handling of runtime changes to the queue count (Weiping Zhang) - t10 protection information support for nvme-rdma and nvmet-rdma (Israel Rukshin and Max Gurtovoy) - target side AEN improvements (Chaitanya Kulkarni) - various fixes and minor improvements all over, icluding the nvme part of the lpfc driver" - Floppy code cleanup series (Willy, Denis) - Floppy contention fix (Jiri) - Loop CONFIGURE support (Martijn) - bcache fixes/improvements (Coly, Joe, Colin) - q->queuedata cleanups (Christoph) - Get rid of ioctl_by_bdev (Christoph, Stefan) - md/raid5 allocation fixes (Coly) - zero length array fixes (Gustavo) - swim3 task state fix (Xu)" * tag 'for-5.8/drivers-2020-06-01' of git://git.kernel.dk/linux-block: (166 commits) bcache: configure the asynchronous registertion to be experimental bcache: asynchronous devices registration bcache: fix refcount underflow in bcache_device_free() bcache: Convert pr_<level> uses to a more typical style bcache: remove redundant variables i and n lpfc: Fix return value in __lpfc_nvme_ls_abort lpfc: fix axchg pointer reference after free and double frees lpfc: Fix pointer checks and comments in LS receive refactoring nvme: set dma alignment to qword nvmet: cleanups the loop in nvmet_async_events_process nvmet: fix memory leak when removing namespaces and controllers concurrently nvmet-rdma: add metadata/T10-PI support nvmet: add metadata support for block devices nvmet: add metadata/T10-PI support nvme: add Metadata Capabilities enumerations nvmet: rename nvmet_check_data_len to nvmet_check_transfer_len nvmet: rename nvmet_rw_len to nvmet_rw_data_len nvmet: add metadata characteristics for a namespace nvme-rdma: add metadata/T10-PI support nvme-rdma: introduce nvme_rdma_sgl structure ...
Diffstat (limited to 'drivers/nvme/host/tcp.c')
-rw-r--r--drivers/nvme/host/tcp.c64
1 files changed, 47 insertions, 17 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index c15a92163c1f..7c7c1886642f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -60,6 +60,7 @@ struct nvme_tcp_request {
enum nvme_tcp_queue_flags {
NVME_TCP_Q_ALLOCATED = 0,
NVME_TCP_Q_LIVE = 1,
+ NVME_TCP_Q_POLLING = 2,
};
enum nvme_tcp_recv_state {
@@ -75,6 +76,7 @@ struct nvme_tcp_queue {
int io_cpu;
spinlock_t lock;
+ struct mutex send_mutex;
struct list_head send_list;
/* recv state */
@@ -131,6 +133,7 @@ static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
static struct workqueue_struct *nvme_tcp_wq;
static struct blk_mq_ops nvme_tcp_mq_ops;
static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
{
@@ -257,15 +260,29 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
}
}
-static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
+ bool sync)
{
struct nvme_tcp_queue *queue = req->queue;
+ bool empty;
spin_lock(&queue->lock);
+ empty = list_empty(&queue->send_list) && !queue->request;
list_add_tail(&req->entry, &queue->send_list);
spin_unlock(&queue->lock);
- queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ /*
+ * if we're the first on the send_list and we can try to send
+ * directly, otherwise queue io_work. Also, only do that if we
+ * are on the same cpu, so we don't introduce contention.
+ */
+ if (queue->io_cpu == smp_processor_id() &&
+ sync && empty && mutex_trylock(&queue->send_mutex)) {
+ nvme_tcp_try_send(queue);
+ mutex_unlock(&queue->send_mutex);
+ } else {
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ }
}
static inline struct nvme_tcp_request *
@@ -578,7 +595,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, false);
return 0;
}
@@ -794,11 +811,12 @@ static void nvme_tcp_data_ready(struct sock *sk)
{
struct nvme_tcp_queue *queue;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
- if (likely(queue && queue->rd_enabled))
+ if (likely(queue && queue->rd_enabled) &&
+ !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void nvme_tcp_write_space(struct sock *sk)
@@ -867,7 +885,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
if (last && !queue->data_digest)
flags |= MSG_EOR;
else
- flags |= MSG_MORE;
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
/* can't zcopy slab pages */
if (unlikely(PageSlab(page))) {
@@ -906,11 +924,16 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
bool inline_data = nvme_tcp_has_inline_data(req);
- int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
+ int flags = MSG_DONTWAIT;
int ret;
+ if (inline_data)
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ else
+ flags |= MSG_EOR;
+
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
@@ -949,7 +972,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len,
- MSG_DONTWAIT | MSG_MORE);
+ MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
if (unlikely(ret <= 0))
return ret;
@@ -1063,11 +1086,14 @@ static void nvme_tcp_io_work(struct work_struct *w)
bool pending = false;
int result;
- result = nvme_tcp_try_send(queue);
- if (result > 0)
- pending = true;
- else if (unlikely(result < 0))
- break;
+ if (mutex_trylock(&queue->send_mutex)) {
+ result = nvme_tcp_try_send(queue);
+ mutex_unlock(&queue->send_mutex);
+ if (result > 0)
+ pending = true;
+ else if (unlikely(result < 0))
+ break;
+ }
result = nvme_tcp_try_recv(queue);
if (result > 0)
@@ -1319,6 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
queue->ctrl = ctrl;
INIT_LIST_HEAD(&queue->send_list);
spin_lock_init(&queue->lock);
+ mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
queue->queue_size = queue_size;
@@ -1543,6 +1570,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
set->numa_node = NUMA_NO_NODE;
+ set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = 1;
@@ -1554,7 +1582,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
set->numa_node = NUMA_NO_NODE;
- set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
@@ -2113,7 +2141,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req);
+ nvme_tcp_queue_request(&ctrl->async_req, true);
}
static enum blk_eh_timer_return
@@ -2244,7 +2272,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(rq);
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, true);
return BLK_STS_OK;
}
@@ -2302,9 +2330,11 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
+ set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
+ clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return queue->nr_cqe;
}