summaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-04-08 21:44:05 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-04-08 21:44:05 -0700
commitfcc95f06403c956e3f50ca4a82db12b66a3078e0 (patch)
tree74bb3e387b635601ba351f621c7eea575e185aac /drivers/block
parentc6b80eb89b55590b12db11103913088735205b5c (diff)
parentef9157259fb7bb3bc2c61df227e36f1b861a4753 (diff)
downloadlinux-fcc95f06403c956e3f50ca4a82db12b66a3078e0.tar.gz
linux-fcc95f06403c956e3f50ca4a82db12b66a3078e0.tar.bz2
linux-fcc95f06403c956e3f50ca4a82db12b66a3078e0.zip
Merge tag 'ceph-for-5.7-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The main items are: - support for asynchronous create and unlink (Jeff Layton). Creates and unlinks are satisfied locally, without waiting for a reply from the MDS, provided the client has been granted appropriate caps (new in v15.y.z ("Octopus") release). This can be a big help for metadata heavy workloads such as tar and rsync. Opt-in with the new nowsync mount option. - multiple blk-mq queues for rbd (Hannes Reinecke and myself). When the driver was converted to blk-mq, we settled on a single blk-mq queue because of a global lock in libceph and some other technical debt. These have since been addressed, so allocate a queue per CPU to enhance parallelism. - don't hold onto caps that aren't actually needed (Zheng Yan). This has been our long-standing behavior, but it causes issues with some active/standby applications (synchronous I/O, stalls if the standby goes down, etc). - .snap directory timestamps consistent with ceph-fuse (Luis Henriques)" * tag 'ceph-for-5.7-rc1' of git://github.com/ceph/ceph-client: (49 commits) ceph: fix snapshot directory timestamps ceph: wait for async creating inode before requesting new max size ceph: don't skip updating wanted caps when cap is stale ceph: request new max size only when there is auth cap ceph: cleanup return error of try_get_cap_refs() ceph: return ceph_mdsc_do_request() errors from __get_parent() ceph: check all mds' caps after page writeback ceph: update i_requested_max_size only when sending cap msg to auth mds ceph: simplify calling of ceph_get_fmode() ceph: remove delay check logic from ceph_check_caps() ceph: consider inode's last read/write when calculating wanted caps ceph: always renew caps if mds_wanted is insufficient ceph: update dentry lease for async create ceph: attempt to do async create when possible ceph: cache layout in parent dir on first sync create ceph: add new MDS req field to hold delegated inode number ceph: decode interval_sets for delegated inos ceph: make ceph_fill_inode non-static ceph: perform asynchronous unlink if we have sufficient caps ceph: don't take refs to want mask unless we have all bits ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/rbd.c215
1 files changed, 79 insertions, 136 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6343402c09e6..1e0a6b19ae0d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -337,10 +337,7 @@ struct rbd_img_request {
u64 snap_id; /* for reads */
struct ceph_snap_context *snapc; /* for writes */
};
- union {
- struct request *rq; /* block request */
- struct rbd_obj_request *obj_request; /* obj req initiator */
- };
+ struct rbd_obj_request *obj_request; /* obj req initiator */
struct list_head lock_item;
struct list_head object_extents; /* obj_req.ex structs */
@@ -349,7 +346,6 @@ struct rbd_img_request {
struct pending_result pending;
struct work_struct work;
int work_result;
- struct kref kref;
};
#define for_each_obj_request(ireq, oreq) \
@@ -1320,15 +1316,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
-static void rbd_img_request_destroy(struct kref *kref);
-static void rbd_img_request_put(struct rbd_img_request *img_request)
-{
- rbd_assert(img_request != NULL);
- dout("%s: img %p (was %d)\n", __func__, img_request,
- kref_read(&img_request->kref));
- kref_put(&img_request->kref, rbd_img_request_destroy);
-}
-
static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
struct rbd_obj_request *obj_request)
{
@@ -1366,18 +1353,10 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
static void img_request_layered_set(struct rbd_img_request *img_request)
{
set_bit(IMG_REQ_LAYERED, &img_request->flags);
- smp_mb();
-}
-
-static void img_request_layered_clear(struct rbd_img_request *img_request)
-{
- clear_bit(IMG_REQ_LAYERED, &img_request->flags);
- smp_mb();
}
static bool img_request_layered_test(struct rbd_img_request *img_request)
{
- smp_mb();
return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
}
@@ -1619,10 +1598,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
if (!rbd_dev->parent_spec)
return false;
- down_read(&rbd_dev->header_rwsem);
if (rbd_dev->parent_overlap)
counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
- up_read(&rbd_dev->header_rwsem);
if (counter < 0)
rbd_warn(rbd_dev, "parent reference overflow");
@@ -1630,63 +1607,54 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
return counter > 0;
}
-/*
- * Caller is responsible for filling in the list of object requests
- * that comprises the image request, and the Linux request pointer
- * (if there is one).
- */
-static struct rbd_img_request *rbd_img_request_create(
- struct rbd_device *rbd_dev,
- enum obj_operation_type op_type,
- struct ceph_snap_context *snapc)
+static void rbd_img_request_init(struct rbd_img_request *img_request,
+ struct rbd_device *rbd_dev,
+ enum obj_operation_type op_type)
{
- struct rbd_img_request *img_request;
-
- img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
- if (!img_request)
- return NULL;
+ memset(img_request, 0, sizeof(*img_request));
img_request->rbd_dev = rbd_dev;
img_request->op_type = op_type;
- if (!rbd_img_is_write(img_request))
- img_request->snap_id = rbd_dev->spec->snap_id;
- else
- img_request->snapc = snapc;
-
- if (rbd_dev_parent_get(rbd_dev))
- img_request_layered_set(img_request);
INIT_LIST_HEAD(&img_request->lock_item);
INIT_LIST_HEAD(&img_request->object_extents);
mutex_init(&img_request->state_mutex);
- kref_init(&img_request->kref);
+}
+
+static void rbd_img_capture_header(struct rbd_img_request *img_req)
+{
+ struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+ lockdep_assert_held(&rbd_dev->header_rwsem);
- return img_request;
+ if (rbd_img_is_write(img_req))
+ img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+ else
+ img_req->snap_id = rbd_dev->spec->snap_id;
+
+ if (rbd_dev_parent_get(rbd_dev))
+ img_request_layered_set(img_req);
}
-static void rbd_img_request_destroy(struct kref *kref)
+static void rbd_img_request_destroy(struct rbd_img_request *img_request)
{
- struct rbd_img_request *img_request;
struct rbd_obj_request *obj_request;
struct rbd_obj_request *next_obj_request;
- img_request = container_of(kref, struct rbd_img_request, kref);
-
dout("%s: img %p\n", __func__, img_request);
WARN_ON(!list_empty(&img_request->lock_item));
for_each_obj_request_safe(img_request, obj_request, next_obj_request)
rbd_img_obj_request_del(img_request, obj_request);
- if (img_request_layered_test(img_request)) {
- img_request_layered_clear(img_request);
+ if (img_request_layered_test(img_request))
rbd_dev_parent_put(img_request->rbd_dev);
- }
if (rbd_img_is_write(img_request))
ceph_put_snap_context(img_request->snapc);
- kmem_cache_free(rbd_img_request_cache, img_request);
+ if (test_bit(IMG_REQ_CHILD, &img_request->flags))
+ kmem_cache_free(rbd_img_request_cache, img_request);
}
#define BITS_PER_OBJ 2
@@ -2849,17 +2817,22 @@ static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
{
struct rbd_img_request *img_req = obj_req->img_request;
+ struct rbd_device *parent = img_req->rbd_dev->parent;
struct rbd_img_request *child_img_req;
int ret;
- child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
- OBJ_OP_READ, NULL);
+ child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
if (!child_img_req)
return -ENOMEM;
+ rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
child_img_req->obj_request = obj_req;
+ down_read(&parent->header_rwsem);
+ rbd_img_capture_header(child_img_req);
+ up_read(&parent->header_rwsem);
+
dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
obj_req);
@@ -2888,7 +2861,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
obj_req->copyup_bvecs);
}
if (ret) {
- rbd_img_request_put(child_img_req);
+ rbd_img_request_destroy(child_img_req);
return ret;
}
@@ -3647,15 +3620,15 @@ again:
if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
struct rbd_obj_request *obj_req = img_req->obj_request;
- rbd_img_request_put(img_req);
+ rbd_img_request_destroy(img_req);
if (__rbd_obj_handle_request(obj_req, &result)) {
img_req = obj_req->img_request;
goto again;
}
} else {
- struct request *rq = img_req->rq;
+ struct request *rq = blk_mq_rq_from_pdu(img_req);
- rbd_img_request_put(img_req);
+ rbd_img_request_destroy(img_req);
blk_mq_end_request(rq, errno_to_blk_status(result));
}
}
@@ -4707,84 +4680,36 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
static void rbd_queue_workfn(struct work_struct *work)
{
- struct request *rq = blk_mq_rq_from_pdu(work);
- struct rbd_device *rbd_dev = rq->q->queuedata;
- struct rbd_img_request *img_request;
- struct ceph_snap_context *snapc = NULL;
+ struct rbd_img_request *img_request =
+ container_of(work, struct rbd_img_request, work);
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ enum obj_operation_type op_type = img_request->op_type;
+ struct request *rq = blk_mq_rq_from_pdu(img_request);
u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
u64 length = blk_rq_bytes(rq);
- enum obj_operation_type op_type;
u64 mapping_size;
int result;
- switch (req_op(rq)) {
- case REQ_OP_DISCARD:
- op_type = OBJ_OP_DISCARD;
- break;
- case REQ_OP_WRITE_ZEROES:
- op_type = OBJ_OP_ZEROOUT;
- break;
- case REQ_OP_WRITE:
- op_type = OBJ_OP_WRITE;
- break;
- case REQ_OP_READ:
- op_type = OBJ_OP_READ;
- break;
- default:
- dout("%s: non-fs request type %d\n", __func__, req_op(rq));
- result = -EIO;
- goto err;
- }
-
/* Ignore/skip any zero-length requests */
-
if (!length) {
dout("%s: zero-length request\n", __func__);
result = 0;
- goto err_rq;
- }
-
- if (op_type != OBJ_OP_READ) {
- if (rbd_is_ro(rbd_dev)) {
- rbd_warn(rbd_dev, "%s on read-only mapping",
- obj_op_name(op_type));
- result = -EIO;
- goto err;
- }
- rbd_assert(!rbd_is_snap(rbd_dev));
- }
-
- if (offset && length > U64_MAX - offset + 1) {
- rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
- length);
- result = -EINVAL;
- goto err_rq; /* Shouldn't happen */
+ goto err_img_request;
}
blk_mq_start_request(rq);
down_read(&rbd_dev->header_rwsem);
mapping_size = rbd_dev->mapping.size;
- if (op_type != OBJ_OP_READ) {
- snapc = rbd_dev->header.snapc;
- ceph_get_snap_context(snapc);
- }
+ rbd_img_capture_header(img_request);
up_read(&rbd_dev->header_rwsem);
if (offset + length > mapping_size) {
rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
length, mapping_size);
result = -EIO;
- goto err_rq;
- }
-
- img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
- if (!img_request) {
- result = -ENOMEM;
- goto err_rq;
+ goto err_img_request;
}
- img_request->rq = rq;
- snapc = NULL; /* img_request consumes a ref */
dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
img_request, obj_op_name(op_type), offset, length);
@@ -4801,23 +4726,51 @@ static void rbd_queue_workfn(struct work_struct *work)
return;
err_img_request:
- rbd_img_request_put(img_request);
-err_rq:
+ rbd_img_request_destroy(img_request);
if (result)
rbd_warn(rbd_dev, "%s %llx at %llx result %d",
obj_op_name(op_type), length, offset, result);
- ceph_put_snap_context(snapc);
-err:
blk_mq_end_request(rq, errno_to_blk_status(result));
}
static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
- struct request *rq = bd->rq;
- struct work_struct *work = blk_mq_rq_to_pdu(rq);
+ struct rbd_device *rbd_dev = hctx->queue->queuedata;
+ struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
+ enum obj_operation_type op_type;
- queue_work(rbd_wq, work);
+ switch (req_op(bd->rq)) {
+ case REQ_OP_DISCARD:
+ op_type = OBJ_OP_DISCARD;
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ op_type = OBJ_OP_ZEROOUT;
+ break;
+ case REQ_OP_WRITE:
+ op_type = OBJ_OP_WRITE;
+ break;
+ case REQ_OP_READ:
+ op_type = OBJ_OP_READ;
+ break;
+ default:
+ rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
+ return BLK_STS_IOERR;
+ }
+
+ rbd_img_request_init(img_req, rbd_dev, op_type);
+
+ if (rbd_img_is_write(img_req)) {
+ if (rbd_is_ro(rbd_dev)) {
+ rbd_warn(rbd_dev, "%s on read-only mapping",
+ obj_op_name(img_req->op_type));
+ return BLK_STS_IOERR;
+ }
+ rbd_assert(!rbd_is_snap(rbd_dev));
+ }
+
+ INIT_WORK(&img_req->work, rbd_queue_workfn);
+ queue_work(rbd_wq, &img_req->work);
return BLK_STS_OK;
}
@@ -4984,18 +4937,8 @@ out:
return ret;
}
-static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- struct work_struct *work = blk_mq_rq_to_pdu(rq);
-
- INIT_WORK(work, rbd_queue_workfn);
- return 0;
-}
-
static const struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
- .init_request = rbd_init_request,
};
static int rbd_init_disk(struct rbd_device *rbd_dev)
@@ -5027,8 +4970,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
- rbd_dev->tag_set.nr_hw_queues = 1;
- rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+ rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
+ rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
if (err)