From fd7e3f0d8f25e4e3fed9fa3a743af92ebcbaf4e9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 22 Jan 2019 11:56:30 +0100 Subject: rbd: get rid of obj_req->obj_request_count It is effectively unused. Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1e92b61d0bd5..d071608507f2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -291,7 +291,6 @@ struct rbd_img_request { int result; /* first nonzero obj_request result */ struct list_head object_extents; /* obj_req.ex structs */ - u32 obj_request_count; u32 pending_count; struct kref kref; @@ -1345,7 +1344,6 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, /* Image request now owns object's original reference */ obj_request->img_request = img_request; - img_request->obj_request_count++; img_request->pending_count++; dout("%s: img %p obj %p\n", __func__, img_request, obj_request); } @@ -1355,8 +1353,6 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, { dout("%s: img %p obj %p\n", __func__, img_request, obj_request); list_del(&obj_request->ex.oe_item); - rbd_assert(img_request->obj_request_count > 0); - img_request->obj_request_count--; rbd_assert(obj_request->img_request == img_request); rbd_obj_request_put(obj_request); } @@ -1672,7 +1668,6 @@ static void rbd_img_request_destroy(struct kref *kref) for_each_obj_request_safe(img_request, obj_request, next_obj_request) rbd_img_obj_request_del(img_request, obj_request); - rbd_assert(img_request->obj_request_count == 0); if (img_request_layered_test(img_request)) { img_request_layered_clear(img_request); -- cgit v1.2.3 From 6484cbe987e0e44b8ebf224fc9faf7f73ace10d2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 29 Jan 2019 12:46:25 +0100 Subject: rbd: handle DISCARD and WRITE_ZEROES separately With discard_zeroes_data gone in commit 48920ff2a5a9 ("block: remove the discard_zeroes_data flag"), continuing to provide this guarantee is pointless: applications can't query it and discards can only be used for deallocating. Add OBJ_OP_ZEROOUT and move the existing logic under it. As the first step to divorcing OBJ_OP_DISCARD, stop worrying about copyups but keep special casing whole-object layered discards. Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 61 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 10 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d071608507f2..3ef97121a8f5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -214,6 +214,7 @@ enum obj_operation_type { OBJ_OP_READ = 1, OBJ_OP_WRITE, OBJ_OP_DISCARD, + OBJ_OP_ZEROOUT, }; /* @@ -857,6 +858,8 @@ static char* obj_op_name(enum obj_operation_type op_type) return "write"; case OBJ_OP_DISCARD: return "discard"; + case OBJ_OP_ZEROOUT: + return "zeroout"; default: return "???"; } @@ -1419,6 +1422,7 @@ static bool rbd_img_is_write(struct rbd_img_request *img_req) return false; case OBJ_OP_WRITE: case OBJ_OP_DISCARD: + case OBJ_OP_ZEROOUT: return true; default: BUG(); @@ -1841,7 +1845,40 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) return 0; } -static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, +static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) +{ + return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE : + CEPH_OSD_OP_ZERO; +} + +static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) +{ + int ret; + + /* reverse map the entire object onto the parent */ + ret = rbd_obj_calc_img_extents(obj_req, true); + if (ret) + return ret; + + obj_req->osd_req = rbd_osd_req_create(obj_req, 1); + if (!obj_req->osd_req) + return -ENOMEM; + + if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { + osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0); + } else { + osd_req_op_extent_init(obj_req->osd_req, 0, + truncate_or_zero_opcode(obj_req), + obj_req->ex.oe_off, obj_req->ex.oe_len, + 0, 0); + } + + obj_req->write_state = RBD_OBJ_WRITE_FLAT; + rbd_osd_req_format_write(obj_req); + return 0; +} + +static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, unsigned int which) { u16 opcode; @@ -1856,10 +1893,8 @@ static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, CEPH_OSD_OP_DELETE, 0); opcode = 0; } - } else if (rbd_obj_is_tail(obj_req)) { - opcode = CEPH_OSD_OP_TRUNCATE; } else { - opcode = CEPH_OSD_OP_ZERO; + opcode = truncate_or_zero_opcode(obj_req); } if (opcode) @@ -1871,7 +1906,7 @@ static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, rbd_osd_req_format_write(obj_req); } -static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) +static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) { unsigned int num_osd_ops, which = 0; int ret; @@ -1907,7 +1942,7 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) return ret; } - __rbd_obj_setup_discard(obj_req, which); + __rbd_obj_setup_zeroout(obj_req, which); return 0; } @@ -1932,6 +1967,9 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) case OBJ_OP_DISCARD: ret = rbd_obj_setup_discard(obj_req); break; + case OBJ_OP_ZEROOUT: + ret = rbd_obj_setup_zeroout(obj_req); + break; default: rbd_assert(0); } @@ -2392,9 +2430,9 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) case OBJ_OP_WRITE: __rbd_obj_setup_write(obj_req, 1); break; - case OBJ_OP_DISCARD: + case OBJ_OP_ZEROOUT: rbd_assert(!rbd_obj_is_entire(obj_req)); - __rbd_obj_setup_discard(obj_req, 1); + __rbd_obj_setup_zeroout(obj_req, 1); break; default: rbd_assert(0); @@ -2524,6 +2562,7 @@ static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) case OBJ_OP_WRITE: return rbd_obj_handle_write(obj_req); case OBJ_OP_DISCARD: + case OBJ_OP_ZEROOUT: if (rbd_obj_handle_write(obj_req)) { /* * Hide -ENOENT from delete/truncate/zero -- discarding @@ -3636,9 +3675,11 @@ static void rbd_queue_workfn(struct work_struct *work) switch (req_op(rq)) { case REQ_OP_DISCARD: - case REQ_OP_WRITE_ZEROES: op_type = OBJ_OP_DISCARD; break; + case REQ_OP_WRITE_ZEROES: + op_type = OBJ_OP_ZEROOUT; + break; case REQ_OP_WRITE: op_type = OBJ_OP_WRITE; break; @@ -3718,7 +3759,7 @@ static void rbd_queue_workfn(struct work_struct *work) img_request->rq = rq; snapc = NULL; /* img_request consumes a ref */ - if (op_type == OBJ_OP_DISCARD) + if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) result = rbd_img_fill_nodata(img_request, offset, length); else result = rbd_img_fill_from_bio(img_request, offset, length, -- cgit v1.2.3 From 0c93e1b7a26b418247218d08a6d0b95d61c9c415 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 30 Jan 2019 15:14:48 +0100 Subject: rbd: round off and ignore discards that are too small If, after rounding off, the discard request is smaller than alloc_size, drop it on the floor in __rbd_img_fill_request(). Default alloc_size to 64k. This should cover both HDD and SSD based bluestore OSDs and somewhat improve things for filestore. For OSDs on filestore with filestore_punch_hole = false, alloc_size is best set to object size in order to allow deletes and truncates and disallow zero op. Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 6 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3ef97121a8f5..3bd1af5a3d93 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -733,6 +733,7 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) */ enum { Opt_queue_depth, + Opt_alloc_size, Opt_lock_timeout, Opt_last_int, /* int args above */ @@ -749,6 +750,7 @@ enum { static match_table_t rbd_opts_tokens = { {Opt_queue_depth, "queue_depth=%d"}, + {Opt_alloc_size, "alloc_size=%d"}, {Opt_lock_timeout, "lock_timeout=%d"}, /* int args above */ {Opt_pool_ns, "_pool_ns=%s"}, @@ -765,6 +767,7 @@ static match_table_t rbd_opts_tokens = { struct rbd_options { int queue_depth; + int alloc_size; unsigned long lock_timeout; bool read_only; bool lock_on_read; @@ -773,6 +776,7 @@ struct rbd_options { }; #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ +#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ #define RBD_READ_ONLY_DEFAULT false #define RBD_LOCK_ON_READ_DEFAULT false @@ -812,6 +816,17 @@ static int parse_rbd_opts_token(char *c, void *private) } pctx->opts->queue_depth = intval; break; + case Opt_alloc_size: + if (intval < 1) { + pr_err("alloc_size out of range\n"); + return -EINVAL; + } + if (!is_power_of_2(intval)) { + pr_err("alloc_size must be a power of 2\n"); + return -EINVAL; + } + pctx->opts->alloc_size = intval; + break; case Opt_lock_timeout: /* 0 is "wait forever" (i.e. infinite timeout) */ if (intval < 0 || intval > INT_MAX / 1000) { @@ -1853,8 +1868,27 @@ static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) { + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; + u64 off = obj_req->ex.oe_off; + u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len; int ret; + /* + * Align the range to alloc_size boundary and punt on discards + * that are too small to free up any space. + * + * alloc_size == object_size && is_tail() is a special case for + * filestore with filestore_punch_hole = false, needed to allow + * truncate (in addition to delete). + */ + if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || + !rbd_obj_is_tail(obj_req)) { + off = round_up(off, rbd_dev->opts->alloc_size); + next_off = round_down(next_off, rbd_dev->opts->alloc_size); + if (off >= next_off) + return 1; + } + /* reverse map the entire object onto the parent */ ret = rbd_obj_calc_img_extents(obj_req, true); if (ret) @@ -1867,10 +1901,12 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0); } else { + dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, + obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, + off, next_off - off); osd_req_op_extent_init(obj_req->osd_req, 0, truncate_or_zero_opcode(obj_req), - obj_req->ex.oe_off, obj_req->ex.oe_len, - 0, 0); + off, next_off - off, 0, 0); } obj_req->write_state = RBD_OBJ_WRITE_FLAT; @@ -1953,10 +1989,10 @@ static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) */ static int __rbd_img_fill_request(struct rbd_img_request *img_req) { - struct rbd_obj_request *obj_req; + struct rbd_obj_request *obj_req, *next_obj_req; int ret; - for_each_obj_request(img_req, obj_req) { + for_each_obj_request_safe(img_req, obj_req, next_obj_req) { switch (img_req->op_type) { case OBJ_OP_READ: ret = rbd_obj_setup_read(obj_req); @@ -1973,8 +2009,14 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) default: rbd_assert(0); } - if (ret) + if (ret < 0) return ret; + if (ret > 0) { + img_req->xferred += obj_req->ex.oe_len; + img_req->pending_count--; + rbd_img_obj_request_del(img_req, obj_req); + continue; + } ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); if (ret) @@ -3764,7 +3806,7 @@ static void rbd_queue_workfn(struct work_struct *work) else result = rbd_img_fill_from_bio(img_request, offset, length, rq->bio); - if (result) + if (result || !img_request->pending_count) goto err_img_request; rbd_img_request_submit(img_request); @@ -5425,6 +5467,7 @@ static int rbd_add_parse_args(const char *buf, pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; + pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT; pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; @@ -5922,6 +5965,12 @@ static ssize_t do_rbd_add(struct bus_type *bus, if (rbd_dev->spec->snap_id != CEPH_NOSNAP) rbd_dev->opts->read_only = true; + if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { + rbd_warn(rbd_dev, "alloc_size adjusted to %u", + rbd_dev->layout.object_size); + rbd_dev->opts->alloc_size = rbd_dev->layout.object_size; + } + rc = rbd_dev_device_setup(rbd_dev); if (rc) goto err_out_image_probe; -- cgit v1.2.3 From 0b51c9d15ab481a5ad7124cc61f1ab5a10e57f67 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 7 Feb 2019 15:24:56 +0100 Subject: rbd: remove experimental designation from kernel layering Support for kernel layering hasn't been considered experimental for a few years now. All the issues that I'm aware of were shaken out in 2014 and early 2015. Moreover, most of that code was rewritten with the addition of support for fancy striping. Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3bd1af5a3d93..324f61bc5793 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5875,14 +5875,6 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) goto err_out_probe; - - /* - * Need to warn users if this image is the one being - * mapped and has a parent. - */ - if (!depth && rbd_dev->parent_spec) - rbd_warn(rbd_dev, - "WARNING: kernel layering is EXPERIMENTAL!"); } ret = rbd_dev_probe_parent(rbd_dev, depth); -- cgit v1.2.3 From 356889c49d84f11f446ec235bd52ca1a7d581aa0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 1 Mar 2019 12:06:24 +0100 Subject: rbd: clear ->xferred on error from rbd_obj_issue_copyup() Otherwise the assert in rbd_obj_end_request() is triggered. Fixes: 3da691bf4366 ("rbd: new request handling code") Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 324f61bc5793..c247938d220d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2585,6 +2585,7 @@ again: ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); if (ret) { obj_req->result = ret; + obj_req->xferred = 0; return true; } return false; -- cgit v1.2.3 From e28eded58bdb5579e7f772160f09d33760e3354d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 25 Feb 2019 11:42:26 +0100 Subject: rbd: factor out __rbd_osd_req_create() Allow passing a custom snapshot context: NULL for read and an empty snapshot context for deep-copyup. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c247938d220d..66915528298e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1486,18 +1486,16 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) } static struct ceph_osd_request * -rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) +__rbd_osd_req_create(struct rbd_obj_request *obj_req, + struct ceph_snap_context *snapc, unsigned int num_ops) { - struct rbd_img_request *img_req = obj_req->img_request; - struct rbd_device *rbd_dev = img_req->rbd_dev; + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_request *req; const char *name_format = rbd_dev->image_format == 1 ? RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; - req = ceph_osdc_alloc_request(osdc, - (rbd_img_is_write(img_req) ? img_req->snapc : NULL), - num_ops, false, GFP_NOIO); + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); if (!req) return NULL; @@ -1522,6 +1520,13 @@ err_req: return NULL; } +static struct ceph_osd_request * +rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) +{ + return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc, + num_ops); +} + static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) { ceph_osdc_put_request(osd_req); @@ -1769,7 +1774,7 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) { - obj_req->osd_req = rbd_osd_req_create(obj_req, 1); + obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1); if (!obj_req->osd_req) return -ENOMEM; -- cgit v1.2.3 From 13488d53775ba5f82dc4075c424d06dfe4b6b162 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 25 Feb 2019 12:37:50 +0100 Subject: rbd: stop copying num_osd_ops in rbd_obj_issue_copyup() In preparation for deep-flatten feature, stop copying num_osd_ops from the original request in rbd_obj_issue_copyup(). Split the calculation into count_{write,zeroout}_ops() respectively and determine whether the assert_exists guard is needed with the new rbd_obj_copyup_enabled(). As a nice side effect, we no longer guard in the writefull case as the copyup'ed object is always fully overwritten. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 90 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 31 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 66915528298e..f9cad40d95af 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1424,6 +1424,18 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) rbd_dev->layout.object_size; } +/* + * Must be called after rbd_obj_calc_img_extents(). + */ +static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) +{ + if (!obj_req->num_img_extents || + rbd_obj_is_entire(obj_req)) + return false; + + return true; +} + static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) { return ceph_file_extents_bytes(obj_req->img_extents, @@ -1810,6 +1822,11 @@ static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, return 0; } +static int count_write_ops(struct rbd_obj_request *obj_req) +{ + return 2; /* setallochint + write/writefull */ +} + static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, unsigned int which) { @@ -1836,6 +1853,7 @@ static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) { unsigned int num_osd_ops, which = 0; + bool need_guard; int ret; /* reverse map the entire object onto the parent */ @@ -1843,22 +1861,21 @@ static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) if (ret) return ret; - if (obj_req->num_img_extents) { - obj_req->write_state = RBD_OBJ_WRITE_GUARD; - num_osd_ops = 3; /* stat + setallochint + write/writefull */ - } else { - obj_req->write_state = RBD_OBJ_WRITE_FLAT; - num_osd_ops = 2; /* setallochint + write/writefull */ - } + need_guard = rbd_obj_copyup_enabled(obj_req); + num_osd_ops = need_guard + count_write_ops(obj_req); obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); if (!obj_req->osd_req) return -ENOMEM; - if (obj_req->num_img_extents) { + if (need_guard) { ret = __rbd_obj_setup_stat(obj_req, which++); if (ret) return ret; + + obj_req->write_state = RBD_OBJ_WRITE_GUARD; + } else { + obj_req->write_state = RBD_OBJ_WRITE_FLAT; } __rbd_obj_setup_write(obj_req, which); @@ -1919,6 +1936,18 @@ static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) return 0; } +static int count_zeroout_ops(struct rbd_obj_request *obj_req) +{ + int num_osd_ops; + + if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) + num_osd_ops = 2; /* create + truncate */ + else + num_osd_ops = 1; /* delete/truncate/zero */ + + return num_osd_ops; +} + static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, unsigned int which) { @@ -1950,6 +1979,7 @@ static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) { unsigned int num_osd_ops, which = 0; + bool need_guard; int ret; /* reverse map the entire object onto the parent */ @@ -1957,30 +1987,21 @@ static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) if (ret) return ret; - if (rbd_obj_is_entire(obj_req)) { - obj_req->write_state = RBD_OBJ_WRITE_FLAT; - if (obj_req->num_img_extents) - num_osd_ops = 2; /* create + truncate */ - else - num_osd_ops = 1; /* delete */ - } else { - if (obj_req->num_img_extents) { - obj_req->write_state = RBD_OBJ_WRITE_GUARD; - num_osd_ops = 2; /* stat + truncate/zero */ - } else { - obj_req->write_state = RBD_OBJ_WRITE_FLAT; - num_osd_ops = 1; /* truncate/zero */ - } - } + need_guard = rbd_obj_copyup_enabled(obj_req); + num_osd_ops = need_guard + count_zeroout_ops(obj_req); obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); if (!obj_req->osd_req) return -ENOMEM; - if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) { + if (need_guard) { ret = __rbd_obj_setup_stat(obj_req, which++); if (ret) return ret; + + obj_req->write_state = RBD_OBJ_WRITE_GUARD; + } else { + obj_req->write_state = RBD_OBJ_WRITE_FLAT; } __rbd_obj_setup_zeroout(obj_req, which); @@ -2439,18 +2460,25 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) { - unsigned int num_osd_ops = obj_req->osd_req->r_num_ops; + struct rbd_img_request *img_req = obj_req->img_request; + unsigned int num_osd_ops = 1; int ret; dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); rbd_osd_req_destroy(obj_req->osd_req); - /* - * Create a copyup request with the same number of OSD ops as - * the original request. The original request was stat + op(s), - * the new copyup request will be copyup + the same op(s). - */ + switch (img_req->op_type) { + case OBJ_OP_WRITE: + num_osd_ops += count_write_ops(obj_req); + break; + case OBJ_OP_ZEROOUT: + num_osd_ops += count_zeroout_ops(obj_req); + break; + default: + rbd_assert(0); + } + obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); if (!obj_req->osd_req) return -ENOMEM; @@ -2473,7 +2501,7 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) obj_req->copyup_bvec_count, bytes); - switch (obj_req->img_request->op_type) { + switch (img_req->op_type) { case OBJ_OP_WRITE: __rbd_obj_setup_write(obj_req, 1); break; -- cgit v1.2.3 From 3a482501cf701f56a454f9397aa96f477db87769 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Feb 2019 10:49:12 +0100 Subject: rbd: introduce rbd_obj_issue_copyup_ops() In preparation for deep-flatten feature, split rbd_obj_issue_copyup() into two functions and add a new write state to make the state machine slightly more clear. Make the copyup op optional and start using that for when the overlap goes to 0. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 76 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 33 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f9cad40d95af..aa95227fdee2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -236,7 +236,8 @@ enum obj_operation_type { enum rbd_obj_write_state { RBD_OBJ_WRITE_FLAT = 1, RBD_OBJ_WRITE_GUARD, - RBD_OBJ_WRITE_COPYUP, + RBD_OBJ_WRITE_READ_FROM_PARENT, + RBD_OBJ_WRITE_COPYUP_OPS, }; struct rbd_obj_request { @@ -2458,10 +2459,13 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) return true; } -static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) +#define MODS_ONLY U32_MAX + +static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) { struct rbd_img_request *img_req = obj_req->img_request; - unsigned int num_osd_ops = 1; + unsigned int num_osd_ops = (bytes != MODS_ONLY); + unsigned int which = 0; int ret; dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); @@ -2483,31 +2487,25 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) if (!obj_req->osd_req) return -ENOMEM; - ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup"); - if (ret) - return ret; + if (bytes != MODS_ONLY) { + ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd", + "copyup"); + if (ret) + return ret; - /* - * Only send non-zero copyup data to save some I/O and network - * bandwidth -- zero copyup data is equivalent to the object not - * existing. - */ - if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { - dout("%s obj_req %p detected zeroes\n", __func__, obj_req); - bytes = 0; + osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++, + obj_req->copyup_bvecs, + obj_req->copyup_bvec_count, + bytes); } - osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, - obj_req->copyup_bvecs, - obj_req->copyup_bvec_count, - bytes); switch (img_req->op_type) { case OBJ_OP_WRITE: - __rbd_obj_setup_write(obj_req, 1); + __rbd_obj_setup_write(obj_req, which); break; case OBJ_OP_ZEROOUT: rbd_assert(!rbd_obj_is_entire(obj_req)); - __rbd_obj_setup_zeroout(obj_req, 1); + __rbd_obj_setup_zeroout(obj_req, which); break; default: rbd_assert(0); @@ -2521,6 +2519,22 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) return 0; } +static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) +{ + /* + * Only send non-zero copyup data to save some I/O and network + * bandwidth -- zero copyup data is equivalent to the object not + * existing. + */ + if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { + dout("%s obj_req %p detected zeroes\n", __func__, obj_req); + bytes = 0; + } + + obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; + return rbd_obj_issue_copyup_ops(obj_req, bytes); +} + static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) { u32 i; @@ -2560,22 +2574,19 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) if (!obj_req->num_img_extents) { /* * The overlap has become 0 (most likely because the - * image has been flattened). Use rbd_obj_issue_copyup() - * to re-submit the original write request -- the copyup - * operation itself will be a no-op, since someone must - * have populated the child object while we weren't - * looking. Move to WRITE_FLAT state as we'll be done - * with the operation once the null copyup completes. + * image has been flattened). Re-submit the original write + * request -- pass MODS_ONLY since the copyup isn't needed + * anymore. */ - obj_req->write_state = RBD_OBJ_WRITE_FLAT; - return rbd_obj_issue_copyup(obj_req, 0); + obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; + return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); } ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); if (ret) return ret; - obj_req->write_state = RBD_OBJ_WRITE_COPYUP; + obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT; return rbd_obj_read_from_parent(obj_req); } @@ -2583,7 +2594,6 @@ static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) { int ret; -again: switch (obj_req->write_state) { case RBD_OBJ_WRITE_GUARD: rbd_assert(!obj_req->xferred); @@ -2602,6 +2612,7 @@ again: } /* fall through */ case RBD_OBJ_WRITE_FLAT: + case RBD_OBJ_WRITE_COPYUP_OPS: if (!obj_req->result) /* * There is no such thing as a successful short @@ -2609,10 +2620,9 @@ again: */ obj_req->xferred = obj_req->ex.oe_len; return true; - case RBD_OBJ_WRITE_COPYUP: - obj_req->write_state = RBD_OBJ_WRITE_GUARD; + case RBD_OBJ_WRITE_READ_FROM_PARENT: if (obj_req->result) - goto again; + return true; rbd_assert(obj_req->xferred); ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); -- cgit v1.2.3 From 89a59c1ca73b8dd43c208cdbd3658bd302cd41b4 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Feb 2019 14:20:28 +0100 Subject: rbd: copyup with an empty snapshot context (aka deep-copyup) This is the core of deep-flatten feature: sending a copyup request (i.e. a guarded write of the data read from the parent) with an empty snapshot context (snaps = [], seq = 0) causes the OSD to reflect the write in all existing snapshots. This allows "rbd flatten" to fully disconnect the clone image and its snapshots from the parent and make the parent snapshot removable. The actual modification request is sent only after deep-copyup request is completed. Waiting for deep-copyup reply is unnecessary, this will be improved in the future. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 79 insertions(+), 10 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aa95227fdee2..4befd8f6ac9c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -221,22 +221,32 @@ enum obj_operation_type { * Writes go through the following state machine to deal with * layering: * - * need copyup - * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP - * | ^ | - * v \------------------------------/ - * done - * ^ - * | - * RBD_OBJ_WRITE_FLAT + * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . . + * . | . + * . v . + * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . . + * . | . . + * . v v (deep-copyup . + * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) . + * flattened) v | . . + * . v . . + * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup . + * | not needed) v + * v . + * done . . . . . . . . . . . . . . . . . . + * ^ + * | + * RBD_OBJ_WRITE_FLAT * * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether - * there is a parent or not. + * assert_exists guard is needed or not (in some cases it's not needed + * even if there is a parent). */ enum rbd_obj_write_state { RBD_OBJ_WRITE_FLAT = 1, RBD_OBJ_WRITE_GUARD, RBD_OBJ_WRITE_READ_FROM_PARENT, + RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC, RBD_OBJ_WRITE_COPYUP_OPS, }; @@ -422,6 +432,10 @@ static DEFINE_IDA(rbd_dev_id_ida); static struct workqueue_struct *rbd_wq; +static struct ceph_snap_context rbd_empty_snapc = { + .nref = REFCOUNT_INIT(1), +}; + /* * single-major requires >= 0.75 version of userspace rbd utility. */ @@ -2461,6 +2475,38 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) #define MODS_ONLY U32_MAX +static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req, + u32 bytes) +{ + int ret; + + dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); + rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); + rbd_assert(bytes > 0 && bytes != MODS_ONLY); + rbd_osd_req_destroy(obj_req->osd_req); + + obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1); + if (!obj_req->osd_req) + return -ENOMEM; + + ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup"); + if (ret) + return ret; + + osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, + obj_req->copyup_bvecs, + obj_req->copyup_bvec_count, + bytes); + rbd_osd_req_format_write(obj_req); + + ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); + if (ret) + return ret; + + rbd_obj_request_submit(obj_req); + return 0; +} + static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) { struct rbd_img_request *img_req = obj_req->img_request; @@ -2469,7 +2515,8 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) int ret; dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); - rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); + rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT || + obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL); rbd_osd_req_destroy(obj_req->osd_req); switch (img_req->op_type) { @@ -2531,6 +2578,17 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) bytes = 0; } + if (obj_req->img_request->snapc->num_snaps && bytes > 0) { + /* + * Send a copyup request with an empty snapshot context to + * deep-copyup the object through all existing snapshots. + * A second request with the current snapshot context will be + * sent for the actual modification. + */ + obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC; + return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes); + } + obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; return rbd_obj_issue_copyup_ops(obj_req, bytes); } @@ -2632,6 +2690,17 @@ static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) return true; } return false; + case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC: + if (obj_req->result) + return true; + + obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; + ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); + if (ret) { + obj_req->result = ret; + return true; + } + return false; default: BUG(); } -- cgit v1.2.3 From 9b17eb2ce102e3274dafb3776a699969f02f7611 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Feb 2019 15:51:39 +0100 Subject: rbd: whole-object write and zeroout should copyup when snapshots exist Otherwise, once the parent snapshot is removed, the clone's snapshot wouldn't reflect the state of the clone prior to whole-object write or zeroout because a deep-copyup was never done ("rbd flatten" wouldn't do it because the modified object would exist in HEAD). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4befd8f6ac9c..ccfbed8741b8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1445,7 +1445,8 @@ static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) { if (!obj_req->num_img_extents || - rbd_obj_is_entire(obj_req)) + (rbd_obj_is_entire(obj_req) && + !obj_req->img_request->snapc->num_snaps)) return false; return true; @@ -1955,7 +1956,8 @@ static int count_zeroout_ops(struct rbd_obj_request *obj_req) { int num_osd_ops; - if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) + if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && + !rbd_obj_copyup_enabled(obj_req)) num_osd_ops = 2; /* create + truncate */ else num_osd_ops = 1; /* delete/truncate/zero */ @@ -1970,8 +1972,9 @@ static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, if (rbd_obj_is_entire(obj_req)) { if (obj_req->num_img_extents) { - osd_req_op_init(obj_req->osd_req, which++, - CEPH_OSD_OP_CREATE, 0); + if (!rbd_obj_copyup_enabled(obj_req)) + osd_req_op_init(obj_req->osd_req, which++, + CEPH_OSD_OP_CREATE, 0); opcode = CEPH_OSD_OP_TRUNCATE; } else { osd_req_op_init(obj_req->osd_req, which++, @@ -2551,7 +2554,6 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) __rbd_obj_setup_write(obj_req, which); break; case OBJ_OP_ZEROOUT: - rbd_assert(!rbd_obj_is_entire(obj_req)); __rbd_obj_setup_zeroout(obj_req, which); break; default: -- cgit v1.2.3 From b9f6d447a6f67b2acc3c4a9d9adc2508986e8df9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 25 Feb 2019 18:55:38 +0100 Subject: rbd: advertise support for RBD_FEATURE_DEEP_FLATTEN All copyups perform deep-copyup regardless of whether deep-flatten feature is enabled. The feature bit is used to ensure that image is written to only by new-enough clients that always perform deep-copyup. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block/rbd.c') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ccfbed8741b8..8dbfc5e54ae3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -115,12 +115,14 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_FEATURE_LAYERING (1ULL<<0) #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) +#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) #define RBD_FEATURE_DATA_POOL (1ULL<<7) #define RBD_FEATURE_OPERATIONS (1ULL<<8) #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ RBD_FEATURE_STRIPINGV2 | \ RBD_FEATURE_EXCLUSIVE_LOCK | \ + RBD_FEATURE_DEEP_FLATTEN | \ RBD_FEATURE_DATA_POOL | \ RBD_FEATURE_OPERATIONS) -- cgit v1.2.3