From 3469ac1aa3a2f1e2586a412923c414779a0af854 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:33:36 -0700 Subject: ceph: drop support for preferred_osd pgs This was an ill-conceived feature that has been removed from Ceph. Do this gracefully: - reject attempts to specify a preferred_osd via the ioctl - stop exposing this information via virtual xattrs - always fill in -1 for requests, in case we talk to an older server - don't calculate preferred_osd placements/pgids Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 47 ++++++++++------------------------------------- 1 file changed, 10 insertions(+), 37 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 29ad46ec9dcf..7d39f3cb4947 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1000,7 +1000,6 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, { unsigned num, num_mask; struct ceph_pg pgid; - s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); int poolid = le32_to_cpu(fl->fl_pg_pool); struct ceph_pg_pool_info *pool; unsigned ps; @@ -1011,23 +1010,13 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol, if (!pool) return -EIO; ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); - if (preferred >= 0) { - ps += preferred; - num = le32_to_cpu(pool->v.lpg_num); - num_mask = pool->lpg_num_mask; - } else { - num = le32_to_cpu(pool->v.pg_num); - num_mask = pool->pg_num_mask; - } + num = le32_to_cpu(pool->v.pg_num); + num_mask = pool->pg_num_mask; pgid.ps = cpu_to_le16(ps); - pgid.preferred = cpu_to_le16(preferred); + pgid.preferred = cpu_to_le16(-1); pgid.pool = fl->fl_pg_pool; - if (preferred >= 0) - dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, - (int)preferred); - else - dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); + dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); ol->ol_pgid = pgid; ol->ol_stripe_unit = fl->fl_object_stripe_unit; @@ -1046,23 +1035,17 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, struct ceph_pg_pool_info *pool; int ruleno; unsigned poolid, ps, pps, t; - int preferred; poolid = le32_to_cpu(pgid.pool); ps = le16_to_cpu(pgid.ps); - preferred = (s16)le16_to_cpu(pgid.preferred); pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); if (!pool) return NULL; /* pg_temp? */ - if (preferred >= 0) - t = ceph_stable_mod(ps, le32_to_cpu(pool->v.lpg_num), - pool->lpgp_num_mask); - else - t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), - pool->pgp_num_mask); + t = ceph_stable_mod(ps, le32_to_cpu(pool->v.pg_num), + pool->pgp_num_mask); pgid.ps = cpu_to_le16(t); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { @@ -1080,23 +1063,13 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, return NULL; } - /* don't forcefeed bad device ids to crush */ - if (preferred >= osdmap->max_osd || - preferred >= osdmap->crush->max_devices) - preferred = -1; - - if (preferred >= 0) - pps = ceph_stable_mod(ps, - le32_to_cpu(pool->v.lpgp_num), - pool->lpgp_num_mask); - else - pps = ceph_stable_mod(ps, - le32_to_cpu(pool->v.pgp_num), - pool->pgp_num_mask); + pps = ceph_stable_mod(ps, + le32_to_cpu(pool->v.pgp_num), + pool->pgp_num_mask); pps += poolid; *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, min_t(int, pool->v.size, *num), - preferred, osdmap->osd_weight); + -1, osdmap->osd_weight); return osds; } -- cgit v1.2.3 From 8b12d47b80c7a34dffdd98244d99316db490ec58 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:38:35 -0700 Subject: crush: clean up types, const-ness Move various types from int -> __u32 (or similar), and add const as appropriate. This reflects changes that have been present in the userland implementation for some time. Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/crush/crush.c | 8 ++++---- net/ceph/crush/mapper.c | 31 ++++++++++++++++--------------- 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index d6ebb13a18a4..8dd19a0deedc 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -26,9 +26,9 @@ const char *crush_bucket_alg_name(int alg) * @b: bucket pointer * @p: item index in bucket */ -int crush_get_bucket_item_weight(struct crush_bucket *b, int p) +int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) { - if (p >= b->size) + if ((__u32)p >= b->size) return 0; switch (b->alg) { @@ -124,10 +124,9 @@ void crush_destroy_bucket(struct crush_bucket *b) */ void crush_destroy(struct crush_map *map) { - int b; - /* buckets */ if (map->buckets) { + __s32 b; for (b = 0; b < map->max_buckets; b++) { if (map->buckets[b] == NULL) continue; @@ -138,6 +137,7 @@ void crush_destroy(struct crush_map *map) /* rules */ if (map->rules) { + __u32 b; for (b = 0; b < map->max_rules; b++) kfree(map->rules[b]); kfree(map->rules); diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b79747c4b645..436102a8a461 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -32,9 +32,9 @@ * @type: storage ruleset type (user defined) * @size: output set size */ -int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) +int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) { - int i; + __u32 i; for (i = 0; i < map->max_rules; i++) { if (map->rules[i] && @@ -72,7 +72,7 @@ static int bucket_perm_choose(struct crush_bucket *bucket, unsigned i, s; /* start a new permutation if @x has changed */ - if (bucket->perm_x != x || bucket->perm_n == 0) { + if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { dprintk("bucket %d new x=%d\n", bucket->id, x); bucket->perm_x = x; @@ -219,7 +219,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, static int bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) { - int i; + __u32 i; int high = 0; __u64 high_draw = 0; __u64 draw; @@ -262,7 +262,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) * true if device is marked "out" (failed, fully offloaded) * of the cluster */ -static int is_out(struct crush_map *map, __u32 *weight, int item, int x) +static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) { if (weight[item] >= 0x10000) return 0; @@ -287,16 +287,16 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x) * @recurse_to_leaf: true if we want one device under each item of given type * @out2: second output vector for leaf items (if @recurse_to_leaf) */ -static int crush_choose(struct crush_map *map, +static int crush_choose(const struct crush_map *map, struct crush_bucket *bucket, - __u32 *weight, + const __u32 *weight, int x, int numrep, int type, int *out, int outpos, int firstn, int recurse_to_leaf, int *out2) { int rep; - int ftotal, flocal; + unsigned int ftotal, flocal; int retry_descent, retry_bucket, skip_rep; struct crush_bucket *in = bucket; int r; @@ -304,7 +304,7 @@ static int crush_choose(struct crush_map *map, int item = 0; int itemtype; int collide, reject; - const int orig_tries = 5; /* attempts before we fall back to search */ + const unsigned int orig_tries = 5; /* attempts before we fall back to search */ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", bucket->id, x, outpos, numrep); @@ -325,7 +325,7 @@ static int crush_choose(struct crush_map *map, r = rep; if (in->alg == CRUSH_BUCKET_UNIFORM) { /* be careful */ - if (firstn || numrep >= in->size) + if (firstn || (__u32)numrep >= in->size) /* r' = r + f_total */ r += ftotal; else if (in->size % numrep == 0) @@ -425,7 +425,7 @@ reject: /* else give up */ skip_rep = 1; dprintk(" reject %d collide %d " - "ftotal %d flocal %d\n", + "ftotal %u flocal %u\n", reject, collide, ftotal, flocal); } @@ -456,9 +456,9 @@ reject: * @result_max: maximum result size * @force: force initial replica choice; -1 for none */ -int crush_do_rule(struct crush_map *map, +int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - int force, __u32 *weight) + int force, const __u32 *weight) { int result_len; int force_context[CRUSH_MAX_DEPTH]; @@ -473,7 +473,7 @@ int crush_do_rule(struct crush_map *map, int osize; int *tmp; struct crush_rule *rule; - int step; + __u32 step; int i, j; int numrep; int firstn; @@ -488,7 +488,8 @@ int crush_do_rule(struct crush_map *map, /* * determine hierarchical context of force, if any. note * that this may or may not correspond to the specific types - * referenced by the crush rule. + * referenced by the crush rule. it will also only affect + * the first descent (TAKE). */ if (force >= 0 && force < map->max_devices && -- cgit v1.2.3 From c90f95ed46393e29d843686e21947d1c6fcb1164 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:35:09 -0700 Subject: crush: adjust local retry threshold This small adjustment reflects a change that was made in ceph.git commit af6a9f30696c900a2a8bd7ae24e8ed15fb4964bb, about 6 months ago. An N-1 search is not exhaustive. Fixed ceph.git bug #1594. Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/crush/mapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 436102a8a461..583f644b0e28 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -415,7 +415,7 @@ reject: if (collide && flocal < 3) /* retry locally a few times */ retry_bucket = 1; - else if (flocal < in->size + orig_tries) + else if (flocal <= in->size + orig_tries) /* exhaustive bucket search */ retry_bucket = 1; else if (ftotal < 20) -- cgit v1.2.3 From a1f4895be8bf1ba56c2306b058f51619e9b0e8f8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:35:24 -0700 Subject: crush: be more tolerant of nonsensical crush maps If we get a map that doesn't make sense, error out or ignore the badness instead of BUGging out. This reflects the ceph.git commits 9895f0bff7dc68e9b49b572613d242315fb11b6c and 8ded26472058d5205803f244c2f33cb6cb10de79. Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/crush/mapper.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 583f644b0e28..00baad5d3bde 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -152,8 +152,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, return bucket->h.items[i]; } - BUG_ON(1); - return 0; + dprintk("bad list sums for bucket %d\n", bucket->h.id); + return bucket->h.items[0]; } @@ -239,6 +239,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket, static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); + BUG_ON(in->size == 0); switch (in->alg) { case CRUSH_BUCKET_UNIFORM: return bucket_uniform_choose((struct crush_bucket_uniform *)in, @@ -253,7 +254,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) return bucket_straw_choose((struct crush_bucket_straw *)in, x, r); default: - BUG_ON(1); + dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; } } @@ -354,7 +355,11 @@ static int crush_choose(const struct crush_map *map, item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); - BUG_ON(item >= map->max_devices); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + skip_rep = 1; + break; + } /* desired type? */ if (item < 0) @@ -365,8 +370,12 @@ static int crush_choose(const struct crush_map *map, /* keep going? */ if (itemtype != type) { - BUG_ON(item >= 0 || - (-1-item) >= map->max_buckets); + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + skip_rep = 1; + break; + } in = map->buckets[-1-item]; retry_bucket = 1; continue; @@ -478,7 +487,10 @@ int crush_do_rule(const struct crush_map *map, int numrep; int firstn; - BUG_ON(ruleno >= map->max_rules); + if ((__u32)ruleno >= map->max_rules) { + dprintk(" bad ruleno %d\n", ruleno); + return 0; + } rule = map->rules[ruleno]; result_len = 0; @@ -528,7 +540,8 @@ int crush_do_rule(const struct crush_map *map, firstn = 1; case CRUSH_RULE_CHOOSE_LEAF_INDEP: case CRUSH_RULE_CHOOSE_INDEP: - BUG_ON(wsize == 0); + if (wsize == 0) + break; recurse_to_leaf = rule->steps[step].op == @@ -597,7 +610,9 @@ int crush_do_rule(const struct crush_map *map, break; default: - BUG_ON(1); + dprintk(" unknown op %d at step %d\n", + curstep->op, step); + break; } } return result_len; -- cgit v1.2.3 From 0668216efe16ab1adf077e5f138775cee2af927a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:35:48 -0700 Subject: crush: use a temporary variable to simplify crush_do_rule Use a temporary variable here to avoid repeated array lookups and clean up the code a bit. This reflects ceph.git commit 6b5be27634ad307b471a5bf0db85c4f5c834885f. Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/crush/mapper.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 00baad5d3bde..fba9460fe572 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -519,14 +519,15 @@ int crush_do_rule(const struct crush_map *map, } for (step = 0; step < rule->len; step++) { + struct crush_rule_step *curstep = &rule->steps[step]; + firstn = 0; - switch (rule->steps[step].op) { + switch (curstep->op) { case CRUSH_RULE_TAKE: - w[0] = rule->steps[step].arg1; + w[0] = curstep->arg1; /* find position in force_context/hierarchy */ - while (force_pos >= 0 && - force_context[force_pos] != w[0]) + while (force_pos >= 0 && force_context[force_pos] != w[0]) force_pos--; /* and move past it */ if (force_pos >= 0) @@ -538,15 +539,16 @@ int crush_do_rule(const struct crush_map *map, case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; + /* fall through */ case CRUSH_RULE_CHOOSE_LEAF_INDEP: case CRUSH_RULE_CHOOSE_INDEP: if (wsize == 0) break; recurse_to_leaf = - rule->steps[step].op == + curstep->op == CRUSH_RULE_CHOOSE_LEAF_FIRSTN || - rule->steps[step].op == + curstep->op == CRUSH_RULE_CHOOSE_LEAF_INDEP; /* reset output */ @@ -558,7 +560,7 @@ int crush_do_rule(const struct crush_map *map, * basically, numrep <= 0 means relative to * the provided result_max */ - numrep = rule->steps[step].arg1; + numrep = curstep->arg1; if (numrep <= 0) { numrep += result_max; if (numrep <= 0) @@ -569,7 +571,7 @@ int crush_do_rule(const struct crush_map *map, /* skip any intermediate types */ while (force_pos && force_context[force_pos] < 0 && - rule->steps[step].arg2 != + curstep->arg2 != map->buckets[-1 - force_context[force_pos]]->type) force_pos--; @@ -583,7 +585,7 @@ int crush_do_rule(const struct crush_map *map, map->buckets[-1-w[i]], weight, x, numrep, - rule->steps[step].arg2, + curstep->arg2, o+osize, j, firstn, recurse_to_leaf, c+osize); -- cgit v1.2.3 From 41ebcc0907c58f75d0b25afcaf8b9c35c6b1ad14 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:39:29 -0700 Subject: crush: remove forcefeed functionality Remove forcefeed functionality from CRUSH. This is an ugly misfeature that is mostly useless and unused. Remove it. Reflects ceph.git commit ed974b5000f2851207d860a651809af4a1867942. Reviewed-by: Alex Elder Signed-off-by: Sage Weil Conflicts: net/ceph/crush/mapper.c --- net/ceph/crush/mapper.c | 48 +----------------------------------------------- net/ceph/osdmap.c | 2 +- 2 files changed, 2 insertions(+), 48 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index fba9460fe572..11cf352201ba 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -463,15 +463,12 @@ reject: * @x: hash input * @result: pointer to result vector * @result_max: maximum result size - * @force: force initial replica choice; -1 for none */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - int force, const __u32 *weight) + const __u32 *weight) { int result_len; - int force_context[CRUSH_MAX_DEPTH]; - int force_pos = -1; int a[CRUSH_MAX_SET]; int b[CRUSH_MAX_SET]; int c[CRUSH_MAX_SET]; @@ -497,27 +494,6 @@ int crush_do_rule(const struct crush_map *map, w = a; o = b; - /* - * determine hierarchical context of force, if any. note - * that this may or may not correspond to the specific types - * referenced by the crush rule. it will also only affect - * the first descent (TAKE). - */ - if (force >= 0 && - force < map->max_devices && - map->device_parents[force] != 0 && - !is_out(map, weight, force, x)) { - while (1) { - force_context[++force_pos] = force; - if (force >= 0) - force = map->device_parents[force]; - else - force = map->bucket_parents[-1-force]; - if (force == 0) - break; - } - } - for (step = 0; step < rule->len; step++) { struct crush_rule_step *curstep = &rule->steps[step]; @@ -525,14 +501,6 @@ int crush_do_rule(const struct crush_map *map, switch (curstep->op) { case CRUSH_RULE_TAKE: w[0] = curstep->arg1; - - /* find position in force_context/hierarchy */ - while (force_pos >= 0 && force_context[force_pos] != w[0]) - force_pos--; - /* and move past it */ - if (force_pos >= 0) - force_pos--; - wsize = 1; break; @@ -567,20 +535,6 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; - if (osize == 0 && force_pos >= 0) { - /* skip any intermediate types */ - while (force_pos && - force_context[force_pos] < 0 && - curstep->arg2 != - map->buckets[-1 - - force_context[force_pos]]->type) - force_pos--; - o[osize] = force_context[force_pos]; - if (recurse_to_leaf) - c[osize] = force_context[0]; - j++; - force_pos--; - } osize += crush_choose(map, map->buckets[-1-w[i]], weight, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 7d39f3cb4947..9dda36f7aa9d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1069,7 +1069,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pps += poolid; *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, min_t(int, pool->v.size, *num), - -1, osdmap->osd_weight); + osdmap->osd_weight); return osds; } -- cgit v1.2.3 From fc7c3ae5ab9246ad96aab4d0d57f67e9255cfb56 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:36:35 -0700 Subject: crush: remove parent maps These were used for the ill-fated forcefeed feature. Remove them. Reflects ceph.git commit ebdf80edfecfbd5a842b71fbe5732857994380c1. Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/crush/crush.c | 25 ------------------------- net/ceph/osdmap.c | 7 ------- 2 files changed, 32 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 8dd19a0deedc..2160791acf03 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -46,29 +46,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) return 0; } -/** - * crush_calc_parents - Calculate parent vectors for the given crush map. - * @map: crush_map pointer - */ -void crush_calc_parents(struct crush_map *map) -{ - int i, b, c; - - for (b = 0; b < map->max_buckets; b++) { - if (map->buckets[b] == NULL) - continue; - for (i = 0; i < map->buckets[b]->size; i++) { - c = map->buckets[b]->items[i]; - BUG_ON(c >= map->max_devices || - c < -map->max_buckets); - if (c >= 0) - map->device_parents[c] = map->buckets[b]->id; - else - map->bucket_parents[-1-c] = map->buckets[b]->id; - } - } -} - void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { kfree(b->h.perm); @@ -143,8 +120,6 @@ void crush_destroy(struct crush_map *map) kfree(map->rules); } - kfree(map->bucket_parents); - kfree(map->device_parents); kfree(map); } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9dda36f7aa9d..dac448ba68e4 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -161,13 +161,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->max_rules = ceph_decode_32(p); c->max_devices = ceph_decode_32(p); - c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); - if (c->device_parents == NULL) - goto badmem; - c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); - if (c->bucket_parents == NULL) - goto badmem; - c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); if (c->buckets == NULL) goto badmem; -- cgit v1.2.3 From f671d4cd9b36691ac4ef42cde44c1b7a84e13631 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:36:49 -0700 Subject: crush: fix tree node weight lookup Fix the node weight lookup for tree buckets by using a correct accessor. Reflects ceph.git commit d287ade5bcbdca82a3aef145b92924cf1e856733. Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/crush/crush.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 2160791acf03..b93575f4eb13 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -37,9 +37,7 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) case CRUSH_BUCKET_LIST: return ((struct crush_bucket_list *)b)->item_weights[p]; case CRUSH_BUCKET_TREE: - if (p & 1) - return ((struct crush_bucket_tree *)b)->node_weights[p]; - return 0; + return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; case CRUSH_BUCKET_STRAW: return ((struct crush_bucket_straw *)b)->item_weights[p]; } -- cgit v1.2.3 From 6eb43f4b5a2a74599b4ff17a97c03a342327ca65 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:37:05 -0700 Subject: crush: fix memory leak when destroying tree buckets Reflects ceph.git commit 46d63d98434b3bc9dad2fc9ab23cbaedc3bcb0e4. Reported-by: Alexander Lyakas Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/crush/crush.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index b93575f4eb13..089613234f03 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -62,6 +62,8 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { + kfree(b->h.perm); + kfree(b->h.items); kfree(b->node_weights); kfree(b); } -- cgit v1.2.3 From 8b393269008411a612ca549b733b4296e819f2fb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 7 May 2012 15:37:23 -0700 Subject: crush: warn on do_rule failure If we get an error code from crush_do_rule(), print an error to the console. Reviewed-by: Alex Elder Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index dac448ba68e4..2592f3cca987 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1027,7 +1027,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, struct ceph_pg_mapping *pg; struct ceph_pg_pool_info *pool; int ruleno; - unsigned poolid, ps, pps, t; + unsigned poolid, ps, pps, t, r; poolid = le32_to_cpu(pgid.pool); ps = le16_to_cpu(pgid.ps); @@ -1060,9 +1060,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, le32_to_cpu(pool->v.pgp_num), pool->pgp_num_mask); pps += poolid; - *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->v.size, *num), - osdmap->osd_weight); + r = crush_do_rule(osdmap->crush, ruleno, pps, osds, + min_t(int, pool->v.size, *num), + osdmap->osd_weight); + if (r < 0) { + pr_err("error %d from crush rule: pool %d ruleset %d type %d" + " size %d\n", r, poolid, pool->v.crush_ruleset, + pool->v.type, pool->v.size); + return NULL; + } + *num = r; return osds; } -- cgit v1.2.3 From 065a68f9167e20f321a62d044cb2c3024393d455 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 20 Apr 2012 15:49:43 -0500 Subject: ceph: osd_client: fix endianness bug in osd_req_encode_op() From Al Viro Al Viro noticed that we were using a non-cpu-encoded value in a switch statement in osd_req_encode_op(). The result would clearly not work correctly on a big-endian machine. Signed-off-by: Alex Elder --- net/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5e254055c910..daa2716a0c30 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -278,7 +278,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, { dst->op = cpu_to_le16(src->op); - switch (dst->op) { + switch (src->op) { case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: dst->extent.offset = -- cgit v1.2.3 From 57dac9d1620942608306d8c17c98a9d1568ffdf4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 10 May 2012 10:29:50 -0500 Subject: ceph: messenger: use read_partial() in read_partial_message() There are two blocks of code in read_partial_message()--those that read the header and footer of the message--that can be replaced by a call to read_partial(). Do that. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f0993af2ae4d..673133ee3181 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1628,7 +1628,7 @@ static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; int ret; - int to, left; + int to; unsigned front_len, middle_len, data_len; bool do_datacrc = !con->msgr->nocrc; int skip; @@ -1638,15 +1638,10 @@ static int read_partial_message(struct ceph_connection *con) dout("read_partial_message con %p msg %p\n", con, m); /* header */ - while (con->in_base_pos < sizeof(con->in_hdr)) { - left = sizeof(con->in_hdr) - con->in_base_pos; - ret = ceph_tcp_recvmsg(con->sock, - (char *)&con->in_hdr + con->in_base_pos, - left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } + to = 0; + ret = read_partial(con, &to, sizeof (con->in_hdr), &con->in_hdr); + if (ret <= 0) + return ret; crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); if (cpu_to_le32(crc) != con->in_hdr.crc) { @@ -1759,16 +1754,11 @@ static int read_partial_message(struct ceph_connection *con) } /* footer */ - to = sizeof(m->hdr) + sizeof(m->footer); - while (con->in_base_pos < to) { - left = to - con->in_base_pos; - ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + - (con->in_base_pos - sizeof(m->hdr)), - left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } + to = sizeof (m->hdr); + ret = read_partial(con, &to, sizeof (m->footer), &m->footer); + if (ret <= 0) + return ret; + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", m, front_len, m->footer.front_crc, middle_len, m->footer.middle_crc, data_len, m->footer.data_crc); -- cgit v1.2.3 From e6cee71fac27c946a0bbad754dd076e66c4e9dbd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 10 May 2012 10:29:50 -0500 Subject: ceph: messenger: update "to" in read_partial() caller read_partial() always increases whatever "to" value is supplied by adding the requested size to it, and that's the only thing it does with that pointed-to value. Do that pointer advance in the caller (and then only when the updated value will be subsequently used), and change the "to" parameter to be an in-only and non-pointer value. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 673133ee3181..37fd2aece232 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -992,11 +992,12 @@ static int prepare_read_message(struct ceph_connection *con) static int read_partial(struct ceph_connection *con, - int *to, int size, void *object) + int to, int size, void *object) { - *to += size; - while (con->in_base_pos < *to) { - int left = *to - con->in_base_pos; + int end = to + size; + + while (con->in_base_pos < end) { + int left = end - con->in_base_pos; int have = size - left; int ret = ceph_tcp_recvmsg(con->sock, object + have, left); if (ret <= 0) @@ -1017,14 +1018,16 @@ static int read_partial_banner(struct ceph_connection *con) dout("read_partial_banner %p at %d\n", con, con->in_base_pos); /* peer's banner */ - ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); + ret = read_partial(con, to, strlen(CEPH_BANNER), con->in_banner); if (ret <= 0) goto out; - ret = read_partial(con, &to, sizeof(con->actual_peer_addr), + to += strlen(CEPH_BANNER); + ret = read_partial(con, to, sizeof(con->actual_peer_addr), &con->actual_peer_addr); if (ret <= 0) goto out; - ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), + to += sizeof(con->actual_peer_addr); + ret = read_partial(con, to, sizeof(con->peer_addr_for_me), &con->peer_addr_for_me); if (ret <= 0) goto out; @@ -1038,10 +1041,11 @@ static int read_partial_connect(struct ceph_connection *con) dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); + ret = read_partial(con, to, sizeof(con->in_reply), &con->in_reply); if (ret <= 0) goto out; - ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), + to += sizeof(con->in_reply); + ret = read_partial(con, to, le32_to_cpu(con->in_reply.authorizer_len), con->auth_reply_buf); if (ret <= 0) goto out; @@ -1491,9 +1495,7 @@ static int process_connect(struct ceph_connection *con) */ static int read_partial_ack(struct ceph_connection *con) { - int to = 0; - - return read_partial(con, &to, sizeof(con->in_temp_ack), + return read_partial(con, 0, sizeof(con->in_temp_ack), &con->in_temp_ack); } @@ -1638,8 +1640,7 @@ static int read_partial_message(struct ceph_connection *con) dout("read_partial_message con %p msg %p\n", con, m); /* header */ - to = 0; - ret = read_partial(con, &to, sizeof (con->in_hdr), &con->in_hdr); + ret = read_partial(con, 0, sizeof (con->in_hdr), &con->in_hdr); if (ret <= 0) return ret; @@ -1755,7 +1756,7 @@ static int read_partial_message(struct ceph_connection *con) /* footer */ to = sizeof (m->hdr); - ret = read_partial(con, &to, sizeof (m->footer), &m->footer); + ret = read_partial(con, to, sizeof (m->footer), &m->footer); if (ret <= 0) return ret; -- cgit v1.2.3 From fd51653f78cf40a0516e521b6de22f329c5bad8d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 10 May 2012 10:29:50 -0500 Subject: ceph: messenger: change read_partial() to take "end" arg Make the second argument to read_partial() be the ending input byte position rather than the beginning offset it now represents. This amounts to moving the addition "to + size" into the caller. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 60 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 22 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 37fd2aece232..a659b4de64aa 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -992,10 +992,8 @@ static int prepare_read_message(struct ceph_connection *con) static int read_partial(struct ceph_connection *con, - int to, int size, void *object) + int end, int size, void *object) { - int end = to + size; - while (con->in_base_pos < end) { int left = end - con->in_base_pos; int have = size - left; @@ -1013,40 +1011,52 @@ static int read_partial(struct ceph_connection *con, */ static int read_partial_banner(struct ceph_connection *con) { - int ret, to = 0; + int size; + int end; + int ret; dout("read_partial_banner %p at %d\n", con, con->in_base_pos); /* peer's banner */ - ret = read_partial(con, to, strlen(CEPH_BANNER), con->in_banner); + size = strlen(CEPH_BANNER); + end = size; + ret = read_partial(con, end, size, con->in_banner); if (ret <= 0) goto out; - to += strlen(CEPH_BANNER); - ret = read_partial(con, to, sizeof(con->actual_peer_addr), - &con->actual_peer_addr); + + size = sizeof (con->actual_peer_addr); + end += size; + ret = read_partial(con, end, size, &con->actual_peer_addr); if (ret <= 0) goto out; - to += sizeof(con->actual_peer_addr); - ret = read_partial(con, to, sizeof(con->peer_addr_for_me), - &con->peer_addr_for_me); + + size = sizeof (con->peer_addr_for_me); + end += size; + ret = read_partial(con, end, size, &con->peer_addr_for_me); if (ret <= 0) goto out; + out: return ret; } static int read_partial_connect(struct ceph_connection *con) { - int ret, to = 0; + int size; + int end; + int ret; dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - ret = read_partial(con, to, sizeof(con->in_reply), &con->in_reply); + size = sizeof (con->in_reply); + end = size; + ret = read_partial(con, end, size, &con->in_reply); if (ret <= 0) goto out; - to += sizeof(con->in_reply); - ret = read_partial(con, to, le32_to_cpu(con->in_reply.authorizer_len), - con->auth_reply_buf); + + size = le32_to_cpu(con->in_reply.authorizer_len); + end += size; + ret = read_partial(con, end, size, con->auth_reply_buf); if (ret <= 0) goto out; @@ -1495,8 +1505,10 @@ static int process_connect(struct ceph_connection *con) */ static int read_partial_ack(struct ceph_connection *con) { - return read_partial(con, 0, sizeof(con->in_temp_ack), - &con->in_temp_ack); + int size = sizeof (con->in_temp_ack); + int end = size; + + return read_partial(con, end, size, &con->in_temp_ack); } @@ -1629,8 +1641,9 @@ static int read_partial_message_bio(struct ceph_connection *con, static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; + int size; + int end; int ret; - int to; unsigned front_len, middle_len, data_len; bool do_datacrc = !con->msgr->nocrc; int skip; @@ -1640,7 +1653,9 @@ static int read_partial_message(struct ceph_connection *con) dout("read_partial_message con %p msg %p\n", con, m); /* header */ - ret = read_partial(con, 0, sizeof (con->in_hdr), &con->in_hdr); + size = sizeof (con->in_hdr); + end = size; + ret = read_partial(con, end, size, &con->in_hdr); if (ret <= 0) return ret; @@ -1755,8 +1770,9 @@ static int read_partial_message(struct ceph_connection *con) } /* footer */ - to = sizeof (m->hdr); - ret = read_partial(con, to, sizeof (m->footer), &m->footer); + size = sizeof (m->footer); + end += size; + ret = read_partial(con, end, size, &m->footer); if (ret <= 0) return ret; -- cgit v1.2.3 From d329156f16306449c273002486c28de3ddddfd89 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: libceph: don't reset kvec in prepare_write_banner() Move the kvec reset for a connection out of prepare_write_banner and into its only caller. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index a659b4de64aa..bcbd409b5ca7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -686,7 +686,6 @@ static int prepare_connect_authorizer(struct ceph_connection *con) static void prepare_write_banner(struct ceph_messenger *msgr, struct ceph_connection *con) { - ceph_con_out_kvec_reset(con); ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr), &msgr->my_enc_addr); @@ -726,10 +725,9 @@ static int prepare_write_connect(struct ceph_messenger *msgr, con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; + ceph_con_out_kvec_reset(con); if (include_banner) prepare_write_banner(msgr, con); - else - ceph_con_out_kvec_reset(con); ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); con->out_more = 0; -- cgit v1.2.3 From 84fb3adf6413862cff51d8af3fce5f0b655586a2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: ceph: messenger: reset connection kvec caller Reset a connection's kvec fields in the caller rather than in prepare_write_connect(). This ends up repeating a few lines of code but it's improving the separation between distinct operations on the connection, which we can take advantage of later. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index bcbd409b5ca7..cca3cf341d70 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -725,7 +725,6 @@ static int prepare_write_connect(struct ceph_messenger *msgr, con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - ceph_con_out_kvec_reset(con); if (include_banner) prepare_write_banner(msgr, con); ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); @@ -1389,6 +1388,7 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; + ceph_con_out_kvec_reset(con); ret = prepare_write_connect(con->msgr, con, 0); if (ret < 0) return ret; @@ -1409,6 +1409,7 @@ static int process_connect(struct ceph_connection *con) ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); + ceph_con_out_kvec_reset(con); prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); @@ -1432,6 +1433,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->in_connect.connect_seq)); con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); + ceph_con_out_kvec_reset(con); prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); break; @@ -1446,6 +1448,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_connect.global_seq)); get_global_seq(con->msgr, le32_to_cpu(con->in_connect.global_seq)); + ceph_con_out_kvec_reset(con); prepare_write_connect(con->msgr, con, 0); prepare_read_connect(con); break; @@ -1851,6 +1854,7 @@ more: /* open the socket first? */ if (con->sock == NULL) { + ceph_con_out_kvec_reset(con); prepare_write_connect(msgr, con, 1); prepare_read_banner(con); set_bit(CONNECTING, &con->state); -- cgit v1.2.3 From 41b90c00858129f52d08e6a05c9cfdb0f2bd074d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: ceph: messenger: send banner in process_connect() prepare_write_connect() has an argument indicating whether a banner should be sent out before sending out a connection message. It's only ever set in one of its callers, so move the code that arranges to send the banner into that caller and drop the "include_banner" argument from prepare_write_connect(). Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index cca3cf341d70..6b38b6fbb25f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -695,8 +695,7 @@ static void prepare_write_banner(struct ceph_messenger *msgr, } static int prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int include_banner) + struct ceph_connection *con) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -725,8 +724,6 @@ static int prepare_write_connect(struct ceph_messenger *msgr, con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - if (include_banner) - prepare_write_banner(msgr, con); ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); con->out_more = 0; @@ -1389,7 +1386,7 @@ static int process_connect(struct ceph_connection *con) } con->auth_retry = 1; ceph_con_out_kvec_reset(con); - ret = prepare_write_connect(con->msgr, con, 0); + ret = prepare_write_connect(con->msgr, con); if (ret < 0) return ret; prepare_read_connect(con); @@ -1410,7 +1407,7 @@ static int process_connect(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); ceph_con_out_kvec_reset(con); - prepare_write_connect(con->msgr, con, 0); + prepare_write_connect(con->msgr, con); prepare_read_connect(con); /* Tell ceph about it. */ @@ -1434,7 +1431,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_connect.connect_seq)); con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); ceph_con_out_kvec_reset(con); - prepare_write_connect(con->msgr, con, 0); + prepare_write_connect(con->msgr, con); prepare_read_connect(con); break; @@ -1449,7 +1446,7 @@ static int process_connect(struct ceph_connection *con) get_global_seq(con->msgr, le32_to_cpu(con->in_connect.global_seq)); ceph_con_out_kvec_reset(con); - prepare_write_connect(con->msgr, con, 0); + prepare_write_connect(con->msgr, con); prepare_read_connect(con); break; @@ -1855,7 +1852,8 @@ more: /* open the socket first? */ if (con->sock == NULL) { ceph_con_out_kvec_reset(con); - prepare_write_connect(msgr, con, 1); + prepare_write_banner(msgr, con); + prepare_write_connect(msgr, con); prepare_read_banner(con); set_bit(CONNECTING, &con->state); clear_bit(NEGOTIATING, &con->state); -- cgit v1.2.3 From e825a66df97776d30a48a187e3a986736af43945 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: ceph: drop msgr argument from prepare_write_connect() In all cases, the value passed as the msgr argument to prepare_write_connect() is just con->msgr. Just get the msgr value from the ceph connection and drop the unneeded argument. The only msgr passed to prepare_write_banner() is also therefore just the one from con->msgr, so change that function to drop the msgr argument as well. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6b38b6fbb25f..47499dc0e413 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -683,19 +683,17 @@ static int prepare_connect_authorizer(struct ceph_connection *con) /* * We connected to a peer and are saying hello. */ -static void prepare_write_banner(struct ceph_messenger *msgr, - struct ceph_connection *con) +static void prepare_write_banner(struct ceph_connection *con) { ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); - ceph_con_out_kvec_add(con, sizeof (msgr->my_enc_addr), - &msgr->my_enc_addr); + ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + &con->msgr->my_enc_addr); con->out_more = 0; set_bit(WRITE_PENDING, &con->state); } -static int prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con) +static int prepare_write_connect(struct ceph_connection *con) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; @@ -717,7 +715,7 @@ static int prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = cpu_to_le64(msgr->supported_features); + con->out_connect.features = cpu_to_le64(con->msgr->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1386,7 +1384,7 @@ static int process_connect(struct ceph_connection *con) } con->auth_retry = 1; ceph_con_out_kvec_reset(con); - ret = prepare_write_connect(con->msgr, con); + ret = prepare_write_connect(con); if (ret < 0) return ret; prepare_read_connect(con); @@ -1407,7 +1405,7 @@ static int process_connect(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); ceph_con_out_kvec_reset(con); - prepare_write_connect(con->msgr, con); + prepare_write_connect(con); prepare_read_connect(con); /* Tell ceph about it. */ @@ -1431,7 +1429,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_connect.connect_seq)); con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); ceph_con_out_kvec_reset(con); - prepare_write_connect(con->msgr, con); + prepare_write_connect(con); prepare_read_connect(con); break; @@ -1446,7 +1444,7 @@ static int process_connect(struct ceph_connection *con) get_global_seq(con->msgr, le32_to_cpu(con->in_connect.global_seq)); ceph_con_out_kvec_reset(con); - prepare_write_connect(con->msgr, con); + prepare_write_connect(con); prepare_read_connect(con); break; @@ -1840,7 +1838,6 @@ static void process_message(struct ceph_connection *con) */ static int try_write(struct ceph_connection *con) { - struct ceph_messenger *msgr = con->msgr; int ret = 1; dout("try_write start %p state %lu nref %d\n", con, con->state, @@ -1852,8 +1849,8 @@ more: /* open the socket first? */ if (con->sock == NULL) { ceph_con_out_kvec_reset(con); - prepare_write_banner(msgr, con); - prepare_write_connect(msgr, con); + prepare_write_banner(con); + prepare_write_connect(con); prepare_read_banner(con); set_bit(CONNECTING, &con->state); clear_bit(NEGOTIATING, &con->state); -- cgit v1.2.3 From e10c758e4031a801ea4d2f8fb39bf14c2658d74b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: ceph: don't set WRITE_PENDING too early prepare_write_connect() prepares a connect message, then sets WRITE_PENDING on the connection. Then *after* this, it calls prepare_connect_authorizer(), which updates the content of the connection buffer already queued for sending. It's also possible it will result in prepare_write_connect() returning -EAGAIN despite the WRITE_PENDING big getting set. Fix this by preparing the connect authorizer first, setting the WRITE_PENDING bit only after that is done. Partially addresses http://tracker.newdream.net/issues/2424 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 47499dc0e413..cf292939dd1e 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -697,6 +697,7 @@ static int prepare_write_connect(struct ceph_connection *con) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; + int ret; switch (con->peer_name.type) { case CEPH_ENTITY_TYPE_MON: @@ -723,11 +724,14 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.flags = 0; ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); + ret = prepare_connect_authorizer(con); + if (ret) + return ret; con->out_more = 0; set_bit(WRITE_PENDING, &con->state); - return prepare_connect_authorizer(con); + return 0; } /* -- cgit v1.2.3 From 5a0f8fdd8a0ebe320952a388331dc043d7e14ced Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 21:51:59 -0500 Subject: ceph: messenger: check prepare_write_connect() result prepare_write_connect() can return an error, but only one of its callers checks for it. All the rest are in functions that already return errors, so it should be fine to return the error if one gets returned. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index cf292939dd1e..8e76936a8c13 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1409,7 +1409,9 @@ static int process_connect(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); ceph_con_out_kvec_reset(con); - prepare_write_connect(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; prepare_read_connect(con); /* Tell ceph about it. */ @@ -1433,7 +1435,9 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_connect.connect_seq)); con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); ceph_con_out_kvec_reset(con); - prepare_write_connect(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1448,7 +1452,9 @@ static int process_connect(struct ceph_connection *con) get_global_seq(con->msgr, le32_to_cpu(con->in_connect.global_seq)); ceph_con_out_kvec_reset(con); - prepare_write_connect(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; prepare_read_connect(con); break; @@ -1854,7 +1860,9 @@ more: if (con->sock == NULL) { ceph_con_out_kvec_reset(con); prepare_write_banner(con); - prepare_write_connect(con); + ret = prepare_write_connect(con); + if (ret < 0) + goto out; prepare_read_banner(con); set_bit(CONNECTING, &con->state); clear_bit(NEGOTIATING, &con->state); -- cgit v1.2.3 From b1c6b9803f5491e94041e6da96bc9dec3870e792 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: ceph: messenger: rework prepare_connect_authorizer() Change prepare_connect_authorizer() so it returns without dropping the connection mutex if the connection has no get_authorizer method. Use the symbolic CEPH_AUTH_UNKNOWN instead of 0 when assigning authorization protocols. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 8e76936a8c13..09409a3d9500 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -656,19 +656,29 @@ static void prepare_write_keepalive(struct ceph_connection *con) static int prepare_connect_authorizer(struct ceph_connection *con) { void *auth_buf; - int auth_len = 0; - int auth_protocol = 0; + int auth_len; + int auth_protocol; + + if (!con->ops->get_authorizer) { + con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; + con->out_connect.authorizer_len = 0; + + return 0; + } + + /* Can't hold the mutex while getting authorizer */ mutex_unlock(&con->mutex); - if (con->ops->get_authorizer) - con->ops->get_authorizer(con, &auth_buf, &auth_len, - &auth_protocol, &con->auth_reply_buf, - &con->auth_reply_buf_len, - con->auth_retry); + + auth_buf = NULL; + auth_len = 0; + auth_protocol = CEPH_AUTH_UNKNOWN; + con->ops->get_authorizer(con, &auth_buf, &auth_len, &auth_protocol, + &con->auth_reply_buf, &con->auth_reply_buf_len, + con->auth_retry); mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state) || - test_bit(OPENING, &con->state)) + if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) return -EAGAIN; con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); -- cgit v1.2.3 From ed96af646011412c2bf1ffe860db170db355fae5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: ceph: messenger: check return from get_authorizer In prepare_connect_authorizer(), a connection's get_authorizer method is called but ignores its return value. This function can return an error, so check for it and return it if that ever occurs. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 09409a3d9500..e0532d5b22f5 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -658,6 +658,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con) void *auth_buf; int auth_len; int auth_protocol; + int ret; if (!con->ops->get_authorizer) { con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; @@ -673,11 +674,14 @@ static int prepare_connect_authorizer(struct ceph_connection *con) auth_buf = NULL; auth_len = 0; auth_protocol = CEPH_AUTH_UNKNOWN; - con->ops->get_authorizer(con, &auth_buf, &auth_len, &auth_protocol, - &con->auth_reply_buf, &con->auth_reply_buf_len, - con->auth_retry); + ret = con->ops->get_authorizer(con, &auth_buf, &auth_len, + &auth_protocol, &con->auth_reply_buf, + &con->auth_reply_buf_len, con->auth_retry); mutex_lock(&con->mutex); + if (ret) + return ret; + if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) return -EAGAIN; -- cgit v1.2.3 From 6c4a19158b96ea1fb8acbe0c1d5493d9dcd2f147 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:38 -0500 Subject: ceph: define ceph_auth_handshake type The definitions for the ceph_mds_session and ceph_osd both contain five fields related only to "authorizers." Encapsulate those fields into their own struct type, allowing for better isolation in some upcoming patches. Fix the #includes in "linux/ceph/osd_client.h" to lay out their more complete canonical path. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index daa2716a0c30..66b09d6a1531 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -667,8 +667,8 @@ static void put_osd(struct ceph_osd *osd) if (atomic_dec_and_test(&osd->o_ref)) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (osd->o_authorizer) - ac->ops->destroy_authorizer(ac, osd->o_authorizer); + if (osd->o_auth.authorizer) + ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -2117,27 +2117,27 @@ static int get_authorizer(struct ceph_connection *con, struct ceph_auth_client *ac = osdc->client->monc.auth; int ret = 0; - if (force_new && o->o_authorizer) { - ac->ops->destroy_authorizer(ac, o->o_authorizer); - o->o_authorizer = NULL; + if (force_new && o->o_auth.authorizer) { + ac->ops->destroy_authorizer(ac, o->o_auth.authorizer); + o->o_auth.authorizer = NULL; } - if (o->o_authorizer == NULL) { + if (o->o_auth.authorizer == NULL) { ret = ac->ops->create_authorizer( ac, CEPH_ENTITY_TYPE_OSD, - &o->o_authorizer, - &o->o_authorizer_buf, - &o->o_authorizer_buf_len, - &o->o_authorizer_reply_buf, - &o->o_authorizer_reply_buf_len); + &o->o_auth.authorizer, + &o->o_auth.authorizer_buf, + &o->o_auth.authorizer_buf_len, + &o->o_auth.authorizer_reply_buf, + &o->o_auth.authorizer_reply_buf_len); if (ret) return ret; } *proto = ac->protocol; - *buf = o->o_authorizer_buf; - *len = o->o_authorizer_buf_len; - *reply_buf = o->o_authorizer_reply_buf; - *reply_len = o->o_authorizer_reply_buf_len; + *buf = o->o_auth.authorizer_buf; + *len = o->o_auth.authorizer_buf_len; + *reply_buf = o->o_auth.authorizer_reply_buf; + *reply_len = o->o_auth.authorizer_reply_buf_len; return 0; } @@ -2148,7 +2148,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); + return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) -- cgit v1.2.3 From 74f1869f76d043bad12ec03b4d5f04a8c3d1f157 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:39 -0500 Subject: ceph: messenger: reduce args to create_authorizer Make use of the new ceph_auth_handshake structure in order to reduce the number of arguments passed to the create_authorizor method in ceph_auth_client_ops. Use a local variable of that type as a shorthand in the get_authorizer method definitions. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/auth_none.c | 15 +++++++-------- net/ceph/auth_x.c | 15 +++++++-------- net/ceph/osd_client.c | 28 ++++++++++++---------------- 3 files changed, 26 insertions(+), 32 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 214c2bb43d62..925ca583c09c 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -59,9 +59,7 @@ static int handle_reply(struct ceph_auth_client *ac, int result, */ static int ceph_auth_none_create_authorizer( struct ceph_auth_client *ac, int peer_type, - struct ceph_authorizer **a, - void **buf, size_t *len, - void **reply_buf, size_t *reply_len) + struct ceph_auth_handshake *auth) { struct ceph_auth_none_info *ai = ac->private; struct ceph_none_authorizer *au = &ai->au; @@ -82,11 +80,12 @@ static int ceph_auth_none_create_authorizer( dout("built authorizer len %d\n", au->buf_len); } - *a = (struct ceph_authorizer *)au; - *buf = au->buf; - *len = au->buf_len; - *reply_buf = au->reply_buf; - *reply_len = sizeof(au->reply_buf); + auth->authorizer = (struct ceph_authorizer *) au; + auth->authorizer_buf = au->buf; + auth->authorizer_buf_len = au->buf_len; + auth->authorizer_reply_buf = au->reply_buf; + auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + return 0; bad2: diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 1587dc6010c6..a16bf14eb027 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -526,9 +526,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, static int ceph_x_create_authorizer( struct ceph_auth_client *ac, int peer_type, - struct ceph_authorizer **a, - void **buf, size_t *len, - void **reply_buf, size_t *reply_len) + struct ceph_auth_handshake *auth) { struct ceph_x_authorizer *au; struct ceph_x_ticket_handler *th; @@ -548,11 +546,12 @@ static int ceph_x_create_authorizer( return ret; } - *a = (struct ceph_authorizer *)au; - *buf = au->buf->vec.iov_base; - *len = au->buf->vec.iov_len; - *reply_buf = au->reply_buf; - *reply_len = sizeof(au->reply_buf); + auth->authorizer = (struct ceph_authorizer *) au; + auth->authorizer_buf = au->buf->vec.iov_base; + auth->authorizer_buf_len = au->buf->vec.iov_len; + auth->authorizer_reply_buf = au->reply_buf; + auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + return 0; } diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 66b09d6a1531..2da4b9e97dc1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2115,29 +2115,25 @@ static int get_authorizer(struct ceph_connection *con, struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; int ret = 0; - if (force_new && o->o_auth.authorizer) { - ac->ops->destroy_authorizer(ac, o->o_auth.authorizer); - o->o_auth.authorizer = NULL; - } - if (o->o_auth.authorizer == NULL) { - ret = ac->ops->create_authorizer( - ac, CEPH_ENTITY_TYPE_OSD, - &o->o_auth.authorizer, - &o->o_auth.authorizer_buf, - &o->o_auth.authorizer_buf_len, - &o->o_auth.authorizer_reply_buf, - &o->o_auth.authorizer_reply_buf_len); + if (force_new && auth->authorizer) { + ac->ops->destroy_authorizer(ac, auth->authorizer); + auth->authorizer = NULL; + } + if (auth->authorizer == NULL) { + ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, auth); if (ret) return ret; } *proto = ac->protocol; - *buf = o->o_auth.authorizer_buf; - *len = o->o_auth.authorizer_buf_len; - *reply_buf = o->o_auth.authorizer_reply_buf; - *reply_len = o->o_auth.authorizer_reply_buf_len; + *buf = auth->authorizer_buf; + *len = auth->authorizer_buf_len; + *reply_buf = auth->authorizer_reply_buf; + *reply_len = auth->authorizer_reply_buf_len; + return 0; } -- cgit v1.2.3 From a255651d4cad89f1a606edd36135af892ada4f20 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:39 -0500 Subject: ceph: ensure auth ops are defined before use In the create_authorizer method for both the mds and osd clients, the auth_client->ops pointer is blindly dereferenced. There is no obvious guarantee that this pointer has been assigned. And furthermore, even if the ops pointer is non-null there is definitely no guarantee that the create_authorizer or destroy_authorizer methods are defined. Add checks in both routines to make sure they are defined (non-null) before use. Add similar checks in a few other spots in these files while we're at it. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2da4b9e97dc1..f640bdf027e7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -664,10 +664,10 @@ static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref)) { + if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (osd->o_auth.authorizer) + if (ac->ops && ac->ops->destroy_authorizer) ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } @@ -2119,10 +2119,11 @@ static int get_authorizer(struct ceph_connection *con, int ret = 0; if (force_new && auth->authorizer) { - ac->ops->destroy_authorizer(ac, auth->authorizer); + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (auth->authorizer == NULL) { + if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, auth); if (ret) return ret; @@ -2144,6 +2145,10 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; + /* + * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, + * XXX which do we do: succeed or fail? + */ return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); } @@ -2153,7 +2158,7 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - if (ac->ops->invalidate_authorizer) + if (ac->ops && ac->ops->invalidate_authorizer) ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); -- cgit v1.2.3 From a3530df33eb91d787d08c7383a0a9982690e42d0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:39 -0500 Subject: ceph: have get_authorizer methods return pointers Have the get_authorizer auth_client method return a ceph_auth pointer rather than an integer, pointer-encoding any returned error value. This is to pave the way for making use of the returned value in an upcoming patch. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 8 ++++---- net/ceph/osd_client.c | 19 ++++++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e0532d5b22f5..ac27a2c0694a 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -658,7 +658,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con) void *auth_buf; int auth_len; int auth_protocol; - int ret; + struct ceph_auth_handshake *auth; if (!con->ops->get_authorizer) { con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; @@ -674,13 +674,13 @@ static int prepare_connect_authorizer(struct ceph_connection *con) auth_buf = NULL; auth_len = 0; auth_protocol = CEPH_AUTH_UNKNOWN; - ret = con->ops->get_authorizer(con, &auth_buf, &auth_len, + auth = con->ops->get_authorizer(con, &auth_buf, &auth_len, &auth_protocol, &con->auth_reply_buf, &con->auth_reply_buf_len, con->auth_retry); mutex_lock(&con->mutex); - if (ret) - return ret; + if (IS_ERR(auth)) + return PTR_ERR(auth); if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) return -EAGAIN; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f640bdf027e7..fa74ae0ea910 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2108,15 +2108,19 @@ static void put_osd_con(struct ceph_connection *con) /* * authentication */ -static int get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new) +/* + * Note: returned pointer is the address of a structure that's + * managed separately. Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + void **buf, int *len, int *proto, + void **reply_buf, int *reply_len, + int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; - int ret = 0; if (force_new && auth->authorizer) { if (ac->ops && ac->ops->destroy_authorizer) @@ -2124,9 +2128,10 @@ static int get_authorizer(struct ceph_connection *con, auth->authorizer = NULL; } if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, auth); + int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); if (ret) - return ret; + return ERR_PTR(ret); } *proto = ac->protocol; @@ -2135,7 +2140,7 @@ static int get_authorizer(struct ceph_connection *con, *reply_buf = auth->authorizer_reply_buf; *reply_len = auth->authorizer_reply_buf_len; - return 0; + return auth; } -- cgit v1.2.3 From 8f43fb53894079bf0caab6e348ceaffe7adc651a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:39 -0500 Subject: ceph: use info returned by get_authorizer Rather than passing a bunch of arguments to be filled in with the content of the ceph_auth_handshake buffer now returned by the get_authorizer method, just use the returned information in the caller, and drop the unnecessary arguments. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 13 +++++++------ net/ceph/osd_client.c | 9 +-------- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ac27a2c0694a..6d82c1a1a89b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -671,20 +671,21 @@ static int prepare_connect_authorizer(struct ceph_connection *con) mutex_unlock(&con->mutex); - auth_buf = NULL; - auth_len = 0; auth_protocol = CEPH_AUTH_UNKNOWN; - auth = con->ops->get_authorizer(con, &auth_buf, &auth_len, - &auth_protocol, &con->auth_reply_buf, - &con->auth_reply_buf_len, con->auth_retry); + auth = con->ops->get_authorizer(con, &auth_protocol, con->auth_retry); + mutex_lock(&con->mutex); if (IS_ERR(auth)) return PTR_ERR(auth); - if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) return -EAGAIN; + auth_buf = auth->authorizer_buf; + auth_len = auth->authorizer_buf_len; + con->auth_reply_buf = auth->authorizer_reply_buf; + con->auth_reply_buf_len = auth->authorizer_reply_buf_len; + con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index fa74ae0ea910..b7d633cc96a6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2113,9 +2113,7 @@ static void put_osd_con(struct ceph_connection *con) * managed separately. Caller must *not* attempt to free it. */ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, - int force_new) + int *proto, int force_new) { struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; @@ -2133,12 +2131,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, if (ret) return ERR_PTR(ret); } - *proto = ac->protocol; - *buf = auth->authorizer_buf; - *len = auth->authorizer_buf_len; - *reply_buf = auth->authorizer_reply_buf; - *reply_len = auth->authorizer_reply_buf_len; return auth; } -- cgit v1.2.3 From 729796be9190f57ca40ccca315e8ad34a1eb8fef Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:39 -0500 Subject: ceph: return pointer from prepare_connect_authorizer() Change prepare_connect_authorizer() so it returns a pointer (or pointer-coded error). Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6d82c1a1a89b..f92d564c1505 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -653,7 +653,7 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ -static int prepare_connect_authorizer(struct ceph_connection *con) +static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connection *con) { void *auth_buf; int auth_len; @@ -664,7 +664,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con) con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; con->out_connect.authorizer_len = 0; - return 0; + return NULL; } /* Can't hold the mutex while getting authorizer */ @@ -677,9 +677,9 @@ static int prepare_connect_authorizer(struct ceph_connection *con) mutex_lock(&con->mutex); if (IS_ERR(auth)) - return PTR_ERR(auth); + return auth; if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) - return -EAGAIN; + return ERR_PTR(-EAGAIN); auth_buf = auth->authorizer_buf; auth_len = auth->authorizer_buf_len; @@ -692,7 +692,7 @@ static int prepare_connect_authorizer(struct ceph_connection *con) if (auth_len) ceph_con_out_kvec_add(con, auth_len, auth_buf); - return 0; + return auth; } /* @@ -712,7 +712,7 @@ static int prepare_write_connect(struct ceph_connection *con) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; - int ret; + struct ceph_auth_handshake *auth; switch (con->peer_name.type) { case CEPH_ENTITY_TYPE_MON: @@ -739,9 +739,9 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.flags = 0; ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); - ret = prepare_connect_authorizer(con); - if (ret) - return ret; + auth = prepare_connect_authorizer(con); + if (IS_ERR(auth)) + return PTR_ERR(auth); con->out_more = 0; set_bit(WRITE_PENDING, &con->state); -- cgit v1.2.3 From dac1e716c60161867a47745bca592987ca3a9cb2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:39 -0500 Subject: ceph: rename prepare_connect_authorizer() Change the name of prepare_connect_authorizer(). The next patch is going to make this function no longer add anything to the connection's out_kvec, so it will no longer fit the pattern of the rest of the prepare_connect_*() functions. In addition, pass the address of a variable that will hold the authorization protocol to use. Move the assignment of that to the connection's out_connect structure into prepare_write_connect(). Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f92d564c1505..bfddd87db788 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -653,11 +653,11 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ -static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connection *con) +static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, + int *auth_proto) { void *auth_buf; int auth_len; - int auth_protocol; struct ceph_auth_handshake *auth; if (!con->ops->get_authorizer) { @@ -671,8 +671,7 @@ static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connec mutex_unlock(&con->mutex); - auth_protocol = CEPH_AUTH_UNKNOWN; - auth = con->ops->get_authorizer(con, &auth_protocol, con->auth_retry); + auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); mutex_lock(&con->mutex); @@ -686,7 +685,6 @@ static struct ceph_auth_handshake *prepare_connect_authorizer(struct ceph_connec con->auth_reply_buf = auth->authorizer_reply_buf; con->auth_reply_buf_len = auth->authorizer_reply_buf_len; - con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); con->out_connect.authorizer_len = cpu_to_le32(auth_len); if (auth_len) @@ -712,6 +710,7 @@ static int prepare_write_connect(struct ceph_connection *con) { unsigned global_seq = get_global_seq(con->msgr, 0); int proto; + int auth_proto; struct ceph_auth_handshake *auth; switch (con->peer_name.type) { @@ -739,9 +738,11 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.flags = 0; ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); - auth = prepare_connect_authorizer(con); + auth_proto = CEPH_AUTH_UNKNOWN; + auth = get_connect_authorizer(con, &auth_proto); if (IS_ERR(auth)) return PTR_ERR(auth); + con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); con->out_more = 0; set_bit(WRITE_PENDING, &con->state); -- cgit v1.2.3 From 3da54776e2c0385c32d143fd497a7f40a88e29dd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 16 May 2012 15:16:39 -0500 Subject: ceph: add auth buf in prepare_write_connect() Move the addition of the authorizer buffer to a connection's out_kvec out of get_connect_authorizer() and into its caller. This way, the caller--prepare_write_connect()--can avoid adding the connect header to out_kvec before it has been fully initialized. Prior to this patch, it was possible for a connect header to be sent over the wire before the authorizer protocol or buffer length fields were initialized. An authorizer buffer associated with that header could also be queued to send only after the connection header that describes it was on the wire. Fixes http://tracker.newdream.net/issues/2424 Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- net/ceph/messenger.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ceph') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index bfddd87db788..1a80907282cc 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -656,8 +656,6 @@ static void prepare_write_keepalive(struct ceph_connection *con) static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, int *auth_proto) { - void *auth_buf; - int auth_len; struct ceph_auth_handshake *auth; if (!con->ops->get_authorizer) { @@ -680,15 +678,9 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) return ERR_PTR(-EAGAIN); - auth_buf = auth->authorizer_buf; - auth_len = auth->authorizer_buf_len; con->auth_reply_buf = auth->authorizer_reply_buf; con->auth_reply_buf_len = auth->authorizer_reply_buf_len; - con->out_connect.authorizer_len = cpu_to_le32(auth_len); - - if (auth_len) - ceph_con_out_kvec_add(con, auth_len, auth_buf); return auth; } @@ -737,12 +729,20 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - ceph_con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); auth_proto = CEPH_AUTH_UNKNOWN; auth = get_connect_authorizer(con, &auth_proto); if (IS_ERR(auth)) return PTR_ERR(auth); + con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); + con->out_connect.authorizer_len = auth ? + cpu_to_le32(auth->authorizer_buf_len) : 0; + + ceph_con_out_kvec_add(con, sizeof (con->out_connect), + &con->out_connect); + if (auth && auth->authorizer_buf_len) + ceph_con_out_kvec_add(con, auth->authorizer_buf_len, + auth->authorizer_buf); con->out_more = 0; set_bit(WRITE_PENDING, &con->state); -- cgit v1.2.3 From 35f9f8a09e1e88e31bd34a1e645ca0e5f070dd5c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 16 May 2012 15:16:38 -0500 Subject: libceph: avoid unregistering osd request when not registered There is a race between two __unregister_request() callers: the reply path and the ceph_osdc_wait_request(). If we get a reply *and* the timeout expires at roughly the same time, both callers will try to unregister the request, and the second one will do bad things. Simply check if the request is still already unregistered; if so, return immediately and do nothing. Fixes http://tracker.newdream.net/issues/2420 Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ceph') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b7d633cc96a6..b098e7b591f0 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -841,6 +841,12 @@ static void register_request(struct ceph_osd_client *osdc, static void __unregister_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { + if (RB_EMPTY_NODE(&req->r_node)) { + dout("__unregister_request %p tid %lld not registered\n", + req, req->r_tid); + return; + } + dout("__unregister_request %p tid %lld\n", req, req->r_tid); rb_erase(&req->r_node, &osdc->requests); osdc->num_requests--; -- cgit v1.2.3 From 6bd9adbdf9ca6a052b0b7455ac67b925eb38cfad Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 21 May 2012 09:45:23 -0700 Subject: libceph: fix pg_temp updates Usually, we are adding pg_temp entries or removing them. Occasionally they update. In that case, osdmap_apply_incremental() was failing because the rbtree entry already exists. Fix by removing the existing entry before inserting a new one. Fixes http://tracker.newdream.net/issues/2446 Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ceph') diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2592f3cca987..1892c523c43c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -883,8 +883,12 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, pglen = ceph_decode_32(p); if (pglen) { - /* insert */ ceph_decode_need(p, end, pglen*sizeof(u32), bad); + + /* removing existing (if any) */ + (void) __remove_pg_mapping(&map->pg_temp, pgid); + + /* insert */ pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); if (!pg) { err = -ENOMEM; -- cgit v1.2.3