diff options
Diffstat (limited to 'fs/ceph/crush/mapper.c')
-rw-r--r-- | fs/ceph/crush/mapper.c | 609 |
1 files changed, 0 insertions, 609 deletions
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c deleted file mode 100644 index a4eec133258e..000000000000 --- a/fs/ceph/crush/mapper.c +++ /dev/null @@ -1,609 +0,0 @@ - -#ifdef __KERNEL__ -# include <linux/string.h> -# include <linux/slab.h> -# include <linux/bug.h> -# include <linux/kernel.h> -# ifndef dprintk -# define dprintk(args...) -# endif -#else -# include <string.h> -# include <stdio.h> -# include <stdlib.h> -# include <assert.h> -# define BUG_ON(x) assert(!(x)) -# define dprintk(args...) /* printf(args) */ -# define kmalloc(x, f) malloc(x) -# define kfree(x) free(x) -#endif - -#include "crush.h" -#include "hash.h" - -/* - * Implement the core CRUSH mapping algorithm. - */ - -/** - * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. - * @map: the crush_map - * @ruleset: the storage ruleset id (user defined) - * @type: storage ruleset type (user defined) - * @size: output set size - */ -int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) -{ - int i; - - for (i = 0; i < map->max_rules; i++) { - if (map->rules[i] && - map->rules[i]->mask.ruleset == ruleset && - map->rules[i]->mask.type == type && - map->rules[i]->mask.min_size <= size && - map->rules[i]->mask.max_size >= size) - return i; - } - return -1; -} - - -/* - * bucket choose methods - * - * For each bucket algorithm, we have a "choose" method that, given a - * crush input @x and replica position (usually, position in output set) @r, - * will produce an item in the bucket. - */ - -/* - * Choose based on a random permutation of the bucket. - * - * We used to use some prime number arithmetic to do this, but it - * wasn't very random, and had some other bad behaviors. Instead, we - * calculate an actual random permutation of the bucket members. - * Since this is expensive, we optimize for the r=0 case, which - * captures the vast majority of calls. - */ -static int bucket_perm_choose(struct crush_bucket *bucket, - int x, int r) -{ - unsigned pr = r % bucket->size; - unsigned i, s; - - /* start a new permutation if @x has changed */ - if (bucket->perm_x != x || bucket->perm_n == 0) { - dprintk("bucket %d new x=%d\n", bucket->id, x); - bucket->perm_x = x; - - /* optimize common r=0 case */ - if (pr == 0) { - s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % - bucket->size; - bucket->perm[0] = s; - bucket->perm_n = 0xffff; /* magic value, see below */ - goto out; - } - - for (i = 0; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm_n = 0; - } else if (bucket->perm_n == 0xffff) { - /* clean up after the r=0 case above */ - for (i = 1; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm[bucket->perm[0]] = 0; - bucket->perm_n = 1; - } - - /* calculate permutation up to pr */ - for (i = 0; i < bucket->perm_n; i++) - dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); - while (bucket->perm_n <= pr) { - unsigned p = bucket->perm_n; - /* no point in swapping the final entry */ - if (p < bucket->size - 1) { - i = crush_hash32_3(bucket->hash, x, bucket->id, p) % - (bucket->size - p); - if (i) { - unsigned t = bucket->perm[p + i]; - bucket->perm[p + i] = bucket->perm[p]; - bucket->perm[p] = t; - } - dprintk(" perm_choose swap %d with %d\n", p, p+i); - } - bucket->perm_n++; - } - for (i = 0; i < bucket->size; i++) - dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); - - s = bucket->perm[pr]; -out: - dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, - bucket->size, x, r, pr, s); - return bucket->items[s]; -} - -/* uniform */ -static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, - int x, int r) -{ - return bucket_perm_choose(&bucket->h, x, r); -} - -/* list */ -static int bucket_list_choose(struct crush_bucket_list *bucket, - int x, int r) -{ - int i; - - for (i = bucket->h.size-1; i >= 0; i--) { - __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], - r, bucket->h.id); - w &= 0xffff; - dprintk("list_choose i=%d x=%d r=%d item %d weight %x " - "sw %x rand %llx", - i, x, r, bucket->h.items[i], bucket->item_weights[i], - bucket->sum_weights[i], w); - w *= bucket->sum_weights[i]; - w = w >> 16; - /*dprintk(" scaled %llx\n", w);*/ - if (w < bucket->item_weights[i]) - return bucket->h.items[i]; - } - - BUG_ON(1); - return 0; -} - - -/* (binary) tree */ -static int height(int n) -{ - int h = 0; - while ((n & 1) == 0) { - h++; - n = n >> 1; - } - return h; -} - -static int left(int x) -{ - int h = height(x); - return x - (1 << (h-1)); -} - -static int right(int x) -{ - int h = height(x); - return x + (1 << (h-1)); -} - -static int terminal(int x) -{ - return x & 1; -} - -static int bucket_tree_choose(struct crush_bucket_tree *bucket, - int x, int r) -{ - int n, l; - __u32 w; - __u64 t; - - /* start at root */ - n = bucket->num_nodes >> 1; - - while (!terminal(n)) { - /* pick point in [0, w) */ - w = bucket->node_weights[n]; - t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, - bucket->h.id) * (__u64)w; - t = t >> 32; - - /* descend to the left or right? */ - l = left(n); - if (t < bucket->node_weights[l]) - n = l; - else - n = right(n); - } - - return bucket->h.items[n >> 1]; -} - - -/* straw */ - -static int bucket_straw_choose(struct crush_bucket_straw *bucket, - int x, int r) -{ - int i; - int high = 0; - __u64 high_draw = 0; - __u64 draw; - - for (i = 0; i < bucket->h.size; i++) { - draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); - draw &= 0xffff; - draw *= bucket->straws[i]; - if (i == 0 || draw > high_draw) { - high = i; - high_draw = draw; - } - } - return bucket->h.items[high]; -} - -static int crush_bucket_choose(struct crush_bucket *in, int x, int r) -{ - dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); - switch (in->alg) { - case CRUSH_BUCKET_UNIFORM: - return bucket_uniform_choose((struct crush_bucket_uniform *)in, - x, r); - case CRUSH_BUCKET_LIST: - return bucket_list_choose((struct crush_bucket_list *)in, - x, r); - case CRUSH_BUCKET_TREE: - return bucket_tree_choose((struct crush_bucket_tree *)in, - x, r); - case CRUSH_BUCKET_STRAW: - return bucket_straw_choose((struct crush_bucket_straw *)in, - x, r); - default: - BUG_ON(1); - return in->items[0]; - } -} - -/* - * true if device is marked "out" (failed, fully offloaded) - * of the cluster - */ -static int is_out(struct crush_map *map, __u32 *weight, int item, int x) -{ - if (weight[item] >= 0x10000) - return 0; - if (weight[item] == 0) - return 1; - if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) - < weight[item]) - return 0; - return 1; -} - -/** - * crush_choose - choose numrep distinct items of given type - * @map: the crush_map - * @bucket: the bucket we are choose an item from - * @x: crush input value - * @numrep: the number of items to choose - * @type: the type of item to choose - * @out: pointer to output vector - * @outpos: our position in that vector - * @firstn: true if choosing "first n" items, false if choosing "indep" - * @recurse_to_leaf: true if we want one device under each item of given type - * @out2: second output vector for leaf items (if @recurse_to_leaf) - */ -static int crush_choose(struct crush_map *map, - struct crush_bucket *bucket, - __u32 *weight, - int x, int numrep, int type, - int *out, int outpos, - int firstn, int recurse_to_leaf, - int *out2) -{ - int rep; - int ftotal, flocal; - int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; - int r; - int i; - int item = 0; - int itemtype; - int collide, reject; - const int orig_tries = 5; /* attempts before we fall back to search */ - - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", - bucket->id, x, outpos, numrep); - - for (rep = outpos; rep < numrep; rep++) { - /* keep trying until we get a non-out, non-colliding item */ - ftotal = 0; - skip_rep = 0; - do { - retry_descent = 0; - in = bucket; /* initial bucket */ - - /* choose through intervening buckets */ - flocal = 0; - do { - collide = 0; - retry_bucket = 0; - r = rep; - if (in->alg == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || numrep >= in->size) - /* r' = r + f_total */ - r += ftotal; - else if (in->size % numrep == 0) - /* r'=r+(n+1)*f_local */ - r += (numrep+1) * - (flocal+ftotal); - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } else { - if (firstn) - /* r' = r + f_total */ - r += ftotal; - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } - - /* bucket choose */ - if (in->size == 0) { - reject = 1; - goto reject; - } - if (flocal >= (in->size>>1) && - flocal > orig_tries) - item = bucket_perm_choose(in, x, r); - else - item = crush_bucket_choose(in, x, r); - BUG_ON(item >= map->max_devices); - - /* desired type? */ - if (item < 0) - itemtype = map->buckets[-1-item]->type; - else - itemtype = 0; - dprintk(" item %d type %d\n", item, itemtype); - - /* keep going? */ - if (itemtype != type) { - BUG_ON(item >= 0 || - (-1-item) >= map->max_buckets); - in = map->buckets[-1-item]; - retry_bucket = 1; - continue; - } - - /* collision? */ - for (i = 0; i < outpos; i++) { - if (out[i] == item) { - collide = 1; - break; - } - } - - reject = 0; - if (recurse_to_leaf) { - if (item < 0) { - if (crush_choose(map, - map->buckets[-1-item], - weight, - x, outpos+1, 0, - out2, outpos, - firstn, 0, - NULL) <= outpos) - /* didn't get leaf */ - reject = 1; - } else { - /* we already have a leaf! */ - out2[outpos] = item; - } - } - - if (!reject) { - /* out? */ - if (itemtype == 0) - reject = is_out(map, weight, - item, x); - else - reject = 0; - } - -reject: - if (reject || collide) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - /* retry locally a few times */ - retry_bucket = 1; - else if (flocal < in->size + orig_tries) - /* exhaustive bucket search */ - retry_bucket = 1; - else if (ftotal < 20) - /* then retry descent */ - retry_descent = 1; - else - /* else give up */ - skip_rep = 1; - dprintk(" reject %d collide %d " - "ftotal %d flocal %d\n", - reject, collide, ftotal, - flocal); - } - } while (retry_bucket); - } while (retry_descent); - - if (skip_rep) { - dprintk("skip rep\n"); - continue; - } - - dprintk("CHOOSE got %d\n", item); - out[outpos] = item; - outpos++; - } - - dprintk("CHOOSE returns %d\n", outpos); - return outpos; -} - - -/** - * crush_do_rule - calculate a mapping with the given input and rule - * @map: the crush_map - * @ruleno: the rule id - * @x: hash input - * @result: pointer to result vector - * @result_max: maximum result size - * @force: force initial replica choice; -1 for none - */ -int crush_do_rule(struct crush_map *map, - int ruleno, int x, int *result, int result_max, - int force, __u32 *weight) -{ - int result_len; - int force_context[CRUSH_MAX_DEPTH]; - int force_pos = -1; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int c[CRUSH_MAX_SET]; - int recurse_to_leaf; - int *w; - int wsize = 0; - int *o; - int osize; - int *tmp; - struct crush_rule *rule; - int step; - int i, j; - int numrep; - int firstn; - int rc = -1; - - BUG_ON(ruleno >= map->max_rules); - - rule = map->rules[ruleno]; - result_len = 0; - w = a; - o = b; - - /* - * determine hierarchical context of force, if any. note - * that this may or may not correspond to the specific types - * referenced by the crush rule. - */ - if (force >= 0) { - if (force >= map->max_devices || - map->device_parents[force] == 0) { - /*dprintk("CRUSH: forcefed device dne\n");*/ - rc = -1; /* force fed device dne */ - goto out; - } - if (!is_out(map, weight, force, x)) { - while (1) { - force_context[++force_pos] = force; - if (force >= 0) - force = map->device_parents[force]; - else - force = map->bucket_parents[-1-force]; - if (force == 0) - break; - } - } - } - - for (step = 0; step < rule->len; step++) { - firstn = 0; - switch (rule->steps[step].op) { - case CRUSH_RULE_TAKE: - w[0] = rule->steps[step].arg1; - if (force_pos >= 0) { - BUG_ON(force_context[force_pos] != w[0]); - force_pos--; - } - wsize = 1; - break; - - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: - case CRUSH_RULE_CHOOSE_FIRSTN: - firstn = 1; - case CRUSH_RULE_CHOOSE_LEAF_INDEP: - case CRUSH_RULE_CHOOSE_INDEP: - BUG_ON(wsize == 0); - - recurse_to_leaf = - rule->steps[step].op == - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || - rule->steps[step].op == - CRUSH_RULE_CHOOSE_LEAF_INDEP; - - /* reset output */ - osize = 0; - - for (i = 0; i < wsize; i++) { - /* - * see CRUSH_N, CRUSH_N_MINUS macros. - * basically, numrep <= 0 means relative to - * the provided result_max - */ - numrep = rule->steps[step].arg1; - if (numrep <= 0) { - numrep += result_max; - if (numrep <= 0) - continue; - } - j = 0; - if (osize == 0 && force_pos >= 0) { - /* skip any intermediate types */ - while (force_pos && - force_context[force_pos] < 0 && - rule->steps[step].arg2 != - map->buckets[-1 - - force_context[force_pos]]->type) - force_pos--; - o[osize] = force_context[force_pos]; - if (recurse_to_leaf) - c[osize] = force_context[0]; - j++; - force_pos--; - } - osize += crush_choose(map, - map->buckets[-1-w[i]], - weight, - x, numrep, - rule->steps[step].arg2, - o+osize, j, - firstn, - recurse_to_leaf, c+osize); - } - - if (recurse_to_leaf) - /* copy final _leaf_ values to output set */ - memcpy(o, c, osize*sizeof(*o)); - - /* swap t and w arrays */ - tmp = o; - o = w; - w = tmp; - wsize = osize; - break; - - - case CRUSH_RULE_EMIT: - for (i = 0; i < wsize && result_len < result_max; i++) { - result[result_len] = w[i]; - result_len++; - } - wsize = 0; - break; - - default: - BUG_ON(1); - } - } - rc = result_len; - -out: - return rc; -} - - |