compare.c

map_with_crush():

 1 void map_with_crush(int replication_count, int hosts_count, int object_map[][NUMBER_OF_OBJECTS]) {
 2   struct crush_map *m = crush_create();
 3   m->choose_local_tries = 0;
 4   m->choose_local_fallback_tries = 0;
 5   m->choose_total_tries = 50;
 6   m->chooseleaf_descend_once = 1;
 7   m->chooseleaf_vary_r = 1;
 8   m->chooseleaf_stable = 1;
 9   m->allowed_bucket_algs =
10     (1 << CRUSH_BUCKET_UNIFORM) |
11     (1 << CRUSH_BUCKET_LIST) |
12     (1 << CRUSH_BUCKET_STRAW2);
13   int root_type = 1;
14   int host_type = 2;  
15   int bucketno = 0;
16 
17   int hosts[hosts_count];
18   int weights[hosts_count];
19   int disk = 0;
20   int weight = 0x10000;  // 1.0
21   // 创建一个多个host类型的bucket,并给每个bucket添加一个disk的item
22   for(int host = 0; host < hosts_count; host++) {
23     struct crush_bucket *b;
24     // 函数创建一个bucket结构。需要指定CRUSH算法(uniform、list、tree、straw)、hash算法、
25     // bucket类型(自定义的type)、size(包括多少个item)、这些items的id数组、这些items的weights
26     b = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, host_type,
27                           0, NULL, NULL);
28     assert(b != NULL);
29     // 往bucket中添加item, disk为item的id,weight为item的权重
30     assert(crush_bucket_add_item(m, b, disk, weight) == 0);
31     // 将b添加到m中,m->buckets[],bucket的id为bucketno
32     assert(crush_add_bucket(m, 0, b, &bucketno) == 0);
33     hosts[host] = bucketno;
34     weights[host] = weight;
35     disk++;
36   }
37 
38   struct crush_bucket *root;
39   int bucket_root;
40   // 创建一个root类型的bucket,bucket中添加的item是hosts
41   root = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, root_type,
42                            hosts_count, hosts, weights);
43   assert(root != NULL);
44   // 将bucket添加带map中
45   assert(crush_add_bucket(m, 0, root, &bucket_root) == 0);
46   // 调整权重分布
47   assert(crush_reweight_bucket(m, root) == 0);
48   
49   struct crush_rule *r;
50   int minsize = 1;
51   int maxsize = 5;
52   int number_of_steps = 3;
53   r = crush_make_rule(number_of_steps, 0, 0, minsize, maxsize);
54   assert(r != NULL);
55   crush_rule_set_step(r, 0, CRUSH_RULE_TAKE, bucket_root, 0);
56   crush_rule_set_step(r, 1, CRUSH_RULE_CHOOSELEAF_FIRSTN, replication_count, host_type);
57   crush_rule_set_step(r, 2, CRUSH_RULE_EMIT, 0, 0);
58   int ruleno = crush_add_rule(m, r, -1);
59   assert(ruleno >= 0);
60 
61   crush_finalize(m);
62 
63   {
64     int result[replication_count];
65     __u32 weights[hosts_count];
66     for(int i = 0; i < hosts_count; i++)
67       weights[i] = 0x10000;
68     int cwin_size = crush_work_size(m, replication_count);
69     char cwin[cwin_size];
70     crush_init_workspace(m, cwin);
71     for(int x = 0; x < NUMBER_OF_OBJECTS; x++) {
72       memset(result, '', sizeof(int) * replication_count);
73       assert(crush_do_rule(m, ruleno, x, result, replication_count, weights, hosts_count, cwin) == replication_count);
74       for(int i = 0; i < replication_count; i++) {
75         object_map[i][x] = result[i];
76       }
77     }
78   }
79   crush_destroy(m);
80 }

构造crush map

1. 创建crush map

 struct crush_map *m = crush_create();

 1 struct crush_map *crush_create()
 2 {
 3     struct crush_map *m;
 4     m = malloc(sizeof(*m));
 5         if (!m)
 6                 return NULL;
 7     memset(m, 0, sizeof(*m));
 8 
 9     /* initialize legacy tunable values */
10     m->choose_local_tries = 2;
11     m->choose_local_fallback_tries = 5;
12     m->choose_total_tries = 19;
13     m->chooseleaf_descend_once = 0;
14     m->chooseleaf_vary_r = 0;
15     m->straw_calc_version = 0;
16 
17     // by default, use legacy types, and also exclude tree,
18     // since it was buggy.
19     m->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS;
20     return m;
21 }

2. 添加bucket

 分为两步:先make,后add

 1  // 创建一个多个host类型的bucket,并给每个bucket添加一个disk的item
 2   for(int host = 0; host < hosts_count; host++) {
 3     struct crush_bucket *b;
 4     // 函数创建一个bucket结构。需要指定CRUSH算法(uniform、list、tree、straw)、hash算法、
 5     // bucket类型(自定义的type)、size(包括多少个item)、这些items的id数组、这些items的weights
 6     b = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, host_type,
 7                           0, NULL, NULL);
 8     assert(b != NULL);
 9     // 往bucket中添加item, disk为item的id,weight为item的权重
10     assert(crush_bucket_add_item(m, b, disk, weight) == 0);
11     // 将b添加到m中,m->buckets[],bucket的id为bucketno
12     assert(crush_add_bucket(m, 0, b, &bucketno) == 0);
13     hosts[host] = bucketno;
14     weights[host] = weight;
15     disk++;
16   }

2.1 make bucket

先看一下bucket的数据结构:

 1 /** @ingroup API
 2  *
 3  * A bucket contains __size__ __items__ which are either positive
 4  * numbers or negative numbers that reference other buckets and is
 5  * uniquely identified with __id__ which is a negative number.  The
 6  * __weight__ of a bucket is the cumulative weight of all its
 7  * children.  A bucket is assigned a ::crush_algorithm that is used by
 8  * crush_do_rule() to draw an item depending on its weight.  A bucket
 9  * can be assigned a strictly positive (> 0) __type__ defined by the
10  * caller. The __type__ can be used by crush_do_rule(), when it is
11  * given as an argument of a rule step.
12  *
13  * A pointer to crush_bucket can safely be cast into the following
14  * structure, depending on the value of __alg__:
15  *
16  * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
17  * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
18  * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
19  *
20  * The weight of each item depends on the algorithm and the
21  * information about it is available in the corresponding structure
22  * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
23  *
24  * See crush_map for more information on how __id__ is used
25  * to reference the bucket.
26  */
27 struct crush_bucket {
28     __s32 id;        /*!< bucket identifier, < 0 and unique within a crush_map 小于0的唯一标记 */
29     __u16 type;      /*!< > 0 bucket type, defined by the caller > 0的值,用户自定义的类型 */
30     __u8 alg;        /*!< the item selection ::crush_algorithm 所采用的crush算法 */
31         /*! @cond INTERNAL */
32     __u8 hash;       /* which hash function to use, CRUSH_HASH_* 使用的hash算法类型 */
33     /*! @endcond */
34     __u32 weight;    /*!< 16.16 fixed point cumulated children weight */
35     __u32 size;      /*!< size of the __items__ array,如果size为0,则不包含item*/
36         __s32 *items;    /*!< array of children: < 0 are buckets, >= 0 items, 这是个数组,
37         记录所有的item, 如果item的值是负数,则为buket, 为自然数则是device */
38 };

产生一个bucket:

b = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, host_type, 0, NULL, NULL);

 1 struct crush_bucket*
 2 crush_make_bucket(struct crush_map *map,
 3           int alg, int hash, int type, int size,
 4           int *items,
 5           int *weights)
 6 {
 7     int item_weight;
 8 
 9     switch (alg) {
10     case CRUSH_BUCKET_UNIFORM:
11         if (size && weights)
12             item_weight = weights[0];
13         else
14             item_weight = 0;
15         return (struct crush_bucket *)crush_make_uniform_bucket(hash, type, size, items, item_weight);
16 
17     case CRUSH_BUCKET_LIST:
18         return (struct crush_bucket *)crush_make_list_bucket(hash, type, size, items, weights);
19 
20     case CRUSH_BUCKET_TREE:
21         return (struct crush_bucket *)crush_make_tree_bucket(hash, type, size, items, weights);
22 
23     case CRUSH_BUCKET_STRAW:
24         return (struct crush_bucket *)crush_make_straw_bucket(map, hash, type, size, items, weights);
25     case CRUSH_BUCKET_STRAW2:
26         return (struct crush_bucket *)crush_make_straw2_bucket(map, hash, type, size, items, weights);
27     }
28     return 0;
29 }
30 
31 struct crush_bucket_straw2 *
32 crush_make_straw2_bucket(struct crush_map *map,
33              int hash,          // hash值
34              int type,          // 类型
35              int size,          // item的大小
36              int *items,        // items
37              int *weights)      // item的weight值
38 {
39     struct crush_bucket_straw2 *bucket;
40     int i;
41 
42     bucket = malloc(sizeof(*bucket));
43         if (!bucket)
44                 return NULL;
45     memset(bucket, 0, sizeof(*bucket));
46     bucket->h.alg = CRUSH_BUCKET_STRAW2;
47     bucket->h.hash = hash;
48     bucket->h.type = type;
49     bucket->h.size = size;
50 
51     bucket->h.items = malloc(sizeof(__s32)*size);
52         if (!bucket->h.items)
53                 goto err;
54     bucket->item_weights = malloc(sizeof(__u32)*size);
55         if (!bucket->item_weights)
56                 goto err;
57 
58     bucket->h.weight = 0;
59     for (i=0; i<size; i++) {
60         bucket->h.items[i] = items[i];        // item中是item的id
61         bucket->h.weight += weights[i];       // 总的weight值
62         bucket->item_weights[i] = weights[i];
63     }
64 
65     return bucket;
66 err:
67         free(bucket->item_weights);
68         free(bucket->h.items);
69         free(bucket);
70         return NULL;
71 }

2.2 向bucket中添加item

入参:

1. struct crush_map *map  map

2. struct crush_bucket *b   bucket

3. int item       item的id

4. int weight   item的weight

assert(crush_bucket_add_item(m, b, disk, weight) == 0);

 1 int crush_bucket_add_item(struct crush_map *map,
 2               struct crush_bucket *b, int item, int weight)
 3 {
 4     switch (b->alg) {
 5     case CRUSH_BUCKET_UNIFORM:
 6         return crush_add_uniform_bucket_item((struct crush_bucket_uniform *)b, item, weight);
 7     case CRUSH_BUCKET_LIST:
 8         return crush_add_list_bucket_item((struct crush_bucket_list *)b, item, weight);
 9     case CRUSH_BUCKET_TREE:
10         return crush_add_tree_bucket_item((struct crush_bucket_tree *)b, item, weight);
11     case CRUSH_BUCKET_STRAW:
12         return crush_add_straw_bucket_item(map, (struct crush_bucket_straw *)b, item, weight);
13     case CRUSH_BUCKET_STRAW2:
14         return crush_add_straw2_bucket_item(map, (struct crush_bucket_straw2 *)b, item, weight);
15     default:
16         return -1;
17     }
18 }
19 
20 int crush_add_straw2_bucket_item(struct crush_map *map,
21                  struct crush_bucket_straw2 *bucket,
22                  int item, int weight)
23 {
24     int newsize = bucket->h.size + 1;          // 添加一个item
25 
26     void *_realloc = NULL;
27 
28     if ((_realloc = realloc(bucket->h.items, sizeof(__s32)*newsize)) == NULL) {
29         return -ENOMEM;
30     } else {
31         bucket->h.items = _realloc;
32     }
33     if ((_realloc = realloc(bucket->item_weights, sizeof(__u32)*newsize)) == NULL) {
34         return -ENOMEM;
35     } else {
36         bucket->item_weights = _realloc;
37     }
38 
39     bucket->h.items[newsize-1] = item;
40     bucket->item_weights[newsize-1] = weight;
41 
42     if (crush_addition_is_unsafe(bucket->h.weight, weight))
43                 return -ERANGE;
44 
45     bucket->h.weight += weight;
46     bucket->h.size++;
47 
48     return 0;
49 }

2.3  把bucket添加到map中

入参:

1. struct crush_map *map  

2. int id   bucket的id

3. struct crush_bucket *bucket   bucket的结构体

4. int *idout 和id值一致

crush_add_bucket(m, 0, b, &bucketno) == 0

 1 int crush_add_bucket(struct crush_map *map,
 2              int id,
 3              struct crush_bucket *bucket,
 4              int *idout)
 5 {
 6     int pos;
 7 
 8     /* find a bucket id */
 9     if (id == 0)
10         id = crush_get_next_bucket_id(map);
11     pos = -1 - id;
12 
13     while (pos >= map->max_buckets) {     // 扩充map的bucket数组大小
14         /* expand array */
15         int oldsize = map->max_buckets;
16         if (map->max_buckets)             // 如果已经设置最大的bucket个数,则加倍扩充
17             map->max_buckets *= 2;
18         else
19             map->max_buckets = 8;         // 未进行设置,则默认为8
20         void *_realloc = NULL;
21         if ((_realloc = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0]))) == NULL) {
22             return -ENOMEM; 
23         } else {
24             map->buckets = _realloc;
25         }
26         memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0]));  // 初始化
27     }
28 
29     if (map->buckets[pos] != 0) {   // 判断是否已经存在
30         return -EEXIST;
31     }
32 
33     /* add it */
34     bucket->id = id;
35     map->buckets[pos] = bucket;
36 
37     if (idout) *idout = id;    // 返回id
38     return 0;
39 }

2.4 构造root bucket

构造一个root类型的bucket,并将hosts添加到该bucket中;

root = crush_make_bucket(m, CRUSH_BUCKET_STRAW2, CRUSH_HASH_DEFAULT, root_type, hosts_count, hosts, weights);

 1 // 入参
 2 // struct crush_map *map
 3 // int alg 算法类型 CRUSH_BUCKET_STRAW2
 4 // int hash 选择使用的是那种hash函数类型
 5 // int type类型,root type
 6 // 包含的item的大小, hosts_count
 7 // int *items item的id数组 hosts
 8 // int *weights item的weight集合 weights
 9 struct crush_bucket*
10 crush_make_bucket(struct crush_map *map,
11           int alg, int hash, int type, int size,
12           int *items,
13           int *weights)

// 将bucket添加带map中
assert(crush_add_bucket(m, 0, root, &bucket_root) == 0);


// 调整权重分布
assert(crush_reweight_bucket(m, root) == 0);

 1 int crush_reweight_bucket(struct crush_map *crush, struct crush_bucket *b)
 2 {
 3     switch (b->alg) {
 4     case CRUSH_BUCKET_UNIFORM:
 5         return crush_reweight_uniform_bucket(crush, (struct crush_bucket_uniform *)b);
 6     case CRUSH_BUCKET_LIST:
 7         return crush_reweight_list_bucket(crush, (struct crush_bucket_list *)b);
 8     case CRUSH_BUCKET_TREE:
 9         return crush_reweight_tree_bucket(crush, (struct crush_bucket_tree *)b);
10     case CRUSH_BUCKET_STRAW:
11         return crush_reweight_straw_bucket(crush, (struct crush_bucket_straw *)b);
12     case CRUSH_BUCKET_STRAW2:
13         return crush_reweight_straw2_bucket(crush, (struct crush_bucket_straw2 *)b);
14     default:
15         return -1;
16     }
17 }
18 
19 static int crush_reweight_straw2_bucket(struct crush_map *crush, struct crush_bucket_straw2 *bucket)
20 {
21     unsigned i;
22 
23     bucket->h.weight = 0;
24     for (i = 0; i < bucket->h.size; i++) {
25         int id = bucket->h.items[i];
26         if (id < 0) {
27             struct crush_bucket *c = crush->buckets[-1-id];
28             crush_reweight_bucket(crush, c);        // 递归调用,逐级调整其实就是重新计算下每一层的bucket的weight
29             bucket->item_weights[i] = c->weight;
30         }
31 
32         if (crush_addition_is_unsafe(bucket->h.weight, bucket->item_weights[i]))
33             return -ERANGE;
34 
35         bucket->h.weight += bucket->item_weights[i];
36     }
37 
38     return 0;
39 }

至此,crush_map已经创建完成

梳理一下:

1. 创建一个空map;

2. 创建host类型的bucket,向bucket添加disk类型的item; bucket, bucket->h.items[]

3. 将host添加到bucket中; map->buckets[]

4. 创建root类型的bucket,将host类型的bucket添加到root下,并调整相应的weight值;

接下来分析ruleset规则的创建,它决定了rule的选路规则

rule的数据结构

 1 /*
 2  * The rule mask is used to describe what the rule is intended for.
 3  * Given a ruleset and size of output set, we search through the
 4  * rule list for a matching rule_mask.
 5  */
 6 struct crush_rule_mask {
 7     __u8 ruleset;  // ruleset的编号
 8     __u8 type;     // 类型
 9     __u8 min_size;
10     __u8 max_size;
11 };
12 
13 struct crush_rule {
14     __u32 len;   //steps数组的长度
15     struct crush_rule_mask mask;      // ruleset的相关配置参数
16     struct crush_rule_step steps[0];  // 每一个操作步骤,操作类型+参数
17 };

1. 首先,创建一个rule的数据结构,用于存储rule信息

r = crush_make_rule(number_of_steps, 0, 0, minsize, maxsize);

 1 // step的number,ruleset的id,type的id
 2 struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize)
 3 {
 4     struct crush_rule *rule;
 5     rule = malloc(crush_rule_size(len));
 6         if (!rule)
 7                 return NULL;
 8     rule->len = len;
 9     rule->mask.ruleset = ruleset;
10     rule->mask.type = type;
11     rule->mask.min_size = minsize;
12     rule->mask.max_size = maxsize;
13     return rule;
14 }

2. 接着,添加step:

 1 crush_rule_set_step(r, 0, CRUSH_RULE_TAKE, bucket_root, 0);
 2 
 3 crush_rule_set_step(r, 1, CRUSH_RULE_CHOOSELEAF_FIRSTN, replication_count, host_type);
 4 
 5 crush_rule_set_step(r, 2, CRUSH_RULE_EMIT, 0, 0);
 6 
 7 /*  入参:n为step的编号,op为操作码,arg为参数
 8  * be careful; this doesn't verify that the buffer you allocated is big enough!
 9  */
10 void crush_rule_set_step(struct crush_rule *rule, int n, int op, int arg1, int arg2)
11 {
12     assert((__u32)n < rule->len);
13     rule->steps[n].op = op;
14     rule->steps[n].arg1 = arg1;
15     rule->steps[n].arg2 = arg2;
16 }

3. 添加rule:

 1 int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno)
 2 {
 3     __u32 r;
 4     // 如果ruleno小于0,则自己分配连续的rule,否则,按照指定的ruleno
 5     if (ruleno < 0) {
 6         for (r=0; r < map->max_rules; r++)
 7             if (map->rules[r] == 0)
 8                 break;
 9         assert(r < CRUSH_MAX_RULES);
10     }
11     else
12         r = ruleno;
13 
14     if (r >= map->max_rules) {
15         /* expand array */
16         int oldsize;
17         void *_realloc = NULL;
18         if (map->max_rules +1 > CRUSH_MAX_RULES)
19             return -ENOSPC;
20         oldsize = map->max_rules;
21         map->max_rules = r+1;  
22         if ((_realloc = realloc(map->rules, map->max_rules * sizeof(map->rules[0]))) == NULL) {
23             return -ENOMEM; 
24         } else {
25             map->rules = _realloc;
26         } 
27         memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0]));
28     }
29 
30     /* add it */
31     map->rules[r] = rule;
32     return r;
33 }

看一下map的数据结构:

 1 /** @ingroup API
 2  *
 3  * A crush map define a hierarchy of crush_bucket that end with leaves
 4  * (buckets and leaves are called items) and a set of crush_rule to
 5  * map an integer to items with the crush_do_rule() function.
 6  *
 7  */
 8 struct crush_map {
 9         /*! An array of crush_bucket pointers of size __max_buckets__.
10          * An element of the array may be NULL if the bucket was removed with
11          * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
12          * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
13          */
14     struct crush_bucket **buckets;     // 动态二维数组,保存所有的bucket结构
15         /*! An array of crush_rule pointers of size __max_rules__.
16          * An element of the array may be NULL if the rule was removed (there is
17          * no API to do so but there may be one in the future). The rules must be added
18          * with crush_add_rule().
19          */
20     struct crush_rule **rules;        // 保存所有的Crush_rule 
21         __s32 max_buckets; /*!< the size of __buckets__ */
22     __u32 max_rules; /*!< the size of __rules__ */
23         /*! The value of the highest item stored in the crush_map + 1
24          */
25     __s32 max_devices;
26 
27     /*! choose local retries before re-descent */
28     __u32 choose_local_tries;
29     /*! choose local attempts using a fallback permutation before
30      *! re-descent */
31     __u32 choose_local_fallback_tries;
32     /*! choose attempts before giving up */
33     __u32 choose_total_tries;
34     /*! attempt chooseleaf inner descent once for firstn mode; on
35      *! reject retry outer descent.  Note that this does *not*
36      *! apply to a collision: in that case we will retry as we used
37      *! to. */
38     __u32 chooseleaf_descend_once;
39 
40     /*! if non-zero, feed r into chooseleaf, bit-shifted right by (r-1)
41      *! bits.  a value of 1 is best for new clusters.  for legacy clusters
42      *! that want to limit reshuffling, a value of 3 or 4 will make the
43      *! mappings line up a bit better with previous mappings. */
44     __u8 chooseleaf_vary_r;
45 
46     /*! if true, it makes chooseleaf firstn to return stable results (if
47      *! no local retry) so that data migrations would be optimal when some
48      *! device fails. */
49     __u8 chooseleaf_stable;
50 
51         /*! @cond INTERNAL */
52     /* This value is calculated after decode or construction by
53        the builder. It is exposed here (rather than having a
54        'build CRUSH working space' function) so that callers can
55        reserve a static buffer, allocate space on the stack, or
56        otherwise avoid calling into the heap allocator if they
57        want to. The size of the working space depends on the map,
58        while the size of the scratch vector passed to the mapper
59        depends on the size of the desired result set.
60 
61        Nothing stops the caller from allocating both in one swell
62        foop and passing in two points, though. */
63     size_t working_size;
64 
65 #ifndef __KERNEL__
66     /*
67      * version 0 (original) of straw_calc has various flaws.  version 1
68      * fixes a few of them.
69      */
70     __u8 straw_calc_version;
71 
72     /*
73      * allowed bucket algs is a bitmask, here the bit positions
74      * are CRUSH_BUCKET_*.  note that these are *bits* and
75      * CRUSH_BUCKET_* values are not, so we need to or together (1
76      * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
77      * minimize confusion (bucket type values start at 1).
78      */
79     __u32 allowed_bucket_algs;
80 
81     __u32 *choose_tries;
82 #endif
83     /*! @endcond */
84 };

4. finalize rule

 1 /* 更新working_size和max_devices
 2  * finalize should be called _after_ all buckets are added to the map.
 3  */
 4 void crush_finalize(struct crush_map *map)
 5 {
 6     int b;
 7     __u32 i;
 8 
 9     /* Calculate the needed working space while we do other
10        finalization tasks. */
11     map->working_size = sizeof(struct crush_work);
12     /* Space for the array of pointers to per-bucket workspace */
13     map->working_size += map->max_buckets *
14         sizeof(struct crush_work_bucket *);
15 
16     /* calc max_devices */
17     map->max_devices = 0;
18     for (b=0; b<map->max_buckets; b++) {
19         if (map->buckets[b] == 0)
20             continue;
21         // 更新map->max_devices
22         for (i=0; i<map->buckets[b]->size; i++)
23             if (map->buckets[b]->items[i] >= map->max_devices)
24                 map->max_devices = map->buckets[b]->items[i] + 1;
25 
26         switch (map->buckets[b]->alg) {
27         default:
28             /* The base case, permutation variables and
29                the pointer to the permutation array. */
30             map->working_size += sizeof(struct crush_work_bucket);
31             break;
32         }
33         /* Every bucket has a permutation array. */
34         map->working_size += map->buckets[b]->size * sizeof(__u32);
35     }
36 }

如何调用

先初始化空间:

 1 int cwin_size = crush_work_size(m, replication_count);  // 3倍空间,o和w
 2 
 3 crush_init_workspace(m, cwin);
 4 
 5 for(int x = 0; x < NUMBER_OF_OBJECTS; x++) {
 6       memset(result, '', sizeof(int) * replication_count);
 7       assert(crush_do_rule(m, ruleno, x, result, replication_count, weights, hosts_count, cwin) == replication_count);
 8       for(int i = 0; i < replication_count; i++) {
 9         object_map[i][x] = result[i];
10       }
11  }

crush do rule

  1 /**
  2  * crush_do_rule - calculate a mapping with the given input and rule
  3  * @map: the crush_map
  4  * @ruleno: the rule id
  5  * @x: hash input
  6  * @result: pointer to result vector
  7  * @result_max: maximum result size
  8  * @weight: weight vector (for map leaves)
  9  * @weight_max: size of weight vector
 10  * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
 11  */
 12 int crush_do_rule(const struct crush_map *map,
 13           int ruleno, int x, int *result, int result_max,
 14           const __u32 *weight, int weight_max,
 15           void *cwin)
 16 {
 17     int result_len;
 18     struct crush_work *cw = cwin;
 19     int *a = (int *)((char *)cw + map->working_size);  // 用了一个scratch空间来完成item的搜索
 20     int *b = a + result_max;
 21     int *c = b + result_max;
 22     int *w = a;
 23     int *o = b;
 24     int recurse_to_leaf;
 25     int wsize = 0;
 26     int osize;
 27     int *tmp;
 28     const struct crush_rule *rule;
 29     __u32 step;
 30     int i, j;
 31     int numrep;
 32     int out_size;
 33     /*
 34      * the original choose_total_tries value was off by one (it
 35      * counted "retries" and not "tries").  add one.
 36      */
 37     int choose_tries = map->choose_total_tries + 1;
 38     int choose_leaf_tries = 0;
 39     /*
 40      * the local tries values were counted as "retries", though,
 41      * and need no adjustment
 42      */
 43     int choose_local_retries = map->choose_local_tries;
 44     int choose_local_fallback_retries = map->choose_local_fallback_tries;
 45 
 46     int vary_r = map->chooseleaf_vary_r;
 47     int stable = map->chooseleaf_stable;
 48 
 49     if ((__u32)ruleno >= map->max_rules) {
 50         dprintk(" bad ruleno %d
", ruleno);
 51         return 0;
 52     }
 53 
 54     rule = map->rules[ruleno];
 55     result_len = 0;
 56 
 57     for (step = 0; step < rule->len; step++) {
 58         int firstn = 0;
 59         const struct crush_rule_step *curstep = &rule->steps[step];
 60 
 61         switch (curstep->op) {
 62         case CRUSH_RULE_TAKE:
 63             if ((curstep->arg1 >= 0 &&
 64                  curstep->arg1 < map->max_devices) ||
 65                 (-1-curstep->arg1 >= 0 &&
 66                  -1-curstep->arg1 < map->max_buckets &&
 67                  map->buckets[-1-curstep->arg1])) {
 68                 w[0] = curstep->arg1;
 69                 wsize = 1;
 70             } else {
 71                 dprintk(" bad take value %d
", curstep->arg1);
 72             }
 73             break;
 74 
 75         case CRUSH_RULE_SET_CHOOSE_TRIES:
 76             if (curstep->arg1 > 0)
 77                 choose_tries = curstep->arg1;
 78             break;
 79 
 80         case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
 81             if (curstep->arg1 > 0)
 82                 choose_leaf_tries = curstep->arg1;
 83             break;
 84 
 85         case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
 86             if (curstep->arg1 >= 0)
 87                 choose_local_retries = curstep->arg1;
 88             break;
 89 
 90         case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
 91             if (curstep->arg1 >= 0)
 92                 choose_local_fallback_retries = curstep->arg1;
 93             break;
 94 
 95         case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
 96             if (curstep->arg1 >= 0)
 97                 vary_r = curstep->arg1;
 98             break;
 99 
100         case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
101             if (curstep->arg1 >= 0)
102                 stable = curstep->arg1;
103             break;
104 
105         case CRUSH_RULE_CHOOSELEAF_FIRSTN:
106         case CRUSH_RULE_CHOOSE_FIRSTN:
107             firstn = 1;
108             /* fall through */
109         case CRUSH_RULE_CHOOSELEAF_INDEP:
110         case CRUSH_RULE_CHOOSE_INDEP:
111             if (wsize == 0)
112                 break;
113 
114             recurse_to_leaf =
115                 curstep->op ==
116                  CRUSH_RULE_CHOOSELEAF_FIRSTN ||
117                 curstep->op ==
118                 CRUSH_RULE_CHOOSELEAF_INDEP;
119 
120             /* reset output */
121             osize = 0;
122 
123             for (i = 0; i < wsize; i++) {
124                 int bno;
125                 /*
126                  * see CRUSH_N, CRUSH_N_MINUS macros.
127                  * basically, numrep <= 0 means relative to
128                  * the provided result_max
129                  */
130                 numrep = curstep->arg1;
131                 if (numrep <= 0) {
132                     numrep += result_max;
133                     if (numrep <= 0)
134                         continue;
135                 }
136                 j = 0;
137                 /* make sure bucket id is valid */
138                 bno = -1 - w[i];
139                 if (bno < 0 || bno >= map->max_buckets) {
140                     // w[i] is probably CRUSH_ITEM_NONE
141                     dprintk("  bad w[i] %d
", w[i]);
142                     continue;
143                 }
144                 if (firstn) {
145                     int recurse_tries;
146                     if (choose_leaf_tries)
147                         recurse_tries =
148                             choose_leaf_tries;
149                     else if (map->chooseleaf_descend_once)
150                         recurse_tries = 1;
151                     else
152                         recurse_tries = choose_tries;
153                     osize += crush_choose_firstn(
154                         map,
155                         cw,
156                         map->buckets[bno],
157                         weight, weight_max,
158                         x, numrep,
159                         curstep->arg2,
160                         o+osize, j,
161                         result_max-osize,
162                         choose_tries,
163                         recurse_tries,
164                         choose_local_retries,
165                         choose_local_fallback_retries,
166                         recurse_to_leaf,
167                         vary_r,
168                         stable,
169                         c+osize,
170                         0);
171                 } else {
172                     out_size = ((numrep < (result_max-osize)) ?
173                             numrep : (result_max-osize));
174                     crush_choose_indep(
175                         map,
176                         cw,
177                         map->buckets[bno],
178                         weight, weight_max,
179                         x, out_size, numrep,
180                         curstep->arg2,
181                         o+osize, j,
182                         choose_tries,
183                         choose_leaf_tries ?
184                            choose_leaf_tries : 1,
185                         recurse_to_leaf,
186                         c+osize,
187                         0);
188                     osize += out_size;
189                 }
190             }
191 
192             if (recurse_to_leaf)
193                 /* copy final _leaf_ values to output set */
194                 memcpy(o, c, osize*sizeof(*o));
195 
196             /* swap o and w arrays */
197             tmp = o;
198             o = w;
199             w = tmp;
200             wsize = osize;
201             break;
202 
203 
204         case CRUSH_RULE_EMIT:
205             for (i = 0; i < wsize && result_len < result_max; i++) {
206                 result[result_len] = w[i];
207                 result_len++;
208             }
209             wsize = 0;
210             break;
211 
212         default:
213             dprintk(" unknown op %d at step %d
",
214                 curstep->op, step);
215             break;
216         }
217     }
218 
219     return result_len;
220 }
原文地址:https://www.cnblogs.com/yunlion/p/10712178.html