crush_do_rule

文件读写接口:

例如:rados_write(ioctx, "foo", buf, sizeof(buf), 0)

 1 /**
 2  * Write *len* bytes from *buf* into the *oid* object, starting at
 3  * offset *off*. The value of *len* must be <= UINT_MAX/2.
 4  *
 5  * @note This will never return a positive value not equal to len.   
 6  * @param io the io context in which the write will occur
 7  * @param oid name of the object       // 对象名称
 8  * @param buf data to write            // 对象内容
 9  * @param len length of the data, in bytes  // 写的长度
10  * @param off byte offset in the object to begin writing at  // 写起始位置
11  * @returns 0 on success, negative error code on failure
12  */
13 CEPH_RADOS_API int rados_write(rados_ioctx_t io, const char *oid,
14                                const char *buf, size_t len, uint64_t off);

 

do_rule函数:

const WeightVector& weight为每个item的weight
 1   template<typename WeightVector>
 2   void do_rule(int rule, int x, std::vector<int>& out, int maxout,
 3            const WeightVector& weight,
 4            uint64_t choose_args_index) const {
 5     int rawout[maxout];
 6     char work[crush_work_size(crush, maxout)];
 7     crush_init_workspace(crush, work);   
 8     crush_choose_arg_map arg_map = choose_args_get_with_fallback(
 9       choose_args_index);
10     int numrep = crush_do_rule(crush, rule, x, rawout, maxout, &weight[0],
11                    weight.size(), work, arg_map.args);
12     if (numrep < 0)
13       numrep = 0;
14     out.resize(numrep);
15     for (int i=0; i<numrep; i++)
16       out[i] = rawout[i];
17   }

 

函数原型:

  1 /**
  2  * crush_do_rule - calculate a mapping with the given input and rule
  3  * @map: the crush_map
  4  * @ruleno: the rule id       
  5  * @x: hash input                    对象的id和pool id进行hash后输出的值
  6  * @result: pointer to result vector
  7  * @result_max: maximum result size
  8  * @weight: weight vector (for map leaves)
  9  * @weight_max: size of weight vector
 10  * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
 11  */
 12 int crush_do_rule(const struct crush_map *map,
 13           int ruleno, int x, int *result, int result_max,
 14           const __u32 *weight, int weight_max,
 15           void *cwin, const struct crush_choose_arg *choose_args)
 16 {
 17     int result_len;
 18     struct crush_work *cw = cwin;
 19     int *a = (int *)((char *)cw + map->working_size);
 20     int *b = a + result_max;
 21     int *c = b + result_max;
 22     int *w = a;
 23     int *o = b;
/* a, b, c 分别指向 scratch向量的0, 1/3, 2/3的位置. 
w = a; o = b; 
w被用作一个先入先出队列来在CRUSH map中进行横向优先搜索(BFS traversal). 
o存储crush_choose_firstn选择的结果. 
c存储最终的OSD选择结果. 
crush_choose_firstn计算后如果结果不是OSD类型, o 交给w.以便于 w成为下次crush_choose_firstn的输入参数.
如上所述, crush_do_rule 反复进行 crushrules 迭代. 你可以在内存中发现规则: */
24 int recurse_to_leaf; 25 int wsize = 0; 26 int osize; 27 int *tmp; 28 const struct crush_rule *rule; 29 __u32 step; 30 int i, j; 31 int numrep; 32 int out_size; 33 /* 34 * the original choose_total_tries value was off by one (it 35 * counted "retries" and not "tries"). add one. 36 */ 37 int choose_tries = map->choose_total_tries + 1; 38 int choose_leaf_tries = 0; 39 /* 40 * the local tries values were counted as "retries", though, 41 * and need no adjustment 42 */ 43 int choose_local_retries = map->choose_local_tries; 44 int choose_local_fallback_retries = map->choose_local_fallback_tries; 45 46 int vary_r = map->chooseleaf_vary_r; 47 int stable = map->chooseleaf_stable; 48 49 if ((__u32)ruleno >= map->max_rules) { 50 dprintk(" bad ruleno %d\n", ruleno); 51 return 0; 52 } 53 54 rule = map->rules[ruleno]; 55 result_len = 0; 56 57 for (step = 0; step < rule->len; step++) { 58 int firstn = 0; 59 const struct crush_rule_step *curstep = &rule->steps[step]; 60 61 switch (curstep->op) { 62 case CRUSH_RULE_TAKE: 63 if ((curstep->arg1 >= 0 && 64 curstep->arg1 < map->max_devices) || 65 (-1-curstep->arg1 >= 0 && 66 -1-curstep->arg1 < map->max_buckets && 67 map->buckets[-1-curstep->arg1])) { 68 w[0] = curstep->arg1; 69 wsize = 1; 70 } else { 71 dprintk(" bad take value %d\n", curstep->arg1); 72 } 73 break; 74 75 case CRUSH_RULE_SET_CHOOSE_TRIES: 76 if (curstep->arg1 > 0) 77 choose_tries = curstep->arg1; 78 break; 79 80 case CRUSH_RULE_SET_CHOOSELEAF_TRIES: 81 if (curstep->arg1 > 0) 82 choose_leaf_tries = curstep->arg1; 83 break; 84 85 case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: 86 if (curstep->arg1 >= 0) 87 choose_local_retries = curstep->arg1; 88 break; 89 90 case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: 91 if (curstep->arg1 >= 0) 92 choose_local_fallback_retries = curstep->arg1; 93 break; 94 95 case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: 96 if (curstep->arg1 >= 0) 97 vary_r = curstep->arg1; 98 break; 99 100 case CRUSH_RULE_SET_CHOOSELEAF_STABLE: 101 if (curstep->arg1 >= 0) 102 stable = curstep->arg1; 103 break; 104 105 case CRUSH_RULE_CHOOSELEAF_FIRSTN: 106 case CRUSH_RULE_CHOOSE_FIRSTN: 107 firstn = 1; 108 /* fall through */ 109 case CRUSH_RULE_CHOOSELEAF_INDEP: 110 case CRUSH_RULE_CHOOSE_INDEP: 111 if (wsize == 0) 112 break; 113 114 recurse_to_leaf = 115 curstep->op == 116 CRUSH_RULE_CHOOSELEAF_FIRSTN || 117 curstep->op == 118 CRUSH_RULE_CHOOSELEAF_INDEP; 119 120 /* reset output */ 121 osize = 0; 122 123 for (i = 0; i < wsize; i++) { 124 int bno; 125 numrep = curstep->arg1; 126 if (numrep <= 0) { 127 numrep += result_max; 128 if (numrep <= 0) 129 continue; 130 } 131 j = 0; 132 /* make sure bucket id is valid */ 133 bno = -1 - w[i]; 134 if (bno < 0 || bno >= map->max_buckets) { 135 // w[i] is probably CRUSH_ITEM_NONE 136 dprintk(" bad w[i] %d\n", w[i]); 137 continue; 138 } 139 if (firstn) { 140 int recurse_tries; 141 if (choose_leaf_tries) 142 recurse_tries = 143 choose_leaf_tries; 144 else if (map->chooseleaf_descend_once) 145 recurse_tries = 1; 146 else 147 recurse_tries = choose_tries; 148 osize += crush_choose_firstn( 149 map, 150 cw, 151 map->buckets[bno], 152 weight, weight_max, 153 x, numrep, 154 curstep->arg2, 155 o+osize, j, 156 result_max-osize, 157 choose_tries, 158 recurse_tries, 159 choose_local_retries, 160 choose_local_fallback_retries, 161 recurse_to_leaf, 162 vary_r, 163 stable, 164 c+osize, 165 0, 166 choose_args); 167 } else { 168 out_size = ((numrep < (result_max-osize)) ? 169 numrep : (result_max-osize)); 170 crush_choose_indep( 171 map, 172 cw, 173 map->buckets[bno], 174 weight, weight_max, 175 x, out_size, numrep, 176 curstep->arg2, 177 o+osize, j, 178 choose_tries, 179 choose_leaf_tries ? 180 choose_leaf_tries : 1, 181 recurse_to_leaf, 182 c+osize, 183 0, 184 choose_args); 185 osize += out_size; 186 } 187 } 188 189 if (recurse_to_leaf) 190 /* copy final _leaf_ values to output set */ 191 memcpy(o, c, osize*sizeof(*o)); 192 193 /* swap o and w arrays */ 194 tmp = o; 195 o = w; 196 w = tmp; 197 wsize = osize; 198 break; 199 200 201 case CRUSH_RULE_EMIT: 202 for (i = 0; i < wsize && result_len < result_max; i++) { 203 result[result_len] = w[i]; 204 result_len++; 205 } 206 wsize = 0; 207 break; 208 209 default: 210 dprintk(" unknown op %d at step %d\n", 211 curstep->op, step); 212 break; 213 } 214 } 215 216 return result_len; 217 }

crush_choose_firstn 函数

这个函数递归的选择特定bucket或者设备,并且可以处理冲突,失败的情况. 
如果当前是choose过程,通过调用crush_bucket_choose来直接选择. 
如果当前是chooseleaf选择叶子节点的过程,该函数将递归直到得到叶子节点.

crush_bucket_choose 函数

crush_bucket_choose是CRUSH最重要的函数.应为默认的bucket类型是straw,常见的情况下我们会使用straw类型bucket,然后就会进入bucket_straw_choose

 1 static int crush_bucket_choose(const struct crush_bucket *in,
 2                    struct crush_work_bucket *work,
 3                    int x, int r,
 4                                const struct crush_choose_arg *arg,
 5                                int position)
 6 {
 7     dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
 8     BUG_ON(in->size == 0);
 9     switch (in->alg) {
10     case CRUSH_BUCKET_UNIFORM:
11         return bucket_uniform_choose(
12             (const struct crush_bucket_uniform *)in,
13             work, x, r);
14     case CRUSH_BUCKET_LIST:
15         return bucket_list_choose((const struct crush_bucket_list *)in,
16                       x, r);
17     case CRUSH_BUCKET_TREE:
18         return bucket_tree_choose((const struct crush_bucket_tree *)in,
19                       x, r);
20     case CRUSH_BUCKET_STRAW:
21         return bucket_straw_choose(
22             (const struct crush_bucket_straw *)in,
23             x, r);
24     case CRUSH_BUCKET_STRAW2:  // 默认算法
25         return bucket_straw2_choose(
26             (const struct crush_bucket_straw2 *)in,
27             x, r, arg, position);
28     default:
29         dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
30         return in->items[0];
31     }
32 }

关键选择函数:

 1 static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
 2                 int x, int r, const struct crush_choose_arg *arg,
 3                                 int position)
 4 {
 5     unsigned int i, high = 0;
 6     __s64 draw, high_draw = 0;
 7     __u32 *weights = get_choose_arg_weights(bucket, arg, position);
 8     __s32 *ids = get_choose_arg_ids(bucket, arg);
 9     for (i = 0; i < bucket->h.size; i++) {
10                 dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
11         if (weights[i]) {
12             draw = generate_exponential_distribution(bucket->h.hash, x, ids[i], r, weights[i]);
13         } else {
14             draw = S64_MIN;
15         }
16 
17         if (i == 0 || draw > high_draw) {
18             high = i;
19             high_draw = draw;
20         }
21     }
22 
23     return bucket->h.items[high];
24 }

 

posted @ 2019-04-11 16:57  yunlion  阅读(395)  评论(0编辑  收藏  举报