Memcached源码分析之slabs.c

  1. #include "memcached.h"
  2. #include <sys/stat.h>
  3. #include <sys/socket.h>
  4. #include <sys/signal.h>
  5. #include <sys/resource.h>
  6. #include <fcntl.h>
  7. #include <netinet/in.h>
  8. #include <errno.h>
  9. #include <stdlib.h>
  10. #include <stdio.h>
  11. #include <string.h>
  12. #include <assert.h>
  13. #include <pthread.h>
  14.  
  15. typedef struct {
  16. unsigned int size;      /* sizes of items */     //item或者说chunk的大小
  17. unsigned int perslab;   /* how many items per slab */ //每个slab有多少个item,slab又称“页”
  18.  
  19. /**
  20. 当前slabclass的空闲item链表,也是可用item链表,当前slabclass一切可以用的内存空间都在此,
  21. 这里是内存分配的入口,分配内存的时候都是在这个链表上挤一个出去。
  22.  
  23. ps:memcached的新版本才开始把slots作为“所有空闲的item链接”的用途,以前的版本slots链表保存的是“回收的item”的意思,
  24. 而旧版本新分配的slab,是用end_page_ptr指针及end_page_free来控制,此版本已不用。
  25. */
  26. void *slots;           /* list of item ptrs */
  27. unsigned int sl_curr;   /* total free items in list */  //当前slabclass还剩多少空闲的item,即上面的slots数
  28.  
  29. unsigned int slabs;     /* how many slabs were allocated for this class */ //这个slabclass分配了多少个slab了
  30.  
  31. /**
  32. slab_list是这个slabclass下的slabs列表,逻辑上是一个数组,每个元素是一个slab指针。
  33. list_size是slab_list的元素个数。
  34. 注意这个list_size和上面的slabs的不同:
  35. 由于slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组元素的个数,代表slab_list的空间大小。
  36. slabs代表已经分配出去的slabs数,list_size则代表可以有多少个slabs数
  37. 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
  38.  
  39. */
  40. void **slab_list;       /* array of slab pointers */
  41. unsigned int list_size; /* size of prev array */
  42.  
  43. unsigned int killing;  /* index+1 of dying slab, or zero if none */
  44. size_t requested; /* The number of requested bytes */
  45. } slabclass_t;
  46.  
  47. static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
  48. static size_t mem_limit = 0; //内存上限
  49. static size_t mem_malloced = 0; //已分配的内存
  50. static int power_largest;
  51.  
  52. static void *mem_base = NULL; //预分配的内存空间
  53. static void *mem_current = NULL;
  54. static size_t mem_avail = 0;
  55.  
  56. static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
  57. static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
  58.  
  59. static int do_slabs_newslab(const unsigned int id);
  60. static void *memory_allocate(size_t size);
  61. static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
  62.  
  63. static void slabs_preallocate (const unsigned int maxslabs);
  64.  
  65. //根据item大小找到合适的slabclass
  66. unsigned int slabs_clsid(const size_t size) {
  67. int res = POWER_SMALLEST;
  68.  
  69. if (size == 0)
  70. return 0;
  71. while (size > slabclass[res].size)
  72. if (res++ == power_largest)     /* won't fit in the biggest slab */
  73. return 0;
  74. return res;
  75. }
  76.  
  77. /**
  78. 初始化slabs,这里会对一些内存管理进行初始化
  79. */
  80. void slabs_init(const size_t limit, const double factor, const bool prealloc) {
  81. int i = POWER_SMALLEST - 1;
  82. unsigned int size = sizeof(item) + settings.chunk_size;
  83.  
  84. mem_limit = limit; //这个limit就是启动时候用户设置的-m xx中的xx,最大的内存上限
  85.  
  86. if (prealloc) {
  87. /**
  88. 如果用户开启了预分配,则先把上限的内存先分配出来,放到mem_base全局变量中。
  89. 所以这个时候服务就拥有了一大坨内存,以后要分配的内存都是从这一坨里面割下来。
  90. */
  91. mem_base = malloc(mem_limit);
  92. if (mem_base != NULL) {
  93. mem_current = mem_base;
  94. mem_avail = mem_limit;
  95. } else {
  96. fprintf(stderr, "Warning: Failed to allocate requested memory in"
  97. " one large chunk.\nWill allocate in smaller chunks\n");
  98. }
  99. }
  100.  
  101. //下面是初始化各个slabclass对象
  102. memset(slabclass, 0, sizeof(slabclass));
  103.  
  104. while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
  105. /* Make sure items are always n-byte aligned */
  106. if (size % CHUNK_ALIGN_BYTES)
  107. size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
  108.  
  109. slabclass[i].size = size;
  110. slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
  111. size *= factor;
  112. if (settings.verbose > 1) {
  113. fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
  114. i, slabclass[i].size, slabclass[i].perslab);
  115. }
  116. }
  117.  
  118. power_largest = i;
  119. slabclass[power_largest].size = settings.item_size_max;
  120. slabclass[power_largest].perslab = 1;
  121. if (settings.verbose > 1) {
  122. fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
  123. i, slabclass[i].size, slabclass[i].perslab);
  124. }
  125.  
  126. {
  127. char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
  128. if (t_initial_malloc) {
  129. mem_malloced = (size_t)atol(t_initial_malloc);
  130. }
  131.  
  132. }
  133.  
  134. if (prealloc) {
  135. slabs_preallocate(power_largest);
  136. }
  137. }
  138.  
  139. /**
  140. 内存预分配,如果用户开启了预分配,则会调用此方法,先从mem_base为分每个slabclass割一个slab大小下来。
  141. */
  142. static void slabs_preallocate (const unsigned int maxslabs) {
  143. int i;
  144. unsigned int prealloc = 0;
  145.  
  146. for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
  147. if (++prealloc > maxslabs)
  148. return;
  149. if (do_slabs_newslab(i) == 0) {
  150. fprintf(stderr, "Error while preallocating slab memory!\n"
  151. "If using -L or other prealloc options, max memory must be "
  152. "at least %d megabytes.\n", power_largest);
  153. exit(1);
  154. }
  155. }
  156.  
  157. }
  158.  
  159. static int grow_slab_list (const unsigned int id) {
  160. slabclass_t *p = &slabclass[id];
  161. /**
  162. p->slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组分配的空间。
  163. p->slabs代表已经分配出去的slabs数
  164. 而p->list_size代表可以用多少个slabs数
  165. 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
  166. */
  167. if (p->slabs == p->list_size) {
  168. size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
  169. void *new_list = realloc(p->slab_list, new_size * sizeof(void *)); //
  170. if (new_list == 0) return 0;
  171. p->list_size = new_size;
  172. p->slab_list = new_list;
  173. }
  174. return 1;
  175. }
  176.  
  177. /**
  178. 把整个slab打散成一个个(也叫chunk)放到相应的slots链表中
  179. */
  180. static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
  181. slabclass_t *p = &slabclass[id];
  182. int x;
  183. for (x = 0; x < p->perslab; x++) {
  184. do_slabs_free(ptr, 0, id); //这个函数主要作用是让当前item空间可用,即加到slots链表中。
  185. ptr += p->size;
  186. }
  187. }
  188.  
  189. /**
  190. 为slabclass[id]分配新的slab,仅当当前的slabclass中slots没有空闲的空间才调用
  191. 此函数分配新的slab
  192. */
  193. static int do_slabs_newslab(const unsigned int id) {
  194. slabclass_t *p = &slabclass[id];
  195. int len = settings.slab_reassign ? settings.item_size_max
  196. : p->size * p->perslab; //先判断是否开启了自定义slab大小,如果没有就按默认的,即约1M
  197. char *ptr;
  198.  
  199. /**
  200. 下面if的逻辑是:
  201. 如果内存超出了限制,分配失败进入if,返回0
  202. 否则调用grow_slab_list检查是否要增大slab_list的大小
  203. 如果在grow_slab_list返回失败,则不继续分配空间,进入if,返回0
  204. 否则分配空间memory_allocate,如果分配失败,同样进入if,返回0;
  205. */
  206. if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
  207. (grow_slab_list(id) == 0) ||
  208. ((ptr = memory_allocate((size_t)len)) == 0)) {
  209.  
  210. MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
  211. return 0;
  212. }
  213.  
  214. memset(ptr, 0, (size_t)len); //清干净内存空间
  215. split_slab_page_into_freelist(ptr, id); //把新申请的slab放到slots中去
  216.  
  217. p->slab_list[p->slabs++] = ptr; //把新的slab加到slab_list数组中
  218. mem_malloced += len; //记下已分配的空间大小
  219. MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
  220.  
  221. return 1;
  222. }
  223.  
  224. /**
  225. 根据item大小和slabsclass分配空间
  226. */
  227. static void *do_slabs_alloc(const size_t size, unsigned int id) {
  228. slabclass_t *p;
  229. void *ret = NULL;
  230. item *it = NULL;
  231.  
  232. if (id < POWER_SMALLEST || id > power_largest) { //默认最大是200,最小是1
  233. MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
  234. return NULL;
  235. }
  236.  
  237. p = &slabclass[id]; //slabclass是一个全局变量,是各个slabclass对象数组,在这取得当前id对应的slabclass
  238. assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
  239.  
  240. /* fail unless we have space at the end of a recently allocated page,
  241. we have something on our freelist, or we could allocate a new page */
  242. /**
  243. 下面这个if的逻辑相当于:
  244. 如果p->sl_curr==0,即slots链表中没有空闲的空间,则do_slabs_newslab分配新slab
  245. 如果p->sl_curr==0,且do_slabs_newslab分配新slab失败,则进入if,ret = NULL,否则进入下面的elseif
  246. */
  247. if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) {
  248. /* We don't have more memory available */
  249. ret = NULL;
  250. } else if (p->sl_curr != 0) { //如果进入此分支是因为slots链表中还有空闲的空间
  251. /* return off our freelist */
  252. //把空闲的item分配出去
  253. it = (item *)p->slots;
  254. p->slots = it->next;
  255. if (it->next) it->next->prev = 0;
  256. p->sl_curr--;
  257. ret = (void *)it;
  258. }
  259.  
  260. if (ret) {
  261. p->requested += size; //分配成功,记下已分配的字节数
  262. MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
  263. } else {
  264. MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
  265. }
  266.  
  267. return ret;
  268. }
  269.  
  270. /**
  271. 这个函数的命名虽然叫do_slabs_free,听上去好像是释放空间,其实质是把空间变成可用。
  272. 怎样的空间才算可用?就是加到当前slabclass的slots链表中而已。
  273. 所以新申请的slab也会调用这个函数,让整个slab变为可用。
  274.  
  275. ps: 以前的memcached版本slots链表保存的是回收的item空间,而
  276. 现在保存的是所有可用的item空间。
  277. */
  278. static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
  279. slabclass_t *p;
  280. item *it;
  281.  
  282. assert(((item *)ptr)->slabs_clsid == 0);
  283. assert(id >= POWER_SMALLEST && id <= power_largest);
  284. if (id < POWER_SMALLEST || id > power_largest)
  285. return;
  286.  
  287. MEMCACHED_SLABS_FREE(size, id, ptr);
  288. p = &slabclass[id];
  289.  
  290. it = (item *)ptr;
  291. it->it_flags |= ITEM_SLABBED; //把item标记为slabbed状态
  292. it->prev = 0;
  293. it->next = p->slots;  //插入到slots链表中
  294. if (it->next) it->next->prev = it;
  295. p->slots = it;
  296.  
  297. p->sl_curr++; //空闲item数加1
  298. p->requested -= size;
  299. return;
  300. }
  301.  
  302. static int nz_strcmp(int nzlength, const char *nz, const char *z) {
  303. int zlength=strlen(z);
  304. return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
  305. }
  306.  
  307. bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
  308. bool ret = true;
  309.  
  310. if (add_stats != NULL) {
  311. if (!stat_type) {
  312. /* prepare general statistics for the engine */
  313. STATS_LOCK();
  314. APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
  315. APPEND_STAT("curr_items", "%u", stats.curr_items);
  316. APPEND_STAT("total_items", "%u", stats.total_items);
  317. STATS_UNLOCK();
  318. item_stats_totals(add_stats, c);
  319. } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
  320. item_stats(add_stats, c);
  321. } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
  322. slabs_stats(add_stats, c);
  323. } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
  324. item_stats_sizes(add_stats, c);
  325. } else {
  326. ret = false;
  327. }
  328. } else {
  329. ret = false;
  330. }
  331.  
  332. return ret;
  333. }
  334.  
  335. static void do_slabs_stats(ADD_STAT add_stats, void *c) {
  336. int i, total;
  337. /* Get the per-thread stats which contain some interesting aggregates */
  338. struct thread_stats thread_stats;
  339. threadlocal_stats_aggregate(&thread_stats);
  340.  
  341. total = 0;
  342. for(i = POWER_SMALLEST; i <= power_largest; i++) {
  343. slabclass_t *p = &slabclass[i];
  344. if (p->slabs != 0) {
  345. uint32_t perslab, slabs;
  346. slabs = p->slabs;
  347. perslab = p->perslab;
  348.  
  349. char key_str[STAT_KEY_LEN];
  350. char val_str[STAT_VAL_LEN];
  351. int klen = 0, vlen = 0;
  352.  
  353. APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
  354. APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
  355. APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
  356. APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
  357. APPEND_NUM_STAT(i, "used_chunks", "%u",
  358. slabs*perslab - p->sl_curr);
  359. APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
  360. /* Stat is dead, but displaying zero instead of removing it. */
  361. APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
  362. APPEND_NUM_STAT(i, "mem_requested", "%llu",
  363. (unsigned long long)p->requested);
  364. APPEND_NUM_STAT(i, "get_hits", "%llu",
  365. (unsigned long long)thread_stats.slab_stats[i].get_hits);
  366. APPEND_NUM_STAT(i, "cmd_set", "%llu",
  367. (unsigned long long)thread_stats.slab_stats[i].set_cmds);
  368. APPEND_NUM_STAT(i, "delete_hits", "%llu",
  369. (unsigned long long)thread_stats.slab_stats[i].delete_hits);
  370. APPEND_NUM_STAT(i, "incr_hits", "%llu",
  371. (unsigned long long)thread_stats.slab_stats[i].incr_hits);
  372. APPEND_NUM_STAT(i, "decr_hits", "%llu",
  373. (unsigned long long)thread_stats.slab_stats[i].decr_hits);
  374. APPEND_NUM_STAT(i, "cas_hits", "%llu",
  375. (unsigned long long)thread_stats.slab_stats[i].cas_hits);
  376. APPEND_NUM_STAT(i, "cas_badval", "%llu",
  377. (unsigned long long)thread_stats.slab_stats[i].cas_badval);
  378. APPEND_NUM_STAT(i, "touch_hits", "%llu",
  379. (unsigned long long)thread_stats.slab_stats[i].touch_hits);
  380. total++;
  381. }
  382. }
  383.  
  384. APPEND_STAT("active_slabs", "%d", total);
  385. APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
  386. add_stats(NULL, 0, NULL, 0, c);
  387. }
  388.  
  389. /**
  390. 分配内存空间
  391. */
  392. static void *memory_allocate(size_t size) {
  393. void *ret;
  394.  
  395. /**
  396. 有两种分配策略
  397. 1)如果是开启了内存预分配策略,则只需要从预分配好的内存块那里割一块出来。即进入下面的else分支
  398. 2)如果没有开启预分配,则malloc分配内存
  399.  
  400. 关于预分配详见 slabs_init
  401. */
  402. if (mem_base == NULL) {
  403. /* We are not using a preallocated large memory chunk */
  404. ret = malloc(size);
  405. } else {
  406. ret = mem_current;
  407.  
  408. if (size > mem_avail) {
  409. return NULL;
  410. }
  411.  
  412. /* mem_current pointer _must_ be aligned!!! */
  413. if (size % CHUNK_ALIGN_BYTES) {
  414. size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
  415. }
  416.  
  417. mem_current = ((char*)mem_current) + size;
  418. if (size < mem_avail) {
  419. mem_avail -= size;
  420. } else {
  421. mem_avail = 0;
  422. }
  423. }
  424.  
  425. return ret;
  426. }
  427.  
  428. void *slabs_alloc(size_t size, unsigned int id) {
  429. void *ret;
  430.  
  431. pthread_mutex_lock(&slabs_lock);
  432. ret = do_slabs_alloc(size, id);
  433. pthread_mutex_unlock(&slabs_lock);
  434. return ret;
  435. }
  436.  
  437. void slabs_free(void *ptr, size_t size, unsigned int id) {
  438. pthread_mutex_lock(&slabs_lock);
  439. do_slabs_free(ptr, size, id);
  440. pthread_mutex_unlock(&slabs_lock);
  441. }
  442.  
  443. void slabs_stats(ADD_STAT add_stats, void *c) {
  444. pthread_mutex_lock(&slabs_lock);
  445. do_slabs_stats(add_stats, c);
  446. pthread_mutex_unlock(&slabs_lock);
  447. }
  448.  
  449. void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
  450. {
  451. pthread_mutex_lock(&slabs_lock);
  452. slabclass_t *p;
  453. if (id < POWER_SMALLEST || id > power_largest) {
  454. fprintf(stderr, "Internal error! Invalid slab class\n");
  455. abort();
  456. }
  457.  
  458. p = &slabclass[id];
  459. p->requested = p->requested - old + ntotal;
  460. pthread_mutex_unlock(&slabs_lock);
  461. }
  462.  
  463. static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
  464. static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
  465. static volatile int do_run_slab_thread = 1;
  466. static volatile int do_run_slab_rebalance_thread = 1;
  467.  
  468. #define DEFAULT_SLAB_BULK_CHECK 1
  469. int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
  470.  
  471. static int slab_rebalance_start(void) {
  472. slabclass_t *s_cls;
  473. int no_go = 0;
  474.  
  475. pthread_mutex_lock(&cache_lock);
  476. pthread_mutex_lock(&slabs_lock);
  477.  
  478. if (slab_rebal.s_clsid < POWER_SMALLEST ||
  479. slab_rebal.s_clsid > power_largest  ||
  480. slab_rebal.d_clsid < POWER_SMALLEST ||
  481. slab_rebal.d_clsid > power_largest  ||
  482. slab_rebal.s_clsid == slab_rebal.d_clsid)
  483. no_go = -2;
  484.  
  485. s_cls = &slabclass[slab_rebal.s_clsid];
  486.  
  487. if (!grow_slab_list(slab_rebal.d_clsid)) {
  488. no_go = -1;
  489. }
  490.  
  491. if (s_cls->slabs < 2)
  492. no_go = -3;
  493.  
  494. if (no_go != 0) {
  495. pthread_mutex_unlock(&slabs_lock);
  496. pthread_mutex_unlock(&cache_lock);
  497. return no_go; /* Should use a wrapper function... */
  498. }
  499.  
  500. s_cls->killing = 1;
  501.  
  502. slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
  503. slab_rebal.slab_end   = (char *)slab_rebal.slab_start +
  504. (s_cls->size * s_cls->perslab);
  505. slab_rebal.slab_pos   = slab_rebal.slab_start;
  506. slab_rebal.done       = 0;
  507.  
  508. /* Also tells do_item_get to search for items in this slab */
  509. slab_rebalance_signal = 2;
  510.  
  511. if (settings.verbose > 1) {
  512. fprintf(stderr, "Started a slab rebalance\n");
  513. }
  514.  
  515. pthread_mutex_unlock(&slabs_lock);
  516. pthread_mutex_unlock(&cache_lock);
  517.  
  518. STATS_LOCK();
  519. stats.slab_reassign_running = true;
  520. STATS_UNLOCK();
  521.  
  522. return 0;
  523. }
  524.  
  525. enum move_status {
  526. MOVE_PASS=0, MOVE_DONE, MOVE_BUSY, MOVE_LOCKED
  527. };
  528.  
  529. static int slab_rebalance_move(void) {
  530. slabclass_t *s_cls;
  531. int x;
  532. int was_busy = 0;
  533. int refcount = 0;
  534. enum move_status status = MOVE_PASS;
  535.  
  536. pthread_mutex_lock(&cache_lock);
  537. pthread_mutex_lock(&slabs_lock);
  538.  
  539. s_cls = &slabclass[slab_rebal.s_clsid];
  540.  
  541. for (x = 0; x < slab_bulk_check; x++) {
  542. item *it = slab_rebal.slab_pos;
  543. status = MOVE_PASS;
  544. if (it->slabs_clsid != 255) {
  545. void *hold_lock = NULL;
  546. uint32_t hv = hash(ITEM_key(it), it->nkey);
  547. if ((hold_lock = item_trylock(hv)) == NULL) {
  548. status = MOVE_LOCKED;
  549. } else {
  550. refcount = refcount_incr(&it->refcount);
  551. if (refcount == 1) { /* item is unlinked, unused */
  552. if (it->it_flags & ITEM_SLABBED) {
  553. /* remove from slab freelist */
  554. if (s_cls->slots == it) {
  555. s_cls->slots = it->next;
  556. }
  557. if (it->next) it->next->prev = it->prev;
  558. if (it->prev) it->prev->next = it->next;
  559. s_cls->sl_curr--;
  560. status = MOVE_DONE;
  561. } else {
  562. status = MOVE_BUSY;
  563. }
  564. } else if (refcount == 2) { /* item is linked but not busy */
  565. if ((it->it_flags & ITEM_LINKED) != 0) {
  566. do_item_unlink_nolock(it, hv);
  567. status = MOVE_DONE;
  568. } else {
  569. /* refcount == 1 + !ITEM_LINKED means the item is being
  570. * uploaded to, or was just unlinked but hasn't been freed
  571. * yet. Let it bleed off on its own and try again later */
  572. status = MOVE_BUSY;
  573. }
  574. } else {
  575. if (settings.verbose > 2) {
  576. fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
  577. it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
  578. }
  579. status = MOVE_BUSY;
  580. }
  581. item_trylock_unlock(hold_lock);
  582. }
  583. }
  584.  
  585. switch (status) {
  586. case MOVE_DONE:
  587. it->refcount = 0;
  588. it->it_flags = 0;
  589. it->slabs_clsid = 255;
  590. break;
  591. case MOVE_BUSY:
  592. refcount_decr(&it->refcount);
  593. case MOVE_LOCKED:
  594. slab_rebal.busy_items++;
  595. was_busy++;
  596. break;
  597. case MOVE_PASS:
  598. break;
  599. }
  600.  
  601. slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
  602. if (slab_rebal.slab_pos >= slab_rebal.slab_end)
  603. break;
  604. }
  605.  
  606. if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
  607. /* Some items were busy, start again from the top */
  608. if (slab_rebal.busy_items) {
  609. slab_rebal.slab_pos = slab_rebal.slab_start;
  610. slab_rebal.busy_items = 0;
  611. } else {
  612. slab_rebal.done++;
  613. }
  614. }
  615.  
  616. pthread_mutex_unlock(&slabs_lock);
  617. pthread_mutex_unlock(&cache_lock);
  618.  
  619. return was_busy;
  620. }
  621.  
  622. static void slab_rebalance_finish(void) {
  623. slabclass_t *s_cls;
  624. slabclass_t *d_cls;
  625.  
  626. pthread_mutex_lock(&cache_lock);
  627. pthread_mutex_lock(&slabs_lock);
  628.  
  629. s_cls = &slabclass[slab_rebal.s_clsid];
  630. d_cls   = &slabclass[slab_rebal.d_clsid];
  631.  
  632. /* At this point the stolen slab is completely clear */
  633. s_cls->slab_list[s_cls->killing - 1] =
  634. s_cls->slab_list[s_cls->slabs - 1];
  635. s_cls->slabs--;
  636. s_cls->killing = 0;
  637.  
  638. memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
  639.  
  640. d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
  641. split_slab_page_into_freelist(slab_rebal.slab_start,
  642. slab_rebal.d_clsid);
  643.  
  644. slab_rebal.done       = 0;
  645. slab_rebal.s_clsid    = 0;
  646. slab_rebal.d_clsid    = 0;
  647. slab_rebal.slab_start = NULL;
  648. slab_rebal.slab_end   = NULL;
  649. slab_rebal.slab_pos   = NULL;
  650.  
  651. slab_rebalance_signal = 0;
  652.  
  653. pthread_mutex_unlock(&slabs_lock);
  654. pthread_mutex_unlock(&cache_lock);
  655.  
  656. STATS_LOCK();
  657. stats.slab_reassign_running = false;
  658. stats.slabs_moved++;
  659. STATS_UNLOCK();
  660.  
  661. if (settings.verbose > 1) {
  662. fprintf(stderr, "finished a slab move\n");
  663. }
  664. }
  665.  
  666. /*
  667. slab自动重分配时,执行此函数做出重分配方案决定
  668. */
  669. static int slab_automove_decision(int *src, int *dst) {
  670. static uint64_t evicted_old[POWER_LARGEST];
  671. static unsigned int slab_zeroes[POWER_LARGEST];
  672. static unsigned int slab_winner = 0;
  673. static unsigned int slab_wins   = 0;
  674. uint64_t evicted_new[POWER_LARGEST];
  675. uint64_t evicted_diff = 0;
  676. uint64_t evicted_max  = 0;
  677. unsigned int highest_slab = 0;
  678. unsigned int total_pages[POWER_LARGEST];
  679. int i;
  680. int source = 0;
  681. int dest = 0;
  682. static rel_time_t next_run;
  683.  
  684. /* Run less frequently than the slabmove tester. */
  685. if (current_time >= next_run) {
  686. next_run = current_time + 10;
  687. } else {
  688. return 0;
  689. }
  690.  
  691. item_stats_evictions(evicted_new);
  692. pthread_mutex_lock(&cache_lock);
  693. for (i = POWER_SMALLEST; i < power_largest; i++) {
  694. total_pages[i] = slabclass[i].slabs;
  695. }
  696. pthread_mutex_unlock(&cache_lock);
  697.  
  698. /* Find a candidate source; something with zero evicts 3+ times */
  699. for (i = POWER_SMALLEST; i < power_largest; i++) {
  700. evicted_diff = evicted_new[i] - evicted_old[i];
  701. if (evicted_diff == 0 && total_pages[i] > 2) {
  702. slab_zeroes[i]++;
  703. if (source == 0 && slab_zeroes[i] >= 3)
  704. source = i;
  705. } else {
  706. slab_zeroes[i] = 0;
  707. if (evicted_diff > evicted_max) {
  708. evicted_max = evicted_diff;
  709. highest_slab = i;
  710. }
  711. }
  712. evicted_old[i] = evicted_new[i];
  713. }
  714.  
  715. /* Pick a valid destination */
  716. if (slab_winner != 0 && slab_winner == highest_slab) {
  717. slab_wins++;
  718. if (slab_wins >= 3)
  719. dest = slab_winner;
  720. } else {
  721. slab_wins = 1;
  722. slab_winner = highest_slab;
  723. }
  724.  
  725. if (source && dest) {
  726. *src = source;
  727. *dst = dest;
  728. return 1;
  729. }
  730. return 0;
  731. }
  732.  
  733. /* Slab rebalancer thread.
  734. * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
  735. * go to sleep if locks are contended
  736. 运行slab维护线程,slab维护线程的执行入口
  737. */
  738. static void *slab_maintenance_thread(void *arg) {
  739. int src, dest;
  740.  
  741. while (do_run_slab_thread) {
  742. if (settings.slab_automove == 1) {
  743. if (slab_automove_decision(&src, &dest) == 1) {
  744. /* Blind to the return codes. It will retry on its own */
  745. slabs_reassign(src, dest); //移动slab,重分配
  746. }
  747. sleep(1);
  748. } else {
  749. /* Don't wake as often if we're not enabled.
  750. * This is lazier than setting up a condition right now. */
  751. sleep(5);
  752. }
  753. }
  754. return NULL;
  755. }
  756.  
  757. /* Slab mover thread.
  758. * Sits waiting for a condition to jump off and shovel some memory about
  759. */
  760. static void *slab_rebalance_thread(void *arg) {
  761. int was_busy = 0;
  762. /* So we first pass into cond_wait with the mutex held */
  763. mutex_lock(&slabs_rebalance_lock);
  764.  
  765. while (do_run_slab_rebalance_thread) {
  766. if (slab_rebalance_signal == 1) {
  767. if (slab_rebalance_start() < 0) {
  768. /* Handle errors with more specifity as required. */
  769. slab_rebalance_signal = 0;
  770. }
  771.  
  772. was_busy = 0;
  773. } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
  774. was_busy = slab_rebalance_move();
  775. }
  776.  
  777. if (slab_rebal.done) {
  778. slab_rebalance_finish();
  779. } else if (was_busy) {
  780. /* Stuck waiting for some items to unlock, so slow down a bit
  781. * to give them a chance to free up */
  782. usleep(50);
  783. }
  784.  
  785. if (slab_rebalance_signal == 0) {
  786. /* always hold this lock while we're running */
  787. pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
  788. }
  789. }
  790. return NULL;
  791. }
  792.  
  793. static int slabs_reassign_pick_any(int dst) {
  794. static int cur = POWER_SMALLEST - 1;
  795. int tries = power_largest - POWER_SMALLEST + 1;
  796. for (; tries > 0; tries--) {
  797. cur++;
  798. if (cur > power_largest)
  799. cur = POWER_SMALLEST;
  800. if (cur == dst)
  801. continue;
  802. if (slabclass[cur].slabs > 1) {
  803. return cur;
  804. }
  805. }
  806. return -1;
  807. }
  808.  
  809. static enum reassign_result_type do_slabs_reassign(int src, int dst) {
  810. if (slab_rebalance_signal != 0)
  811. return REASSIGN_RUNNING;
  812.  
  813. if (src == dst)
  814. return REASSIGN_SRC_DST_SAME;
  815.  
  816. /* Special indicator to choose ourselves. */
  817. if (src == -1) {
  818. src = slabs_reassign_pick_any(dst);
  819. /* TODO: If we end up back at -1, return a new error type */
  820. }
  821.  
  822. if (src < POWER_SMALLEST || src > power_largest ||
  823. dst < POWER_SMALLEST || dst > power_largest)
  824. return REASSIGN_BADCLASS;
  825.  
  826. if (slabclass[src].slabs < 2)
  827. return REASSIGN_NOSPARE;
  828.  
  829. slab_rebal.s_clsid = src;
  830. slab_rebal.d_clsid = dst;
  831.  
  832. slab_rebalance_signal = 1;
  833. pthread_cond_signal(&slab_rebalance_cond);
  834.  
  835. return REASSIGN_OK;
  836. }
  837.  
  838. enum reassign_result_type slabs_reassign(int src, int dst) {
  839. enum reassign_result_type ret;
  840. if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
  841. return REASSIGN_RUNNING;
  842. }
  843. ret = do_slabs_reassign(src, dst);
  844. pthread_mutex_unlock(&slabs_rebalance_lock);
  845. return ret;
  846. }
  847.  
  848. /* If we hold this lock, rebalancer can't wake up or move */
  849. void slabs_rebalancer_pause(void) {
  850. pthread_mutex_lock(&slabs_rebalance_lock);
  851. }
  852.  
  853. void slabs_rebalancer_resume(void) {
  854. pthread_mutex_unlock(&slabs_rebalance_lock);
  855. }
  856.  
  857. static pthread_t maintenance_tid;
  858. static pthread_t rebalance_tid;
  859.  
  860. /**
  861. 启动slab维护线程
  862. */
  863. int start_slab_maintenance_thread(void) {
  864. int ret;
  865. slab_rebalance_signal = 0;
  866. slab_rebal.slab_start = NULL;
  867. char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
  868. if (env != NULL) {
  869. slab_bulk_check = atoi(env);
  870. if (slab_bulk_check == 0) {
  871. slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
  872. }
  873. }
  874.  
  875. if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
  876. fprintf(stderr, "Can't intiialize rebalance condition\n");
  877. return -1;
  878. }
  879. pthread_mutex_init(&slabs_rebalance_lock, NULL);
  880.  
  881. if ((ret = pthread_create(&maintenance_tid, NULL,
  882. slab_maintenance_thread, NULL)) != 0) {
  883. fprintf(stderr, "Can't create slab maint thread: %s\n", strerror(ret));
  884. return -1;
  885. }
  886. if ((ret = pthread_create(&rebalance_tid, NULL,
  887. slab_rebalance_thread, NULL)) != 0) {
  888. fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
  889. return -1;
  890. }
  891. return 0;
  892. }
  893.  
  894. /**
  895. 停止slab维护线程,逻辑和停止哈希表维护线程一样。
  896. */
  897. void stop_slab_maintenance_thread(void) {
  898. mutex_lock(&cache_lock);
  899. do_run_slab_thread = 0;
  900. do_run_slab_rebalance_thread = 0;
  901. pthread_cond_signal(&maintenance_cond);
  902. pthread_mutex_unlock(&cache_lock);
  903.  
  904. /* Wait for the maintenance thread to stop */
  905. pthread_join(maintenance_tid, NULL);
  906. pthread_join(rebalance_tid, NULL);
  907. }
posted @ 2016-09-07 17:33  Beyond it  阅读(305)  评论(0编辑  收藏  举报