Memcached源码分析之slabs.c
- #include "memcached.h"
- #include <sys/stat.h>
- #include <sys/socket.h>
- #include <sys/signal.h>
- #include <sys/resource.h>
- #include <fcntl.h>
- #include <netinet/in.h>
- #include <errno.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <string.h>
- #include <assert.h>
- #include <pthread.h>
- typedef struct {
- unsigned int size; /* sizes of items */ //item或者说chunk的大小
- unsigned int perslab; /* how many items per slab */ //每个slab有多少个item,slab又称“页”
- /**
- 当前slabclass的空闲item链表,也是可用item链表,当前slabclass一切可以用的内存空间都在此,
- 这里是内存分配的入口,分配内存的时候都是在这个链表上挤一个出去。
- ps:memcached的新版本才开始把slots作为“所有空闲的item链接”的用途,以前的版本slots链表保存的是“回收的item”的意思,
- 而旧版本新分配的slab,是用end_page_ptr指针及end_page_free来控制,此版本已不用。
- */
- void *slots; /* list of item ptrs */
- unsigned int sl_curr; /* total free items in list */ //当前slabclass还剩多少空闲的item,即上面的slots数
- unsigned int slabs; /* how many slabs were allocated for this class */ //这个slabclass分配了多少个slab了
- /**
- slab_list是这个slabclass下的slabs列表,逻辑上是一个数组,每个元素是一个slab指针。
- list_size是slab_list的元素个数。
- 注意这个list_size和上面的slabs的不同:
- 由于slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组元素的个数,代表slab_list的空间大小。
- slabs代表已经分配出去的slabs数,list_size则代表可以有多少个slabs数
- 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
- */
- void **slab_list; /* array of slab pointers */
- unsigned int list_size; /* size of prev array */
- unsigned int killing; /* index+1 of dying slab, or zero if none */
- size_t requested; /* The number of requested bytes */
- } slabclass_t;
- static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
- static size_t mem_limit = 0; //内存上限
- static size_t mem_malloced = 0; //已分配的内存
- static int power_largest;
- static void *mem_base = NULL; //预分配的内存空间
- static void *mem_current = NULL;
- static size_t mem_avail = 0;
- static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
- static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;
- static int do_slabs_newslab(const unsigned int id);
- static void *memory_allocate(size_t size);
- static void do_slabs_free(void *ptr, const size_t size, unsigned int id);
- static void slabs_preallocate (const unsigned int maxslabs);
- //根据item大小找到合适的slabclass
- unsigned int slabs_clsid(const size_t size) {
- int res = POWER_SMALLEST;
- if (size == 0)
- return 0;
- while (size > slabclass[res].size)
- if (res++ == power_largest) /* won't fit in the biggest slab */
- return 0;
- return res;
- }
- /**
- 初始化slabs,这里会对一些内存管理进行初始化
- */
- void slabs_init(const size_t limit, const double factor, const bool prealloc) {
- int i = POWER_SMALLEST - 1;
- unsigned int size = sizeof(item) + settings.chunk_size;
- mem_limit = limit; //这个limit就是启动时候用户设置的-m xx中的xx,最大的内存上限
- if (prealloc) {
- /**
- 如果用户开启了预分配,则先把上限的内存先分配出来,放到mem_base全局变量中。
- 所以这个时候服务就拥有了一大坨内存,以后要分配的内存都是从这一坨里面割下来。
- */
- mem_base = malloc(mem_limit);
- if (mem_base != NULL) {
- mem_current = mem_base;
- mem_avail = mem_limit;
- } else {
- fprintf(stderr, "Warning: Failed to allocate requested memory in"
- " one large chunk.\nWill allocate in smaller chunks\n");
- }
- }
- //下面是初始化各个slabclass对象
- memset(slabclass, 0, sizeof(slabclass));
- while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
- /* Make sure items are always n-byte aligned */
- if (size % CHUNK_ALIGN_BYTES)
- size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
- slabclass[i].size = size;
- slabclass[i].perslab = settings.item_size_max / slabclass[i].size;
- size *= factor;
- if (settings.verbose > 1) {
- fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
- i, slabclass[i].size, slabclass[i].perslab);
- }
- }
- power_largest = i;
- slabclass[power_largest].size = settings.item_size_max;
- slabclass[power_largest].perslab = 1;
- if (settings.verbose > 1) {
- fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",
- i, slabclass[i].size, slabclass[i].perslab);
- }
- {
- char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
- if (t_initial_malloc) {
- mem_malloced = (size_t)atol(t_initial_malloc);
- }
- }
- if (prealloc) {
- slabs_preallocate(power_largest);
- }
- }
- /**
- 内存预分配,如果用户开启了预分配,则会调用此方法,先从mem_base为分每个slabclass割一个slab大小下来。
- */
- static void slabs_preallocate (const unsigned int maxslabs) {
- int i;
- unsigned int prealloc = 0;
- for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
- if (++prealloc > maxslabs)
- return;
- if (do_slabs_newslab(i) == 0) {
- fprintf(stderr, "Error while preallocating slab memory!\n"
- "If using -L or other prealloc options, max memory must be "
- "at least %d megabytes.\n", power_largest);
- exit(1);
- }
- }
- }
- static int grow_slab_list (const unsigned int id) {
- slabclass_t *p = &slabclass[id];
- /**
- p->slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组分配的空间。
- p->slabs代表已经分配出去的slabs数
- 而p->list_size代表可以用多少个slabs数
- 所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
- */
- if (p->slabs == p->list_size) {
- size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16;
- void *new_list = realloc(p->slab_list, new_size * sizeof(void *)); //
- if (new_list == 0) return 0;
- p->list_size = new_size;
- p->slab_list = new_list;
- }
- return 1;
- }
- /**
- 把整个slab打散成一个个(也叫chunk)放到相应的slots链表中
- */
- static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
- slabclass_t *p = &slabclass[id];
- int x;
- for (x = 0; x < p->perslab; x++) {
- do_slabs_free(ptr, 0, id); //这个函数主要作用是让当前item空间可用,即加到slots链表中。
- ptr += p->size;
- }
- }
- /**
- 为slabclass[id]分配新的slab,仅当当前的slabclass中slots没有空闲的空间才调用
- 此函数分配新的slab
- */
- static int do_slabs_newslab(const unsigned int id) {
- slabclass_t *p = &slabclass[id];
- int len = settings.slab_reassign ? settings.item_size_max
- : p->size * p->perslab; //先判断是否开启了自定义slab大小,如果没有就按默认的,即约1M
- char *ptr;
- /**
- 下面if的逻辑是:
- 如果内存超出了限制,分配失败进入if,返回0
- 否则调用grow_slab_list检查是否要增大slab_list的大小
- 如果在grow_slab_list返回失败,则不继续分配空间,进入if,返回0
- 否则分配空间memory_allocate,如果分配失败,同样进入if,返回0;
- */
- if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
- (grow_slab_list(id) == 0) ||
- ((ptr = memory_allocate((size_t)len)) == 0)) {
- MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
- return 0;
- }
- memset(ptr, 0, (size_t)len); //清干净内存空间
- split_slab_page_into_freelist(ptr, id); //把新申请的slab放到slots中去
- p->slab_list[p->slabs++] = ptr; //把新的slab加到slab_list数组中
- mem_malloced += len; //记下已分配的空间大小
- MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
- return 1;
- }
- /**
- 根据item大小和slabsclass分配空间
- */
- static void *do_slabs_alloc(const size_t size, unsigned int id) {
- slabclass_t *p;
- void *ret = NULL;
- item *it = NULL;
- if (id < POWER_SMALLEST || id > power_largest) { //默认最大是200,最小是1
- MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
- return NULL;
- }
- p = &slabclass[id]; //slabclass是一个全局变量,是各个slabclass对象数组,在这取得当前id对应的slabclass
- assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
- /* fail unless we have space at the end of a recently allocated page,
- we have something on our freelist, or we could allocate a new page */
- /**
- 下面这个if的逻辑相当于:
- 如果p->sl_curr==0,即slots链表中没有空闲的空间,则do_slabs_newslab分配新slab
- 如果p->sl_curr==0,且do_slabs_newslab分配新slab失败,则进入if,ret = NULL,否则进入下面的elseif
- */
- if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) {
- /* We don't have more memory available */
- ret = NULL;
- } else if (p->sl_curr != 0) { //如果进入此分支是因为slots链表中还有空闲的空间
- /* return off our freelist */
- //把空闲的item分配出去
- it = (item *)p->slots;
- p->slots = it->next;
- if (it->next) it->next->prev = 0;
- p->sl_curr--;
- ret = (void *)it;
- }
- if (ret) {
- p->requested += size; //分配成功,记下已分配的字节数
- MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
- } else {
- MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
- }
- return ret;
- }
- /**
- 这个函数的命名虽然叫do_slabs_free,听上去好像是释放空间,其实质是把空间变成可用。
- 怎样的空间才算可用?就是加到当前slabclass的slots链表中而已。
- 所以新申请的slab也会调用这个函数,让整个slab变为可用。
- ps: 以前的memcached版本slots链表保存的是回收的item空间,而
- 现在保存的是所有可用的item空间。
- */
- static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
- slabclass_t *p;
- item *it;
- assert(((item *)ptr)->slabs_clsid == 0);
- assert(id >= POWER_SMALLEST && id <= power_largest);
- if (id < POWER_SMALLEST || id > power_largest)
- return;
- MEMCACHED_SLABS_FREE(size, id, ptr);
- p = &slabclass[id];
- it = (item *)ptr;
- it->it_flags |= ITEM_SLABBED; //把item标记为slabbed状态
- it->prev = 0;
- it->next = p->slots; //插入到slots链表中
- if (it->next) it->next->prev = it;
- p->slots = it;
- p->sl_curr++; //空闲item数加1
- p->requested -= size;
- return;
- }
- static int nz_strcmp(int nzlength, const char *nz, const char *z) {
- int zlength=strlen(z);
- return (zlength == nzlength) && (strncmp(nz, z, zlength) == 0) ? 0 : -1;
- }
- bool get_stats(const char *stat_type, int nkey, ADD_STAT add_stats, void *c) {
- bool ret = true;
- if (add_stats != NULL) {
- if (!stat_type) {
- /* prepare general statistics for the engine */
- STATS_LOCK();
- APPEND_STAT("bytes", "%llu", (unsigned long long)stats.curr_bytes);
- APPEND_STAT("curr_items", "%u", stats.curr_items);
- APPEND_STAT("total_items", "%u", stats.total_items);
- STATS_UNLOCK();
- item_stats_totals(add_stats, c);
- } else if (nz_strcmp(nkey, stat_type, "items") == 0) {
- item_stats(add_stats, c);
- } else if (nz_strcmp(nkey, stat_type, "slabs") == 0) {
- slabs_stats(add_stats, c);
- } else if (nz_strcmp(nkey, stat_type, "sizes") == 0) {
- item_stats_sizes(add_stats, c);
- } else {
- ret = false;
- }
- } else {
- ret = false;
- }
- return ret;
- }
- static void do_slabs_stats(ADD_STAT add_stats, void *c) {
- int i, total;
- /* Get the per-thread stats which contain some interesting aggregates */
- struct thread_stats thread_stats;
- threadlocal_stats_aggregate(&thread_stats);
- total = 0;
- for(i = POWER_SMALLEST; i <= power_largest; i++) {
- slabclass_t *p = &slabclass[i];
- if (p->slabs != 0) {
- uint32_t perslab, slabs;
- slabs = p->slabs;
- perslab = p->perslab;
- char key_str[STAT_KEY_LEN];
- char val_str[STAT_VAL_LEN];
- int klen = 0, vlen = 0;
- APPEND_NUM_STAT(i, "chunk_size", "%u", p->size);
- APPEND_NUM_STAT(i, "chunks_per_page", "%u", perslab);
- APPEND_NUM_STAT(i, "total_pages", "%u", slabs);
- APPEND_NUM_STAT(i, "total_chunks", "%u", slabs * perslab);
- APPEND_NUM_STAT(i, "used_chunks", "%u",
- slabs*perslab - p->sl_curr);
- APPEND_NUM_STAT(i, "free_chunks", "%u", p->sl_curr);
- /* Stat is dead, but displaying zero instead of removing it. */
- APPEND_NUM_STAT(i, "free_chunks_end", "%u", 0);
- APPEND_NUM_STAT(i, "mem_requested", "%llu",
- (unsigned long long)p->requested);
- APPEND_NUM_STAT(i, "get_hits", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].get_hits);
- APPEND_NUM_STAT(i, "cmd_set", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].set_cmds);
- APPEND_NUM_STAT(i, "delete_hits", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].delete_hits);
- APPEND_NUM_STAT(i, "incr_hits", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].incr_hits);
- APPEND_NUM_STAT(i, "decr_hits", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].decr_hits);
- APPEND_NUM_STAT(i, "cas_hits", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].cas_hits);
- APPEND_NUM_STAT(i, "cas_badval", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].cas_badval);
- APPEND_NUM_STAT(i, "touch_hits", "%llu",
- (unsigned long long)thread_stats.slab_stats[i].touch_hits);
- total++;
- }
- }
- APPEND_STAT("active_slabs", "%d", total);
- APPEND_STAT("total_malloced", "%llu", (unsigned long long)mem_malloced);
- add_stats(NULL, 0, NULL, 0, c);
- }
- /**
- 分配内存空间
- */
- static void *memory_allocate(size_t size) {
- void *ret;
- /**
- 有两种分配策略
- 1)如果是开启了内存预分配策略,则只需要从预分配好的内存块那里割一块出来。即进入下面的else分支
- 2)如果没有开启预分配,则malloc分配内存
- 关于预分配详见 slabs_init
- */
- if (mem_base == NULL) {
- /* We are not using a preallocated large memory chunk */
- ret = malloc(size);
- } else {
- ret = mem_current;
- if (size > mem_avail) {
- return NULL;
- }
- /* mem_current pointer _must_ be aligned!!! */
- if (size % CHUNK_ALIGN_BYTES) {
- size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
- }
- mem_current = ((char*)mem_current) + size;
- if (size < mem_avail) {
- mem_avail -= size;
- } else {
- mem_avail = 0;
- }
- }
- return ret;
- }
- void *slabs_alloc(size_t size, unsigned int id) {
- void *ret;
- pthread_mutex_lock(&slabs_lock);
- ret = do_slabs_alloc(size, id);
- pthread_mutex_unlock(&slabs_lock);
- return ret;
- }
- void slabs_free(void *ptr, size_t size, unsigned int id) {
- pthread_mutex_lock(&slabs_lock);
- do_slabs_free(ptr, size, id);
- pthread_mutex_unlock(&slabs_lock);
- }
- void slabs_stats(ADD_STAT add_stats, void *c) {
- pthread_mutex_lock(&slabs_lock);
- do_slabs_stats(add_stats, c);
- pthread_mutex_unlock(&slabs_lock);
- }
- void slabs_adjust_mem_requested(unsigned int id, size_t old, size_t ntotal)
- {
- pthread_mutex_lock(&slabs_lock);
- slabclass_t *p;
- if (id < POWER_SMALLEST || id > power_largest) {
- fprintf(stderr, "Internal error! Invalid slab class\n");
- abort();
- }
- p = &slabclass[id];
- p->requested = p->requested - old + ntotal;
- pthread_mutex_unlock(&slabs_lock);
- }
- static pthread_cond_t maintenance_cond = PTHREAD_COND_INITIALIZER;
- static pthread_cond_t slab_rebalance_cond = PTHREAD_COND_INITIALIZER;
- static volatile int do_run_slab_thread = 1;
- static volatile int do_run_slab_rebalance_thread = 1;
- #define DEFAULT_SLAB_BULK_CHECK 1
- int slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
- static int slab_rebalance_start(void) {
- slabclass_t *s_cls;
- int no_go = 0;
- pthread_mutex_lock(&cache_lock);
- pthread_mutex_lock(&slabs_lock);
- if (slab_rebal.s_clsid < POWER_SMALLEST ||
- slab_rebal.s_clsid > power_largest ||
- slab_rebal.d_clsid < POWER_SMALLEST ||
- slab_rebal.d_clsid > power_largest ||
- slab_rebal.s_clsid == slab_rebal.d_clsid)
- no_go = -2;
- s_cls = &slabclass[slab_rebal.s_clsid];
- if (!grow_slab_list(slab_rebal.d_clsid)) {
- no_go = -1;
- }
- if (s_cls->slabs < 2)
- no_go = -3;
- if (no_go != 0) {
- pthread_mutex_unlock(&slabs_lock);
- pthread_mutex_unlock(&cache_lock);
- return no_go; /* Should use a wrapper function... */
- }
- s_cls->killing = 1;
- slab_rebal.slab_start = s_cls->slab_list[s_cls->killing - 1];
- slab_rebal.slab_end = (char *)slab_rebal.slab_start +
- (s_cls->size * s_cls->perslab);
- slab_rebal.slab_pos = slab_rebal.slab_start;
- slab_rebal.done = 0;
- /* Also tells do_item_get to search for items in this slab */
- slab_rebalance_signal = 2;
- if (settings.verbose > 1) {
- fprintf(stderr, "Started a slab rebalance\n");
- }
- pthread_mutex_unlock(&slabs_lock);
- pthread_mutex_unlock(&cache_lock);
- STATS_LOCK();
- stats.slab_reassign_running = true;
- STATS_UNLOCK();
- return 0;
- }
- enum move_status {
- MOVE_PASS=0, MOVE_DONE, MOVE_BUSY, MOVE_LOCKED
- };
- static int slab_rebalance_move(void) {
- slabclass_t *s_cls;
- int x;
- int was_busy = 0;
- int refcount = 0;
- enum move_status status = MOVE_PASS;
- pthread_mutex_lock(&cache_lock);
- pthread_mutex_lock(&slabs_lock);
- s_cls = &slabclass[slab_rebal.s_clsid];
- for (x = 0; x < slab_bulk_check; x++) {
- item *it = slab_rebal.slab_pos;
- status = MOVE_PASS;
- if (it->slabs_clsid != 255) {
- void *hold_lock = NULL;
- uint32_t hv = hash(ITEM_key(it), it->nkey);
- if ((hold_lock = item_trylock(hv)) == NULL) {
- status = MOVE_LOCKED;
- } else {
- refcount = refcount_incr(&it->refcount);
- if (refcount == 1) { /* item is unlinked, unused */
- if (it->it_flags & ITEM_SLABBED) {
- /* remove from slab freelist */
- if (s_cls->slots == it) {
- s_cls->slots = it->next;
- }
- if (it->next) it->next->prev = it->prev;
- if (it->prev) it->prev->next = it->next;
- s_cls->sl_curr--;
- status = MOVE_DONE;
- } else {
- status = MOVE_BUSY;
- }
- } else if (refcount == 2) { /* item is linked but not busy */
- if ((it->it_flags & ITEM_LINKED) != 0) {
- do_item_unlink_nolock(it, hv);
- status = MOVE_DONE;
- } else {
- /* refcount == 1 + !ITEM_LINKED means the item is being
- * uploaded to, or was just unlinked but hasn't been freed
- * yet. Let it bleed off on its own and try again later */
- status = MOVE_BUSY;
- }
- } else {
- if (settings.verbose > 2) {
- fprintf(stderr, "Slab reassign hit a busy item: refcount: %d (%d -> %d)\n",
- it->refcount, slab_rebal.s_clsid, slab_rebal.d_clsid);
- }
- status = MOVE_BUSY;
- }
- item_trylock_unlock(hold_lock);
- }
- }
- switch (status) {
- case MOVE_DONE:
- it->refcount = 0;
- it->it_flags = 0;
- it->slabs_clsid = 255;
- break;
- case MOVE_BUSY:
- refcount_decr(&it->refcount);
- case MOVE_LOCKED:
- slab_rebal.busy_items++;
- was_busy++;
- break;
- case MOVE_PASS:
- break;
- }
- slab_rebal.slab_pos = (char *)slab_rebal.slab_pos + s_cls->size;
- if (slab_rebal.slab_pos >= slab_rebal.slab_end)
- break;
- }
- if (slab_rebal.slab_pos >= slab_rebal.slab_end) {
- /* Some items were busy, start again from the top */
- if (slab_rebal.busy_items) {
- slab_rebal.slab_pos = slab_rebal.slab_start;
- slab_rebal.busy_items = 0;
- } else {
- slab_rebal.done++;
- }
- }
- pthread_mutex_unlock(&slabs_lock);
- pthread_mutex_unlock(&cache_lock);
- return was_busy;
- }
- static void slab_rebalance_finish(void) {
- slabclass_t *s_cls;
- slabclass_t *d_cls;
- pthread_mutex_lock(&cache_lock);
- pthread_mutex_lock(&slabs_lock);
- s_cls = &slabclass[slab_rebal.s_clsid];
- d_cls = &slabclass[slab_rebal.d_clsid];
- /* At this point the stolen slab is completely clear */
- s_cls->slab_list[s_cls->killing - 1] =
- s_cls->slab_list[s_cls->slabs - 1];
- s_cls->slabs--;
- s_cls->killing = 0;
- memset(slab_rebal.slab_start, 0, (size_t)settings.item_size_max);
- d_cls->slab_list[d_cls->slabs++] = slab_rebal.slab_start;
- split_slab_page_into_freelist(slab_rebal.slab_start,
- slab_rebal.d_clsid);
- slab_rebal.done = 0;
- slab_rebal.s_clsid = 0;
- slab_rebal.d_clsid = 0;
- slab_rebal.slab_start = NULL;
- slab_rebal.slab_end = NULL;
- slab_rebal.slab_pos = NULL;
- slab_rebalance_signal = 0;
- pthread_mutex_unlock(&slabs_lock);
- pthread_mutex_unlock(&cache_lock);
- STATS_LOCK();
- stats.slab_reassign_running = false;
- stats.slabs_moved++;
- STATS_UNLOCK();
- if (settings.verbose > 1) {
- fprintf(stderr, "finished a slab move\n");
- }
- }
- /*
- slab自动重分配时,执行此函数做出重分配方案决定
- */
- static int slab_automove_decision(int *src, int *dst) {
- static uint64_t evicted_old[POWER_LARGEST];
- static unsigned int slab_zeroes[POWER_LARGEST];
- static unsigned int slab_winner = 0;
- static unsigned int slab_wins = 0;
- uint64_t evicted_new[POWER_LARGEST];
- uint64_t evicted_diff = 0;
- uint64_t evicted_max = 0;
- unsigned int highest_slab = 0;
- unsigned int total_pages[POWER_LARGEST];
- int i;
- int source = 0;
- int dest = 0;
- static rel_time_t next_run;
- /* Run less frequently than the slabmove tester. */
- if (current_time >= next_run) {
- next_run = current_time + 10;
- } else {
- return 0;
- }
- item_stats_evictions(evicted_new);
- pthread_mutex_lock(&cache_lock);
- for (i = POWER_SMALLEST; i < power_largest; i++) {
- total_pages[i] = slabclass[i].slabs;
- }
- pthread_mutex_unlock(&cache_lock);
- /* Find a candidate source; something with zero evicts 3+ times */
- for (i = POWER_SMALLEST; i < power_largest; i++) {
- evicted_diff = evicted_new[i] - evicted_old[i];
- if (evicted_diff == 0 && total_pages[i] > 2) {
- slab_zeroes[i]++;
- if (source == 0 && slab_zeroes[i] >= 3)
- source = i;
- } else {
- slab_zeroes[i] = 0;
- if (evicted_diff > evicted_max) {
- evicted_max = evicted_diff;
- highest_slab = i;
- }
- }
- evicted_old[i] = evicted_new[i];
- }
- /* Pick a valid destination */
- if (slab_winner != 0 && slab_winner == highest_slab) {
- slab_wins++;
- if (slab_wins >= 3)
- dest = slab_winner;
- } else {
- slab_wins = 1;
- slab_winner = highest_slab;
- }
- if (source && dest) {
- *src = source;
- *dst = dest;
- return 1;
- }
- return 0;
- }
- /* Slab rebalancer thread.
- * Does not use spinlocks since it is not timing sensitive. Burn less CPU and
- * go to sleep if locks are contended
- 运行slab维护线程,slab维护线程的执行入口
- */
- static void *slab_maintenance_thread(void *arg) {
- int src, dest;
- while (do_run_slab_thread) {
- if (settings.slab_automove == 1) {
- if (slab_automove_decision(&src, &dest) == 1) {
- /* Blind to the return codes. It will retry on its own */
- slabs_reassign(src, dest); //移动slab,重分配
- }
- sleep(1);
- } else {
- /* Don't wake as often if we're not enabled.
- * This is lazier than setting up a condition right now. */
- sleep(5);
- }
- }
- return NULL;
- }
- /* Slab mover thread.
- * Sits waiting for a condition to jump off and shovel some memory about
- */
- static void *slab_rebalance_thread(void *arg) {
- int was_busy = 0;
- /* So we first pass into cond_wait with the mutex held */
- mutex_lock(&slabs_rebalance_lock);
- while (do_run_slab_rebalance_thread) {
- if (slab_rebalance_signal == 1) {
- if (slab_rebalance_start() < 0) {
- /* Handle errors with more specifity as required. */
- slab_rebalance_signal = 0;
- }
- was_busy = 0;
- } else if (slab_rebalance_signal && slab_rebal.slab_start != NULL) {
- was_busy = slab_rebalance_move();
- }
- if (slab_rebal.done) {
- slab_rebalance_finish();
- } else if (was_busy) {
- /* Stuck waiting for some items to unlock, so slow down a bit
- * to give them a chance to free up */
- usleep(50);
- }
- if (slab_rebalance_signal == 0) {
- /* always hold this lock while we're running */
- pthread_cond_wait(&slab_rebalance_cond, &slabs_rebalance_lock);
- }
- }
- return NULL;
- }
- static int slabs_reassign_pick_any(int dst) {
- static int cur = POWER_SMALLEST - 1;
- int tries = power_largest - POWER_SMALLEST + 1;
- for (; tries > 0; tries--) {
- cur++;
- if (cur > power_largest)
- cur = POWER_SMALLEST;
- if (cur == dst)
- continue;
- if (slabclass[cur].slabs > 1) {
- return cur;
- }
- }
- return -1;
- }
- static enum reassign_result_type do_slabs_reassign(int src, int dst) {
- if (slab_rebalance_signal != 0)
- return REASSIGN_RUNNING;
- if (src == dst)
- return REASSIGN_SRC_DST_SAME;
- /* Special indicator to choose ourselves. */
- if (src == -1) {
- src = slabs_reassign_pick_any(dst);
- /* TODO: If we end up back at -1, return a new error type */
- }
- if (src < POWER_SMALLEST || src > power_largest ||
- dst < POWER_SMALLEST || dst > power_largest)
- return REASSIGN_BADCLASS;
- if (slabclass[src].slabs < 2)
- return REASSIGN_NOSPARE;
- slab_rebal.s_clsid = src;
- slab_rebal.d_clsid = dst;
- slab_rebalance_signal = 1;
- pthread_cond_signal(&slab_rebalance_cond);
- return REASSIGN_OK;
- }
- enum reassign_result_type slabs_reassign(int src, int dst) {
- enum reassign_result_type ret;
- if (pthread_mutex_trylock(&slabs_rebalance_lock) != 0) {
- return REASSIGN_RUNNING;
- }
- ret = do_slabs_reassign(src, dst);
- pthread_mutex_unlock(&slabs_rebalance_lock);
- return ret;
- }
- /* If we hold this lock, rebalancer can't wake up or move */
- void slabs_rebalancer_pause(void) {
- pthread_mutex_lock(&slabs_rebalance_lock);
- }
- void slabs_rebalancer_resume(void) {
- pthread_mutex_unlock(&slabs_rebalance_lock);
- }
- static pthread_t maintenance_tid;
- static pthread_t rebalance_tid;
- /**
- 启动slab维护线程
- */
- int start_slab_maintenance_thread(void) {
- int ret;
- slab_rebalance_signal = 0;
- slab_rebal.slab_start = NULL;
- char *env = getenv("MEMCACHED_SLAB_BULK_CHECK");
- if (env != NULL) {
- slab_bulk_check = atoi(env);
- if (slab_bulk_check == 0) {
- slab_bulk_check = DEFAULT_SLAB_BULK_CHECK;
- }
- }
- if (pthread_cond_init(&slab_rebalance_cond, NULL) != 0) {
- fprintf(stderr, "Can't intiialize rebalance condition\n");
- return -1;
- }
- pthread_mutex_init(&slabs_rebalance_lock, NULL);
- if ((ret = pthread_create(&maintenance_tid, NULL,
- slab_maintenance_thread, NULL)) != 0) {
- fprintf(stderr, "Can't create slab maint thread: %s\n", strerror(ret));
- return -1;
- }
- if ((ret = pthread_create(&rebalance_tid, NULL,
- slab_rebalance_thread, NULL)) != 0) {
- fprintf(stderr, "Can't create rebal thread: %s\n", strerror(ret));
- return -1;
- }
- return 0;
- }
- /**
- 停止slab维护线程,逻辑和停止哈希表维护线程一样。
- */
- void stop_slab_maintenance_thread(void) {
- mutex_lock(&cache_lock);
- do_run_slab_thread = 0;
- do_run_slab_rebalance_thread = 0;
- pthread_cond_signal(&maintenance_cond);
- pthread_mutex_unlock(&cache_lock);
- /* Wait for the maintenance thread to stop */
- pthread_join(maintenance_tid, NULL);
- pthread_join(rebalance_tid, NULL);
- }
不掉到水里,也永不知道自己有多大潜力!