items是memcache用来管理item的封装,采用的hash表和LRU链的形式,关于hash表的操作见我前几天的文章 memcache源码分析之assoc
关于item内容的存储机制简介
item的内容存储是在slab中管理的,为了对内存进行有效的管理,slab采用的是分桶的大小来存储item的内容的,简单举例解释一下,初始化时会有不同块大小的桶,比如桶1里面的
内存块都是80b的,专门用来存储item内容大小接近80b的。桶2的内存块是100b的,专门用来存储内容大小接近100b的item,桶3是120b的,用来存储大小接近120b的item,等等。所以,如果有一个item的内容大小是90b,那它只能存储在100b的桶内,不能存储在其他里面的,120b的也不可以。具体详细介绍请见我后续关于slab的文章。
问题:当100b的桶存储满的时候,memcache怎么办呢?
这个问题的答案就在本文介绍的内容里面。
为一个item分配存储空间的时候,具体的操作是这样的:
1、首先,计算该item占用的空间大小,只有知道了它的大小,才能知道它需要存储在哪个桶中。一个item的大小包括它的item结构体大小部分、名字长度部分、状态标识部分、内容大小部分等的总和。具体计算方法请看下面的代码分析中 item_make_header 函数。
2、然后寻找合适的slab用于存储,这一部分主要是比较item 和各slab桶的大小,寻找最合适的slab,此部分代码是文件 slabs.c 中的 slabs_clsid 函数,具体内容我后续关于slab的文章会详细分析。
3、从对应slab的tail队列中寻找是否存在过期的item,如果有,清除掉,此处操作最多尝试50次。
4、如果第3步操作失败,并且在对应slab中分配空间失败,那么从slab对应的tail队列中删除没有被引用的item,且最多也是尝试50次。
5、尝试从slab中分配空间。
6、如果第5步失败,会从slab对应的tail队列中删除3个小时(默认)之前的正在引用的item。
7、然后尝试从slab中分配空间。如果失败,返回NULL,成功则会设置item对应的一些信息,返回成功标识。
item的删除过程:
1、设置已被删除状态。并从hash表中删除,次部分代码调用的是 memcache源码分析之assoc 中介绍到的函数assoc_delete
2、从LRU链中删除。函数item_unlink_q。
3、如果要清除item占用的资源,则调用函数do_item_remove和item_free,释放占用内存空间。
另外还提供了一些其他操作,分别包括,获取某个item(会判断是否过期),获取某个item(不判断是否过期),客户端通过flush_all操作清空所有过期item,item的新值替换,访问时间更新等。
当然,有item的删除操作,就要有相应的加入hash表和LRU链的操作。
另外,还提供了一些item和slab状态函数。
想了解详细代码的同学可以看一下下面的简要分析。有错误之处请指正。
items.h
/* See items.c */ uint64_t get_cas_id(void); /*@null@*/ item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes); void item_free(item *it); bool item_size_ok(const size_t nkey, const int flags, const int nbytes); int do_item_link(item *it); /** may fail if transgresses limits */ void do_item_unlink(item *it); void do_item_remove(item *it); void do_item_update(item *it); /** update LRU time to current and reposition */ int do_item_replace(item *it, item *new_it); /*@null@*/ char *do_item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes); void do_item_stats(ADD_STAT add_stats, void *c); /*@null@*/ void do_item_stats_sizes(ADD_STAT add_stats, void *c); void do_item_flush_expired(void); item *do_item_get(const char *key, const size_t nkey); item *do_item_get_nocheck(const char *key, const size_t nkey); void item_stats_reset(void); extern pthread_mutex_t cache_lock;
items.c
/* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ #include "memcached.h" #include <sys/stat.h> #include <sys/socket.h> #include <sys/signal.h> #include <sys/resource.h> #include <fcntl.h> #include <netinet/in.h> #include <errno.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <time.h> #include <assert.h> /* Forward Declarations */ static void item_link_q(item *it); static void item_unlink_q(item *it); /* * We only reposition items in the LRU queue if they haven't been repositioned * in this many seconds. That saves us from churning on frequently-accessed * items. */ #define ITEM_UPDATE_INTERVAL 60 #define LARGEST_ID POWER_LARGEST //item状态信息结构体 typedef struct { unsigned int evicted; unsigned int evicted_nonzero; rel_time_t evicted_time; unsigned int reclaimed; unsigned int outofmemory; unsigned int tailrepairs; } itemstats_t; static item *heads[LARGEST_ID]; static item *tails[LARGEST_ID]; static itemstats_t itemstats[LARGEST_ID]; static unsigned int sizes[LARGEST_ID];//记录每个slab的元素个数 void item_stats_reset(void) { pthread_mutex_lock(&cache_lock); memset(itemstats, 0, sizeof(itemstats)); pthread_mutex_unlock(&cache_lock); } //获取新的CAS值 uint64_t get_cas_id(void) { static uint64_t cas_id = 0; return ++cas_id; } /* Enable this for reference-count debugging. */ #if 0 # define DEBUG_REFCNT(it,op) \ fprintf(stderr, "item %x refcnt(%c) %d %c%c%c\n", \ it, op, it->refcount, \ (it->it_flags & ITEM_LINKED) ? 'L' : ' ', \ (it->it_flags & ITEM_SLABBED) ? 'S' : ' ') #else # define DEBUG_REFCNT(it,op) while(0) #endif /** * Generates the variable-sized part of the header for an object. * * key - The key * nkey - The length of the key * flags - key flags * nbytes - Number of bytes to hold value and addition CRLF terminator * suffix - Buffer for the "VALUE" line suffix (flags, size). * nsuffix - The length of the suffix is stored here. * * Returns the total size of the header. */ //计算item占用空间大小 static size_t item_make_header(const uint8_t nkey, const int flags, const int nbytes,char *suffix, uint8_t *nsuffix) { /* suffix is defined at 40 chars elsewhere.. */ *nsuffix = (uint8_t) snprintf(suffix, 40, " %d %d\r\n", flags, nbytes - 2); return sizeof(item) + nkey + *nsuffix + nbytes; } //分配一个item空间 item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) { uint8_t nsuffix; item *it = NULL; char suffix[40]; size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);//获取item占用空间大小 if (settings.use_cas) { ntotal += sizeof(uint64_t); } unsigned int id = slabs_clsid(ntotal);//寻找合适的slab if (id == 0) return 0; /* do a quick check if we have any expired items in the tail.. */ int tries = 50; item *search; for (search = tails[id];tries > 0 && search != NULL;tries--, search=search->prev) { if (search->refcount == 0 && (search->exptime != 0 && search->exptime < current_time)) {//过期 it = search; /* I don't want to actually free the object, just steal * the item to avoid to grab the slab mutex twice ;-) */ STATS_LOCK(); stats.reclaimed++; STATS_UNLOCK(); itemstats[id].reclaimed++; it->refcount = 1; do_item_unlink(it);//从hash表删除 /* Initialize the item block: */ it->slabs_clsid = 0; it->refcount = 0; break; } } if (it == NULL && (it = slabs_alloc(ntotal, id)) == NULL) {//没有过期元素且加入相应slab失败 tries = 50; /* If requested to not push old items out of cache when memory runs out, * we're out of luck at this point... */ if (settings.evict_to_free == 0) { itemstats[id].outofmemory++; return NULL; } /* * try to get one off the right LRU * don't necessariuly unlink the tail because it may be locked: refcount>0 * search up from tail an item with refcount==0 and unlink it; give up after 50 * tries */ if (tails[id] == 0) { itemstats[id].outofmemory++; return NULL; } for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) { if (search->refcount == 0) {//没有被引用的情况下删除之 if (search->exptime == 0 || search->exptime > current_time) { itemstats[id].evicted++; itemstats[id].evicted_time = current_time - search->time; if (search->exptime != 0) itemstats[id].evicted_nonzero++; STATS_LOCK(); stats.evictions++; STATS_UNLOCK(); } else { itemstats[id].reclaimed++; STATS_LOCK(); stats.reclaimed++; STATS_UNLOCK(); } do_item_unlink(search); break; } } it = slabs_alloc(ntotal, id); if (it == 0) { itemstats[id].outofmemory++; /* Last ditch effort. There is a very rare bug which causes * refcount leaks. We've fixed most of them, but it still happens, * and it may happen in the future. * We can reasonably assume no item can stay locked for more than * three hours, so if we find one in the tail which is that old, * free it anyway. */ tries = 50; for (search = tails[id]; tries > 0 && search != NULL; tries--, search=search->prev) { if (search->refcount != 0 && search->time + TAIL_REPAIR_TIME < current_time) {//没有被引用并且是3小时之前的item itemstats[id].tailrepairs++; search->refcount = 0; do_item_unlink(search); break; } } it = slabs_alloc(ntotal, id); if (it == 0) { return NULL; } } } assert(it->slabs_clsid == 0); it->slabs_clsid = id; assert(it != heads[it->slabs_clsid]); it->next = it->prev = it->h_next = 0; it->refcount = 1; /* the caller will have a reference */ DEBUG_REFCNT(it, '*'); it->it_flags = settings.use_cas ? ITEM_CAS : 0; it->nkey = nkey; it->nbytes = nbytes; memcpy(ITEM_key(it), key, nkey); it->exptime = exptime; memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix); it->nsuffix = nsuffix; return it; } //释放item void item_free(item *it) { size_t ntotal = ITEM_ntotal(it); unsigned int clsid; assert((it->it_flags & ITEM_LINKED) == 0);//没有在hash表和LRU链中 assert(it != heads[it->slabs_clsid]); assert(it != tails[it->slabs_clsid]); assert(it->refcount == 0); /* so slab size changer can tell later if item is already free or not */ clsid = it->slabs_clsid; it->slabs_clsid = 0; it->it_flags |= ITEM_SLABBED;//内存空闲交给slab DEBUG_REFCNT(it, 'F'); slabs_free(it, ntotal, clsid); } //检验某item是否有适合的slab来存储 bool item_size_ok(const size_t nkey, const int flags, const int nbytes) { char prefix[40]; uint8_t nsuffix; return slabs_clsid(item_make_header(nkey + 1, flags, nbytes,prefix, &nsuffix)) != 0; } //加入LRU队列,成为新的head static void item_link_q(item *it) { /* item is the new head */ item **head, **tail; assert(it->slabs_clsid < LARGEST_ID);//判断所设置slab是否有效 assert((it->it_flags & ITEM_SLABBED) == 0);//判断状态 head = &heads[it->slabs_clsid]; tail = &tails[it->slabs_clsid]; assert(it != *head); assert((*head && *tail) || (*head == 0 && *tail == 0)); it->prev = 0; it->next = *head; if (it->next) it->next->prev = it; *head = it; if (*tail == 0) *tail = it;//只有tail为空时才加入? sizes[it->slabs_clsid]++; return; } //从对应的slab的LRU链上删除 static void item_unlink_q(item *it) { item **head, **tail; assert(it->slabs_clsid < LARGEST_ID); head = &heads[it->slabs_clsid]; tail = &tails[it->slabs_clsid]; if (*head == it) { assert(it->prev == 0); *head = it->next; } if (*tail == it) { assert(it->next == 0); *tail = it->prev; } assert(it->next != it); assert(it->prev != it); if (it->next) it->next->prev = it->prev; if (it->prev) it->prev->next = it->next; sizes[it->slabs_clsid]--; return; } //将item加入到hashtable和LRU链中 int do_item_link(item *it) { MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);//ITEM_key在memcached.h中定义 assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);//判断状态,既没有在hash表LRU链中或被释放 it->it_flags |= ITEM_LINKED;//设置linked状态 it->time = current_time;//设置最近访问时间 assoc_insert(it);//插入hashtable assoc.c STATS_LOCK(); stats.curr_bytes += ITEM_ntotal(it);//增加每个item所需要的字节大小,包括item结构体和item内容大小 stats.curr_items += 1; stats.total_items += 1; STATS_UNLOCK(); /* Allocate a new CAS ID on link. */ ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);//设置新CAS,CAS是memcache用来处理并发请求的一种机制 item_link_q(it);//加入LRU链 return 1; } //从hash表和LRU链中删除item void do_item_unlink(item *it) { MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes); if ((it->it_flags & ITEM_LINKED) != 0) { it->it_flags &= ~ITEM_LINKED;//设置为非linked STATS_LOCK(); stats.curr_bytes -= ITEM_ntotal(it); stats.curr_items -= 1; STATS_UNLOCK(); assoc_delete(ITEM_key(it), it->nkey);//从hash表中删除 item_unlink_q(it);//从LRU链中删除 if (it->refcount == 0) item_free(it); } } //remove item void do_item_remove(item *it) { MEMCACHED_ITEM_REMOVE(ITEM_key(it), it->nkey, it->nbytes); assert((it->it_flags & ITEM_SLABBED) == 0); if (it->refcount != 0) { it->refcount--; DEBUG_REFCNT(it, '-'); } if (it->refcount == 0 && (it->it_flags & ITEM_LINKED) == 0) {//没有人在引用并且没有在hash表和LEU链中 item_free(it); } } //更新item最后访问时间 void do_item_update(item *it) { MEMCACHED_ITEM_UPDATE(ITEM_key(it), it->nkey, it->nbytes); if (it->time < current_time - ITEM_UPDATE_INTERVAL) { assert((it->it_flags & ITEM_SLABBED) == 0);//没有被释放 if ((it->it_flags & ITEM_LINKED) != 0) { item_unlink_q(it); it->time = current_time; item_link_q(it); } } } //item替换 int do_item_replace(item *it, item *new_it) { MEMCACHED_ITEM_REPLACE(ITEM_key(it), it->nkey, it->nbytes,ITEM_key(new_it), new_it->nkey, new_it->nbytes); assert((it->it_flags & ITEM_SLABBED) == 0);//确保没有被释放 do_item_unlink(it); return do_item_link(new_it); } /*@null@*/ char *do_item_cachedump(const unsigned int slabs_clsid, const unsigned int limit, unsigned int *bytes) { unsigned int memlimit = 2 * 1024 * 1024; /* 2MB max response size */ char *buffer; unsigned int bufcurr; item *it; unsigned int len; unsigned int shown = 0; char key_temp[KEY_MAX_LENGTH + 1]; char temp[512]; it = heads[slabs_clsid]; buffer = malloc((size_t)memlimit); if (buffer == 0) return NULL; bufcurr = 0; while (it != NULL && (limit == 0 || shown < limit)) { assert(it->nkey <= KEY_MAX_LENGTH); /* Copy the key since it may not be null-terminated in the struct */ strncpy(key_temp, ITEM_key(it), it->nkey); key_temp[it->nkey] = 0x00; /* terminate */ len = snprintf(temp, sizeof(temp), "ITEM %s [%d b; %lu s]\r\n",key_temp, it->nbytes - 2,(unsigned long)it->exptime + process_started); if (bufcurr + len + 6 > memlimit) /* 6 is END\r\n\0 */ break; memcpy(buffer + bufcurr, temp, len); bufcurr += len; shown++; it = it->next; } memcpy(buffer + bufcurr, "END\r\n", 6); bufcurr += 5; *bytes = bufcurr; return buffer; } //slab状态信息 void do_item_stats(ADD_STAT add_stats, void *c) { int i; for (i = 0; i < LARGEST_ID; i++) { if (tails[i] != NULL) { const char *fmt = "items:%d:%s"; char key_str[STAT_KEY_LEN]; char val_str[STAT_VAL_LEN]; int klen = 0, vlen = 0; APPEND_NUM_FMT_STAT(fmt, i, "number", "%u", sizes[i]); APPEND_NUM_FMT_STAT(fmt, i, "age", "%u", tails[i]->time); APPEND_NUM_FMT_STAT(fmt, i, "evicted","%u", itemstats[i].evicted); APPEND_NUM_FMT_STAT(fmt, i, "evicted_nonzero","%u", itemstats[i].evicted_nonzero); APPEND_NUM_FMT_STAT(fmt, i, "evicted_time","%u", itemstats[i].evicted_time); APPEND_NUM_FMT_STAT(fmt, i, "outofmemory","%u", itemstats[i].outofmemory); APPEND_NUM_FMT_STAT(fmt, i, "tailrepairs","%u", itemstats[i].tailrepairs);; APPEND_NUM_FMT_STAT(fmt, i, "reclaimed","%u", itemstats[i].reclaimed);; } } /* getting here means both ascii and binary terminators fit */ add_stats(NULL, 0, NULL, 0, c); } /** dumps out a list of objects of each size, with granularity of 32 bytes */ /*@null@*/ void do_item_stats_sizes(ADD_STAT add_stats, void *c) { /* max 1MB object, divided into 32 bytes size buckets */ const int num_buckets = 32768; unsigned int *histogram = calloc(num_buckets, sizeof(int)); if (histogram != NULL) { int i; /* build the histogram */ for (i = 0; i < LARGEST_ID; i++) { item *iter = heads[i]; while (iter) { int ntotal = ITEM_ntotal(iter); int bucket = ntotal / 32; if ((ntotal % 32) != 0) bucket++; if (bucket < num_buckets) histogram[bucket]++; iter = iter->next; } } /* write the buffer */ for (i = 0; i < num_buckets; i++) { if (histogram[i] != 0) { char key[8]; int klen = 0; klen = snprintf(key, sizeof(key), "%d", i * 32); assert(klen < sizeof(key)); APPEND_STAT(key, "%u", histogram[i]); } } free(histogram); } add_stats(NULL, 0, NULL, 0, c); } //获取item item *do_item_get(const char *key, const size_t nkey) { item *it = assoc_find(key, nkey); int was_found = 0; if (settings.verbose > 2) {//输出调试信息 if (it == NULL) { fprintf(stderr, "> NOT FOUND %s", key); } else { fprintf(stderr, "> FOUND KEY %s", ITEM_key(it)); was_found++; } } //忽略比设置日期早的item if (it != NULL && settings.oldest_live != 0 && settings.oldest_live <= current_time && it->time <= settings.oldest_live) { do_item_unlink(it); /* MTSAFE - cache_lock held */ it = NULL; } if (it == NULL && was_found) { fprintf(stderr, " -nuked by flush");//被忽略错误信息 was_found--; } if (it != NULL && it->exptime != 0 && it->exptime <= current_time) {//过期 do_item_unlink(it); /* MTSAFE - cache_lock held */ it = NULL; } if (it == NULL && was_found) { fprintf(stderr, " -nuked by expire");//过期错误 was_found--; } if (it != NULL) { it->refcount++; DEBUG_REFCNT(it, '+'); } if (settings.verbose > 2) fprintf(stderr, "\n"); return it; } //获取一个item,不论过期与否 item *do_item_get_nocheck(const char *key, const size_t nkey) { item *it = assoc_find(key, nkey); if (it) { it->refcount++; DEBUG_REFCNT(it, '+'); } return it; } //flush all items void do_item_flush_expired(void) { int i; item *iter, *next; if (settings.oldest_live == 0) return; for (i = 0; i < LARGEST_ID; i++) { /* The LRU is sorted in decreasing time order, and an item's timestamp * is never newer than its last access time, so we only need to walk * back until we hit an item older than the oldest_live time. * The oldest_live checking will auto-expire the remaining items. */ for (iter = heads[i]; iter != NULL; iter = next) { if (iter->time >= settings.oldest_live) { next = iter->next; if ((iter->it_flags & ITEM_SLABBED) == 0) {//没有被释放,unlink do_item_unlink(iter); } } else { break; } } } }