本文共 11781 字,大约阅读时间需要 39 分钟。
这篇文章的目的是想把Memcached的内存管理机制讲解清楚,在前面的文章中我们已经提交到Item是Memcached中存储的数据单元,而Item的内存分配策略就是本章的重点了。
Memcached的存储机制Slabs,所有Item相关的内存分配都由Slabs来实现的。typedef struct _stritem { /* Protected by LRU locks */ struct _stritem *next; struct _stritem *prev; /* Rest are protected by an item lock */ struct _stritem *h_next; /* hash chain next */ rel_time_t time; /* least recent access */ rel_time_t exptime; /* expire time */ int nbytes; /* size of data */ unsigned short refcount; uint8_t nsuffix; /* length of flags-and-length string */ uint8_t it_flags; /* ITEM_* above */ uint8_t slabs_clsid;/* which slab class we're in */ uint8_t nkey; /* key length, w/terminating null and padding */ /* this odd type prevents type-punning issues when we do * the little shuffle to save space when not using CAS. */ union { uint64_t cas; char end; } data[]; /* if it_flags & ITEM_CAS we have 8 bytes CAS */ /* then null-terminated key */ /* then " flags length\r\n" (no terminating null) */ /* then data with terminating \r\n (no terminating null; it's binary!) */} item;
static void settings_init(void) { settings.use_cas = true; settings.access = 0700; settings.port = 11211; settings.udpport = 0; /* By default this string should be NULL for getaddrinfo() */ settings.inter = NULL; settings.maxbytes = 64 * 1024 * 1024; /* default is 64MB */ settings.maxconns = 1024; /* to limit connections-related memory to about 5MB */ settings.verbose = 0; settings.oldest_live = 0; settings.oldest_cas = 0; /* supplements accuracy of oldest_live */ settings.evict_to_free = 1; /* push old items out of cache when memory runs out */ settings.socketpath = NULL; /* by default, not using a unix socket */ settings.factor = 1.25; settings.chunk_size = 48; /* space for a modest key and value */ settings.num_threads = 4; /* N workers */ settings.num_threads_per_udp = 0; settings.prefix_delimiter = ':'; settings.detail_enabled = 0; settings.reqs_per_event = 20; settings.backlog = 1024; settings.binding_protocol = negotiating_prot; settings.item_size_max = 1024 * 1024; /* The famous 1MB upper limit. */ settings.slab_page_size = 1024 * 1024; /* chunks are split from 1MB pages. */ settings.slab_chunk_size_max = settings.slab_page_size / 2; settings.sasl = false; settings.maxconns_fast = true; settings.lru_crawler = false; settings.lru_crawler_sleep = 100; settings.lru_crawler_tocrawl = 0; settings.lru_maintainer_thread = false; settings.lru_segmented = true; settings.hot_lru_pct = 20; settings.warm_lru_pct = 40; settings.hot_max_factor = 0.2; settings.warm_max_factor = 2.0; settings.inline_ascii_response = false; settings.temp_lru = false; settings.temporary_ttl = 61; settings.idle_timeout = 0; /* disabled */ settings.hashpower_init = 0; settings.slab_reassign = true; settings.slab_automove = 1; settings.slab_automove_ratio = 0.8; settings.slab_automove_window = 30; settings.shutdown_command = false; settings.tail_repair_time = TAIL_REPAIR_TIME_DEFAULT; settings.flush_enabled = true; settings.dump_enabled = true; settings.crawls_persleep = 1000; settings.logger_watcher_buf_size = LOGGER_WATCHER_BUF_SIZE; settings.logger_buf_size = LOGGER_BUF_SIZE; settings.drop_privileges = false;#ifdef MEMCACHED_DEBUG settings.relaxed_privileges = false;#endif}
slabclass_t结构的核心字段已经进行注释了,核心void *slots表示空间的chunk列表,其实也就是item列表。
typedef struct { unsigned int size; //当前的slabclass存储最大多大的item unsigned int perslab; //每一个slab上可以存储多少个item.每个slab大小为1M, 可以存储的item个数根据size决定。 void *slots; //空闲列表,其实是连续的内存分配,但是还是通过item结构中的item->next和item->prev连建立链表结构关系 unsigned int sl_curr; //当sl_curr=0的时候,说明已经没有空闲的item,需要分配一个新的slab(每个1M,可以切割成N多个Item结构) unsigned int slabs; //总共分配多少个slabs void **slab_list; //分配的slab链表 unsigned int list_size; /* size of prev array */ size_t requested; //总共请求的总bytes} slabclass_t;从这里开始进入slabclass内存初始化的核心逻辑,整体过程如下:slabclass 划分数据空间
#define POWER_SMALLEST 1#define SLAB_GLOBAL_PAGE_POOL 0#define POWER_LARGEST 256 /* actual cap is 255 */#define MAX_NUMBER_OF_SLAB_CLASSES (63 + 1)static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes) { int i = POWER_SMALLEST - 1; // 设置每个slabclass中item的size大小 unsigned int size = sizeof(item) + settings.chunk_size; memset(slabclass, 0, sizeof(slabclass)); // 每次以1.5倍的大小增加slabclass中item的size大小 while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) { if (slab_sizes != NULL) { if (slab_sizes[i-1] == 0) break; size = slab_sizes[i-1]; } else if (size >= settings.slab_chunk_size_max / factor) { // 如果已经达到了最大值settings.slab_chunk_size_max break; } // 执行对齐操作 if (size % CHUNK_ALIGN_BYTES) size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); //每个slabclass[i]存储最大多大的item,size表示item的大小 slabclass[i].size = size; slabclass[i].perslab = settings.slab_page_size / slabclass[i].size; // 这里以1.5倍的速率增加slabclass中item的大小 if (slab_sizes == NULL) size *= factor; } // 根据settings.slab_chunk_size_max的值设置最多多少个slabclass的个数 power_largest = i; slabclass[power_largest].size = settings.slab_chunk_size_max; slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max; { char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC"); if (t_initial_malloc) { mem_malloced = (size_t)atol(t_initial_malloc); } } // 进入slabclass的分配 if (prealloc && do_slab_prealloc) { slabs_preallocate(power_largest); }}static void slabs_preallocate (const unsigned int maxslabs) { int i; unsigned int prealloc = 0; // 遍历每个slabclass进行初始化,这里会综合考虑最大的支持的slabclass和根据内存分配设置的合适的个数, //这里的POWER_SMALLEST的值是1,slabclass的数组的第一个元素是特殊意义的 for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) { if (++prealloc > maxslabs) return; // 进行slabclass的分配 if (do_slabs_newslab(i) == 0) { exit(1); } }}// 这里的id就是slabclass的下标,这里已经进入到单个slabclass初始化过程了static int do_slabs_newslab(const unsigned int id) { slabclass_t *p = &slabclass[id]; slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL]; // 这里我们在计算slab的大小 int len = (settings.slab_reassign || settings.slab_chunk_size_max != settings.slab_page_size) ? settings.slab_page_size : p->size * p->perslab; char *ptr; if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0 && g->slabs == 0)) { mem_limit_reached = true; MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id); return 0; } // 通过memory_allocate函数分配len大小的内存空间 if ((grow_slab_list(id) == 0) || (((ptr = get_page_from_global_pool()) == NULL) && ((ptr = memory_allocate((size_t)len)) == 0))) { MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id); return 0; } // 初始化该块内存为0 memset(ptr, 0, (size_t)len); split_slab_page_into_freelist(ptr, id); // slab_list保存slabs个数的slab的指针 p->slab_list[p->slabs++] = ptr; MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id); return 1;}//初始化单个slabclass下第一个slab当中的item。static void split_slab_page_into_freelist(char *ptr, const unsigned int id) { slabclass_t *p = &slabclass[id]; int x; //遍历slab下的perslab个数的size大小的内存块,将所有的内存块挂到free_list当中 for (x = 0; x < p->perslab; x++) { do_slabs_free(ptr, 0, id); ptr += p->size; }}//初始化slab当中单个item的内存空间static void do_slabs_free(void *ptr, const size_t size, unsigned int id) { slabclass_t *p; item *it; p = &slabclass[id]; it = (item *)ptr; if ((it->it_flags & ITEM_CHUNKED) == 0) { //省略相关代码 } else { do_slabs_free_chunked(it, size); } return;}// 采用链表的头部插入法生成p->slots,也就是说slab当中分配的item以链表的形式进行连接并保存到p->slots当中static void do_slabs_free_chunked(item *it, const size_t size) { item_chunk *chunk = (item_chunk *) ITEM_data(it); slabclass_t *p; it->it_flags = ITEM_SLABBED; it->slabs_clsid = 0; it->prev = 0; // header object's original classid is stored in chunk. p = &slabclass[chunk->orig_clsid]; if (chunk->next) { chunk = chunk->next; chunk->prev = 0; } else { // header with no attached chunk chunk = NULL; } // return the header object. // TODO: This is in three places, here and in do_slabs_free(). it->prev = 0; it->next = p->slots; if (it->next) it->next->prev = it; p->slots = it; p->sl_curr++; // TODO: macro p->requested -= it->nkey + 1 + it->nsuffix + sizeof(item) + sizeof(item_chunk); if (settings.use_cas) { p->requested -= sizeof(uint64_t); } item_chunk *next_chunk; while (chunk) { assert(chunk->it_flags == ITEM_CHUNK); chunk->it_flags = ITEM_SLABBED; p = &slabclass[chunk->slabs_clsid]; chunk->slabs_clsid = 0; next_chunk = chunk->next; chunk->prev = 0; chunk->next = p->slots; if (chunk->next) chunk->next->prev = chunk; p->slots = chunk; p->sl_curr++; p->requested -= chunk->size + sizeof(item_chunk); chunk = next_chunk; } return;}
其实两幅图都是对的,但是侧重点稍微不一样。
slabs_clsid方法,主要通过item的长度来查询应该适合存放到哪个slabsclass_t上面。找的逻辑就是简单的找到第一个可以容纳得下size大小的slabclass。
//通过item的size,选择当前的item适合放在哪个slab class中unsigned int slabs_clsid(const size_t size) { int res = POWER_SMALLEST; //从id = 1开始查找 //slabclass这个结构上的size会存储该class适合多大的item存储 //例如 //slabclass[0] 存储96byte //slabclass[1] 存储120byte //slabclass[2] 存储150byte //则,如果存储的item等于109byte,则存储在slabclass[1]上 if (size == 0) return 0; while (size > slabclass[res].size) if (res++ == power_largest) /* won't fit in the biggest slab */ return 0; return res;}
转载地址:http://dzwno.baihongyu.com/