内存管理 初始化(四)mem_init bootmem 迁移至伙伴系统
mm_init中执行mem_init,将原通过bootmem分配器管理的低端内存 及 通过meminfo得知的高端内存释放到伙伴系统中,最后bootmem位图本身占用的低端内存物理页也被释放进伙伴系统,当然对于内核、初始页表、pkmap页表、struct page实例、ramdisk、percpu变量、dentry_hashtable、inode_hash_table已经被占用的区域不会被释放(对于内核开始的一段,后面会释放).
start_kernel() |---->page_address_init() | 考虑支持高端内存 | 业务:初始化page_address_pool链表; | 将page_address_maps数组元素按索 | 引降序插入page_address_pool链表; | 初始化page_address_htable数组 | |---->setup_arch(&command_line); | |---->setup_per_cpu_areas(); | 为per-CPU变量分配空间 | |---->build_all_zonelist() | 为系统中的zone建立后备zone的列表. | 2.6.34中的建立过程与《深入Linux内核架构》中 | p_134~p_135的图不符(即使是UMA也不同), | 书中讲述是每个zone都有自己的zonelist, | 2.6.34中对于UMA,所有zone的后备列表都在 | pglist_data->node_zonelists[0]中; | | 期间也对per-CPU变量boot_pageset做了初始化. | |---->page_alloc_init() |---->hotcpu_notifier(page_alloc_cpu_notifier, 0); | 不考虑热插拔CPU | |---->pidhash_init() | 详见下文. | 根据低端内存页数和散列度,分配hash空间,并赋予pid_hash | |---->vfs_caches_init_early() |---->dcache_init_early() | dentry_hashtable空间,d_hash_shift, h_hash_mask赋值; | 同pidhash_init(); | 区别: | 散列度变化了(13 - PAGE_SHIFT); | 传入alloc_large_system_hash的最后参数值为0; | |---->inode_init_early() | inode_hashtable空间,i_hash_shift, i_hash_mask赋值; | 同pidhash_init(); | 区别: | 散列度变化了(14 - PAGE_SHIFT); | 传入alloc_large_system_hash的最后参数值为0; | |---->mm_init() |
void mm_init(void) |---->mem_init() | 业务:bootmem迁移至伙伴系统 | |---->
void mem_init(void) |-->max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map; | max_pfn是物理内存的最大页数量,PHYS_PFN_OFFSET是物理内存的起始 | 地址在4G空间中的页帧号; | pfn_to_page(max_pfn + PHYS_PFN_OFFSET)是物理内存终结地址所在的页 | 锁对应的struct page实例虚拟地址,减去mem_map(struct page起始虚 | 拟地址),故max_mapnr是struct page实例的数量 | |-->free_unused_memmap_node(0, &meminfo) | 对于连续内存,bank之间没有间隙,因此free_unused_memmap_node不会执行. | |-->totalram_pages += free_all_bootmem_node(pgdat); | |--->return free_all_bootmem_core(pgdat->bdata); | 1、将低端内存中未被使用的页释放到伙伴系统中; | 2、bootmem位图分配器占用的页也释放到了伙伴系统中; | |-->for_each_nodebank(i, &meminfo, node = 0) |--{ | unsigned long start = bank_pfn_start(&meminfo.bank[i]); | unsigned long end = bank_pfn_end(&meminfo.bank[i]); | | 即:只对于高端内存使用free_area(start, end, NULL) | if(start >= max_low_pfn + PHYS_PFN_OFFSET) | totoalhigh_pates += free_area(start, end, NULL); |--} | |--totoalram_pages += totoalhigh_pages; | | |--for_each_nodebank(i, &meminfo, node) |--{ //统计已被分配的页数(物理页已被使用),并存入reserved_pages; //统计未被分配的页数(物理页未被使用),并存入free_pages; | ……………… |--} | |--num_physpages = meminfo中的各个membank下的总管理区内存大小. | |-->printk: nr_free_pages() << (PAGE_SHIFT) - 10 | 关于nr_free_pages()中涉及的值,实际上是在free_one_page函数 | 中完成的--->__mod_zone_page_state(zone, NR_FREE_PAGES, 1<< order), | 其改变了zone_vm_stat[NR_FREE_PAGES]的值.
void free_unused_memmap_node(int node, struct meminfo *mi)
|-->unsigned long bank_start, prev_bank_end = 0; | unsigned int i = 0; | |-->for_each_nodebank(i, mi, node) | 遍历属于该node的meminfo下的所有membank; | 对于UMA,membank分为低端内存和高端内存两个bank | | struct membank *bank = &mi->bank[i]; | bank_start = bank_pfn_start(bank); | | if(prev_bank_end && prev_bank_end != bank_start) | free_memmap(node, prev_bank_end, bank_start) | 对于连续内存,bank之间没有间隙,因此free_memmap不会执行. | | prev_bank_end = bank_pfn_end(bank); |-- |
void free_memmap(int node, unsigned long start_pfn,
unsigned long end_pfn) |-->struct page *start_pg = NULL, *end_pg = NULL; | unsigned long pg = 0, pgend = 0; | |-->start_pg = pfn_to_page(start_pfn - 1) + 1; | 该页帧号所对应的struct page实例的虚拟地址 | end_pg = pfn_to_page(end_pfn); | 该页帧号所对应的struct page实例的虚拟地址 | |-->pg = PAGN_ALIGN(__pa(start_pg); | 获取start_pg所对应的虚拟地址,即start_pfn页帧号所对应的struct page实例 | 的物理地址. | pgend = __pa(end_pg) & PAGE_MASK; | 获取end_pg所对应的虚拟地址,即end_pfn页帧号所对应的struct page实例 | 的物理地址. | |-->free_bootmem_node(&contig_page_data, pg, pgend - pg); | 将bootmem分配器中[pg,pgend]所对应的页的bit标志位清0.
|
int free_area(unsigned long pfn, unsigned long end, char *s) |-->unsigned int pages = 0, size = (end - pfn) << (PAGESHITF - 10); | |--for(; pfn < end; pfn++) |--{ | struct page *page = pfn_to_page(pfn); | ClearPageReserved(page); | init_page_count(page); | | __free_page(page); | |--->free_pages(page, 0);
| 详见下文 | | page++; |--} |
unsigned long free_all_bootmem_core(bootmem_data_t *bdata) |-->unsigned long start = bdata->node_min_pfn; | 存放低端内存的起始物理页号. | unsigned long end = bdata->node_low_pfn; | 存放低端内存的结束物理页号. | |-->while(start < end) |--{ | unsigned long *map = bdata->node_bootmem_map; | idx = start - bdata->node_min_pfn; | 获取物理内存页帧相对于起始物理内存页帧号的偏移(从0记). | vec = ~map[idx/BITS_PER_LONG]; | 取构成一个字的位图的反码. | | if(vec == ~0UL && start + BITS_PER_LONG < end) | 如果一个字内的位图全为0,即一个字内的页都可释放 | {int order = ilog2(BITS_PER_LONG); | __free_pages_bootmem(pfn_to_page(start), order); | count += BITS_PER_LONG;} | | else //该字内的位图不全为0 | {遍历字内的每一bit位,该bit位在字内偏移量为off. | 若bit位值为1,则 : | page = pfn_to_page(start + off); | __free_pages_bootmem(page, 0); | count++;} | | start += BITS_PER_LONG; |--} | |-->page = virt_to_page(bdata->node_bootmem_map); | 获取位图占用的页的相应的struct page 实例的起始虚拟地址. | | pages= bdata->node_low_pfn - bdata->node_min_pfn; | pages = bootmem_bootmap_pages(pages); | 获取位图所占用的页数 | | count += pages; | 更新释放的总页面数 |
|-->while(pages--) | __free_pages_bootmem(page++; 0); | 将bootmem位图分配器所占用的页释放到buddy system | |-->return count; | 返回释放给buddy system总的页面数
void __free_pages_bootmem(struct page *page, unsigned int order) |-->if(order == 0) |--{ | __ClearPageReserved(page); | 将pgge->flags的PG_reserved清0. | set_page_count(page, 0); | 将page->_count清0. | set_page_refcounted(page); | 将page->_count置1. | __free_page(page);
| |-->__free_pages(page, 0) |--} | |--else |--{ | int loop = 0; | for(loop = 0; loop < BITS_PER_LONG; loop++) | { struct page *p = &page[loop]; | __ClearPageReReserved(p); | 将pgge->flags的PG_reserved清0. | set_page_count(p, 0); | 将page->_count清0.} | | set_page_refcounted(page); | //注意此处在循环外只将一个字内的第一个struct page的_count置为1. | __free_pages(page, order); | |--}
void __free_pages(struct page* page, unsigned int order) |-->if(put_page_testzero(page)) |--{ | //put_page_testzero(page)的意图在于将page->_count值减去1,并 | //检测page->_count的值是否为0,若为0,则执行该块语句. | if(order == 0) | free_hot_cold_page(page, 0); | else | __free_pages_ok(page, order); |--}
//我们此处只看系统初始化时的情形 /* * Free a 0-order page * cold == 1 ? free a cold page : free a hot page */ void free_hot_cold_page(struct page *page, int cold) |-->struct zone *zone = page_zone(page) | 通过page->flags获取该page所属的zone. | |-->int migratetype = get_pageblock_migratetype(page) | 根据page所属的pageblock获取迁移类型, MIGRATETYPE_MOVABLE | |-->set_page_private(page, migratetype); | 初始化时,page设置为MIGRATETYPE_MOVABLE | |-->struct per_cpu_pages *pcp = NULL; | pcp = &this_cpu_ptr(zone->pageset)->pcp; | |-->if(cold) | list_add_tail(&page->lru, &pcp->lists[migratetype]); | else | list_add(&page->lru, &pcp->lists[migratetype]); | |-->pcp->count++; | | 初始化时pcp->count = 0 -- >1; pcp->high = 0; pcp->batch = 1; |-->if(pcp->count >= pcp->high) | { freepcppages_bulk(zone, pcp->batch, pcp); | pcp->count -= pcp->batch; } | |
void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) |-->我们此处回避一些问题,因为本记录以初始化为主,所以,我只下该函数在初始化 | 时的业务. | list_del(&page->lru); 从MIGRATETYPE_MOVABLE上取下. | __free_one_page(page, zone, 0, page_private(page));
void __free_pages_ok(page, order) |-->free_one_page(page_zone(page), page, order,
| get_pageblock_migratetype(page)); |-->__free_one_page(page, zone, order, migratetype);
void __free_one_page(struct page* page, struct zone *zone, unsigned int order, int migratetype) |-->unsigned int page_index = page_to_pfn(page)
| & ((1 << MAX_ORDER) - 1); | |--while(order < MAX_ORDER - 1) |--{ | unsigned long combined_idx; | struct page *buddy; | | buddy = __page_find_buddy(page, page_idx, order); | 找出可与page_idx构成的伙伴. | | 测试与page_idx相应的页,是否在伙伴系统中 | if(!page_is_buddy(page, buddy, order)) | break; | | 如果在伙伴系统内,则执行伙伴合并,有可能连锁合并,因此用了while循环 | list_del(&buddy->lru); | | zone->free_area[order].nr_free--; | nr_free的意义:处于同一个order下,有nr_free * (2**order)个页 | | rmv_page_order(buddy); | | combined_idx = __find_combined_index(page_idx, order); | 因为可能发生连锁合并,所以计算了combined_idx. |--} | | 对于初始化阶段,均加入free_list[MIGRATETYPE_MOVABLE]; |-->set_page_order(page, order); | list_add(&page->lru,
| &zone->free_area[order].free_list[migratetypes]); | zone->free_area[order].nr_free++;