Linux Kernel Development有关内存管理
1 Pages
Page的概念来源为处理器Processor的部件MMU(Memory Management Unit),MMU通过设置好的页表(通过设置CR3寄存器,指向页目录所在的物理内存)对内存进行管理,管理操作包括:
a) 建立线性内存地址与物理内存地址的对应关系,即pa()和va()函数;
b) 管理哪些内存页驻存(Resident)于物理内存中,而哪些内存被交换到Swap文件中;
c) 哪些内存页被映射到哪个进程的虚拟地址空间;
d) 管理哪些内存页存储磁盘上(或者文件系统中)文件的缓存;
数据结构, struct page
1: /*
2: * Each physical page in the system has a struct page associated with
3: * it to keep track of whatever it is we are using the page for at the
4: * moment. Note that we have no way to track which tasks are using
5: * a page, though if it is a pagecache page, rmap structures can tell us
6: * who is mapping it.
7: */
8: struct page {
9: unsigned long flags; /* Atomic flags, some possibly
10: * updated asynchronously */
11: atomic_t _count; /* Usage count, see below. */
12: union {
13: /*
14: * Count of ptes mapped in
15: * mms, to show when page is
16: * mapped & limit reverse map
17: * searches.
18: *
19: * Used also for tail pages
20: * refcounting instead of
21: * _count. Tail pages cannot
22: * be mapped and keeping the
23: * tail page _count zero at
24: * all times guarantees
25: * get_page_unless_zero() will
26: * never succeed on tail
27: * pages.
28: */
29: atomic_t _mapcount;
30:
31: struct { /* SLUB */
32: u16 inuse;
33: u16 objects;
34: };
35: };
36: union {
37: struct {
38: unsigned long private; /* Mapping-private opaque data:
39: * usually used for buffer_heads
40: * if PagePrivate set; used for
41: * swp_entry_t if PageSwapCache;
42: * indicates order in the buddy
43: * system if PG_buddy is set.
44: */
45: struct address_space *mapping; /* If low bit clear, points to
46: * inode address_space, or NULL.
47: * If page mapped as anonymous
48: * memory, low bit is set, and
49: * it points to anon_vma object:
50: * see PAGE_MAPPING_ANON below.
51: */
52: };
53: #if USE_SPLIT_PTLOCKS
54: spinlock_t ptl;
55: #endif
56: struct kmem_cache *slab; /* SLUB: Pointer to slab */
57: struct page *first_page; /* Compound tail pages */
58: };
59: union {
60: pgoff_t index; /* Our offset within mapping. */
61: void *freelist; /* SLUB: freelist req. slab lock */
62: };
63: struct list_head lru; /* Pageout list, eg. active_list
64: * protected by zone->lru_lock !
65: */
66: /*
67: * On machines where all RAM is mapped into kernel address space,
68: * we can simply calculate the virtual address. On machines with
69: * highmem some memory is mapped into kernel virtual memory
70: * dynamically, so we need a place to store that address.
71: * Note that this field could be 16 bits on x86 ... ;)
72: *
73: * Architectures with slow multiplication can define
74: * WANT_PAGE_VIRTUAL in asm/page.h
75: */
76: #if defined(WANT_PAGE_VIRTUAL)
77: void *virtual; /* Kernel virtual address (NULL if
78: not kmapped, ie. highmem) */
79: #endif /* WANT_PAGE_VIRTUAL */
80: #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
81: unsigned long debug_flags; /* Use atomic bitops on this */
82: #endif
83:
84: #ifdef CONFIG_KMEMCHECK
85: /*
86: * kmemcheck wants to track the status of each byte in a page; this
87: * is a pointer to such a status block. NULL if not tracked.
88: */
89: void *shadow;
90: #endif
91: };
每个page结构体对象,代表一个物理内存页。
x) flags
1: enum pageflags {
2: PG_locked, /* Page is locked. Don't touch. */
3: PG_error,
4: PG_referenced,
5: PG_uptodate,
6: PG_dirty,
7: PG_lru,
8: PG_active,
9: PG_slab,
10: PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
11: PG_arch_1,
12: PG_reserved,
13: PG_private, /* If pagecache, has fs-private data */
14: PG_private_2, /* If pagecache, has fs aux data */
15: PG_writeback, /* Page is under writeback */
16: #ifdef CONFIG_PAGEFLAGS_EXTENDED
17: PG_head, /* A head page */
18: PG_tail, /* A tail page */
19: #else
20: PG_compound, /* A compound page */
21: #endif
22: PG_swapcache, /* Swap page: swp_entry_t in private */
23: PG_mappedtodisk, /* Has blocks allocated on-disk */
24: PG_reclaim, /* To be reclaimed asap */
25: PG_swapbacked, /* Page is backed by RAM/swap */
26: PG_unevictable, /* Page is "unevictable" */
27: #ifdef CONFIG_MMU
28: PG_mlocked, /* Page is vma mlocked */
29: #endif
30: #ifdef CONFIG_ARCH_USES_PG_UNCACHED
31: PG_uncached, /* Page has been mapped as uncached */
32: #endif
33: #ifdef CONFIG_MEMORY_FAILURE
34: PG_hwpoison, /* hardware poisoned page. Don't touch */
35: #endif
36: #ifdef CONFIG_TRANSPARENT_HUGEPAGE
37: PG_compound_lock,
38: #endif
39: __NR_PAGEFLAGS,
40:
41: /* Filesystems */
42: PG_checked = PG_owner_priv_1,
43:
44: /* Two page bits are conscripted by FS-Cache to maintain local caching
45: * state. These bits are set on pages belonging to the netfs's inodes
46: * when those inodes are being locally cached.
47: */
48: PG_fscache = PG_private_2, /* page backed by cache */
49:
50: /* XEN */
51: PG_pinned = PG_owner_priv_1,
52: PG_savepinned = PG_dirty,
53:
54: /* SLOB */
55: PG_slob_free = PG_private,
56:
57: /* SLUB */
58: PG_slub_frozen = PG_active,
59: };
x) _count
引用计数,代表有多少处引用到该物理内存页对象。
访问_count时,不要直接访问,调用 page_count()对象进行访问。
x) private, mapping
1: struct address_space {
2: struct inode *host; /* owner: inode, block_device */
3: struct radix_tree_root page_tree; /* radix tree of all pages */
4: spinlock_t tree_lock; /* and lock protecting it */
5: unsigned int i_mmap_writable;/* count VM_SHARED mappings */
6: struct prio_tree_root i_mmap; /* tree of private and shared mappings */
7: struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
8: struct mutex i_mmap_mutex; /* protect tree, count, list */
9: /* Protected by tree_lock together with the radix tree */
10: unsigned long nrpages; /* number of total pages */
11: pgoff_t writeback_index;/* writeback starts here */
12: const struct address_space_operations *a_ops; /* methods */
13: unsigned long flags; /* error bits/gfp mask */
14: struct backing_dev_info *backing_dev_info; /* device readahead, etc */
15: spinlock_t private_lock; /* for use by the address_space */
16: struct list_head private_list; /* ditto */
17: struct address_space *assoc_mapping; /* ditto */
18: } __attribute__((aligned(sizeof(long))));
x) virtual
虚拟地址,如果是高端内存(HighMem),那么该物理页可能不是长久地映射到内核的内存空间中,所以该字段为NULL。
struct page
这个结构只是用来描述物理内存页,而与该页中存储的内容无关。
物理内存页的可能属主(Owner)有:
- user-space processes, 用户态的进程
- dynamically allocated kernel data, 内核态中动态分配的数据
- static kernel code, 内核静态代码
- the page cache,页缓存
可能程序员对为每个物理内存页都分配一个struct page而感到吃惊,“那得分配多少内存啊,多浪费啊!”。
实际上以4GB内存为例,大概需要40MB的内存来存储所有struct page的对象,相对于它能够管理的4GB物理内存,还是十分微不足道的。
2. Zones
“为什么不利用用户态的3GB的地址空间来映射内核的1GB地址空间映射不下的物理内存?”
因为其实,内核在工作时,是不考虑存在着用户空间的。
内核要处理的任务远比支撑用户空间要复杂。
为什么要使用不同的Zone?
有些硬件设备的DMA能访问的内存空间十分有即,比如x86的ISA总线,只能访问0~16M的物理内存,那么如果这段内存被随便地分配掉了,ISA设备就可能无法工作了。
ISA
ISA插槽是基于ISA总线(Industrial Standard Architecture,工业标准结构总线)的扩展插槽,其颜色一般为黑色,比PCI接口插槽要长些,位于主板的最下端。其工作频率为8MHz左右,为16位插槽,最大传输率16MB/sec,可插接显卡,声卡,网卡以及所谓的多功能接口卡等扩展插卡。其缺点是CPU资源占用太高,数据传输带宽太小,是已经被淘汰的插槽接口。(http://baike.baidu.com/view/13594.htm)
HighMem
对于x86体系结构,以896MB物理内存为界,大于该范围的为高端内存(High Memory),而小于该范围的为低端内存(Low Memory)。
内核无法直接映射HighMem区域的物理内存,只能以暂时的方式映射其中一小部分使用,当需要使用其他的高端内存时,可以需要打破之前建立的暂时映射,而用于新的映射,就像是内存交换机制一样。