Linux Kernel Development有关内存管理

1 Pages

Page的概念来源为处理器Processor的部件MMU(Memory Management Unit)，MMU通过设置好的页表(通过设置CR3寄存器，指向页目录所在的物理内存)对内存进行管理，管理操作包括：

a) 建立线性内存地址与物理内存地址的对应关系，即pa()和va()函数；

b) 管理哪些内存页驻存(Resident)于物理内存中，而哪些内存被交换到Swap文件中；

c) 哪些内存页被映射到哪个进程的虚拟地址空间；

d) 管理哪些内存页存储磁盘上（或者文件系统中）文件的缓存；

数据结构， struct page

   1: /*   2:  * Each physical page in the system has a struct page associated with   3:  * it to keep track of whatever it is we are using the page for at the   4:  * moment. Note that we have no way to track which tasks are using   5:  * a page, though if it is a pagecache page, rmap structures can tell us   6:  * who is mapping it.   7:  */   8: struct page {   9:     unsigned long flags;        /* Atomic flags, some possibly  10:                      * updated asynchronously */  11:     atomic_t _count;        /* Usage count, see below. */  12:     union {  13:         /*  14:          * Count of ptes mapped in  15:          * mms, to show when page is  16:          * mapped & limit reverse map  17:          * searches.  18:          *  19:          * Used also for tail pages  20:          * refcounting instead of  21:          * _count. Tail pages cannot  22:          * be mapped and keeping the  23:          * tail page _count zero at  24:          * all times guarantees  25:          * get_page_unless_zero() will  26:          * never succeed on tail  27:          * pages.  28:          */  29:         atomic_t _mapcount;  30:    31:         struct {        /* SLUB */  32:             u16 inuse;  33:             u16 objects;  34:         };  35:     };  36:     union {  37:         struct {  38:         unsigned long private;        /* Mapping-private opaque data:  39:                           * usually used for buffer_heads  40:                          * if PagePrivate set; used for  41:                          * swp_entry_t if PageSwapCache;  42:                          * indicates order in the buddy  43:                          * system if PG_buddy is set.  44:                          */  45:         struct address_space *mapping;    /* If low bit clear, points to  46:                          * inode address_space, or NULL.  47:                          * If page mapped as anonymous  48:                          * memory, low bit is set, and  49:                          * it points to anon_vma object:  50:                          * see PAGE_MAPPING_ANON below.  51:                          */  52:         };  53: #if USE_SPLIT_PTLOCKS  54:         spinlock_t ptl;  55: #endif  56:         struct kmem_cache *slab;    /* SLUB: Pointer to slab */  57:         struct page *first_page;    /* Compound tail pages */  58:     };  59:     union {  60:         pgoff_t index;        /* Our offset within mapping. */  61:         void *freelist;        /* SLUB: freelist req. slab lock */  62:     };  63:     struct list_head lru;        /* Pageout list, eg. active_list  64:                      * protected by zone->lru_lock !  65:                      */  66:     /*  67:      * On machines where all RAM is mapped into kernel address space,  68:      * we can simply calculate the virtual address. On machines with  69:      * highmem some memory is mapped into kernel virtual memory  70:      * dynamically, so we need a place to store that address.  71:      * Note that this field could be 16 bits on x86 ... ;)  72:      *  73:      * Architectures with slow multiplication can define  74:      * WANT_PAGE_VIRTUAL in asm/page.h  75:      */  76: #if defined(WANT_PAGE_VIRTUAL)  77:     void *virtual;            /* Kernel virtual address (NULL if  78:                        not kmapped, ie. highmem) */  79: #endif /* WANT_PAGE_VIRTUAL */  80: #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS  81:     unsigned long debug_flags;    /* Use atomic bitops on this */  82: #endif  83:    84: #ifdef CONFIG_KMEMCHECK  85:     /*  86:      * kmemcheck wants to track the status of each byte in a page; this  87:      * is a pointer to such a status block. NULL if not tracked.  88:      */  89:     void *shadow;  90: #endif  91: };

每个page结构体对象，代表一个物理内存页。

x) flags

   1: enum pageflags {   2:     PG_locked,        /* Page is locked. Don't touch. */   3:     PG_error,   4:     PG_referenced,   5:     PG_uptodate,   6:     PG_dirty,   7:     PG_lru,   8:     PG_active,   9:     PG_slab,  10:     PG_owner_priv_1,    /* Owner use. If pagecache, fs may use*/  11:     PG_arch_1,  12:     PG_reserved,  13:     PG_private,        /* If pagecache, has fs-private data */  14:     PG_private_2,        /* If pagecache, has fs aux data */  15:     PG_writeback,        /* Page is under writeback */  16: #ifdef CONFIG_PAGEFLAGS_EXTENDED  17:     PG_head,        /* A head page */  18:     PG_tail,        /* A tail page */  19: #else  20:     PG_compound,        /* A compound page */  21: #endif  22:     PG_swapcache,        /* Swap page: swp_entry_t in private */  23:     PG_mappedtodisk,    /* Has blocks allocated on-disk */  24:     PG_reclaim,        /* To be reclaimed asap */  25:     PG_swapbacked,        /* Page is backed by RAM/swap */  26:     PG_unevictable,        /* Page is "unevictable"  */  27: #ifdef CONFIG_MMU  28:     PG_mlocked,        /* Page is vma mlocked */  29: #endif  30: #ifdef CONFIG_ARCH_USES_PG_UNCACHED  31:     PG_uncached,        /* Page has been mapped as uncached */  32: #endif  33: #ifdef CONFIG_MEMORY_FAILURE  34:     PG_hwpoison,        /* hardware poisoned page. Don't touch */  35: #endif  36: #ifdef CONFIG_TRANSPARENT_HUGEPAGE  37:     PG_compound_lock,  38: #endif  39:     __NR_PAGEFLAGS,  40:    41:     /* Filesystems */  42:     PG_checked = PG_owner_priv_1,  43:    44:     /* Two page bits are conscripted by FS-Cache to maintain local caching  45:      * state.  These bits are set on pages belonging to the netfs's inodes  46:      * when those inodes are being locally cached.  47:      */  48:     PG_fscache = PG_private_2,    /* page backed by cache */  49:    50:     /* XEN */  51:     PG_pinned = PG_owner_priv_1,  52:     PG_savepinned = PG_dirty,  53:    54:     /* SLOB */  55:     PG_slob_free = PG_private,  56:    57:     /* SLUB */  58:     PG_slub_frozen = PG_active,  59: };

x) _count

引用计数，代表有多少处引用到该物理内存页对象。

访问_count时，不要直接访问，调用 page_count()对象进行访问。

x) private, mapping

   1: struct address_space {   2:     struct inode        *host;        /* owner: inode, block_device */   3:     struct radix_tree_root    page_tree;    /* radix tree of all pages */   4:     spinlock_t        tree_lock;    /* and lock protecting it */   5:     unsigned int        i_mmap_writable;/* count VM_SHARED mappings */   6:     struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */   7:     struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */   8:     struct mutex        i_mmap_mutex;    /* protect tree, count, list */   9:     /* Protected by tree_lock together with the radix tree */  10:     unsigned long        nrpages;    /* number of total pages */  11:     pgoff_t            writeback_index;/* writeback starts here */  12:     const struct address_space_operations *a_ops;    /* methods */  13:     unsigned long        flags;        /* error bits/gfp mask */  14:     struct backing_dev_info *backing_dev_info; /* device readahead, etc */  15:     spinlock_t        private_lock;    /* for use by the address_space */  16:     struct list_head    private_list;    /* ditto */  17:     struct address_space    *assoc_mapping;    /* ditto */  18: } __attribute__((aligned(sizeof(long))));

x) virtual

虚拟地址，如果是高端内存(HighMem)，那么该物理页可能不是长久地映射到内核的内存空间中，所以该字段为NULL。

struct page

这个结构只是用来描述物理内存页，而与该页中存储的内容无关。

物理内存页的可能属主(Owner)有：

user-space processes, 用户态的进程
dynamically allocated kernel data，内核态中动态分配的数据
static kernel code，内核静态代码
the page cache，页缓存

可能程序员对为每个物理内存页都分配一个struct page而感到吃惊，“那得分配多少内存啊，多浪费啊！”。

实际上以4GB内存为例，大概需要40MB的内存来存储所有struct page的对象，相对于它能够管理的4GB物理内存，还是十分微不足道的。

2. Zones

“为什么不利用用户态的3GB的地址空间来映射内核的1GB地址空间映射不下的物理内存？”

因为其实，内核在工作时，是不考虑存在着用户空间的。

内核要处理的任务远比支撑用户空间要复杂。

为什么要使用不同的Zone?

有些硬件设备的DMA能访问的内存空间十分有即，比如x86的ISA总线，只能访问0~16M的物理内存，那么如果这段内存被随便地分配掉了，ISA设备就可能无法工作了。

ISA

ISA插槽是基于ISA总线（Industrial Standard Architecture，工业标准结构总线）的扩展插槽，其颜色一般为黑色，比PCI接口插槽要长些，位于主板的最下端。其工作频率为8MHz左右，为16位插槽，最大传输率16MB/sec，可插接显卡，声卡，网卡以及所谓的多功能接口卡等扩展插卡。其缺点是CPU资源占用太高，数据传输带宽太小，是已经被淘汰的插槽接口。（http://baike.baidu.com/view/13594.htm）

HighMem

对于x86体系结构，以896MB物理内存为界，大于该范围的为高端内存(High Memory)，而小于该范围的为低端内存(Low Memory)。

内核无法直接映射HighMem区域的物理内存，只能以暂时的方式映射其中一小部分使用，当需要使用其他的高端内存时，可以需要打破之前建立的暂时映射，而用于新的映射，就像是内存交换机制一样。

posted @ 2013-12-24 15:46 Daniel King 阅读(495) 评论(0) 编辑收藏举报

刷新页面返回顶部

Daniel King

淡泊明志，宁静致远

Linux Kernel Development有关内存管理

1 Pages

2. Zones

ISA