linux内存管理(六)- 内核新struct - folio
folio大概是5.16引入的,看起来像是page的封装,这里有一篇讲解folio很好的博客,论好名字的重要性: Linux内核page到folio的变迁-CSDN博客
struct folio { /* private: don't document the anon union */ union { struct { /* public: */ unsigned long flags; union { struct list_head lru; /* private: avoid cluttering the output */ struct { void *__filler; /* public: */ unsigned int mlock_count; /* private: */ }; /* public: */ }; struct address_space *mapping; pgoff_t index; union { void *private; swp_entry_t swap; }; atomic_t _mapcount; atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif #if defined(WANT_PAGE_VIRTUAL) void *virtual; #endif #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif /* private: the union with struct page is transitional */ }; struct page page; }; union { struct { unsigned long _flags_1; unsigned long _head_1; unsigned long _folio_avail; /* public: */ atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif /* private: the union with struct page is transitional */ }; struct page __page_1; }; union { struct { unsigned long _flags_2; unsigned long _head_2; /* public: */ void *_hugetlb_subpool; void *_hugetlb_cgroup; void *_hugetlb_cgroup_rsvd; void *_hugetlb_hwpoison; /* private: the union with struct page is transitional */ }; struct { unsigned long _flags_2a; unsigned long _head_2a; /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ }; struct page __page_2; }; };
简单来看它似乎是三个page结构的组合。与第一个page union的结构跟page结构几乎一致。引入folio是为了解决长久以来page混乱的语义。page除了可以代表单页也可以代表连续多个页面,甚至大页。page在内核中应用广泛,这种混乱增加了写代码和理解代码的难度,人为的增加混乱。folio代表一个或多个page,本身就可以代表page所有的语义。在新的内核代码中folio在很多场合完成了page的替代,但是page依然存在。
比如compound_order的实现。在folio之前是这样的。
static inline unsigned int compound_order(struct page *page) { if (!PageHead(page)) return 0; return page[1].compound_order; }
先检查page是不是单页,如果是单页直接返回0,对于复合页order保存在后一个page的compound_order成员中。也即是单个page是表示不了多页的,但是folio可以。
static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio->_flags_1 & 0xff; }
新的代码中首先将page强转为folio,判断其是否为复合页,如果是复合页order保存在_flags_1中。看起来也没简化,反而更复杂一点,但是可以在一个folio结构中解决问题,不再依赖于tail page。
看一下page结构
struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. * WARNING: bit 0 of the first word is used for PageTail(). That * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { struct { /* Page cache and anonymous pages */ /** * @lru: Pageout list, eg. active_list protected by * lruvec->lru_lock. Sometimes used as a generic list * by the page owner. */ union { struct list_head lru; /* Or, for the Unevictable "LRU list" slot */ struct { /* Always even, to negate PageTail */ void *__filler; /* Count page's or folio's mlocks */ unsigned int mlock_count; }; /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; union { pgoff_t index; /* Our offset within mapping. */ unsigned long share; /* share count for fsdax */ }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if PageSwapCache. * Indicates order in the buddy system if PageBuddy. */ unsigned long private; }; struct { /* page_pool used by netstack */ /** * @pp_magic: magic value to avoid recycling non * page_pool allocated pages. */ unsigned long pp_magic; struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; void *zone_device_data; /* * ZONE_DEVICE private pages are counted as being * mapped so the next 3 words hold the mapping, index, * and private fields from the source anonymous or * page cache page while the page is migrated to device * private memory. * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also * use the mapping, index, and private fields when * pmem backed DAX files are mapped. */ }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; }; union { /* This union is 4 bytes in size. */ /* * If the page can be mapped to userspace, encodes the number * of times this page is referenced by a page table. */ atomic_t _mapcount; /* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page * is used for. See page-flags.h for a list of page types * which are currently stored here. */ unsigned int page_type; }; /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif #ifdef CONFIG_KMSAN /* * KMSAN metadata for this page: * - shadow page: every bit indicates whether the corresponding * bit of the original page is initialized (0) or not (1); * - origin page: every 4 bytes contain an id of the stack trace * where the uninitialized value was created. */ struct page *kmsan_shadow; struct page *kmsan_origin; #endif } _struct_page_alignment;
1. flags
enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_head, /* Must be in bit 6 */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable,
...
flags由四部分构成,|node|zone|last_cpuid|flags|
2. mapping
最低两个bits可以用来判断是否为匿名映射或ksm映射。对于匿名映射指向anon_vma. 对于file映射指向address_space结构。
#define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
3. _refcount
表示页面在内核中的引用次数。大于0代表正在使用。
static inline void get_page(struct page *page) { folio_get(page_folio(page)); } static inline void folio_get(struct folio *folio) { VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); folio_ref_inc(folio); } static inline void folio_ref_inc(struct folio *folio) { page_ref_inc(&folio->page); } static inline void page_ref_inc(struct page *page) { atomic_inc(&page->_refcount); if (page_ref_tracepoint_active(page_ref_mod)) __page_ref_mod(page, 1); }
folio让page操作变得非常繁琐,这样真的好吗?
分配内存时_refcount + 1, 加入lru链表时+1等。
4. _mapcount
表示这个页面被进程映射的次数,用做反向映射。-1代表没有页表映射。
page相关的API
static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; }
static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; }
mapping相关
struct address_space *page_mapping(struct page *page) { return folio_mapping(page_folio(page)); } struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(folio_test_slab(folio))) return NULL; if (unlikely(folio_test_swapcache(folio))) return swap_address_space(folio->swap); mapping = folio->mapping;
//如果是匿名页或ksm页 if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; return mapping; }
page_mapped
static inline bool page_mapped(struct page *page) { if (likely(!PageCompound(page))) return atomic_read(&page->_mapcount) >= 0; return folio_large_is_mapped(page_folio(page)); }
对于普通页面只需判断_mapcount值。