slub分配object
kmem_cache如下:
62struct kmem_cache { 63 struct kmem_cache_cpu __percpu *cpu_slab; 64 /* Used for retriving partial slabs etc */ 65 unsigned long flags; 66 unsigned long min_partial; 67 int size; /* The size of an object including meta data */ 68 int object_size; /* The size of an object without meta data */ 69 int offset; /* Free pointer offset. */ 70 int cpu_partial; /* Number of per cpu partial objects to keep around */ 71 struct kmem_cache_order_objects oo; 72 73 /* Allocation and freeing of slabs */ 74 struct kmem_cache_order_objects max; 75 struct kmem_cache_order_objects min; 76 gfp_t allocflags; /* gfp flags to use on each alloc */ 77 int refcount; /* Refcount for slab cache destroy */ 78 void (*ctor)(void *); 79 int inuse; /* Offset to metadata */ 80 int align; /* Alignment */ 81 int reserved; /* Reserved bytes at the end of slabs */ 82 const char *name; /* Name (only for display!) */ 83 struct list_head list; /* List of slab caches */ 84 int red_left_pad; /* Left redzone padding size */ 85#ifdef CONFIG_SYSFS 86 struct kobject kobj; /* For sysfs */ 87#endif 88#ifdef CONFIG_MEMCG_KMEM 89 struct memcg_cache_params memcg_params; 90 int max_attr_size; /* for propagation, maximum size of a stored attr */ 91#ifdef CONFIG_SYSFS 92 struct kset *memcg_kset; 93#endif 94#endif 95 96#ifdef CONFIG_NUMA 97 /* 98 * Defragmentation by allocating from a remote node. 99 */ 100 int remote_node_defrag_ratio; 101#endif 102 103#ifdef CONFIG_KASAN 104 struct kasan_cache kasan_info; 105#endif 106 107 struct kmem_cache_node *node[MAX_NUMNODES]; 108};
kmem_cache_cpu定义如下:
40struct kmem_cache_cpu { 41 void **freelist; /* Pointer to next available object */ 42 unsigned long tid; /* Globally unique transaction id */ 43 struct page *page; /* The slab from which we are allocating */ 44 struct page *partial; /* Partially allocated frozen slabs */ 45#ifdef CONFIG_SLUB_STATS 46 unsigned stat[NR_SLUB_STAT_ITEMS]; 47#endif 48};
kmem_cache_node定义如下:
326struct kmem_cache_node { 327 spinlock_t list_lock; 328 329#ifdef CONFIG_SLAB 330 struct list_head slabs_partial; /* partial list first, better asm code */ 331 struct list_head slabs_full; 332 struct list_head slabs_free; 333 unsigned long free_objects; 334 unsigned int free_limit; 335 unsigned int colour_next; /* Per-node cache coloring */ 336 struct array_cache *shared; /* shared per node */ 337 struct alien_cache **alien; /* on other nodes */ 338 unsigned long next_reap; /* updated without locking */ 339 int free_touched; /* updated without locking */ 340#endif 341 342#ifdef CONFIG_SLUB 343 unsigned long nr_partial; 344 struct list_head partial; 345#ifdef CONFIG_SLUB_DEBUG 346 atomic_long_t nr_slabs; 347 atomic_long_t total_objects; 348 struct list_head full; 349#endif 350#endif 351 352};
总的来说,slub分配object,先从c->freelist找,如果为空,再从c->page里transfer object到freelist(get_freelist).如果依然找不着,从c->partail里找。再找不着,从node->partail里找。再找不着,从相邻的numa节点找,再找不着,申请一个新的slab.
咱们跟着代码来过一下这个流程。
2668static __always_inline void *slab_alloc_node(struct kmem_cache *s, 2669 gfp_t gfpflags, int node, unsigned long addr) 2670{ 2671 void *object; 2672 struct kmem_cache_cpu *c; 2673 struct page *page; 2674 unsigned long tid; 2675 2676 s = slab_pre_alloc_hook(s, gfpflags); 2677 if (!s) 2678 return NULL; 2679redo: 2680 /* 2681 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 2682 * enabled. We may switch back and forth between cpus while 2683 * reading from one cpu area. That does not matter as long 2684 * as we end up on the original cpu again when doing the cmpxchg. 2685 * 2686 * We should guarantee that tid and kmem_cache are retrieved on 2687 * the same cpu. It could be different if CONFIG_PREEMPT so we need 2688 * to check if it is matched or not. 2689 */ 2690 do { 2691 tid = this_cpu_read(s->cpu_slab->tid); 2692 c = raw_cpu_ptr(s->cpu_slab); 2693 } while (IS_ENABLED(CONFIG_PREEMPT) && 2694 unlikely(tid != READ_ONCE(c->tid))); 2695 2696 /* 2697 * Irqless object alloc/free algorithm used here depends on sequence 2698 * of fetching cpu_slab's data. tid should be fetched before anything 2699 * on c to guarantee that object and page associated with previous tid 2700 * won't be used with current tid. If we fetch tid first, object and 2701 * page could be one associated with next tid and our alloc/free 2702 * request will be failed. In this case, we will retry. So, no problem. 2703 */ 2704 barrier(); 2705 2706 /* 2707 * The transaction ids are globally unique per cpu and per operation on 2708 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 2709 * occurs on the right processor and that there was no operation on the 2710 * linked list in between. 2711 */ 2712 2713 object = c->freelist; 2714 page = c->page; 2715 if (unlikely(!object || !node_match(page, node))) { 2716 object = __slab_alloc(s, gfpflags, node, addr, c); 2717 stat(s, ALLOC_SLOWPATH); 2718 } else { 2719 void *next_object = get_freepointer_safe(s, object);
2690~2695保证tid和cpu_slab是同一个cpu的。2704行考虑到了cpu缓存一致性的问题,加了内存屏障。2715行如果c->freelist为空,调__slab_alloc, __slab_alloc会禁本地cpu中断,再调___slab_alloc:
2537static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2538 unsigned long addr, struct kmem_cache_cpu *c) 2539{ 2540 void *freelist; 2541 struct page *page; 2542 2543 page = c->page; 2544 if (!page) 2545 goto new_slab; 2546redo: 2547
...2574 2575 /* must check again c->freelist in case of cpu migration or IRQ */ 2576 freelist = c->freelist; 2577 if (freelist) 2578 goto load_freelist; 2579 2580 freelist = get_freelist(s, page); 2581 2582 if (!freelist) { 2583 c->page = NULL; 2584 stat(s, DEACTIVATE_BYPASS); 2585 goto new_slab; 2586 } 2587 2588 stat(s, ALLOC_REFILL); 2589 2590load_freelist: 2591 /* 2592 * freelist is pointing to the list of objects to be used. 2593 * page is pointing to the page from which the objects are obtained. 2594 * That page must be frozen for per cpu allocations to work. 2595 */ 2596 VM_BUG_ON(!c->page->frozen); 2597 c->freelist = get_freepointer(s, freelist); 2598 c->tid = next_tid(c->tid); 2599 return freelist; 2600 2601new_slab: 2602 2603 if (c->partial) { 2604 page = c->page = c->partial; 2605 c->partial = page->next; 2606 stat(s, CPU_PARTIAL_ALLOC); 2607 c->freelist = NULL; 2608 goto redo; 2609 } 2610 2611 freelist = new_slab_objects(s, gfpflags, node, &c); 2612 2613 if (unlikely(!freelist)) { 2614 slab_out_of_memory(s, gfpflags, node); 2615 return NULL; 2616 } 2617 2618 page = c->page; 2619 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) 2620 goto load_freelist; 2621 2622 /* Only entered in the debug case */ 2623 if (kmem_cache_debug(s) && 2624 !alloc_debug_processing(s, page, freelist, addr)) 2625 goto new_slab; /* Slab failed checks. Next slab needed */ 2626 2627 deactivate_slab(s, page, get_freepointer(s, freelist)); 2628 c->page = NULL; 2629 c->freelist = NULL; 2630 return freelist; 2631}
___slab_alloc是分配的主流程,因为进程迁移和cpu中断的原因再次判断c->freelist是否有object。没有,通过get_freelist把c->page->freelist置为NULL,并把该page的frozen和inuse置位。inuse置为page->objects,直到该slab用完。
get_freelist实现如下:
2494static inline void *get_freelist(struct kmem_cache *s, struct page *page) 2495{ 2496 struct page new; 2497 unsigned long counters; 2498 void *freelist; 2499 2500 do { 2501 freelist = page->freelist; 2502 counters = page->counters; 2503 2504 new.counters = counters; 2505 VM_BUG_ON(!new.frozen); 2506 2507 new.inuse = page->objects; 2508 new.frozen = freelist != NULL; 2509 2510 } while (!__cmpxchg_double_slab(s, page, 2511 freelist, counters, 2512 NULL, new.counters, 2513 "get_freelist")); 2514 2515 return freelist; 2516}
如果c->page也没有找到object,跳到new_slab:处。首先检查c->partial是否有object,如果有,会把freelist指到c->partial这个page.若没有,走到new_slab_objects:
2442static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, 2443 int node, struct kmem_cache_cpu **pc) 2444{ 2445 void *freelist; 2446 struct kmem_cache_cpu *c = *pc; 2447 struct page *page; 2448 2449 freelist = get_partial(s, flags, node, c); 2450 2451 if (freelist) 2452 return freelist; 2453 2454 page = new_slab(s, flags, node); 2455 if (page) { 2456 c = raw_cpu_ptr(s->cpu_slab); 2457 if (c->page) 2458 flush_slab(s, c); 2459 2460 /* 2461 * No other reference to the page yet so we can 2462 * muck around with it freely without cmpxchg 2463 */ 2464 freelist = page->freelist; 2465 page->freelist = NULL; 2466 2467 stat(s, ALLOC_SLAB); 2468 c->page = page; 2469 *pc = c; 2470 } else 2471 freelist = NULL; 2472 2473 return freelist; 2474}
get_partial先从该cache匹配的node去找,如果找不到,再从其他相邻node找object.get_partial_node:
1845static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, 1846 struct kmem_cache_cpu *c, gfp_t flags) 1847{ 1848 struct page *page, *page2; 1849 void *object = NULL; 1850 int available = 0; 1851 int objects; 1852 1853 /* 1854 * Racy check. If we mistakenly see no partial slabs then we 1855 * just allocate an empty slab. If we mistakenly try to get a 1856 * partial slab and there is none available then get_partials() 1857 * will return NULL. 1858 */ 1859 if (!n || !n->nr_partial) 1860 return NULL; 1861 1862 spin_lock(&n->list_lock); 1863 list_for_each_entry_safe(page, page2, &n->partial, lru) { 1864 void *t; 1865 1866 if (!pfmemalloc_match(page, flags)) 1867 continue; 1868 1869 t = acquire_slab(s, n, page, object == NULL, &objects); 1870 if (!t) 1871 break; 1872 1873 available += objects; 1874 if (!object) { 1875 c->page = page; 1876 stat(s, ALLOC_FROM_PARTIAL); 1877 object = t; 1878 } else { 1879 put_cpu_partial(s, page, 0); 1880 stat(s, CPU_PARTIAL_NODE); 1881 } 1882 if (!kmem_cache_has_cpu_partial(s) 1883 || available > s->cpu_partial / 2) 1884 break; 1885 1886 } 1887 spin_unlock(&n->list_lock); 1888 return object; 1889}
该函数会从node的partial链表去连续取一些slab,然后freeze这些slab,把这些slab从node的partial链表里删除,并把这些slab移到c->partail链表。直到可用的objects个数 > s->cpu_partial / 2.
如果没找到,就去相邻node去找。这块暂不叙述。
acquire_slab:
1799static inline void *acquire_slab(struct kmem_cache *s, 1800 struct kmem_cache_node *n, struct page *page, 1801 int mode, int *objects) 1802{ 1803 void *freelist; 1804 unsigned long counters; 1805 struct page new; 1806 1807 lockdep_assert_held(&n->list_lock); 1808 1809 /* 1810 * Zap the freelist and set the frozen bit. 1811 * The old freelist is the list of objects for the 1812 * per cpu allocation list. 1813 */ 1814 freelist = page->freelist; 1815 counters = page->counters; 1816 new.counters = counters; 1817 *objects = new.objects - new.inuse; 1818 if (mode) { 1819 new.inuse = page->objects; 1820 new.freelist = NULL; 1821 } else { 1822 new.freelist = freelist; 1823 } 1824 1825 VM_BUG_ON(new.frozen); 1826 new.frozen = 1; 1827 1828 if (!__cmpxchg_double_slab(s, page, 1829 freelist, counters, 1830 new.freelist, new.counters, 1831 "acquire_slab")) 1832 return NULL; 1833 1834 remove_partial(n, page); 1835 WARN_ON(!freelist); 1836 return freelist; 1837}
这里有一个小细节,如果object为NULL时(mode == true),会把该page->freelist置为NULL,表示正在使用,而如果已经找到了object,则会在put_cpu_partial里把该page->next指向之前c->partial.链表不会断。
put_cpu_partial:
2265static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 2266{ 2267#ifdef CONFIG_SLUB_CPU_PARTIAL 2268 struct page *oldpage; 2269 int pages; 2270 int pobjects; 2271 2272 preempt_disable(); 2273 do { 2274 pages = 0; 2275 pobjects = 0; 2276 oldpage = this_cpu_read(s->cpu_slab->partial); 2277 2278 if (oldpage) { 2279 pobjects = oldpage->pobjects; 2280 pages = oldpage->pages; 2281 if (drain && pobjects > s->cpu_partial) { 2282 unsigned long flags; 2283 /* 2284 * partial array is full. Move the existing 2285 * set to the per node partial list. 2286 */ 2287 local_irq_save(flags); 2288 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); 2289 local_irq_restore(flags); 2290 oldpage = NULL; 2291 pobjects = 0; 2292 pages = 0; 2293 stat(s, CPU_PARTIAL_DRAIN); 2294 } 2295 } 2296 2297 pages++; 2298 pobjects += page->objects - page->inuse; 2299 2300 page->pages = pages; 2301 page->pobjects = pobjects; 2302 page->next = oldpage; 2303 2304 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) 2305 != oldpage); 2306 if (unlikely(!s->cpu_partial)) { 2307 unsigned long flags; 2308 2309 local_irq_save(flags); 2310 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); 2311 local_irq_restore(flags); 2312 } 2313 preempt_enable(); 2314#endif 2315}
如果从node中未找到可用的object,则会申请内存,创建一个新的slab.
1581static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1582{ 1583 struct page *page; 1584 struct kmem_cache_order_objects oo = s->oo; 1585 gfp_t alloc_gfp; 1586 void *start, *p; 1587 int idx, order; 1588 1589 flags &= gfp_allowed_mask; 1590 1591 if (gfpflags_allow_blocking(flags)) 1592 local_irq_enable(); 1593 1594 flags |= s->allocflags; 1595 1596 /* 1597 * Let the initial higher-order allocation fail under memory pressure 1598 * so we fall-back to the minimum order allocation. 1599 */ 1600 alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; 1601 if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) 1602 alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; 1603 1604 page = alloc_slab_page(s, alloc_gfp, node, oo); 1605 if (unlikely(!page)) { 1606 oo = s->min; 1607 alloc_gfp = flags; 1608 /* 1609 * Allocation may have failed due to fragmentation. 1610 * Try a lower order alloc if possible 1611 */ 1612 page = alloc_slab_page(s, alloc_gfp, node, oo); 1613 if (unlikely(!page)) 1614 goto out; 1615 stat(s, ORDER_FALLBACK); 1616 } 1617 ...
1634 page->objects = oo_objects(oo); 1635 1636 order = compound_order(page); 1637 page->slab_cache = s; 1638 __SetPageSlab(page); 1639 if (page_is_pfmemalloc(page)) 1640 SetPageSlabPfmemalloc(page); 1641 1642 start = page_address(page); 1643 1644 if (unlikely(s->flags & SLAB_POISON)) 1645 memset(start, POISON_INUSE, PAGE_SIZE << order); 1646 1647 kasan_poison_slab(page); 1648 1649 for_each_object_idx(p, idx, s, start, page->objects) { 1650 setup_object(s, page, p); 1651 if (likely(idx < page->objects)) 1652 set_freepointer(s, p, p + s->size); 1653 else 1654 set_freepointer(s, p, NULL); 1655 } 1656 1657 page->freelist = fixup_red_left(s, start); 1658 page->inuse = page->objects; 1659 page->frozen = 1; 1660 1661out: 1662 if (gfpflags_allow_blocking(flags)) 1663 local_irq_disable(); 1664 if (!page) 1665 return NULL; 1666 1667 mod_zone_page_state(page_zone(page), 1668 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1669 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1670 1 << oo_order(oo)); 1671 1672 inc_slabs_node(s, page_to_nid(page), page->objects); 1673 1674 return page; 1675}
先申请order个页,然后freeze该slab,并初始化该slab的object(在该object的offset处填写下一个可用的object的地址)。并把该c->page置为刚申请的page,然后把page->freelist置为NULL,以示在使用。
至此,slub分配object完。