slub分配object

kmem_cache如下:

62struct kmem_cache {
63    struct kmem_cache_cpu __percpu *cpu_slab;
64    /* Used for retriving partial slabs etc */
65    unsigned long flags;
66    unsigned long min_partial;
67    int size;        /* The size of an object including meta data */
68    int object_size;    /* The size of an object without meta data */
69    int offset;        /* Free pointer offset. */
70    int cpu_partial;    /* Number of per cpu partial objects to keep around */
71    struct kmem_cache_order_objects oo;
72
73    /* Allocation and freeing of slabs */
74    struct kmem_cache_order_objects max;
75    struct kmem_cache_order_objects min;
76    gfp_t allocflags;    /* gfp flags to use on each alloc */
77    int refcount;        /* Refcount for slab cache destroy */
78    void (*ctor)(void *);
79    int inuse;        /* Offset to metadata */
80    int align;        /* Alignment */
81    int reserved;        /* Reserved bytes at the end of slabs */
82    const char *name;    /* Name (only for display!) */
83    struct list_head list;    /* List of slab caches */
84    int red_left_pad;    /* Left redzone padding size */
85#ifdef CONFIG_SYSFS
86    struct kobject kobj;    /* For sysfs */
87#endif
88#ifdef CONFIG_MEMCG_KMEM
89    struct memcg_cache_params memcg_params;
90    int max_attr_size; /* for propagation, maximum size of a stored attr */
91#ifdef CONFIG_SYSFS
92    struct kset *memcg_kset;
93#endif
94#endif
95
96#ifdef CONFIG_NUMA
97    /*
98     * Defragmentation by allocating from a remote node.
99     */
100    int remote_node_defrag_ratio;
101#endif
102
103#ifdef CONFIG_KASAN
104    struct kasan_cache kasan_info;
105#endif
106
107    struct kmem_cache_node *node[MAX_NUMNODES];
108};

kmem_cache_cpu定义如下:

40struct kmem_cache_cpu {
41    void **freelist;    /* Pointer to next available object */
42    unsigned long tid;    /* Globally unique transaction id */
43    struct page *page;    /* The slab from which we are allocating */
44    struct page *partial;    /* Partially allocated frozen slabs */
45#ifdef CONFIG_SLUB_STATS
46    unsigned stat[NR_SLUB_STAT_ITEMS];
47#endif
48};

kmem_cache_node定义如下:

326struct kmem_cache_node {
327    spinlock_t list_lock;
328
329#ifdef CONFIG_SLAB
330    struct list_head slabs_partial;    /* partial list first, better asm code */
331    struct list_head slabs_full;
332    struct list_head slabs_free;
333    unsigned long free_objects;
334    unsigned int free_limit;
335    unsigned int colour_next;    /* Per-node cache coloring */
336    struct array_cache *shared;    /* shared per node */
337    struct alien_cache **alien;    /* on other nodes */
338    unsigned long next_reap;    /* updated without locking */
339    int free_touched;        /* updated without locking */
340#endif
341
342#ifdef CONFIG_SLUB
343    unsigned long nr_partial;
344    struct list_head partial;
345#ifdef CONFIG_SLUB_DEBUG
346    atomic_long_t nr_slabs;
347    atomic_long_t total_objects;
348    struct list_head full;
349#endif
350#endif
351
352};

总的来说,slub分配object,先从c->freelist找,如果为空,再从c->page里transfer object到freelist(get_freelist).如果依然找不着,从c->partail里找。再找不着,从node->partail里找。再找不着,从相邻的numa节点找,再找不着,申请一个新的slab.

咱们跟着代码来过一下这个流程。

2668static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2669        gfp_t gfpflags, int node, unsigned long addr)
2670{
2671    void *object;
2672    struct kmem_cache_cpu *c;
2673    struct page *page;
2674    unsigned long tid;
2675
2676    s = slab_pre_alloc_hook(s, gfpflags);
2677    if (!s)
2678        return NULL;
2679redo:
2680    /*
2681     * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2682     * enabled. We may switch back and forth between cpus while
2683     * reading from one cpu area. That does not matter as long
2684     * as we end up on the original cpu again when doing the cmpxchg.
2685     *
2686     * We should guarantee that tid and kmem_cache are retrieved on
2687     * the same cpu. It could be different if CONFIG_PREEMPT so we need
2688     * to check if it is matched or not.
2689     */
2690    do {
2691        tid = this_cpu_read(s->cpu_slab->tid);
2692        c = raw_cpu_ptr(s->cpu_slab);
2693    } while (IS_ENABLED(CONFIG_PREEMPT) &&
2694         unlikely(tid != READ_ONCE(c->tid)));
2695
2696    /*
2697     * Irqless object alloc/free algorithm used here depends on sequence
2698     * of fetching cpu_slab's data. tid should be fetched before anything
2699     * on c to guarantee that object and page associated with previous tid
2700     * won't be used with current tid. If we fetch tid first, object and
2701     * page could be one associated with next tid and our alloc/free
2702     * request will be failed. In this case, we will retry. So, no problem.
2703     */
2704    barrier();
2705
2706    /*
2707     * The transaction ids are globally unique per cpu and per operation on
2708     * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2709     * occurs on the right processor and that there was no operation on the
2710     * linked list in between.
2711     */
2712
2713    object = c->freelist;
2714    page = c->page;
2715    if (unlikely(!object || !node_match(page, node))) {
2716        object = __slab_alloc(s, gfpflags, node, addr, c);
2717        stat(s, ALLOC_SLOWPATH);
2718    } else {
2719        void *next_object = get_freepointer_safe(s, object);

2690~2695保证tid和cpu_slab是同一个cpu的。2704行考虑到了cpu缓存一致性的问题,加了内存屏障。2715行如果c->freelist为空,调__slab_alloc, __slab_alloc会禁本地cpu中断,再调___slab_alloc:

2537static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2538              unsigned long addr, struct kmem_cache_cpu *c)
2539{
2540    void *freelist;
2541    struct page *page;
2542
2543    page = c->page;
2544    if (!page)
2545        goto new_slab;
2546redo:
2547
     ...2574 2575 /* must check again c->freelist in case of cpu migration or IRQ */ 2576 freelist = c->freelist; 2577 if (freelist) 2578 goto load_freelist; 2579 2580 freelist = get_freelist(s, page); 2581 2582 if (!freelist) { 2583 c->page = NULL; 2584 stat(s, DEACTIVATE_BYPASS); 2585 goto new_slab; 2586 } 2587 2588 stat(s, ALLOC_REFILL); 2589 2590load_freelist: 2591 /* 2592 * freelist is pointing to the list of objects to be used. 2593 * page is pointing to the page from which the objects are obtained. 2594 * That page must be frozen for per cpu allocations to work. 2595 */ 2596 VM_BUG_ON(!c->page->frozen); 2597 c->freelist = get_freepointer(s, freelist); 2598 c->tid = next_tid(c->tid); 2599 return freelist; 2600 2601new_slab: 2602 2603 if (c->partial) { 2604 page = c->page = c->partial; 2605 c->partial = page->next; 2606 stat(s, CPU_PARTIAL_ALLOC); 2607 c->freelist = NULL; 2608 goto redo; 2609 } 2610 2611 freelist = new_slab_objects(s, gfpflags, node, &c); 2612 2613 if (unlikely(!freelist)) { 2614 slab_out_of_memory(s, gfpflags, node); 2615 return NULL; 2616 } 2617 2618 page = c->page; 2619 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) 2620 goto load_freelist; 2621 2622 /* Only entered in the debug case */ 2623 if (kmem_cache_debug(s) && 2624 !alloc_debug_processing(s, page, freelist, addr)) 2625 goto new_slab; /* Slab failed checks. Next slab needed */ 2626 2627 deactivate_slab(s, page, get_freepointer(s, freelist)); 2628 c->page = NULL; 2629 c->freelist = NULL; 2630 return freelist; 2631}
___slab_alloc是分配的主流程,因为进程迁移和cpu中断的原因再次判断c->freelist是否有object。没有,通过get_freelist把c->page->freelist置为NULL,并把该page的frozen和inuse置位。inuse置为page->objects,直到该slab用完。
get_freelist实现如下:
2494static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2495{
2496    struct page new;
2497    unsigned long counters;
2498    void *freelist;
2499
2500    do {
2501        freelist = page->freelist;
2502        counters = page->counters;
2503
2504        new.counters = counters;
2505        VM_BUG_ON(!new.frozen);
2506
2507        new.inuse = page->objects;
2508        new.frozen = freelist != NULL;
2509
2510    } while (!__cmpxchg_double_slab(s, page,
2511        freelist, counters,
2512        NULL, new.counters,
2513        "get_freelist"));
2514
2515    return freelist;
2516}
如果c->page也没有找到object,跳到new_slab:处。首先检查c->partial是否有object,如果有,会把freelist指到c->partial这个page.若没有,走到new_slab_objects:
2442static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2443            int node, struct kmem_cache_cpu **pc)
2444{
2445    void *freelist;
2446    struct kmem_cache_cpu *c = *pc;
2447    struct page *page;
2448
2449    freelist = get_partial(s, flags, node, c);
2450
2451    if (freelist)
2452        return freelist;
2453
2454    page = new_slab(s, flags, node);
2455    if (page) {
2456        c = raw_cpu_ptr(s->cpu_slab);
2457        if (c->page)
2458            flush_slab(s, c);
2459
2460        /*
2461         * No other reference to the page yet so we can
2462         * muck around with it freely without cmpxchg
2463         */
2464        freelist = page->freelist;
2465        page->freelist = NULL;
2466
2467        stat(s, ALLOC_SLAB);
2468        c->page = page;
2469        *pc = c;
2470    } else
2471        freelist = NULL;
2472
2473    return freelist;
2474}
get_partial先从该cache匹配的node去找,如果找不到,再从其他相邻node找object.get_partial_node
1845static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1846                struct kmem_cache_cpu *c, gfp_t flags)
1847{
1848    struct page *page, *page2;
1849    void *object = NULL;
1850    int available = 0;
1851    int objects;
1852
1853    /*
1854     * Racy check. If we mistakenly see no partial slabs then we
1855     * just allocate an empty slab. If we mistakenly try to get a
1856     * partial slab and there is none available then get_partials()
1857     * will return NULL.
1858     */
1859    if (!n || !n->nr_partial)
1860        return NULL;
1861
1862    spin_lock(&n->list_lock);
1863    list_for_each_entry_safe(page, page2, &n->partial, lru) {
1864        void *t;
1865
1866        if (!pfmemalloc_match(page, flags))
1867            continue;
1868
1869        t = acquire_slab(s, n, page, object == NULL, &objects);
1870        if (!t)
1871            break;
1872
1873        available += objects;
1874        if (!object) {
1875            c->page = page;
1876            stat(s, ALLOC_FROM_PARTIAL);
1877            object = t;
1878        } else {
1879            put_cpu_partial(s, page, 0);
1880            stat(s, CPU_PARTIAL_NODE);
1881        }
1882        if (!kmem_cache_has_cpu_partial(s)
1883            || available > s->cpu_partial / 2)
1884            break;
1885
1886    }
1887    spin_unlock(&n->list_lock);
1888    return object;
1889}

该函数会从node的partial链表去连续取一些slab,然后freeze这些slab,把这些slab从node的partial链表里删除,并把这些slab移到c->partail链表。直到可用的objects个数 > s->cpu_partial / 2.

如果没找到,就去相邻node去找。这块暂不叙述。

acquire_slab:

1799static inline void *acquire_slab(struct kmem_cache *s,
1800        struct kmem_cache_node *n, struct page *page,
1801        int mode, int *objects)
1802{
1803    void *freelist;
1804    unsigned long counters;
1805    struct page new;
1806
1807    lockdep_assert_held(&n->list_lock);
1808
1809    /*
1810     * Zap the freelist and set the frozen bit.
1811     * The old freelist is the list of objects for the
1812     * per cpu allocation list.
1813     */
1814    freelist = page->freelist;
1815    counters = page->counters;
1816    new.counters = counters;
1817    *objects = new.objects - new.inuse;
1818    if (mode) {
1819        new.inuse = page->objects;
1820        new.freelist = NULL;
1821    } else {
1822        new.freelist = freelist;
1823    }
1824
1825    VM_BUG_ON(new.frozen);
1826    new.frozen = 1;
1827
1828    if (!__cmpxchg_double_slab(s, page,
1829            freelist, counters,
1830            new.freelist, new.counters,
1831            "acquire_slab"))
1832        return NULL;
1833
1834    remove_partial(n, page);
1835    WARN_ON(!freelist);
1836    return freelist;
1837}

这里有一个小细节,如果object为NULL时(mode == true),会把该page->freelist置为NULL,表示正在使用,而如果已经找到了object,则会在put_cpu_partial里把该page->next指向之前c->partial.链表不会断。

put_cpu_partial:
2265static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2266{
2267#ifdef CONFIG_SLUB_CPU_PARTIAL
2268    struct page *oldpage;
2269    int pages;
2270    int pobjects;
2271
2272    preempt_disable();
2273    do {
2274        pages = 0;
2275        pobjects = 0;
2276        oldpage = this_cpu_read(s->cpu_slab->partial);
2277
2278        if (oldpage) {
2279            pobjects = oldpage->pobjects;
2280            pages = oldpage->pages;
2281            if (drain && pobjects > s->cpu_partial) {
2282                unsigned long flags;
2283                /*
2284                 * partial array is full. Move the existing
2285                 * set to the per node partial list.
2286                 */
2287                local_irq_save(flags);
2288                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2289                local_irq_restore(flags);
2290                oldpage = NULL;
2291                pobjects = 0;
2292                pages = 0;
2293                stat(s, CPU_PARTIAL_DRAIN);
2294            }
2295        }
2296
2297        pages++;
2298        pobjects += page->objects - page->inuse;
2299
2300        page->pages = pages;
2301        page->pobjects = pobjects;
2302        page->next = oldpage;
2303
2304    } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2305                                != oldpage);
2306    if (unlikely(!s->cpu_partial)) {
2307        unsigned long flags;
2308
2309        local_irq_save(flags);
2310        unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2311        local_irq_restore(flags);
2312    }
2313    preempt_enable();
2314#endif
2315}

如果从node中未找到可用的object,则会申请内存,创建一个新的slab.

1581static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1582{
1583    struct page *page;
1584    struct kmem_cache_order_objects oo = s->oo;
1585    gfp_t alloc_gfp;
1586    void *start, *p;
1587    int idx, order;
1588
1589    flags &= gfp_allowed_mask;
1590
1591    if (gfpflags_allow_blocking(flags))
1592        local_irq_enable();
1593
1594    flags |= s->allocflags;
1595
1596    /*
1597     * Let the initial higher-order allocation fail under memory pressure
1598     * so we fall-back to the minimum order allocation.
1599     */
1600    alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1601    if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
1602        alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
1603
1604    page = alloc_slab_page(s, alloc_gfp, node, oo);
1605    if (unlikely(!page)) {
1606        oo = s->min;
1607        alloc_gfp = flags;
1608        /*
1609         * Allocation may have failed due to fragmentation.
1610         * Try a lower order alloc if possible
1611         */
1612        page = alloc_slab_page(s, alloc_gfp, node, oo);
1613        if (unlikely(!page))
1614            goto out;
1615        stat(s, ORDER_FALLBACK);
1616    }
1617
     ...
1634 page->objects = oo_objects(oo); 1635 1636 order = compound_order(page); 1637 page->slab_cache = s; 1638 __SetPageSlab(page); 1639 if (page_is_pfmemalloc(page)) 1640 SetPageSlabPfmemalloc(page); 1641 1642 start = page_address(page); 1643 1644 if (unlikely(s->flags & SLAB_POISON)) 1645 memset(start, POISON_INUSE, PAGE_SIZE << order); 1646 1647 kasan_poison_slab(page); 1648 1649 for_each_object_idx(p, idx, s, start, page->objects) { 1650 setup_object(s, page, p); 1651 if (likely(idx < page->objects)) 1652 set_freepointer(s, p, p + s->size); 1653 else 1654 set_freepointer(s, p, NULL); 1655 } 1656 1657 page->freelist = fixup_red_left(s, start); 1658 page->inuse = page->objects; 1659 page->frozen = 1; 1660 1661out: 1662 if (gfpflags_allow_blocking(flags)) 1663 local_irq_disable(); 1664 if (!page) 1665 return NULL; 1666 1667 mod_zone_page_state(page_zone(page), 1668 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1669 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1670 1 << oo_order(oo)); 1671 1672 inc_slabs_node(s, page_to_nid(page), page->objects); 1673 1674 return page; 1675}

先申请order个页,然后freeze该slab,并初始化该slab的object(在该object的offset处填写下一个可用的object的地址)。并把该c->page置为刚申请的page,然后把page->freelist置为NULL,以示在使用。

至此,slub分配object完。

 

 

 

 

 

 

posted @ 2018-12-21 17:51  penghan  阅读(657)  评论(0编辑  收藏  举报