slub分配object

kmem_cache如下：

62struct kmem_cache {
63    struct kmem_cache_cpu __percpu *cpu_slab;
64    /* Used for retriving partial slabs etc */
65    unsigned long flags;
66    unsigned long min_partial;
67    int size;        /* The size of an object including meta data */
68    int object_size;    /* The size of an object without meta data */
69    int offset;        /* Free pointer offset. */
70    int cpu_partial;    /* Number of per cpu partial objects to keep around */
71    struct kmem_cache_order_objects oo;
72
73    /* Allocation and freeing of slabs */
74    struct kmem_cache_order_objects max;
75    struct kmem_cache_order_objects min;
76    gfp_t allocflags;    /* gfp flags to use on each alloc */
77    int refcount;        /* Refcount for slab cache destroy */
78    void (*ctor)(void *);
79    int inuse;        /* Offset to metadata */
80    int align;        /* Alignment */
81    int reserved;        /* Reserved bytes at the end of slabs */
82    const char *name;    /* Name (only for display!) */
83    struct list_head list;    /* List of slab caches */
84    int red_left_pad;    /* Left redzone padding size */
85#ifdef CONFIG_SYSFS
86    struct kobject kobj;    /* For sysfs */
87#endif
88#ifdef CONFIG_MEMCG_KMEM
89    struct memcg_cache_params memcg_params;
90    int max_attr_size; /* for propagation, maximum size of a stored attr */
91#ifdef CONFIG_SYSFS
92    struct kset *memcg_kset;
93#endif
94#endif
95
96#ifdef CONFIG_NUMA
97    /*
98     * Defragmentation by allocating from a remote node.
99     */
100    int remote_node_defrag_ratio;
101#endif
102
103#ifdef CONFIG_KASAN
104    struct kasan_cache kasan_info;
105#endif
106
107    struct kmem_cache_node *node[MAX_NUMNODES];
108};

kmem_cache_cpu定义如下：

40struct kmem_cache_cpu {
41    void **freelist;    /* Pointer to next available object */
42    unsigned long tid;    /* Globally unique transaction id */
43    struct page *page;    /* The slab from which we are allocating */
44    struct page *partial;    /* Partially allocated frozen slabs */
45#ifdef CONFIG_SLUB_STATS
46    unsigned stat[NR_SLUB_STAT_ITEMS];
47#endif
48};

kmem_cache_node定义如下：

326struct kmem_cache_node {
327    spinlock_t list_lock;
328
329#ifdef CONFIG_SLAB
330    struct list_head slabs_partial;    /* partial list first, better asm code */
331    struct list_head slabs_full;
332    struct list_head slabs_free;
333    unsigned long free_objects;
334    unsigned int free_limit;
335    unsigned int colour_next;    /* Per-node cache coloring */
336    struct array_cache *shared;    /* shared per node */
337    struct alien_cache **alien;    /* on other nodes */
338    unsigned long next_reap;    /* updated without locking */
339    int free_touched;        /* updated without locking */
340#endif
341
342#ifdef CONFIG_SLUB
343    unsigned long nr_partial;
344    struct list_head partial;
345#ifdef CONFIG_SLUB_DEBUG
346    atomic_long_t nr_slabs;
347    atomic_long_t total_objects;
348    struct list_head full;
349#endif
350#endif
351
352};

总的来说，slub分配object，先从c->freelist找，如果为空，再从c->page里transfer object到freelist（get_freelist）.如果依然找不着，从c->partail里找。再找不着，从node->partail里找。再找不着，从相邻的numa节点找，再找不着，申请一个新的slab.

咱们跟着代码来过一下这个流程。

2668static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2669        gfp_t gfpflags, int node, unsigned long addr)
2670{
2671    void *object;
2672    struct kmem_cache_cpu *c;
2673    struct page *page;
2674    unsigned long tid;
2675
2676    s = slab_pre_alloc_hook(s, gfpflags);
2677    if (!s)
2678        return NULL;
2679redo:
2680    /*
2681     * Must read kmem_cache cpu data via this cpu ptr. Preemption is
2682     * enabled. We may switch back and forth between cpus while
2683     * reading from one cpu area. That does not matter as long
2684     * as we end up on the original cpu again when doing the cmpxchg.
2685     *
2686     * We should guarantee that tid and kmem_cache are retrieved on
2687     * the same cpu. It could be different if CONFIG_PREEMPT so we need
2688     * to check if it is matched or not.
2689     */
2690    do {
2691        tid = this_cpu_read(s->cpu_slab->tid);
2692        c = raw_cpu_ptr(s->cpu_slab);
2693    } while (IS_ENABLED(CONFIG_PREEMPT) &&
2694         unlikely(tid != READ_ONCE(c->tid)));
2695
2696    /*
2697     * Irqless object alloc/free algorithm used here depends on sequence
2698     * of fetching cpu_slab's data. tid should be fetched before anything
2699     * on c to guarantee that object and page associated with previous tid
2700     * won't be used with current tid. If we fetch tid first, object and
2701     * page could be one associated with next tid and our alloc/free
2702     * request will be failed. In this case, we will retry. So, no problem.
2703     */
2704    barrier();
2705
2706    /*
2707     * The transaction ids are globally unique per cpu and per operation on
2708     * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
2709     * occurs on the right processor and that there was no operation on the
2710     * linked list in between.
2711     */
2712
2713    object = c->freelist;
2714    page = c->page;
2715    if (unlikely(!object || !node_match(page, node))) {
2716        object = __slab_alloc(s, gfpflags, node, addr, c);
2717        stat(s, ALLOC_SLOWPATH);
2718    } else {
2719        void *next_object = get_freepointer_safe(s, object);

2690~2695保证tid和cpu_slab是同一个cpu的。2704行考虑到了cpu缓存一致性的问题，加了内存屏障。2715行如果c->freelist为空，调__slab_alloc, __slab_alloc会禁本地cpu中断，再调___slab_alloc:

2537static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2538              unsigned long addr, struct kmem_cache_cpu *c)
2539{
2540    void *freelist;
2541    struct page *page;
2542
2543    page = c->page;
2544    if (!page)
2545        goto new_slab;
2546redo:
2547
　　　　　...2574
2575    /* must check again c->freelist in case of cpu migration or IRQ */
2576    freelist = c->freelist;
2577    if (freelist)
2578        goto load_freelist;
2579
2580    freelist = get_freelist(s, page);
2581
2582    if (!freelist) {
2583        c->page = NULL;
2584        stat(s, DEACTIVATE_BYPASS);
2585        goto new_slab;
2586    }
2587
2588    stat(s, ALLOC_REFILL);
2589
2590load_freelist:
2591    /*
2592     * freelist is pointing to the list of objects to be used.
2593     * page is pointing to the page from which the objects are obtained.
2594     * That page must be frozen for per cpu allocations to work.
2595     */
2596    VM_BUG_ON(!c->page->frozen);
2597    c->freelist = get_freepointer(s, freelist);
2598    c->tid = next_tid(c->tid);
2599    return freelist;
2600
2601new_slab:
2602
2603    if (c->partial) {
2604        page = c->page = c->partial;
2605        c->partial = page->next;
2606        stat(s, CPU_PARTIAL_ALLOC);
2607        c->freelist = NULL;
2608        goto redo;
2609    }
2610
2611    freelist = new_slab_objects(s, gfpflags, node, &c);
2612
2613    if (unlikely(!freelist)) {
2614        slab_out_of_memory(s, gfpflags, node);
2615        return NULL;
2616    }
2617
2618    page = c->page;
2619    if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2620        goto load_freelist;
2621
2622    /* Only entered in the debug case */
2623    if (kmem_cache_debug(s) &&
2624            !alloc_debug_processing(s, page, freelist, addr))
2625        goto new_slab;    /* Slab failed checks. Next slab needed */
2626
2627    deactivate_slab(s, page, get_freepointer(s, freelist));
2628    c->page = NULL;
2629    c->freelist = NULL;
2630    return freelist;
2631}

___slab_alloc是分配的主流程，因为进程迁移和cpu中断的原因再次判断c->freelist是否有object。没有，通过get_freelist把c->page->freelist置为NULL,并把该page的frozen和inuse置位。inuse置为page->objects，直到该slab用完。
get_freelist实现如下：

2494static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2495{
2496    struct page new;
2497    unsigned long counters;
2498    void *freelist;
2499
2500    do {
2501        freelist = page->freelist;
2502        counters = page->counters;
2503
2504        new.counters = counters;
2505        VM_BUG_ON(!new.frozen);
2506
2507        new.inuse = page->objects;
2508        new.frozen = freelist != NULL;
2509
2510    } while (!__cmpxchg_double_slab(s, page,
2511        freelist, counters,
2512        NULL, new.counters,
2513        "get_freelist"));
2514
2515    return freelist;
2516}

如果c->page也没有找到object，跳到new_slab:处。首先检查c->partial是否有object，如果有，会把freelist指到c->partial这个page.若没有，走到new_slab_objects：

2442static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2443            int node, struct kmem_cache_cpu **pc)
2444{
2445    void *freelist;
2446    struct kmem_cache_cpu *c = *pc;
2447    struct page *page;
2448
2449    freelist = get_partial(s, flags, node, c);
2450
2451    if (freelist)
2452        return freelist;
2453
2454    page = new_slab(s, flags, node);
2455    if (page) {
2456        c = raw_cpu_ptr(s->cpu_slab);
2457        if (c->page)
2458            flush_slab(s, c);
2459
2460        /*
2461         * No other reference to the page yet so we can
2462         * muck around with it freely without cmpxchg
2463         */
2464        freelist = page->freelist;
2465        page->freelist = NULL;
2466
2467        stat(s, ALLOC_SLAB);
2468        c->page = page;
2469        *pc = c;
2470    } else
2471        freelist = NULL;
2472
2473    return freelist;
2474}

get_partial先从该cache匹配的node去找，如果找不到，再从其他相邻node找object.get_partial_node：

1845static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1846                struct kmem_cache_cpu *c, gfp_t flags)
1847{
1848    struct page *page, *page2;
1849    void *object = NULL;
1850    int available = 0;
1851    int objects;
1852
1853    /*
1854     * Racy check. If we mistakenly see no partial slabs then we
1855     * just allocate an empty slab. If we mistakenly try to get a
1856     * partial slab and there is none available then get_partials()
1857     * will return NULL.
1858     */
1859    if (!n || !n->nr_partial)
1860        return NULL;
1861
1862    spin_lock(&n->list_lock);
1863    list_for_each_entry_safe(page, page2, &n->partial, lru) {
1864        void *t;
1865
1866        if (!pfmemalloc_match(page, flags))
1867            continue;
1868
1869        t = acquire_slab(s, n, page, object == NULL, &objects);
1870        if (!t)
1871            break;
1872
1873        available += objects;
1874        if (!object) {
1875            c->page = page;
1876            stat(s, ALLOC_FROM_PARTIAL);
1877            object = t;
1878        } else {
1879            put_cpu_partial(s, page, 0);
1880            stat(s, CPU_PARTIAL_NODE);
1881        }
1882        if (!kmem_cache_has_cpu_partial(s)
1883            || available > s->cpu_partial / 2)
1884            break;
1885
1886    }
1887    spin_unlock(&n->list_lock);
1888    return object;
1889}

该函数会从node的partial链表去连续取一些slab，然后freeze这些slab，把这些slab从node的partial链表里删除，并把这些slab移到c->partail链表。直到可用的objects个数 > s->cpu_partial / 2.

如果没找到，就去相邻node去找。这块暂不叙述。

acquire_slab：

1799static inline void *acquire_slab(struct kmem_cache *s,
1800        struct kmem_cache_node *n, struct page *page,
1801        int mode, int *objects)
1802{
1803    void *freelist;
1804    unsigned long counters;
1805    struct page new;
1806
1807    lockdep_assert_held(&n->list_lock);
1808
1809    /*
1810     * Zap the freelist and set the frozen bit.
1811     * The old freelist is the list of objects for the
1812     * per cpu allocation list.
1813     */
1814    freelist = page->freelist;
1815    counters = page->counters;
1816    new.counters = counters;
1817    *objects = new.objects - new.inuse;
1818    if (mode) {
1819        new.inuse = page->objects;
1820        new.freelist = NULL;
1821    } else {
1822        new.freelist = freelist;
1823    }
1824
1825    VM_BUG_ON(new.frozen);
1826    new.frozen = 1;
1827
1828    if (!__cmpxchg_double_slab(s, page,
1829            freelist, counters,
1830            new.freelist, new.counters,
1831            "acquire_slab"))
1832        return NULL;
1833
1834    remove_partial(n, page);
1835    WARN_ON(!freelist);
1836    return freelist;
1837}

这里有一个小细节，如果object为NULL时(mode == true)，会把该page->freelist置为NULL，表示正在使用，而如果已经找到了object，则会在 put_cpu_partial里把该page->next指向之前c->partial.链表不会断。

put_cpu_partial：

2265static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2266{
2267#ifdef CONFIG_SLUB_CPU_PARTIAL
2268    struct page *oldpage;
2269    int pages;
2270    int pobjects;
2271
2272    preempt_disable();
2273    do {
2274        pages = 0;
2275        pobjects = 0;
2276        oldpage = this_cpu_read(s->cpu_slab->partial);
2277
2278        if (oldpage) {
2279            pobjects = oldpage->pobjects;
2280            pages = oldpage->pages;
2281            if (drain && pobjects > s->cpu_partial) {
2282                unsigned long flags;
2283                /*
2284                 * partial array is full. Move the existing
2285                 * set to the per node partial list.
2286                 */
2287                local_irq_save(flags);
2288                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2289                local_irq_restore(flags);
2290                oldpage = NULL;
2291                pobjects = 0;
2292                pages = 0;
2293                stat(s, CPU_PARTIAL_DRAIN);
2294            }
2295        }
2296
2297        pages++;
2298        pobjects += page->objects - page->inuse;
2299
2300        page->pages = pages;
2301        page->pobjects = pobjects;
2302        page->next = oldpage;
2303
2304    } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2305                                != oldpage);
2306    if (unlikely(!s->cpu_partial)) {
2307        unsigned long flags;
2308
2309        local_irq_save(flags);
2310        unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2311        local_irq_restore(flags);
2312    }
2313    preempt_enable();
2314#endif
2315}

如果从node中未找到可用的object，则会申请内存，创建一个新的slab.

1581static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1582{
1583    struct page *page;
1584    struct kmem_cache_order_objects oo = s->oo;
1585    gfp_t alloc_gfp;
1586    void *start, *p;
1587    int idx, order;
1588
1589    flags &= gfp_allowed_mask;
1590
1591    if (gfpflags_allow_blocking(flags))
1592        local_irq_enable();
1593
1594    flags |= s->allocflags;
1595
1596    /*
1597     * Let the initial higher-order allocation fail under memory pressure
1598     * so we fall-back to the minimum order allocation.
1599     */
1600    alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
1601    if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
1602        alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
1603
1604    page = alloc_slab_page(s, alloc_gfp, node, oo);
1605    if (unlikely(!page)) {
1606        oo = s->min;
1607        alloc_gfp = flags;
1608        /*
1609         * Allocation may have failed due to fragmentation.
1610         * Try a lower order alloc if possible
1611         */
1612        page = alloc_slab_page(s, alloc_gfp, node, oo);
1613        if (unlikely(!page))
1614            goto out;
1615        stat(s, ORDER_FALLBACK);
1616    }
1617
　　　　　...

1634    page->objects = oo_objects(oo);
1635
1636    order = compound_order(page);
1637    page->slab_cache = s;
1638    __SetPageSlab(page);
1639    if (page_is_pfmemalloc(page))
1640        SetPageSlabPfmemalloc(page);
1641
1642    start = page_address(page);
1643
1644    if (unlikely(s->flags & SLAB_POISON))
1645        memset(start, POISON_INUSE, PAGE_SIZE << order);
1646
1647    kasan_poison_slab(page);
1648
1649    for_each_object_idx(p, idx, s, start, page->objects) {
1650        setup_object(s, page, p);
1651        if (likely(idx < page->objects))
1652            set_freepointer(s, p, p + s->size);
1653        else
1654            set_freepointer(s, p, NULL);
1655    }
1656
1657    page->freelist = fixup_red_left(s, start);
1658    page->inuse = page->objects;
1659    page->frozen = 1;
1660
1661out:
1662    if (gfpflags_allow_blocking(flags))
1663        local_irq_disable();
1664    if (!page)
1665        return NULL;
1666
1667    mod_zone_page_state(page_zone(page),
1668        (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1669        NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1670        1 << oo_order(oo));
1671
1672    inc_slabs_node(s, page_to_nid(page), page->objects);
1673
1674    return page;
1675}

先申请order个页，然后freeze该slab,并初始化该slab的object（在该object的offset处填写下一个可用的object的地址）。并把该c->page置为刚申请的page,然后把page->freelist置为NULL,以示在使用。

至此，slub分配object完。

posted @ 2018-12-21 17:51 penghan 阅读(679) 评论(0) 收藏举报

刷新页面返回顶部

penghan

slub分配object

公告