Unity IL2cpp内存管理
分配
关键类
- hblk
struct hblk {
char hb_body[HBLKSIZE];
};
BoehmGC是按照block来分配内存的,HBLKSIZE表示每个block大小 默认值是4096,和页大小一致,猜测合理的值,应该是页大小的倍数。
-
hblkhdr
- 是每个block的header信息。hb_n_marks表示这个hblk内存块中被“引用”的对象个数,hb_marks数组维护所有对象的引用标记位,用0和1表示“引用”和“非引用”,MARK_BITS_SZ=(MARK_BITS_PER_HBLK/CPP_WORDSZ + 1) = 5
struct hblkhdr {
struct hblk * hb_next; /* Link field for hblk free list */
/* and for lists of chunks waiting to be */
/* reclaimed. */
struct hblk * hb_prev; /* Backwards link for free list. */
struct hblk * hb_block; /* The corresponding block. */
unsigned char hb_obj_kind;
/* Kind of objects in the block. Each kind */
/* identifies a mark procedure and a set of */
/* list headers. Sometimes called regions. */
unsigned char hb_flags;
# define IGNORE_OFF_PAGE 1 /* Ignore pointers that do not */
/* point to the first page of */
/* this object. */
# define WAS_UNMAPPED 2 /* This is a free block, which has */
/* been unmapped from the address */
/* space. */
/* GC_remap must be invoked on it */
/* before it can be reallocated. */
/* Only set with USE_MUNMAP. */
# define FREE_BLK 4 /* Block is free, i.e. not in use. */
# ifdef ENABLE_DISCLAIM
# define HAS_DISCLAIM 8
/* This kind has a callback on reclaim. */
# define MARK_UNCONDITIONALLY 0x10
/* Mark from all objects, marked or */
/* not. Used to mark objects needed by */
/* reclaim notifier. */
# endif
# ifdef MARK_BIT_PER_GRANULE
# define LARGE_BLOCK 0x20
# endif
unsigned short hb_last_reclaimed;
/* Value of GC_gc_no when block was */
/* last allocated or swept. May wrap. */
/* For a free block, this is maintained */
/* only for USE_MUNMAP, and indicates */
/* when the header was allocated, or */
/* when the size of the block last */
/* changed. */
# ifdef MARK_BIT_PER_OBJ
unsigned32 hb_inv_sz; /* A good upper bound for 2**32/hb_sz. */
/* For large objects, we use */
/* LARGE_INV_SZ. */
# define LARGE_INV_SZ (1 << 16)
# endif
word hb_sz; /* If in use, size in bytes, of objects in the block. */
/* if free, the size in bytes of the whole block. */
/* We assume that this is convertible to signed_word */
/* without generating a negative result. We avoid */
/* generating free blocks larger than that. */
word hb_descr; /* object descriptor for marking. See */
/* gc_mark.h. */
# ifdef MARK_BIT_PER_GRANULE
unsigned short * hb_map; /* Essentially a table of remainders */
/* mod BYTES_TO_GRANULES(hb_sz), except */
/* for large blocks. See GC_obj_map. */
# endif
# ifdef PARALLEL_MARK
volatile AO_t hb_n_marks; /* Number of set mark bits, excluding */
/* the one always set at the end. */
/* Currently it is concurrently */
/* updated and hence only approximate. */
/* But a zero value does guarantee that */
/* the block contains no marked */
/* objects. */
/* Ensuring this property means that we */
/* never decrement it to zero during a */
/* collection, and hence the count may */
/* be one too high. Due to concurrent */
/* updates, an arbitrary number of */
/* increments, but not all of them (!) */
/* may be lost, hence it may in theory */
/* be much too low. */
/* The count may also be too high if */
/* multiple mark threads mark the */
/* same object due to a race. */
# else
size_t hb_n_marks; /* Without parallel marking, the count */
/* is accurate. */
# endif
# ifdef USE_MARK_BYTES
# define MARK_BITS_SZ (MARK_BITS_PER_HBLK + 1)
/* Unlike the other case, this is in units of bytes. */
/* Since we force double-word alignment, we need at most one */
/* mark bit per 2 words. But we do allocate and set one */
/* extra mark bit to avoid an explicit check for the */
/* partial object at the end of each block. */
union {
char _hb_marks[MARK_BITS_SZ];
/* The i'th byte is 1 if the object */
/* starting at granule i or object i is */
/* marked, 0 o.w. */
/* The mark bit for the "one past the */
/* end" object is always set to avoid a */
/* special case test in the marker. */
word dummy; /* Force word alignment of mark bytes. */
} _mark_byte_union;
# define hb_marks _mark_byte_union._hb_marks
# else
# define MARK_BITS_SZ (MARK_BITS_PER_HBLK/CPP_WORDSZ + 1)
word hb_marks[MARK_BITS_SZ];
# endif /* !USE_MARK_BYTES */
};
-
bottom_index
typedef struct bi { hdr * index[BOTTOM_SZ]; /* * The bottom level index contains one of three kinds of values: * 0 means we're not responsible for this block, * or this is a block other than the first one in a free block. * 1 < (long)X <= MAX_JUMP means the block starts at least * X * HBLKSIZE bytes before the current address. * A valid pointer points to a hdr structure. (The above can't be * valid pointers due to the GET_MEM return convention.) */ struct bi * asc_link; /* All indices are linked in */ /* ascending order... */ struct bi * desc_link; /* ... and in descending order. */ word key; /* high order address bits. */ # ifdef HASH_TL struct bi * hash_link; /* Hash chain link. */ # endif } bottom_index;
bottom_index管理BOTTOM_SZ(2<<10)个block,默认值4M
- _GC_arrays
struct _GC_arrays {
word _heapsize; /* Heap size in bytes (value never goes down). */
word _requested_heapsize; /* Heap size due to explicit expansion. */
ptr_t _last_heap_addr;
ptr_t _prev_heap_addr;
word _large_free_bytes;
/* Total bytes contained in blocks on large object free */
/* list. */
word _large_allocd_bytes;
/* Total number of bytes in allocated large objects blocks. */
/* For the purposes of this counter and the next one only, a */
/* large object is one that occupies a block of at least */
/* 2*HBLKSIZE. */
word _max_large_allocd_bytes;
/* Maximum number of bytes that were ever allocated in */
/* large object blocks. This is used to help decide when it */
/* is safe to split up a large block. */
word _bytes_allocd_before_gc;
/* Number of bytes allocated before this */
/* collection cycle. */
# ifndef SEPARATE_GLOBALS
# define GC_bytes_allocd GC_arrays._bytes_allocd
word _bytes_allocd;
/* Number of bytes allocated during this collection cycle. */
# endif
word _bytes_dropped;
/* Number of black-listed bytes dropped during GC cycle */
/* as a result of repeated scanning during allocation */
/* attempts. These are treated largely as allocated, */
/* even though they are not useful to the client. */
word _bytes_finalized;
/* Approximate number of bytes in objects (and headers) */
/* that became ready for finalization in the last */
/* collection. */
word _bytes_freed;
/* Number of explicitly deallocated bytes of memory */
/* since last collection. */
word _finalizer_bytes_freed;
/* Bytes of memory explicitly deallocated while */
/* finalizers were running. Used to approximate memory */
/* explicitly deallocated by finalizers. */
ptr_t _scratch_end_ptr;
ptr_t _scratch_last_end_ptr;
/* Used by headers.c, and can easily appear to point to */
/* heap. Also used by GC_register_dynamic_libraries(). */
mse *_mark_stack;
/* Limits of stack for GC_mark routine. All ranges */
/* between GC_mark_stack (incl.) and GC_mark_stack_top */
/* (incl.) still need to be marked from. */
mse *_mark_stack_limit;
# ifdef PARALLEL_MARK
mse *volatile _mark_stack_top;
/* Updated only with mark lock held, but read asynchronously. */
/* TODO: Use union to avoid casts to AO_t */
# else
mse *_mark_stack_top;
# endif
word _composite_in_use; /* Number of bytes in the accessible */
/* composite objects. */
word _atomic_in_use; /* Number of bytes in the accessible */
/* atomic objects. */
# ifdef USE_MUNMAP
# define GC_unmapped_bytes GC_arrays._unmapped_bytes
word _unmapped_bytes;
# else
# define GC_unmapped_bytes 0
# endif
bottom_index * _all_nils;
# ifdef ENABLE_TRACE
# define GC_trace_addr GC_arrays._trace_addr
ptr_t _trace_addr;
# endif
GC_mark_proc _mark_procs[MAX_MARK_PROCS];
/* Table of user-defined mark procedures. There is */
/* a small number of these, which can be referenced */
/* by DS_PROC mark descriptors. See gc_mark.h. */
char _modws_valid_offsets[sizeof(word)];
/* GC_valid_offsets[i] ==> */
/* GC_modws_valid_offsets[i%sizeof(word)] */
# if !defined(MSWIN32) && !defined(MSWINCE) && !defined(CYGWIN32)
# define GC_root_index GC_arrays._root_index
struct roots * _root_index[RT_SIZE];
# endif
# ifdef SAVE_CALL_CHAIN
# define GC_last_stack GC_arrays._last_stack
struct callinfo _last_stack[NFRAMES];
/* Stack at last garbage collection. Useful for */
/* debugging mysterious object disappearances. In the */
/* multi-threaded case, we currently only save the */
/* calling stack. */
# endif
# ifndef SEPARATE_GLOBALS
# define GC_objfreelist GC_arrays._objfreelist
void *_objfreelist[MAXOBJGRANULES+1];
/* free list for objects */
# define GC_aobjfreelist GC_arrays._aobjfreelist
void *_aobjfreelist[MAXOBJGRANULES+1];
/* free list for atomic objects */
# endif
void *_uobjfreelist[MAXOBJGRANULES+1];
/* Uncollectible but traced objects. */
/* Objects on this and _auobjfreelist */
/* are always marked, except during */
/* garbage collections. */
# ifdef GC_ATOMIC_UNCOLLECTABLE
# define GC_auobjfreelist GC_arrays._auobjfreelist
void *_auobjfreelist[MAXOBJGRANULES+1];
/* Atomic uncollectible but traced objects. */
# endif
size_t _size_map[MAXOBJBYTES+1];
/* Number of granules to allocate when asked for a certain */
/* number of bytes. Should be accessed with the allocation */
/* lock held. */
# ifdef MARK_BIT_PER_GRANULE
# define GC_obj_map GC_arrays._obj_map
unsigned short * _obj_map[MAXOBJGRANULES + 1];
/* If not NULL, then a pointer to a map of valid */
/* object addresses. */
/* _obj_map[sz_in_granules][i] is */
/* i % sz_in_granules. */
/* This is now used purely to replace a */
/* division in the marker by a table lookup. */
/* _obj_map[0] is used for large objects and */
/* contains all nonzero entries. This gets us */
/* out of the marker fast path without an extra */
/* test. */
# define MAP_LEN BYTES_TO_GRANULES(HBLKSIZE)
# endif
# define VALID_OFFSET_SZ HBLKSIZE
char _valid_offsets[VALID_OFFSET_SZ];
/* GC_valid_offsets[i] == TRUE ==> i */
/* is registered as a displacement. */
# if defined(PROC_VDB) || defined(MPROTECT_VDB) \
|| defined(GWW_VDB) || defined(MANUAL_VDB)
# define GC_grungy_pages GC_arrays._grungy_pages
page_hash_table _grungy_pages; /* Pages that were dirty at last */
/* GC_read_dirty. */
# endif
# if defined(MPROTECT_VDB) || defined(MANUAL_VDB)
# define GC_dirty_pages GC_arrays._dirty_pages
volatile page_hash_table _dirty_pages;
/* Pages dirtied since last GC_read_dirty. */
# endif
# if (defined(CHECKSUMS) && defined(GWW_VDB)) || defined(PROC_VDB)
# define GC_written_pages GC_arrays._written_pages
page_hash_table _written_pages; /* Pages ever dirtied */
# endif
# define GC_heap_sects GC_arrays._heap_sects
struct HeapSect {
ptr_t hs_start;
size_t hs_bytes;
} _heap_sects[MAX_HEAP_SECTS]; /* Heap segments potentially */
/* client objects. */
# if defined(USE_PROC_FOR_LIBRARIES)
# define GC_our_memory GC_arrays._our_memory
struct HeapSect _our_memory[MAX_HEAP_SECTS];
/* All GET_MEM allocated */
/* memory. Includes block */
/* headers and the like. */
# endif
# if defined(MSWIN32) || defined(MSWINCE) || defined(CYGWIN32)
# define GC_heap_bases GC_arrays._heap_bases
ptr_t _heap_bases[MAX_HEAP_SECTS];
/* Start address of memory regions obtained from kernel. */
# endif
# ifdef MSWINCE
# define GC_heap_lengths GC_arrays._heap_lengths
word _heap_lengths[MAX_HEAP_SECTS];
/* Committed lengths of memory regions obtained from kernel. */
# endif
struct roots _static_roots[MAX_ROOT_SETS];
struct exclusion _excl_table[MAX_EXCLUSIONS];
/* Block header index; see gc_headers.h */
bottom_index * _top_index[TOP_SZ];
};
GC的全局管理类,_top_index存放所有的bottom_index指针。TOP_SZ默认值1<<11=2048
block和header对应关系
-
block找header
-
先根据block的指针找到bi
# define GET_BI(p, bottom_indx) \ do { \ REGISTER word hi = (word)(p) >> (LOG_BOTTOM_SZ + LOG_HBLKSIZE); \ REGISTER bottom_index * _bi = GC_top_index[TL_HASH(hi)]; \ while (_bi -> key != hi && _bi != GC_all_nils) \ _bi = _bi -> hash_link; \ (bottom_indx) = _bi; \ } while (0)
- p就是block开始地址,LOG_BOTTOM_SZ 是bottom大小,LOG_HBLKSIZE是block的log大小(log2)
-
计算hash值,通过p右移(LOG_BOTTOM_SZ + LOG_HBLKSIZE),再约束到GC_top_index的大小范围内。
- 通过hash值计算方式可以推导出来,一个block里的所有地址(只有结尾的LOG_HBLKSIZE+1位不一致),计算出来的hi是一样。
- 第12位(LOG_HBLKSIZE+1)- 21位(LOG_BOTTOM_SZ + LOG_HBLKSIZE-1),决定block在bi->index里的下标
- 第22位(LOG_BOTTOM_SZ + LOG_HBLKSIZE-1)-32位(TL_HASH算法,11位)决定了bi在GC_top_index的下标。
- 根据hash值,从GC_top_index取到bi
- 为啥还要遍历_bi -> hash_link?因为不用的bi,可以有一样的hash值
-
bi找到对应的block
#define HDR_FROM_BI(bi, p) \ ((bi)->index[((word)(p) >> LOG_HBLKSIZE) & (BOTTOM_SZ - 1)])
-
-
header找block
- hb_block字段
内存总大小
1<<LOG_TOP_SZ<<LOG_BOTTOM_SZ <<LOG_HBLKSIZE = 8G,也就是如果持续分配小对象,占用的内存大于了8G,就无法分配了。
内存分配流程
GC_API GC_ATTR_MALLOC void * GC_CALL GC_malloc_kind_global(size_t lb, int k)
{
GC_ASSERT(k < MAXOBJKINDS);
if (SMALL_OBJ(lb)) {
void *op;
void **opp;
size_t lg;
DCL_LOCK_STATE;
GC_DBG_COLLECT_AT_MALLOC(lb);
LOCK();
lg = GC_size_map[lb];
opp = &GC_obj_kinds[k].ok_freelist[lg];
op = *opp;
if (EXPECT(op != NULL, TRUE)) {
if (k == PTRFREE) {
*opp = obj_link(op);
} else {
GC_ASSERT(0 == obj_link(op)
|| ((word)obj_link(op)
<= (word)GC_greatest_plausible_heap_addr
&& (word)obj_link(op)
>= (word)GC_least_plausible_heap_addr));
*opp = obj_link(op);
obj_link(op) = 0;
}
GC_bytes_allocd += GRANULES_TO_BYTES((word)lg);
UNLOCK();
return op;
}
UNLOCK();
}
/* We make the GC_clear_stack() call a tail one, hoping to get more */
/* of the stack. */
return GC_clear_stack(GC_generic_malloc(lb, k));
}
-
k 3个值:
- NORMAL 常规分配
- PTRFREE 无指针分配
- UNCOLLECTABLE 内部分配,不需要回收
-
如果是小对象(SMALL_OBJ返回true, <blksize/2 = 2048),就从空闲块(GC_obj_kinds)中
- GC_size_map存放分配大小到GRANULE_BYTES的倍数对应关系,长度是smallsize/GRANULE_BYTES=2048/16 = 128
-
GC_obj_kinds对于每种类型都会有一个obj_kind结构
- lg是(GRANULE_BYTES倍数)
- ok_freelist是个一维数组,[lg] = 空闲块的链表指针 ,所以长度也是128.
-
否则,就走一般的分配,核心实现GC_INNER ptr_t GC_alloc_large(size_t lb, int k, unsigned flags)
- 大对象分配,也把需要的内存扩展到GRANULE_BYTES的整数倍,浪费不会超过16个字节
- Flags = 0
-
GC_allochblk完成实际分配
- 分配block
- 分配block对应的header
- 构建空闲link,按照lg大小间隔,把内存放前一个的地址,从而形成链表。
空闲块管理
-
目前遇到过2种:
- IL2cpp方式,在每个空闲的内存块开头,记录下一个空闲块地址。
- unity方式,因为内存块大小固定,粒度GRANULE固定,从而需要标记他们是否空闲的bit位固定。在每个block开头固定大小,用bit方式来记录是否空闲。
-
对比方式一,形成链表,每次进入空闲链表,都有开销:
- 记录下一个地址
- 擦除内容,因为垃圾回收会遍历内存,为了尽量释放,所以需要擦除内存
且整个block空闲时,需要从空闲块比遍历删除。block才能用到其他地方。方式二,有一个内存的开销,占比<1/16*8,其实也还好。标记空闲,只是修改bit值,非常快。感觉还是方式二更优。
总结
- 三级结构_GC_arrays -> bottom_index->hblockhdr管理所有的block。单个block 4K.bi 管理了1024个blockhdr,共4M
- 小对象按粒度公用block。单个block内,对象占用内存大小一致。大对象独占block(非4K)
- 设计了特别的hash算法,block地址,能找到bi,再从bi中找到blockhdr。