可可西

FMallocBinned2内存分配器

FMallocBinned2是虚幻引擎实现的第二代装箱内存分配器,其重要的配置参数及成员变量如下:

#define BINNED2_MAX_CACHED_OS_FREES (64)
#if PLATFORM_64BITS
    #define BINNED2_MAX_CACHED_OS_FREES_BYTE_LIMIT (64*1024*1024) // 64MB
#else
    #define BINNED2_MAX_CACHED_OS_FREES_BYTE_LIMIT (16*1024*1024)
#endif

#define BINNED2_LARGE_ALLOC                    65536        // Alignment of OS-allocated pointer - pool-allocated pointers will have a non-aligned pointer
#define BINNED2_MINIMUM_ALIGNMENT_SHIFT        4            // Alignment of blocks, expressed as a shift
#define BINNED2_MINIMUM_ALIGNMENT            16            // Alignment of blocks
#define BINNED2_MAX_SMALL_POOL_SIZE            (32768-16)    // Maximum block size in GMallocBinned2SmallBlockSizes
#define BINNED2_SMALL_POOL_COUNT            45


#define DEFAULT_GMallocBinned2PerThreadCaches 1
#define DEFAULT_GMallocBinned2LockFreeCaches 0
#define DEFAULT_GMallocBinned2BundleCount 64
#define DEFAULT_GMallocBinned2AllocExtra 32
#define BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle 8

#if !defined(AGGRESSIVE_MEMORY_SAVING)
    #error "AGGRESSIVE_MEMORY_SAVING must be defined"
#endif
#if AGGRESSIVE_MEMORY_SAVING
    #define DEFAULT_GMallocBinned2BundleSize 8192
#else
    #define DEFAULT_GMallocBinned2BundleSize BINNED2_LARGE_ALLOC  // 64KB
#endif


#define BINNED2_ALLOW_RUNTIME_TWEAKING 0
#if BINNED2_ALLOW_RUNTIME_TWEAKING
    extern CORE_API int32 GMallocBinned2PerThreadCaches;
    extern CORE_API int32 GMallocBinned2BundleSize = DEFAULT_GMallocBinned2BundleSize;
    extern CORE_API int32 GMallocBinned2BundleCount = DEFAULT_GMallocBinned2BundleCount;
    extern CORE_API int32 GMallocBinned2MaxBundlesBeforeRecycle = BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle;
    extern CORE_API int32 GMallocBinned2AllocExtra = DEFAULT_GMallocBinned2AllocExtra;
#else
    #define GMallocBinned2PerThreadCaches DEFAULT_GMallocBinned2PerThreadCaches  // 1
    #define GMallocBinned2BundleSize DEFAULT_GMallocBinned2BundleSize  // 64KB
    #define GMallocBinned2BundleCount DEFAULT_GMallocBinned2BundleCount  // 64
    #define GMallocBinned2MaxBundlesBeforeRecycle BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle  // 8
    #define GMallocBinned2AllocExtra DEFAULT_GMallocBinned2AllocExtra  // 32
#endif

// ... ... 

// Block sizes are based around getting the maximum amount of allocations per pool, with as little alignment waste as possible.
// Block sizes should be close to even divisors of the system page size, and well distributed.
// They must be 16-byte aligned as well.
static uint16 SmallBlockSizes[] =  // 共45个档位的Block Size  当请求分配一个特定大小的内存块时,将寻找到最接近的一档
{
    16, 32, 48, 64, 80, 96, 112, 128,
    160, 192, 224, 256, 288, 320, 384, 448,
    512, 576, 640, 704, 768, 896, 1024 - 16, 1168,
    1360, 1632, 2048 - 16, 2336, 2720, 3264, 4096 - 16, 4368,
    4672, 5040, 5456, 5952, 6544 - 16, 7280, 8192 - 16, 9360,
    10912, 13104, 16384 - 16, 21840, 32768 - 16 
};// FFreeBlock用来描述一块Block,其本身也处于一个Block中,位于Block头部,要占一定空间sizeof(FFreeBlock)为16,最高一档是32768-16,这样64K空间中能存下两个最高档。

// ... ...

class CORE_API FMallocBinned2 final : public FMalloc
{

private:
    // ... ...
    FPtrToPoolMapping PtrToPoolMapping;  // 内存池哈希桶的相关参数

    // Pool tables for different pool sizes
    FPoolTable SmallPoolTables[BINNED2_SMALL_POOL_COUNT]; // 所有的内存池表列表, 单个内存池的Block尺寸是一样的  宏BINNED2_SMALL_POOL_COUNT为45

    PoolHashBucket* HashBuckets;  // Key命中时使用的内存池哈希桶
    PoolHashBucket* HashBucketFreeList; // Key未命中时使用的内存池哈希桶
    uint64 NumPoolsPerPage; // 一个Page中内存池的个数   65536除以sizeof(FPoolInfo) = 65536/32 = 2048
    // ... ...
    
    FCriticalSection Mutex; // 用于FScopeLock的临界段对象,实现对临界段的互斥访问
    
    // ... ...

public:
    // ... ...
    static uint16 SmallBlockSizesReversed[BINNED2_SMALL_POOL_COUNT]; // this is reversed to get the smallest elements on our main cache line // 为SmallBlockSizes数组的反向数组
    static FMallocBinned2* MallocBinned2; // 当前Binned2内存分配器实例
    static uint32 Binned2TlsSlot; // 是否创建MallocBinned2的TLS Slot(为0表示未创建),所有线程共享同一个Binned2 TLS Slot
    static uint32 PageSize;  // Constants.BinnedPageSize  为64KB
    static uint32 OsAllocationGranularity;  // Constants.BinnedAllocationGranularity  为4096 
    // Mapping of sizes to small table indices  数组的个数为2048,存放的数据为:0,0,1,2,3,4,5,6,7,8,8, ... ,44,44,44,用于帮助计算当前内存Size在FPoolTable SmallPoolTables中的索引
// Size到BlockSize的PoolTable索引映射表。UE4初始化阶段会预生成一个需要申请内存大小到PoolTable数组的索引的映射表,这样当Malloc时,根据Malloc需要的大小直接找到对应的PoolTable static uint8 MemSizeToIndex[1 + (BINNED2_MAX_SMALL_POOL_SIZE >> BINNED2_MINIMUM_ALIGNMENT_SHIFT)]; // ... ... };

 

FMallocBinned2内存分配器的初始化

// 在其构造函数中

根据所在平台硬件和操作系统,来设置内存分配器的相关参数

FMallocBinned2::FMallocBinned2()
    : HashBucketFreeList(nullptr)
{
    static bool bOnce = false;
    check(!bOnce); // this is now a singleton-like thing and you cannot make multiple copies
    bOnce = true;

    // 初始化SmallBlockSizesReversed数组,为SmallBlockSizes数组的反向数组
    for (uint32 Index = 0; Index != BINNED2_SMALL_POOL_COUNT; ++Index)
    {
        uint32 Partner = BINNED2_SMALL_POOL_COUNT - Index - 1;
        SmallBlockSizesReversed[Index] = SmallBlockSizes[Partner];
    }
    
    // 根据所在平台硬件和操作系统,来设置内存分配器的相关参数
    FGenericPlatformMemoryConstants Constants = FPlatformMemory::GetConstants();
    PageSize = Constants.BinnedPageSize;
    OsAllocationGranularity = Constants.BinnedAllocationGranularity ? Constants.BinnedAllocationGranularity : PageSize;
    NumPoolsPerPage = PageSize / sizeof(FPoolInfo);
    PtrToPoolMapping.Init(PageSize, NumPoolsPerPage, Constants.AddressLimit); // 初始化内存池哈希桶的相关参数

    // 一些check
    checkf(FMath::IsPowerOfTwo(PageSize), TEXT("OS page size must be a power of two"));
    checkf(FMath::IsPowerOfTwo(Constants.AddressLimit), TEXT("OS address limit must be a power of two"));
    checkf(Constants.AddressLimit > PageSize, TEXT("OS address limit must be greater than the page size")); // Check to catch 32 bit overflow in AddressLimit
    checkf(SmallBlockSizes[BINNED2_SMALL_POOL_COUNT - 1] == BINNED2_MAX_SMALL_POOL_SIZE, TEXT("BINNED2_MAX_SMALL_POOL_SIZE must equal the smallest block size"));
    checkf(PageSize % BINNED2_LARGE_ALLOC == 0, TEXT("OS page size must be a multiple of BINNED2_LARGE_ALLOC"));
    checkf(sizeof(FMallocBinned2::FFreeBlock) <= SmallBlockSizes[0], TEXT("Pool header must be able to fit into the smallest block"));
    static_assert(UE_ARRAY_COUNT(SmallBlockSizes) == BINNED2_SMALL_POOL_COUNT, "Small block size array size must match BINNED2_SMALL_POOL_COUNT");
    static_assert(UE_ARRAY_COUNT(SmallBlockSizes) <= 256, "Small block size array size must fit in a byte");
    static_assert(sizeof(FFreeBlock) <= BINNED2_MINIMUM_ALIGNMENT, "Free block struct must be small enough to fit into a block.");

    // Init pool tables.  填充SmallBlockSizes数组中BlockSize,共45档
    for (uint32 Index = 0; Index != BINNED2_SMALL_POOL_COUNT; ++Index)
    {
        checkf(Index == 0 || SmallBlockSizes[Index - 1] < SmallBlockSizes[Index], TEXT("Small block sizes must be strictly increasing"));
        checkf(SmallBlockSizes[Index] <= PageSize, TEXT("Small block size must be small enough to fit into a page"));
        checkf(SmallBlockSizes[Index] % BINNED2_MINIMUM_ALIGNMENT == 0, TEXT("Small block size must be a multiple of BINNED2_MINIMUM_ALIGNMENT"));

        SmallPoolTables[Index].BlockSize = SmallBlockSizes[Index];
    }

    // Set up pool mappings  数组的个数为2048,存放的数据为:0,0,1,2,3,4,5,6,7,8,8, ... ,44,44,44,用于帮助计算当前内存Size在FPoolTable SmallPoolTables中的索引
    uint8* IndexEntry = MemSizeToIndex;
    uint32  PoolIndex  = 0;
    for (uint32 Index = 0; Index != 1 + (BINNED2_MAX_SMALL_POOL_SIZE >> BINNED2_MINIMUM_ALIGNMENT_SHIFT); ++Index)
    {
        
        uint32 BlockSize = Index << BINNED2_MINIMUM_ALIGNMENT_SHIFT; // inverse of int32 Index = int32((Size >> BINNED2_MINIMUM_ALIGNMENT_SHIFT));
        while (SmallBlockSizes[PoolIndex] < BlockSize)
        {
            ++PoolIndex;
            check(PoolIndex != BINNED2_SMALL_POOL_COUNT);
        }
        check(PoolIndex < 256);
        *IndexEntry++ = uint8(PoolIndex);
    }
    
    
    // now reverse the pool sizes for cache coherency  // 再次初始化SmallBlockSizesReversed数组,为SmallBlockSizes数组的反向数组

    for (uint32 Index = 0; Index != BINNED2_SMALL_POOL_COUNT; ++Index)
    {
        uint32 Partner = BINNED2_SMALL_POOL_COUNT - Index - 1;
        SmallBlockSizesReversed[Index] = SmallBlockSizes[Partner];
    }

    uint64 MaxHashBuckets = PtrToPoolMapping.GetMaxHashBuckets();

    {
        LLM_PLATFORM_SCOPE(ELLMTag::FMalloc);
        // 为Key命中时使用的内存池哈希桶分配内存
        HashBuckets = (PoolHashBucket*)FPlatformMemory::BinnedAllocFromOS(Align(MaxHashBuckets * sizeof(PoolHashBucket), OsAllocationGranularity));
#if BINNED2_ALLOCATOR_STATS
        Binned2HashMemory += Align(MaxHashBuckets * sizeof(PoolHashBucket), OsAllocationGranularity);
#endif
    }

    DefaultConstructItems<PoolHashBucket>(HashBuckets, MaxHashBuckets); // 缺省构造与初始化HashBuckets
    MallocBinned2 = this;
    GFixedMallocLocationPtr = (FMalloc**)(&MallocBinned2);
}

具体数值如下:

 

FPoolTable 

// 同一Block大小内存池表

/** 内存池表 sizeof(FPoolTable)为24*/
struct FPoolTable
{
    FPoolList ActivePools;  // 指向有空闲Block的内存池链表
    FPoolList ExhaustedPools; // 指向已满(没有可分配的内存)的内存池链表
    uint32    BlockSize;  // 当前PoolTable中所有内存池的Block大小

    // ... ...
};

 

FPoolList 

// 内存池链表

// 内存池链表 sizeof(FPoolList)为8
struct FPoolList
{
    // ... ...

private:
    FPoolInfo* Front;
};

 

FPoolInfo 

// 内存池

FPoolInfo中的所有Block为空闲时,才释放其占用的内存页

// 内存池  sizeof(FPoolInfo)为32
struct FMallocBinned2::FPoolInfo  
{
    // ... ...
 // 已分配的Block的个数  当为0时,将释放整个内存池及其FirstMem指向的内存块
 public:    uint16      Taken;          // Number of allocated elements in this pool, when counts down to zero can free the entire pool    
 public:    ECanary        Canary;    // See ECanary
 // 已分配的字节数
 private:    uint32      AllocSize;      // Number of bytes allocated
 // 如果是Bin模式,指向内存池可用的内存块Block链表; 如果非Bin模式, 指向由操作系统直接分配的内存块.
 public:    FFreeBlock* FirstFreeBlock; // Pointer to first free memory in this pool or the OS Allocation Size in bytes if this allocation is not binned
 // 指向下一个内存池
 public:    FPoolInfo*  Next;           // Pointer to next pool
 public:    FPoolInfo** PtrToPrevNext;  // Pointer to whichever pointer points to this pool
 
    // ... ...
};

 

FFreeBlock 

// 内存块

// 内存块  sizeof(FFreeBlock)为16
struct FFreeBlock
{
    // ... ...
    uint16 BlockSize;                // Size of the blocks that this list points to  // 所在Pool的BlockSize
    uint8 PoolIndex;                // Index of this pool  // 所在Pool的Index
    uint8 Canary;                    // Constant value of 0xe3  // 固定常量 用于判断内存是否越界写 以此判断这块Block数据是否损坏
    uint32 NumFreeBlocks;          // Number of consecutive free blocks here, at least 1.  // 空闲Block个数
    void*  NextFreeBlock;          // Next free block in another pool // 释放1个Block时,会构建该Block的FFreeMem,并插入到Pool->FirstMem链表的头部
};

 

PoolHashBucket 

// 内存池哈希桶

// 内存池哈希桶的相关参数  sizeof(FPtrToPoolMapping)为32
struct FPtrToPoolMapping
{
    // ... ...

private:
    /** Shift to apply to a pointer to get the reference from the indirect tables */
    uint64 PtrToPoolPageBitShift;

    /** Shift required to get required hash table key. */
    uint64 HashKeyShift;

    /** Used to mask off the bits that have been used to lookup the indirect table */
    uint64 PoolMask;

    // PageSize dependent constants
    uint64 MaxHashBuckets;
};

/** 内存池哈希桶,用于存放由内存地址哈希出来的键对应的内存池链表  sizeof(PoolHashBucket)为32 */
struct FMallocBinned2::PoolHashBucket
{
    UPTRINT         BucketIndex; // 哈希键 Key=Ptr >> Allocator.HashKeyShift  内存地址右移27个bit位
    FPoolInfo*      FirstPool; // 指向内存池内存块(大小为64KB:成员变量PageSize的值)的起始处
    PoolHashBucket* Prev; // 上一个内存池哈希桶
    PoolHashBucket* Next; // 下一个内存池哈希桶
    
    // ... ...
};

 

从内存池Pool中分配内存给Block

struct FMallocBinned2::FPoolInfo
{
    // ... ...
    void* AllocateRegularBlock()
    {
        check(HasFreeRegularBlock()); // 检查FPoolInfo中是否有空闲Block
        ++Taken; // 已分配的Block数+1
        void* Result = FirstFreeBlock->AllocateRegularBlock(); // 分配Block
        ExhaustPoolIfNecessary(); // 如果当前FPoolInfo无空闲Block,则将其移动到FPoolList ExhaustedPools链表中
        return Result;
    }
    
    // ... ...
};


struct FFreeBlock
{
    // ... ...
    FORCEINLINE void* AllocateRegularBlock()
    {
        --NumFreeBlocks;  // 空闲Block个数减1
        if (IsAligned(this, BINNED2_LARGE_ALLOC)) // BINNED2_LARGE_ALLOC为64KB  FFreeBlock起始处是否对齐到64KB
        {
            return (uint8*)this + BINNED2_LARGE_ALLOC - (NumFreeBlocks + 1) * BlockSize; // +64KB后,从前往后分配Block
        }
        return (uint8*)this + (NumFreeBlocks)* BlockSize; // 从后往前分配Block
    }
    
    // ... ...
};

 

TLS Cache机制

FMallocBinned内存分配器相比,FMallocBinned2最大的改进:

引入了TLS(Thread Local Storage线程局部存储。线程可以有自己的存储空间,以键值对形式存储一些自己独有的变量)缓存,来优化内存的分配速度

各线程会记录被free的地址,把它们保存到一个列表中,当这个线程再有malloc请求来时,如果BlockSize匹配,则直接返回之前缓存的free地址

这样就不需要再访问FPoolTable SmallPoolTables[BINNED2_SMALL_POOL_COUNT]了,因此也不用再加互斥锁了

 

各个线程在启动时,通过调用FMemory::SetupTLSCachesOnCurrentThread()创建自己的TLS数据FPerThreadFreeBlockLists

各个线程在创建FPerThreadFreeBlockLists后,都会把它添加到Binned2的RegisteredFreeBlockLists数组中记录。代码如下:

void FMallocBinned2::SetupTLSCachesOnCurrentThread()
{
    if (!BINNED2_ALLOW_RUNTIME_TWEAKING && !GMallocBinned2PerThreadCaches)
    {
        return;
    }
    if (!FMallocBinned2::Binned2TlsSlot)
    {
        FMallocBinned2::Binned2TlsSlot = FPlatformTLS::AllocTlsSlot(); // 只会执行一次,TLS Slot全局唯一
    }
    check(FMallocBinned2::Binned2TlsSlot);
    FPerThreadFreeBlockLists::SetTLS(); // 各线程创建自己的TLS
}


void FMallocBinned2::FPerThreadFreeBlockLists::SetTLS()
{
    check(FMallocBinned2::Binned2TlsSlot);
    FPerThreadFreeBlockLists* ThreadSingleton = (FPerThreadFreeBlockLists*)FPlatformTLS::GetTlsValue(FMallocBinned2::Binned2TlsSlot);
    if (!ThreadSingleton)
    {
        LLM_PLATFORM_SCOPE(ELLMTag::FMalloc);
        ThreadSingleton = new (FPlatformMemory::BinnedAllocFromOS(Align(sizeof(FPerThreadFreeBlockLists), FMallocBinned2::OsAllocationGranularity))) FPerThreadFreeBlockLists();
#if BINNED2_ALLOCATOR_STATS
        Binned2TLSMemory += Align(sizeof(FPerThreadFreeBlockLists), FMallocBinned2::OsAllocationGranularity);
#endif
        FPlatformTLS::SetTlsValue(FMallocBinned2::Binned2TlsSlot, ThreadSingleton);
        FMallocBinned2::Private::RegisterThreadFreeBlockLists(ThreadSingleton);
    }
}

static TArray<FPerThreadFreeBlockLists*>& GetRegisteredFreeBlockLists()
{
    static TArray<FPerThreadFreeBlockLists*> RegisteredFreeBlockLists;
    return RegisteredFreeBlockLists;
}
static void RegisterThreadFreeBlockLists( FPerThreadFreeBlockLists* FreeBlockLists )
{
    FScopeLock Lock(&GetFreeBlockListsRegistrationMutex());
#if BINNED2_ALLOCATOR_STATS_VALIDATION
    ++RecursionCounter;
#endif
    GetRegisteredFreeBlockLists().Add(FreeBlockLists);
#if BINNED2_ALLOCATOR_STATS_VALIDATION
    --RecursionCounter;
#endif
}

 

各线程调用FMemory::SetupTLSCachesOnCurrentThread()的情况:

 

每个线程都会有一份FPerThreadFreeBlockLists副本,其中FreeLists[]数组也通过BlockSize产生,有45个元素

每个元素类型为FFreeBlockList,包含FBundle PartialBundle链表(未装满的Bundle)和FBundle FullBundle链表(已装满的Bundle)

为了让Binned2不过多占用内存空间,给FBundle的FBundleNode节点数做了限制(不能超过GMallocBinned2BundleCount,即64个)及总容量不能大于GMallocBinned2BundleSize【64KB】(即:FBundleNode的Count * BlockSize < 64KB)

 

struct FPerThreadFreeBlockLists
{
    // ... ...
private:
    FFreeBlockList FreeLists[BINNED2_SMALL_POOL_COUNT]; // 宏BINNED2_SMALL_POOL_COUNT为45  对应各个档位的Block
};

// sizeof(FFreeBlockList)为32
struct FFreeBlockList
{
    // ... ...
private:
    FBundle PartialBundle;
    FBundle FullBundle;
};

// FBundleNode链表  sizeof(FBundle)为16
struct FBundle
{
    // ... ...
    FBundleNode* Head;  // 链表头指针
    uint32       Count;
};

// sizeof(FBundleNode)为16
struct FBundleNode
{
    FBundleNode* NextNodeInCurrentBundle; // 当前Bundle链表中的下一个FBundleNode节点
    union
    {
        FBundleNode* NextBundle; // 下一个Bundle链表  FFreeBlockList::PopBundles时,会把FullBundle赋值给PartialBundle的NextBundle,即:PaitialBundle->NextBundle = FullBundle,便于FreeBundles函数中进行循环遍历来释放Bundle
        int32 Count; // 当前链表中的FBundleNode节点个数
    };
};

注:FBundleNode*指向的是Block内存块区域,把Ptr指针转行成FBundleNode*后,其size为16字节,对其修改不会影响到其他内存空间 

 

FGlobalRecycler 

// 用于缓存各个线程的FBundle FullBundle和FBundle PartialBundle链表的头指针

struct FGlobalRecycler  // sizeof(FGlobalRecycler)为64*45 = 2880
{

    // ... ...

private:
    struct FPaddedBundlePointer // sizeof(FPaddedBundlePointer)为8*8 = 64
    {
        FBundleNode* FreeBundles[BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle]; // 宏BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle为8,通过FPlatformAtomics::InterlockedCompareExchangePointer进行原子修改,防止多线程导致数据破坏

        // ... ...
    };
    
    // ... ...
    
    MS_ALIGN(PLATFORM_CACHE_LINE_SIZE) FPaddedBundlePointer Bundles[BINNED2_SMALL_POOL_COUNT] GCC_ALIGN(PLATFORM_CACHE_LINE_SIZE); // 宏BINNED2_SMALL_POOL_COUNT为45
};

static FGlobalRecycler GGlobalRecycler;  // 各Block档允许有8个元素。有空位时,PushBundle会成功,否则失败。全部为空时,PopBundle失败并返回null。

 

Free释放内存时TLS Cache的流程细节如下:

 

Free掉Ptr指针的内存占用  

void FMallocBinned2::FreeExternal(void* Ptr)
{
    if (!IsOSAllocation(Ptr)) // 是否为操作系统直接分配的内存块
    {
        // Bin模式,内存池
        check(Ptr); // null is 64k aligned so we should not be here
        FFreeBlock* BasePtr = GetPoolHeaderFromPointer(Ptr); // 将指针转换成FFreeBlock*
        BasePtr->CanaryTest();
        uint32 BlockSize = BasePtr->BlockSize;
        uint32 PoolIndex = BasePtr->PoolIndex;

        FBundleNode* BundlesToRecycle = nullptr;
        FPerThreadFreeBlockLists* Lists = GMallocBinned2PerThreadCaches ? FPerThreadFreeBlockLists::Get() : nullptr;
        if (Lists)
        {
            // 若FPerThreadFreeBlockLists[BlockSize].FullBundle.Head不为空,会加入GGlobalRecycler对应Block档位的数组中缓存
            // 当GGlobalRecycler对应Block档位没有空位时,BundlesToRecycle会被赋值为FPerThreadFreeBlockLists[BlockSize].FullBundle.Head
            // 当GGlobalRecycler对应Block档位有空位时,会被添加进去,并返回空
            BundlesToRecycle = Lists->RecycleFullBundle(BasePtr->PoolIndex); 
            bool bPushed = Lists->Free(Ptr, PoolIndex, BlockSize); // 加到FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表的头部
            check(bPushed);
#if BINNED2_ALLOCATOR_STATS
            Lists->AllocatedMemory -= BlockSize;
#endif
        }
        else
        {
            BundlesToRecycle = (FBundleNode*)Ptr;
            BundlesToRecycle->NextNodeInCurrentBundle = nullptr;
        }
        if (BundlesToRecycle) // 若不为空,表示GGlobalRecycler.FreeBundles.Bundles[PoolIndex]档位的8个FBundleNode缓存都已被占满,进入内存释放流程
        {
            BundlesToRecycle->NextBundle = nullptr;
            FScopeLock Lock(&Mutex);
            Private::FreeBundles(*this, BundlesToRecycle, BlockSize, PoolIndex); // 释放BundlesToRecycle地址所占用的Block内存
#if BINNED2_ALLOCATOR_STATS
            if (!Lists)
            {
                // lists track their own stat track them instead in the global stat if we don't have lists
                AllocatedSmallPoolMemory -= ((int64)(BlockSize));
            }
#endif
        }
    }
    else if (Ptr)
    {
        // 非Bin模式, 操作系统直接分配
        FScopeLock Lock(&Mutex);
        FPoolInfo* Pool = Private::FindPoolInfo(*this, Ptr);
        if (!Pool)
        {
            UE_LOG(LogMemory, Fatal, TEXT("FMallocBinned2 Attempt to free an unrecognized block %p"), Ptr);
        }
        UPTRINT PoolOsBytes = Pool->GetOsAllocatedBytes();
        SIZE_T PoolOSRequestedBytes = Pool->GetOSRequestedBytes();

#if BINNED2_ALLOCATOR_STATS
        AllocatedLargePoolMemory -= ((int64)PoolOSRequestedBytes);
        AllocatedLargePoolMemoryWAlignment -= ((int64)PoolOsBytes);
#endif

        checkf(PoolOSRequestedBytes <= PoolOsBytes, TEXT("FMallocBinned2::FreeExternal %d %d"), int32(PoolOSRequestedBytes), int32(PoolOsBytes));
        Pool->SetCanary(FPoolInfo::ECanary::Unassigned, true, false);
        // Free an OS allocation.
        CachedOSPageAllocator.Free(Ptr, PoolOsBytes);
    }
}


static void FreeBundles(FMallocBinned2& Allocator, FBundleNode* BundlesToRecycle, uint32 InBlockSize, uint32 InPoolIndex)
{
    FPoolTable& Table = Allocator.SmallPoolTables[InPoolIndex];

    // 释放FBundleNode* BundlesToRecycle链表上各节点指向的内存
    FBundleNode* Bundle = BundlesToRecycle;
    while (Bundle)
    {
        FBundleNode* NextBundle = Bundle->NextBundle;

        FBundleNode* Node = Bundle;
        do
        {
            FBundleNode* NextNode = Node->NextNodeInCurrentBundle;
            FPoolInfo*   NodePool = FindPoolInfo(Allocator, Node);
            if (!NodePool)
            {
                UE_LOG(LogMemory, Fatal, TEXT("FMallocBinned2 Attempt to free an unrecognized small block %p"), Node);
            }
            NodePool->CheckCanary(FPoolInfo::ECanary::FirstFreeBlockIsPtr);

            // If this pool was exhausted, move to available list.
            if (!NodePool->FirstFreeBlock) // FPoolInfo* NodePool在FPoolList ExhaustedPools链表上时
            {
                Table.ActivePools.LinkToFront(NodePool); // 将FPoolInfo* NodePool从FPoolList ExhaustedPools移动到FPoolList ActivePools
            }
            else
            {
                check(NodePool->FirstFreeBlock->Canary == 0 || NodePool->FirstFreeBlock->IsCanaryOk());
            }

            // Free a pooled allocation.  在FBundleNode* Node地址处,构建一个新的FFreeBlock,NumFreeBlocks设置为1,并插入到NodePool->FirstFreeBlock链表的头部
            FFreeBlock* Free = (FFreeBlock*)Node;
            Free->NumFreeBlocks = 1;
            Free->NextFreeBlock = NodePool->FirstFreeBlock;
            Free->BlockSize     = InBlockSize;
            Free->Canary = FFreeBlock::CANARY_VALUE;
            Free->PoolIndex = InPoolIndex;
            NodePool->FirstFreeBlock   = Free;

            // Free this pool.
            check(NodePool->Taken >= 1);
            if (--NodePool->Taken == 0) // FPoolInfo* NodePool中所有Block为空闲时
            {
                NodePool->SetCanary(FPoolInfo::ECanary::Unassigned, true, false);
                FFreeBlock* BasePtrOfNode = GetPoolHeaderFromPointer(Node);

                // Free the OS memory.
                NodePool->Unlink(); // 从FPoolList链表上断开
                Allocator.CachedOSPageAllocator.Free(BasePtrOfNode, Allocator.PageSize); // 回收整个FPoolInfo* NodePool的内存
#if BINNED2_ALLOCATOR_STATS
                AllocatedOSSmallPoolMemory -= ((int64)Allocator.PageSize);
#endif
            }

            Node = NextNode; // 遍历下一个FBundleNode*
        } while (Node);

        Bundle = NextBundle;
    }
}

 

Malloc申请内存时TLS Cache的流程细节如下:

 

Malloc分配内存

FORCEINLINE void* MallocSelect(SIZE_T Size, uint32 Alignment)
{
    void* Result;

    if (UseSmallAlloc(Size, Alignment)) // Size <= BINNED2_MAX_SMALL_POOL_SIZE & Alignment <= BINNED2_MINIMUM_ALIGNMENT
    {
        Result = MallocExternalSmall(Size, Alignment); // 使用内存池来分配内存
    }
    else
    {
        Result = MallocExternalLarge(Size, Alignment); // 由操作系统直接分配内存, 且放入HashBuckets表中
    }

    return Result;
}


void* FMallocBinned2::MallocExternalSmall(SIZE_T Size, uint32 Alignment)
{
    uint32 PoolIndex = BoundSizeToPoolIndex(Size); // 根据Size获取在FPoolTable SmallPoolTables中的索引
    
    // 优先从TLS Cache中获取可用的内存Block
    FPerThreadFreeBlockLists* Lists = GMallocBinned2PerThreadCaches ? FPerThreadFreeBlockLists::Get() : nullptr;
    if (Lists)
    {
        // 若FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表为空,从GGlobalRecycler对应Block档位的数组中Pop出一个并赋值给FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head
        // 然后再判断FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head是否为空
        if (Lists->ObtainRecycledPartial(PoolIndex)) 
        {
            if (void* Result = Lists->Malloc(PoolIndex)) // 从FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表头部Pop出一个FBundleNode*,并赋值给Result
            {
#if BINNED2_ALLOCATOR_STATS
                uint32 BlockSize = PoolIndexToBlockSize(PoolIndex);
                Lists->AllocatedMemory += BlockSize;
#endif
                return Result; // 有对应PoolIndex的TLS Cache的Block,则直接分配出去
            }
        }
    }

    FScopeLock Lock(&Mutex); // 获取互斥锁,离开作用域自动释放互斥锁

    // Allocate from small object pool.
    FPoolTable& Table = SmallPoolTables[PoolIndex]; // 根据PoolIndex找到对应的PoolTable

    FPoolInfo* Pool;
    if (!Table.ActivePools.IsEmpty()) // 当前内存池表中的ActivePools不为空
    {
        Pool = &Table.ActivePools.GetFrontPool();  // 获取第一个FPoolInfo
    }
    else
    {
        Pool = &Table.ActivePools.PushNewPoolToFront(*this, Table.BlockSize, PoolIndex); // 创建一个新的FPoolInfo
    }

    void* Result = Pool->AllocateRegularBlock(); // 从FPoolInfo Pool分配一个Block  注:函数中会对Pool的空闲检查,如果已无空闲Block,则把Pool添加到PoolTable的ExhaustedPools中
#if BINNED2_ALLOCATOR_STATS
    AllocatedSmallPoolMemory += PoolIndexToBlockSize(PoolIndex);
#endif // BINNED2_ALLOCATOR_STATS
    if (GMallocBinned2AllocExtra) // GMallocBinned2AllocExtra值为32  这部分逻辑为TLS Cache优化逻辑
    {
        if (Lists)
        {
            // prefill the free list with some allocations so we are less likely to hit this slow path with the mutex 
            for (int32 Index = 0; Index < GMallocBinned2AllocExtra && Pool->HasFreeRegularBlock(); Index++)
            {
                if (!Lists->Free(Result, PoolIndex, Table.BlockSize)) // Free成功,会将当前Result指针加入到FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表头部
                {
                    break;
                }
                Result = Pool->AllocateRegularBlock(); // 从FPoolInfo Pool分配一个Block
            }
        }
    }
    if (!Pool->HasFreeRegularBlock()) // 当前Pool是否有空闲Block
    {
        Table.ExhaustedPools.LinkToFront(Pool); // 移动到FPoolList ExhaustedPools链表中
    }

    return Result;
}

 

在GC Mark时调用FMemory::Trim(bool bTrimThreadCaches) -- 》FMallocBinned2::FlushCurrentThreadCache()来回收TLS Cache缓存住的内存

void FMallocBinned2::FlushCurrentThreadCache()
{
    double StartTimeInner = FPlatformTime::Seconds();
    QUICK_SCOPE_CYCLE_COUNTER(STAT_FMallocBinned2_FlushCurrentThreadCache);
    FPerThreadFreeBlockLists* Lists = FPerThreadFreeBlockLists::Get(); // 获取当前线程TLS的FPerThreadFreeBlockLists链表

    double WaitForMutexTime = 0.0;
    double WaitForMutexAndTrimTime = 0.0;

    if (Lists)
    {
        FScopeLock Lock(&Mutex);
        WaitForMutexTime = FPlatformTime::Seconds() - StartTimeInner;
        for (int32 PoolIndex = 0; PoolIndex != BINNED2_SMALL_POOL_COUNT; ++PoolIndex) // 遍历45个档位的Pool
        {
            FBundleNode* Bundles = Lists->PopBundles(PoolIndex); // 将当前PoolIndex档位的PaitialBundle、FullBundle连接成链表,即:PaitialBundle->NextBundle = FullBundle,便于后面FreeBundles函数中进行循环遍历来释放Bundle
            if (Bundles)
            {
                Private::FreeBundles(*this, Bundles, PoolIndexToBlockSize(PoolIndex), PoolIndex); // 释放当前档位的PaitialBundle、FullBundle中缓存住的内存
            }
        }
        WaitForMutexAndTrimTime = FPlatformTime::Seconds() - StartTimeInner;
    }

    // These logs must happen outside the above mutex to avoid deadlocks
    if (WaitForMutexTime > GMallocBinned2FlushThreadCacheMaxWaitTime)
    {
        UE_LOG(LogMemory, Warning, TEXT("FMallocBinned2 took %6.2fms to wait for mutex for trim."), WaitForMutexTime * 1000.0f);
    }
    if (WaitForMutexAndTrimTime > GMallocBinned2FlushThreadCacheMaxWaitTime)
    {
        UE_LOG(LogMemory, Warning, TEXT("FMallocBinned2 took %6.2fms to wait for mutex AND trim."), WaitForMutexAndTrimTime * 1000.0f);
    }
}

注:Private::FreeBundles并没有释放GGlobalRecycler.FreeBundles.Bundles当前PoolIndex档位的所有共8个FBundleNode缓存

      仅仅是释放了当前线程TLS的FPerThreadFreeBlockLists链表PaitialBundle、FullBundle缓存住的内存,不会使已交换到GGlobalRecycler.FreeBundles.Bundles中的Bundle缓存住的内存释放

 

调用堆栈如下:

UE4Editor-Core-Win64-Debug.dll!FMallocBinned2::FlushCurrentThreadCache() Line 1070
UE4Editor-Core-Win64-Debug.dll!UE4Function_Private::TFunctionRefCaller<<lambda_7e8e4a5f748f352266e45175872bd5ce>,void __cdecl(enum ENamedThreads::Type)>::Call(void * Obj=0x000000afec167308, ENamedThreads::Type & <Params_0>=GameThread_Local) Line 549
UE4Editor-Core-Win64-Debug.dll!UE4Function_Private::TFunctionRefBase<UE4Function_Private::TFunctionStorage<0>,void __cdecl(enum ENamedThreads::Type)>::operator()(ENamedThreads::Type <Params_0>=GameThread_Local) Line 677
UE4Editor-Core-Win64-Debug.dll!FTaskGraphInterface::BroadcastSlow_OnlyUseForSpecialPurposes(bool bDoTaskThreads=false, bool bDoBackgroundThreads=false, TFunction<void __cdecl(enum ENamedThreads::Type)> & Callback={...}) Line 2047
UE4Editor-Core-Win64-Debug.dll!FMallocBinned2::Trim(bool bTrimThreadCaches=true) Line 1114
UE4Editor-Core-Win64-Debug.dll!FMemory::Trim(bool bTrimThreadCaches=true) Line 529
UE4Editor-CoreUObject-Win64-Debug.dll!CollectGarbageInternal(EObjectFlags KeepFlags=RF_NoFlags, bool bPerformFullPurge=false) Line 2066
UE4Editor-CoreUObject-Win64-Debug.dll!TryCollectGarbage(EObjectFlags KeepFlags=RF_NoFlags, bool bPerformFullPurge=false) Line 2177
UE4Editor-Engine-Win64-Debug.dll!UEngine::PerformGarbageCollectionAndCleanupActors() Line 1427
UE4Editor-Engine-Win64-Debug.dll!UEngine::ConditionalCollectGarbage() Line 1401  // GC Mark
UE4Editor-Engine-Win64-Debug.dll!UWorld::Tick(ELevelTick TickType=LEVELTICK_All, float DeltaSeconds=0.0134023018) Line 1657
UE4Editor-Engine-Win64-Debug.dll!UGameEngine::Tick(float DeltaSeconds=0.0134023018, bool bIdleMode=false) Line 1794
UE4Editor-Win64-Debug.exe!FEngineLoop::Tick() Line 4836
UE4Editor-Win64-Debug.exe!EngineTick() Line 63
UE4Editor-Win64-Debug.exe!GuardedMain(const wchar_t * CmdLine=0x000002964c338500) Line 172
UE4Editor-Win64-Debug.exe!WinMain(HINSTANCE__ * hInInstance=0x00007ff768840000, HINSTANCE__ * hPrevInstance=0x0000000000000000, char * __formal=0x0000029648594866, int nCmdShow=10) Line 257 

 

Android(小米10)DumpPlatformAndAllocatorStats统计信息:

[2021.05.27-15.59.49:152][ 66]LogMemory: Platform Memory Stats for Android
[2021.05.27-15.59.49:152][ 66]LogMemory: Process Physical Memory: 1207.08 MB used, 1254.11 MB peak
[2021.05.27-15.59.49:152][ 66]LogMemory: Process Virtual Memory: 8984.62 MB used, 9077.56 MB peak
[2021.05.27-15.59.49:152][ 66]LogMemory: Physical Memory: 5445.78 MB used,  2177.80 MB free, 7623.57 MB total
[2021.05.27-15.59.49:153][ 66]LogMemory: Virtual Memory: 608.03 MB used,  1439.97 MB free, 2048.00 MB total
[2021.05.27-15.59.49:153][ 66]LogMemory: PageSize: 4096, BinnedPageSize: 65536, BinnedAllocationGranularity: 4096, AddressLimit: 8589934592
[2021.05.27-15.59.49:154][ 66]FMallocBinned2 Mem report
[2021.05.27-15.59.49:154][ 66]Constants.BinnedPageSize = 65536
[2021.05.27-15.59.49:154][ 66]Constants.BinnedAllocationGranularity = 4096
[2021.05.27-15.59.49:154][ 66]Small Pool Allocations: 388.752121mb  (including block size padding)
[2021.05.27-15.59.49:155][ 66]Small Pool OS Allocated: 419.000000mb
[2021.05.27-15.59.49:155][ 66]Large Pool Requested Allocations: 204.530167mb
[2021.05.27-15.59.49:155][ 66]Large Pool OS Allocated: 205.332031mb
[2021.05.27-15.59.49:155][ 66]Requested Allocations: 204.530167mb
[2021.05.27-15.59.49:155][ 66]OS Allocated: 205.332031mb
[2021.05.27-15.59.49:155][ 66]PoolInfo: 1.687500mb
[2021.05.27-15.59.49:155][ 66]Hash: 0.003906mb
[2021.05.27-15.59.49:156][ 66]TLS: 0.066406mb
[2021.05.27-15.59.49:156][ 66]Total allocated from OS: 626.089844mb
[2021.05.27-15.59.49:156][ 66]Cached free OS pages: 3.894531mb

 

PC下DumpPlatformAndAllocatorStats统计信息:

[2021.06.04-06.12.34:488][748]LogMemory: Platform Memory Stats for Windows
[2021.06.04-06.12.34:488][748]LogMemory: Process Physical Memory: 704.69 MB used, 775.71 MB peak
[2021.06.04-06.12.34:488][748]LogMemory: Process Virtual Memory: 784.52 MB used, 888.80 MB peak
[2021.06.04-06.12.34:488][748]LogMemory: Physical Memory: 24035.47 MB used,  8565.63 MB free, 32601.11 MB total
[2021.06.04-06.12.34:488][748]LogMemory: Virtual Memory: 134206408.00 MB used,  11316.60 MB free, 134217728.00 MB total
[2021.06.04-06.12.34:489][748]FMallocBinned2 Mem report
[2021.06.04-06.12.34:489][748]Constants.BinnedPageSize = 65536
[2021.06.04-06.12.34:489][748]Constants.BinnedAllocationGranularity = 4096
[2021.06.04-06.12.34:489][748]Small Pool Allocations: 130.058121mb  (including block size padding)
[2021.06.04-06.12.34:489][748]Small Pool OS Allocated: 157.312500mb
[2021.06.04-06.12.34:489][748]Large Pool Requested Allocations: 141.529739mb
[2021.06.04-06.12.34:489][748]Large Pool OS Allocated: 141.667969mb
[2021.06.04-06.12.34:489][748]Requested Allocations: 141.529739mb
[2021.06.04-06.12.34:489][748]OS Allocated: 141.667969mb
[2021.06.04-06.12.34:489][748]PoolInfo: 0.500000mb
[2021.06.04-06.12.34:489][748]Hash: 0.007813mb
[2021.06.04-06.12.34:489][748]TLS: 0.128906mb
[2021.06.04-06.12.34:489][748]Total allocated from OS: 299.617188mb
[2021.06.04-06.12.34:490][748]Cached free OS pages: 34.992188mb

 

参考

UE4 MallocBinned2分配器

People Mountain People Sea(服务器篇)

 

posted on 2021-06-05 11:08  可可西  阅读(1668)  评论(0编辑  收藏  举报

导航