FMallocBinned2内存分配器
FMallocBinned2是虚幻引擎实现的第二代装箱内存分配器,其重要的配置参数及成员变量如下:
#define BINNED2_MAX_CACHED_OS_FREES (64) #if PLATFORM_64BITS #define BINNED2_MAX_CACHED_OS_FREES_BYTE_LIMIT (64*1024*1024) // 64MB #else #define BINNED2_MAX_CACHED_OS_FREES_BYTE_LIMIT (16*1024*1024) #endif #define BINNED2_LARGE_ALLOC 65536 // Alignment of OS-allocated pointer - pool-allocated pointers will have a non-aligned pointer #define BINNED2_MINIMUM_ALIGNMENT_SHIFT 4 // Alignment of blocks, expressed as a shift #define BINNED2_MINIMUM_ALIGNMENT 16 // Alignment of blocks #define BINNED2_MAX_SMALL_POOL_SIZE (32768-16) // Maximum block size in GMallocBinned2SmallBlockSizes #define BINNED2_SMALL_POOL_COUNT 45 #define DEFAULT_GMallocBinned2PerThreadCaches 1 #define DEFAULT_GMallocBinned2LockFreeCaches 0 #define DEFAULT_GMallocBinned2BundleCount 64 #define DEFAULT_GMallocBinned2AllocExtra 32 #define BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle 8 #if !defined(AGGRESSIVE_MEMORY_SAVING) #error "AGGRESSIVE_MEMORY_SAVING must be defined" #endif #if AGGRESSIVE_MEMORY_SAVING #define DEFAULT_GMallocBinned2BundleSize 8192 #else #define DEFAULT_GMallocBinned2BundleSize BINNED2_LARGE_ALLOC // 64KB #endif #define BINNED2_ALLOW_RUNTIME_TWEAKING 0 #if BINNED2_ALLOW_RUNTIME_TWEAKING extern CORE_API int32 GMallocBinned2PerThreadCaches; extern CORE_API int32 GMallocBinned2BundleSize = DEFAULT_GMallocBinned2BundleSize; extern CORE_API int32 GMallocBinned2BundleCount = DEFAULT_GMallocBinned2BundleCount; extern CORE_API int32 GMallocBinned2MaxBundlesBeforeRecycle = BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle; extern CORE_API int32 GMallocBinned2AllocExtra = DEFAULT_GMallocBinned2AllocExtra; #else #define GMallocBinned2PerThreadCaches DEFAULT_GMallocBinned2PerThreadCaches // 1 #define GMallocBinned2BundleSize DEFAULT_GMallocBinned2BundleSize // 64KB #define GMallocBinned2BundleCount DEFAULT_GMallocBinned2BundleCount // 64 #define GMallocBinned2MaxBundlesBeforeRecycle BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle // 8 #define GMallocBinned2AllocExtra DEFAULT_GMallocBinned2AllocExtra // 32 #endif // ... ... // Block sizes are based around getting the maximum amount of allocations per pool, with as little alignment waste as possible. // Block sizes should be close to even divisors of the system page size, and well distributed. // They must be 16-byte aligned as well. static uint16 SmallBlockSizes[] = // 共45个档位的Block Size 当请求分配一个特定大小的内存块时,将寻找到最接近的一档 { 16, 32, 48, 64, 80, 96, 112, 128, 160, 192, 224, 256, 288, 320, 384, 448, 512, 576, 640, 704, 768, 896, 1024 - 16, 1168, 1360, 1632, 2048 - 16, 2336, 2720, 3264, 4096 - 16, 4368, 4672, 5040, 5456, 5952, 6544 - 16, 7280, 8192 - 16, 9360, 10912, 13104, 16384 - 16, 21840, 32768 - 16 };// FFreeBlock用来描述一块Block,其本身也处于一个Block中,位于Block头部,要占一定空间sizeof(FFreeBlock)为16,最高一档是32768-16,这样64K空间中能存下两个最高档。 // ... ... class CORE_API FMallocBinned2 final : public FMalloc { private: // ... ... FPtrToPoolMapping PtrToPoolMapping; // 内存池哈希桶的相关参数 // Pool tables for different pool sizes FPoolTable SmallPoolTables[BINNED2_SMALL_POOL_COUNT]; // 所有的内存池表列表, 单个内存池的Block尺寸是一样的 宏BINNED2_SMALL_POOL_COUNT为45 PoolHashBucket* HashBuckets; // Key命中时使用的内存池哈希桶 PoolHashBucket* HashBucketFreeList; // Key未命中时使用的内存池哈希桶 uint64 NumPoolsPerPage; // 一个Page中内存池的个数 65536除以sizeof(FPoolInfo) = 65536/32 = 2048 // ... ... FCriticalSection Mutex; // 用于FScopeLock的临界段对象,实现对临界段的互斥访问 // ... ... public: // ... ... static uint16 SmallBlockSizesReversed[BINNED2_SMALL_POOL_COUNT]; // this is reversed to get the smallest elements on our main cache line // 为SmallBlockSizes数组的反向数组 static FMallocBinned2* MallocBinned2; // 当前Binned2内存分配器实例 static uint32 Binned2TlsSlot; // 是否创建MallocBinned2的TLS Slot(为0表示未创建),所有线程共享同一个Binned2 TLS Slot static uint32 PageSize; // Constants.BinnedPageSize 为64KB static uint32 OsAllocationGranularity; // Constants.BinnedAllocationGranularity 为4096 // Mapping of sizes to small table indices 数组的个数为2048,存放的数据为:0,0,1,2,3,4,5,6,7,8,8, ... ,44,44,44,用于帮助计算当前内存Size在FPoolTable SmallPoolTables中的索引
// Size到BlockSize的PoolTable索引映射表。UE4初始化阶段会预生成一个需要申请内存大小到PoolTable数组的索引的映射表,这样当Malloc时,根据Malloc需要的大小直接找到对应的PoolTable static uint8 MemSizeToIndex[1 + (BINNED2_MAX_SMALL_POOL_SIZE >> BINNED2_MINIMUM_ALIGNMENT_SHIFT)]; // ... ... };
FMallocBinned2内存分配器的初始化
// 在其构造函数中
根据所在平台硬件和操作系统,来设置内存分配器的相关参数
FMallocBinned2::FMallocBinned2() : HashBucketFreeList(nullptr) { static bool bOnce = false; check(!bOnce); // this is now a singleton-like thing and you cannot make multiple copies bOnce = true; // 初始化SmallBlockSizesReversed数组,为SmallBlockSizes数组的反向数组 for (uint32 Index = 0; Index != BINNED2_SMALL_POOL_COUNT; ++Index) { uint32 Partner = BINNED2_SMALL_POOL_COUNT - Index - 1; SmallBlockSizesReversed[Index] = SmallBlockSizes[Partner]; } // 根据所在平台硬件和操作系统,来设置内存分配器的相关参数 FGenericPlatformMemoryConstants Constants = FPlatformMemory::GetConstants(); PageSize = Constants.BinnedPageSize; OsAllocationGranularity = Constants.BinnedAllocationGranularity ? Constants.BinnedAllocationGranularity : PageSize; NumPoolsPerPage = PageSize / sizeof(FPoolInfo); PtrToPoolMapping.Init(PageSize, NumPoolsPerPage, Constants.AddressLimit); // 初始化内存池哈希桶的相关参数 // 一些check checkf(FMath::IsPowerOfTwo(PageSize), TEXT("OS page size must be a power of two")); checkf(FMath::IsPowerOfTwo(Constants.AddressLimit), TEXT("OS address limit must be a power of two")); checkf(Constants.AddressLimit > PageSize, TEXT("OS address limit must be greater than the page size")); // Check to catch 32 bit overflow in AddressLimit checkf(SmallBlockSizes[BINNED2_SMALL_POOL_COUNT - 1] == BINNED2_MAX_SMALL_POOL_SIZE, TEXT("BINNED2_MAX_SMALL_POOL_SIZE must equal the smallest block size")); checkf(PageSize % BINNED2_LARGE_ALLOC == 0, TEXT("OS page size must be a multiple of BINNED2_LARGE_ALLOC")); checkf(sizeof(FMallocBinned2::FFreeBlock) <= SmallBlockSizes[0], TEXT("Pool header must be able to fit into the smallest block")); static_assert(UE_ARRAY_COUNT(SmallBlockSizes) == BINNED2_SMALL_POOL_COUNT, "Small block size array size must match BINNED2_SMALL_POOL_COUNT"); static_assert(UE_ARRAY_COUNT(SmallBlockSizes) <= 256, "Small block size array size must fit in a byte"); static_assert(sizeof(FFreeBlock) <= BINNED2_MINIMUM_ALIGNMENT, "Free block struct must be small enough to fit into a block."); // Init pool tables. 填充SmallBlockSizes数组中BlockSize,共45档 for (uint32 Index = 0; Index != BINNED2_SMALL_POOL_COUNT; ++Index) { checkf(Index == 0 || SmallBlockSizes[Index - 1] < SmallBlockSizes[Index], TEXT("Small block sizes must be strictly increasing")); checkf(SmallBlockSizes[Index] <= PageSize, TEXT("Small block size must be small enough to fit into a page")); checkf(SmallBlockSizes[Index] % BINNED2_MINIMUM_ALIGNMENT == 0, TEXT("Small block size must be a multiple of BINNED2_MINIMUM_ALIGNMENT")); SmallPoolTables[Index].BlockSize = SmallBlockSizes[Index]; } // Set up pool mappings 数组的个数为2048,存放的数据为:0,0,1,2,3,4,5,6,7,8,8, ... ,44,44,44,用于帮助计算当前内存Size在FPoolTable SmallPoolTables中的索引 uint8* IndexEntry = MemSizeToIndex; uint32 PoolIndex = 0; for (uint32 Index = 0; Index != 1 + (BINNED2_MAX_SMALL_POOL_SIZE >> BINNED2_MINIMUM_ALIGNMENT_SHIFT); ++Index) { uint32 BlockSize = Index << BINNED2_MINIMUM_ALIGNMENT_SHIFT; // inverse of int32 Index = int32((Size >> BINNED2_MINIMUM_ALIGNMENT_SHIFT)); while (SmallBlockSizes[PoolIndex] < BlockSize) { ++PoolIndex; check(PoolIndex != BINNED2_SMALL_POOL_COUNT); } check(PoolIndex < 256); *IndexEntry++ = uint8(PoolIndex); } // now reverse the pool sizes for cache coherency // 再次初始化SmallBlockSizesReversed数组,为SmallBlockSizes数组的反向数组 for (uint32 Index = 0; Index != BINNED2_SMALL_POOL_COUNT; ++Index) { uint32 Partner = BINNED2_SMALL_POOL_COUNT - Index - 1; SmallBlockSizesReversed[Index] = SmallBlockSizes[Partner]; } uint64 MaxHashBuckets = PtrToPoolMapping.GetMaxHashBuckets(); { LLM_PLATFORM_SCOPE(ELLMTag::FMalloc); // 为Key命中时使用的内存池哈希桶分配内存 HashBuckets = (PoolHashBucket*)FPlatformMemory::BinnedAllocFromOS(Align(MaxHashBuckets * sizeof(PoolHashBucket), OsAllocationGranularity)); #if BINNED2_ALLOCATOR_STATS Binned2HashMemory += Align(MaxHashBuckets * sizeof(PoolHashBucket), OsAllocationGranularity); #endif } DefaultConstructItems<PoolHashBucket>(HashBuckets, MaxHashBuckets); // 缺省构造与初始化HashBuckets MallocBinned2 = this; GFixedMallocLocationPtr = (FMalloc**)(&MallocBinned2); }
具体数值如下:
FPoolTable
// 同一Block大小内存池表
/** 内存池表 sizeof(FPoolTable)为24*/ struct FPoolTable { FPoolList ActivePools; // 指向有空闲Block的内存池链表 FPoolList ExhaustedPools; // 指向已满(没有可分配的内存)的内存池链表 uint32 BlockSize; // 当前PoolTable中所有内存池的Block大小 // ... ... };
FPoolList
// 内存池链表
// 内存池链表 sizeof(FPoolList)为8 struct FPoolList { // ... ... private: FPoolInfo* Front; };
FPoolInfo
// 内存池
FPoolInfo中的所有Block为空闲时,才释放其占用的内存页
// 内存池 sizeof(FPoolInfo)为32 struct FMallocBinned2::FPoolInfo { // ... ... // 已分配的Block的个数 当为0时,将释放整个内存池及其FirstMem指向的内存块 public: uint16 Taken; // Number of allocated elements in this pool, when counts down to zero can free the entire pool public: ECanary Canary; // See ECanary // 已分配的字节数 private: uint32 AllocSize; // Number of bytes allocated // 如果是Bin模式,指向内存池可用的内存块Block链表; 如果非Bin模式, 指向由操作系统直接分配的内存块. public: FFreeBlock* FirstFreeBlock; // Pointer to first free memory in this pool or the OS Allocation Size in bytes if this allocation is not binned // 指向下一个内存池 public: FPoolInfo* Next; // Pointer to next pool public: FPoolInfo** PtrToPrevNext; // Pointer to whichever pointer points to this pool // ... ... };
FFreeBlock
// 内存块
// 内存块 sizeof(FFreeBlock)为16 struct FFreeBlock { // ... ... uint16 BlockSize; // Size of the blocks that this list points to // 所在Pool的BlockSize uint8 PoolIndex; // Index of this pool // 所在Pool的Index uint8 Canary; // Constant value of 0xe3 // 固定常量 用于判断内存是否越界写 以此判断这块Block数据是否损坏 uint32 NumFreeBlocks; // Number of consecutive free blocks here, at least 1. // 空闲Block个数 void* NextFreeBlock; // Next free block in another pool // 释放1个Block时,会构建该Block的FFreeMem,并插入到Pool->FirstMem链表的头部 };
PoolHashBucket
// 内存池哈希桶
// 内存池哈希桶的相关参数 sizeof(FPtrToPoolMapping)为32 struct FPtrToPoolMapping { // ... ... private: /** Shift to apply to a pointer to get the reference from the indirect tables */ uint64 PtrToPoolPageBitShift; /** Shift required to get required hash table key. */ uint64 HashKeyShift; /** Used to mask off the bits that have been used to lookup the indirect table */ uint64 PoolMask; // PageSize dependent constants uint64 MaxHashBuckets; }; /** 内存池哈希桶,用于存放由内存地址哈希出来的键对应的内存池链表 sizeof(PoolHashBucket)为32 */ struct FMallocBinned2::PoolHashBucket { UPTRINT BucketIndex; // 哈希键 Key=Ptr >> Allocator.HashKeyShift 内存地址右移27个bit位 FPoolInfo* FirstPool; // 指向内存池内存块(大小为64KB:成员变量PageSize的值)的起始处 PoolHashBucket* Prev; // 上一个内存池哈希桶 PoolHashBucket* Next; // 下一个内存池哈希桶 // ... ... };
从内存池Pool中分配内存给Block
struct FMallocBinned2::FPoolInfo { // ... ... void* AllocateRegularBlock() { check(HasFreeRegularBlock()); // 检查FPoolInfo中是否有空闲Block ++Taken; // 已分配的Block数+1 void* Result = FirstFreeBlock->AllocateRegularBlock(); // 分配Block ExhaustPoolIfNecessary(); // 如果当前FPoolInfo无空闲Block,则将其移动到FPoolList ExhaustedPools链表中 return Result; } // ... ... }; struct FFreeBlock { // ... ... FORCEINLINE void* AllocateRegularBlock() { --NumFreeBlocks; // 空闲Block个数减1 if (IsAligned(this, BINNED2_LARGE_ALLOC)) // BINNED2_LARGE_ALLOC为64KB FFreeBlock起始处是否对齐到64KB { return (uint8*)this + BINNED2_LARGE_ALLOC - (NumFreeBlocks + 1) * BlockSize; // +64KB后,从前往后分配Block } return (uint8*)this + (NumFreeBlocks)* BlockSize; // 从后往前分配Block } // ... ... };
TLS Cache机制
与FMallocBinned内存分配器相比,FMallocBinned2最大的改进:
引入了TLS(Thread Local Storage,线程局部存储。线程可以有自己的存储空间,以键值对形式存储一些自己独有的变量)缓存,来优化内存的分配速度
各线程会记录被free的地址,把它们保存到一个列表中,当这个线程再有malloc请求来时,如果BlockSize匹配,则直接返回之前缓存的free地址
这样就不需要再访问FPoolTable SmallPoolTables[BINNED2_SMALL_POOL_COUNT]了,因此也不用再加互斥锁了
各个线程在启动时,通过调用FMemory::SetupTLSCachesOnCurrentThread()创建自己的TLS数据FPerThreadFreeBlockLists
各个线程在创建FPerThreadFreeBlockLists后,都会把它添加到Binned2的RegisteredFreeBlockLists数组中记录。代码如下:
void FMallocBinned2::SetupTLSCachesOnCurrentThread() { if (!BINNED2_ALLOW_RUNTIME_TWEAKING && !GMallocBinned2PerThreadCaches) { return; } if (!FMallocBinned2::Binned2TlsSlot) { FMallocBinned2::Binned2TlsSlot = FPlatformTLS::AllocTlsSlot(); // 只会执行一次,TLS Slot全局唯一 } check(FMallocBinned2::Binned2TlsSlot); FPerThreadFreeBlockLists::SetTLS(); // 各线程创建自己的TLS } void FMallocBinned2::FPerThreadFreeBlockLists::SetTLS() { check(FMallocBinned2::Binned2TlsSlot); FPerThreadFreeBlockLists* ThreadSingleton = (FPerThreadFreeBlockLists*)FPlatformTLS::GetTlsValue(FMallocBinned2::Binned2TlsSlot); if (!ThreadSingleton) { LLM_PLATFORM_SCOPE(ELLMTag::FMalloc); ThreadSingleton = new (FPlatformMemory::BinnedAllocFromOS(Align(sizeof(FPerThreadFreeBlockLists), FMallocBinned2::OsAllocationGranularity))) FPerThreadFreeBlockLists(); #if BINNED2_ALLOCATOR_STATS Binned2TLSMemory += Align(sizeof(FPerThreadFreeBlockLists), FMallocBinned2::OsAllocationGranularity); #endif FPlatformTLS::SetTlsValue(FMallocBinned2::Binned2TlsSlot, ThreadSingleton); FMallocBinned2::Private::RegisterThreadFreeBlockLists(ThreadSingleton); } } static TArray<FPerThreadFreeBlockLists*>& GetRegisteredFreeBlockLists() { static TArray<FPerThreadFreeBlockLists*> RegisteredFreeBlockLists; return RegisteredFreeBlockLists; } static void RegisterThreadFreeBlockLists( FPerThreadFreeBlockLists* FreeBlockLists ) { FScopeLock Lock(&GetFreeBlockListsRegistrationMutex()); #if BINNED2_ALLOCATOR_STATS_VALIDATION ++RecursionCounter; #endif GetRegisteredFreeBlockLists().Add(FreeBlockLists); #if BINNED2_ALLOCATOR_STATS_VALIDATION --RecursionCounter; #endif }
各线程调用FMemory::SetupTLSCachesOnCurrentThread()的情况:
每个线程都会有一份FPerThreadFreeBlockLists副本,其中FreeLists[]数组也通过BlockSize产生,有45个元素
每个元素类型为FFreeBlockList,包含FBundle PartialBundle链表(未装满的Bundle)和FBundle FullBundle链表(已装满的Bundle)
为了让Binned2不过多占用内存空间,给FBundle的FBundleNode节点数做了限制(不能超过GMallocBinned2BundleCount,即64个)及总容量不能大于GMallocBinned2BundleSize【64KB】(即:FBundleNode的Count * BlockSize < 64KB)
struct FPerThreadFreeBlockLists { // ... ... private: FFreeBlockList FreeLists[BINNED2_SMALL_POOL_COUNT]; // 宏BINNED2_SMALL_POOL_COUNT为45 对应各个档位的Block }; // sizeof(FFreeBlockList)为32 struct FFreeBlockList { // ... ... private: FBundle PartialBundle; FBundle FullBundle; }; // FBundleNode链表 sizeof(FBundle)为16 struct FBundle { // ... ... FBundleNode* Head; // 链表头指针 uint32 Count; }; // sizeof(FBundleNode)为16 struct FBundleNode { FBundleNode* NextNodeInCurrentBundle; // 当前Bundle链表中的下一个FBundleNode节点 union { FBundleNode* NextBundle; // 下一个Bundle链表 FFreeBlockList::PopBundles时,会把FullBundle赋值给PartialBundle的NextBundle,即:PaitialBundle->NextBundle = FullBundle,便于FreeBundles函数中进行循环遍历来释放Bundle int32 Count; // 当前链表中的FBundleNode节点个数 }; };
注:FBundleNode*指向的是Block内存块区域,把Ptr指针转行成FBundleNode*后,其size为16字节,对其修改不会影响到其他内存空间
FGlobalRecycler
// 用于缓存各个线程的FBundle FullBundle和FBundle PartialBundle链表的头指针
struct FGlobalRecycler // sizeof(FGlobalRecycler)为64*45 = 2880 { // ... ... private: struct FPaddedBundlePointer // sizeof(FPaddedBundlePointer)为8*8 = 64 { FBundleNode* FreeBundles[BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle]; // 宏BINNED2_MAX_GMallocBinned2MaxBundlesBeforeRecycle为8,通过FPlatformAtomics::InterlockedCompareExchangePointer进行原子修改,防止多线程导致数据破坏 // ... ... }; // ... ... MS_ALIGN(PLATFORM_CACHE_LINE_SIZE) FPaddedBundlePointer Bundles[BINNED2_SMALL_POOL_COUNT] GCC_ALIGN(PLATFORM_CACHE_LINE_SIZE); // 宏BINNED2_SMALL_POOL_COUNT为45 }; static FGlobalRecycler GGlobalRecycler; // 各Block档允许有8个元素。有空位时,PushBundle会成功,否则失败。全部为空时,PopBundle失败并返回null。
Free释放内存时TLS Cache的流程细节如下:
Free掉Ptr指针的内存占用
void FMallocBinned2::FreeExternal(void* Ptr) { if (!IsOSAllocation(Ptr)) // 是否为操作系统直接分配的内存块 { // Bin模式,内存池 check(Ptr); // null is 64k aligned so we should not be here FFreeBlock* BasePtr = GetPoolHeaderFromPointer(Ptr); // 将指针转换成FFreeBlock* BasePtr->CanaryTest(); uint32 BlockSize = BasePtr->BlockSize; uint32 PoolIndex = BasePtr->PoolIndex; FBundleNode* BundlesToRecycle = nullptr; FPerThreadFreeBlockLists* Lists = GMallocBinned2PerThreadCaches ? FPerThreadFreeBlockLists::Get() : nullptr; if (Lists) { // 若FPerThreadFreeBlockLists[BlockSize].FullBundle.Head不为空,会加入GGlobalRecycler对应Block档位的数组中缓存 // 当GGlobalRecycler对应Block档位没有空位时,BundlesToRecycle会被赋值为FPerThreadFreeBlockLists[BlockSize].FullBundle.Head // 当GGlobalRecycler对应Block档位有空位时,会被添加进去,并返回空 BundlesToRecycle = Lists->RecycleFullBundle(BasePtr->PoolIndex); bool bPushed = Lists->Free(Ptr, PoolIndex, BlockSize); // 加到FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表的头部 check(bPushed); #if BINNED2_ALLOCATOR_STATS Lists->AllocatedMemory -= BlockSize; #endif } else { BundlesToRecycle = (FBundleNode*)Ptr; BundlesToRecycle->NextNodeInCurrentBundle = nullptr; } if (BundlesToRecycle) // 若不为空,表示GGlobalRecycler.FreeBundles.Bundles[PoolIndex]档位的8个FBundleNode缓存都已被占满,进入内存释放流程 { BundlesToRecycle->NextBundle = nullptr; FScopeLock Lock(&Mutex); Private::FreeBundles(*this, BundlesToRecycle, BlockSize, PoolIndex); // 释放BundlesToRecycle地址所占用的Block内存 #if BINNED2_ALLOCATOR_STATS if (!Lists) { // lists track their own stat track them instead in the global stat if we don't have lists AllocatedSmallPoolMemory -= ((int64)(BlockSize)); } #endif } } else if (Ptr) { // 非Bin模式, 操作系统直接分配 FScopeLock Lock(&Mutex); FPoolInfo* Pool = Private::FindPoolInfo(*this, Ptr); if (!Pool) { UE_LOG(LogMemory, Fatal, TEXT("FMallocBinned2 Attempt to free an unrecognized block %p"), Ptr); } UPTRINT PoolOsBytes = Pool->GetOsAllocatedBytes(); SIZE_T PoolOSRequestedBytes = Pool->GetOSRequestedBytes(); #if BINNED2_ALLOCATOR_STATS AllocatedLargePoolMemory -= ((int64)PoolOSRequestedBytes); AllocatedLargePoolMemoryWAlignment -= ((int64)PoolOsBytes); #endif checkf(PoolOSRequestedBytes <= PoolOsBytes, TEXT("FMallocBinned2::FreeExternal %d %d"), int32(PoolOSRequestedBytes), int32(PoolOsBytes)); Pool->SetCanary(FPoolInfo::ECanary::Unassigned, true, false); // Free an OS allocation. CachedOSPageAllocator.Free(Ptr, PoolOsBytes); } } static void FreeBundles(FMallocBinned2& Allocator, FBundleNode* BundlesToRecycle, uint32 InBlockSize, uint32 InPoolIndex) { FPoolTable& Table = Allocator.SmallPoolTables[InPoolIndex]; // 释放FBundleNode* BundlesToRecycle链表上各节点指向的内存 FBundleNode* Bundle = BundlesToRecycle; while (Bundle) { FBundleNode* NextBundle = Bundle->NextBundle; FBundleNode* Node = Bundle; do { FBundleNode* NextNode = Node->NextNodeInCurrentBundle; FPoolInfo* NodePool = FindPoolInfo(Allocator, Node); if (!NodePool) { UE_LOG(LogMemory, Fatal, TEXT("FMallocBinned2 Attempt to free an unrecognized small block %p"), Node); } NodePool->CheckCanary(FPoolInfo::ECanary::FirstFreeBlockIsPtr); // If this pool was exhausted, move to available list. if (!NodePool->FirstFreeBlock) // FPoolInfo* NodePool在FPoolList ExhaustedPools链表上时 { Table.ActivePools.LinkToFront(NodePool); // 将FPoolInfo* NodePool从FPoolList ExhaustedPools移动到FPoolList ActivePools } else { check(NodePool->FirstFreeBlock->Canary == 0 || NodePool->FirstFreeBlock->IsCanaryOk()); } // Free a pooled allocation. 在FBundleNode* Node地址处,构建一个新的FFreeBlock,NumFreeBlocks设置为1,并插入到NodePool->FirstFreeBlock链表的头部 FFreeBlock* Free = (FFreeBlock*)Node; Free->NumFreeBlocks = 1; Free->NextFreeBlock = NodePool->FirstFreeBlock; Free->BlockSize = InBlockSize; Free->Canary = FFreeBlock::CANARY_VALUE; Free->PoolIndex = InPoolIndex; NodePool->FirstFreeBlock = Free; // Free this pool. check(NodePool->Taken >= 1); if (--NodePool->Taken == 0) // FPoolInfo* NodePool中所有Block为空闲时 { NodePool->SetCanary(FPoolInfo::ECanary::Unassigned, true, false); FFreeBlock* BasePtrOfNode = GetPoolHeaderFromPointer(Node); // Free the OS memory. NodePool->Unlink(); // 从FPoolList链表上断开 Allocator.CachedOSPageAllocator.Free(BasePtrOfNode, Allocator.PageSize); // 回收整个FPoolInfo* NodePool的内存 #if BINNED2_ALLOCATOR_STATS AllocatedOSSmallPoolMemory -= ((int64)Allocator.PageSize); #endif } Node = NextNode; // 遍历下一个FBundleNode* } while (Node); Bundle = NextBundle; } }
Malloc申请内存时TLS Cache的流程细节如下:
Malloc分配内存
FORCEINLINE void* MallocSelect(SIZE_T Size, uint32 Alignment) { void* Result; if (UseSmallAlloc(Size, Alignment)) // Size <= BINNED2_MAX_SMALL_POOL_SIZE & Alignment <= BINNED2_MINIMUM_ALIGNMENT { Result = MallocExternalSmall(Size, Alignment); // 使用内存池来分配内存 } else { Result = MallocExternalLarge(Size, Alignment); // 由操作系统直接分配内存, 且放入HashBuckets表中 } return Result; } void* FMallocBinned2::MallocExternalSmall(SIZE_T Size, uint32 Alignment) { uint32 PoolIndex = BoundSizeToPoolIndex(Size); // 根据Size获取在FPoolTable SmallPoolTables中的索引 // 优先从TLS Cache中获取可用的内存Block FPerThreadFreeBlockLists* Lists = GMallocBinned2PerThreadCaches ? FPerThreadFreeBlockLists::Get() : nullptr; if (Lists) { // 若FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表为空,从GGlobalRecycler对应Block档位的数组中Pop出一个并赋值给FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head // 然后再判断FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head是否为空 if (Lists->ObtainRecycledPartial(PoolIndex)) { if (void* Result = Lists->Malloc(PoolIndex)) // 从FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表头部Pop出一个FBundleNode*,并赋值给Result { #if BINNED2_ALLOCATOR_STATS uint32 BlockSize = PoolIndexToBlockSize(PoolIndex); Lists->AllocatedMemory += BlockSize; #endif return Result; // 有对应PoolIndex的TLS Cache的Block,则直接分配出去 } } } FScopeLock Lock(&Mutex); // 获取互斥锁,离开作用域自动释放互斥锁 // Allocate from small object pool. FPoolTable& Table = SmallPoolTables[PoolIndex]; // 根据PoolIndex找到对应的PoolTable FPoolInfo* Pool; if (!Table.ActivePools.IsEmpty()) // 当前内存池表中的ActivePools不为空 { Pool = &Table.ActivePools.GetFrontPool(); // 获取第一个FPoolInfo } else { Pool = &Table.ActivePools.PushNewPoolToFront(*this, Table.BlockSize, PoolIndex); // 创建一个新的FPoolInfo } void* Result = Pool->AllocateRegularBlock(); // 从FPoolInfo Pool分配一个Block 注:函数中会对Pool的空闲检查,如果已无空闲Block,则把Pool添加到PoolTable的ExhaustedPools中 #if BINNED2_ALLOCATOR_STATS AllocatedSmallPoolMemory += PoolIndexToBlockSize(PoolIndex); #endif // BINNED2_ALLOCATOR_STATS if (GMallocBinned2AllocExtra) // GMallocBinned2AllocExtra值为32 这部分逻辑为TLS Cache优化逻辑 { if (Lists) { // prefill the free list with some allocations so we are less likely to hit this slow path with the mutex for (int32 Index = 0; Index < GMallocBinned2AllocExtra && Pool->HasFreeRegularBlock(); Index++) { if (!Lists->Free(Result, PoolIndex, Table.BlockSize)) // Free成功,会将当前Result指针加入到FPerThreadFreeBlockLists[BlockSize].PartialBundle.Head链表头部 { break; } Result = Pool->AllocateRegularBlock(); // 从FPoolInfo Pool分配一个Block } } } if (!Pool->HasFreeRegularBlock()) // 当前Pool是否有空闲Block { Table.ExhaustedPools.LinkToFront(Pool); // 移动到FPoolList ExhaustedPools链表中 } return Result; }
在GC Mark时调用FMemory::Trim(bool bTrimThreadCaches) -- 》FMallocBinned2::FlushCurrentThreadCache()来回收TLS Cache缓存住的内存
void FMallocBinned2::FlushCurrentThreadCache() { double StartTimeInner = FPlatformTime::Seconds(); QUICK_SCOPE_CYCLE_COUNTER(STAT_FMallocBinned2_FlushCurrentThreadCache); FPerThreadFreeBlockLists* Lists = FPerThreadFreeBlockLists::Get(); // 获取当前线程TLS的FPerThreadFreeBlockLists链表 double WaitForMutexTime = 0.0; double WaitForMutexAndTrimTime = 0.0; if (Lists) { FScopeLock Lock(&Mutex); WaitForMutexTime = FPlatformTime::Seconds() - StartTimeInner; for (int32 PoolIndex = 0; PoolIndex != BINNED2_SMALL_POOL_COUNT; ++PoolIndex) // 遍历45个档位的Pool { FBundleNode* Bundles = Lists->PopBundles(PoolIndex); // 将当前PoolIndex档位的PaitialBundle、FullBundle连接成链表,即:PaitialBundle->NextBundle = FullBundle,便于后面FreeBundles函数中进行循环遍历来释放Bundle if (Bundles) { Private::FreeBundles(*this, Bundles, PoolIndexToBlockSize(PoolIndex), PoolIndex); // 释放当前档位的PaitialBundle、FullBundle中缓存住的内存 } } WaitForMutexAndTrimTime = FPlatformTime::Seconds() - StartTimeInner; } // These logs must happen outside the above mutex to avoid deadlocks if (WaitForMutexTime > GMallocBinned2FlushThreadCacheMaxWaitTime) { UE_LOG(LogMemory, Warning, TEXT("FMallocBinned2 took %6.2fms to wait for mutex for trim."), WaitForMutexTime * 1000.0f); } if (WaitForMutexAndTrimTime > GMallocBinned2FlushThreadCacheMaxWaitTime) { UE_LOG(LogMemory, Warning, TEXT("FMallocBinned2 took %6.2fms to wait for mutex AND trim."), WaitForMutexAndTrimTime * 1000.0f); } }
注:Private::FreeBundles并没有释放GGlobalRecycler.FreeBundles.Bundles当前PoolIndex档位的所有共8个FBundleNode缓存
仅仅是释放了当前线程TLS的FPerThreadFreeBlockLists链表PaitialBundle、FullBundle缓存住的内存,不会使已交换到GGlobalRecycler.FreeBundles.Bundles中的Bundle缓存住的内存释放
调用堆栈如下:
UE4Editor-Core-Win64-Debug.dll!FMallocBinned2::FlushCurrentThreadCache() Line 1070 UE4Editor-Core-Win64-Debug.dll!UE4Function_Private::TFunctionRefCaller<<lambda_7e8e4a5f748f352266e45175872bd5ce>,void __cdecl(enum ENamedThreads::Type)>::Call(void * Obj=0x000000afec167308, ENamedThreads::Type & <Params_0>=GameThread_Local) Line 549 UE4Editor-Core-Win64-Debug.dll!UE4Function_Private::TFunctionRefBase<UE4Function_Private::TFunctionStorage<0>,void __cdecl(enum ENamedThreads::Type)>::operator()(ENamedThreads::Type <Params_0>=GameThread_Local) Line 677 UE4Editor-Core-Win64-Debug.dll!FTaskGraphInterface::BroadcastSlow_OnlyUseForSpecialPurposes(bool bDoTaskThreads=false, bool bDoBackgroundThreads=false, TFunction<void __cdecl(enum ENamedThreads::Type)> & Callback={...}) Line 2047 UE4Editor-Core-Win64-Debug.dll!FMallocBinned2::Trim(bool bTrimThreadCaches=true) Line 1114 UE4Editor-Core-Win64-Debug.dll!FMemory::Trim(bool bTrimThreadCaches=true) Line 529 UE4Editor-CoreUObject-Win64-Debug.dll!CollectGarbageInternal(EObjectFlags KeepFlags=RF_NoFlags, bool bPerformFullPurge=false) Line 2066 UE4Editor-CoreUObject-Win64-Debug.dll!TryCollectGarbage(EObjectFlags KeepFlags=RF_NoFlags, bool bPerformFullPurge=false) Line 2177 UE4Editor-Engine-Win64-Debug.dll!UEngine::PerformGarbageCollectionAndCleanupActors() Line 1427 UE4Editor-Engine-Win64-Debug.dll!UEngine::ConditionalCollectGarbage() Line 1401 // GC Mark UE4Editor-Engine-Win64-Debug.dll!UWorld::Tick(ELevelTick TickType=LEVELTICK_All, float DeltaSeconds=0.0134023018) Line 1657 UE4Editor-Engine-Win64-Debug.dll!UGameEngine::Tick(float DeltaSeconds=0.0134023018, bool bIdleMode=false) Line 1794 UE4Editor-Win64-Debug.exe!FEngineLoop::Tick() Line 4836 UE4Editor-Win64-Debug.exe!EngineTick() Line 63 UE4Editor-Win64-Debug.exe!GuardedMain(const wchar_t * CmdLine=0x000002964c338500) Line 172 UE4Editor-Win64-Debug.exe!WinMain(HINSTANCE__ * hInInstance=0x00007ff768840000, HINSTANCE__ * hPrevInstance=0x0000000000000000, char * __formal=0x0000029648594866, int nCmdShow=10) Line 257
Android(小米10)DumpPlatformAndAllocatorStats统计信息:
[2021.05.27-15.59.49:152][ 66]LogMemory: Platform Memory Stats for Android [2021.05.27-15.59.49:152][ 66]LogMemory: Process Physical Memory: 1207.08 MB used, 1254.11 MB peak [2021.05.27-15.59.49:152][ 66]LogMemory: Process Virtual Memory: 8984.62 MB used, 9077.56 MB peak [2021.05.27-15.59.49:152][ 66]LogMemory: Physical Memory: 5445.78 MB used, 2177.80 MB free, 7623.57 MB total [2021.05.27-15.59.49:153][ 66]LogMemory: Virtual Memory: 608.03 MB used, 1439.97 MB free, 2048.00 MB total [2021.05.27-15.59.49:153][ 66]LogMemory: PageSize: 4096, BinnedPageSize: 65536, BinnedAllocationGranularity: 4096, AddressLimit: 8589934592 [2021.05.27-15.59.49:154][ 66]FMallocBinned2 Mem report [2021.05.27-15.59.49:154][ 66]Constants.BinnedPageSize = 65536 [2021.05.27-15.59.49:154][ 66]Constants.BinnedAllocationGranularity = 4096 [2021.05.27-15.59.49:154][ 66]Small Pool Allocations: 388.752121mb (including block size padding) [2021.05.27-15.59.49:155][ 66]Small Pool OS Allocated: 419.000000mb [2021.05.27-15.59.49:155][ 66]Large Pool Requested Allocations: 204.530167mb [2021.05.27-15.59.49:155][ 66]Large Pool OS Allocated: 205.332031mb [2021.05.27-15.59.49:155][ 66]Requested Allocations: 204.530167mb [2021.05.27-15.59.49:155][ 66]OS Allocated: 205.332031mb [2021.05.27-15.59.49:155][ 66]PoolInfo: 1.687500mb [2021.05.27-15.59.49:155][ 66]Hash: 0.003906mb [2021.05.27-15.59.49:156][ 66]TLS: 0.066406mb [2021.05.27-15.59.49:156][ 66]Total allocated from OS: 626.089844mb [2021.05.27-15.59.49:156][ 66]Cached free OS pages: 3.894531mb
PC下DumpPlatformAndAllocatorStats统计信息:
[2021.06.04-06.12.34:488][748]LogMemory: Platform Memory Stats for Windows [2021.06.04-06.12.34:488][748]LogMemory: Process Physical Memory: 704.69 MB used, 775.71 MB peak [2021.06.04-06.12.34:488][748]LogMemory: Process Virtual Memory: 784.52 MB used, 888.80 MB peak [2021.06.04-06.12.34:488][748]LogMemory: Physical Memory: 24035.47 MB used, 8565.63 MB free, 32601.11 MB total [2021.06.04-06.12.34:488][748]LogMemory: Virtual Memory: 134206408.00 MB used, 11316.60 MB free, 134217728.00 MB total [2021.06.04-06.12.34:489][748]FMallocBinned2 Mem report [2021.06.04-06.12.34:489][748]Constants.BinnedPageSize = 65536 [2021.06.04-06.12.34:489][748]Constants.BinnedAllocationGranularity = 4096 [2021.06.04-06.12.34:489][748]Small Pool Allocations: 130.058121mb (including block size padding) [2021.06.04-06.12.34:489][748]Small Pool OS Allocated: 157.312500mb [2021.06.04-06.12.34:489][748]Large Pool Requested Allocations: 141.529739mb [2021.06.04-06.12.34:489][748]Large Pool OS Allocated: 141.667969mb [2021.06.04-06.12.34:489][748]Requested Allocations: 141.529739mb [2021.06.04-06.12.34:489][748]OS Allocated: 141.667969mb [2021.06.04-06.12.34:489][748]PoolInfo: 0.500000mb [2021.06.04-06.12.34:489][748]Hash: 0.007813mb [2021.06.04-06.12.34:489][748]TLS: 0.128906mb [2021.06.04-06.12.34:489][748]Total allocated from OS: 299.617188mb [2021.06.04-06.12.34:490][748]Cached free OS pages: 34.992188mb
参考
People Mountain People Sea(服务器篇)