Unity的Forward+ FPTL光照剔除解析(四)
序言
看完上一节基本上HDRP的光照剔除数据的流程就写完了,这一节主要是解析适用于透明队列的Cluster光照剔除。
为了让Cluster的均匀分布,Cluster的光照剔除也同样是借助了PreDepth的深度图。
RenderGraph Dispatch
这里的buildPerVoxelLightListKernel根据是否开启读取Depth制定suggestBase划分Cluster以及MSAA是否开启有很多的不同的变体。
这里主要解析的Kernel是TileLightListGen_DepthRT_SrcBigTile。
//Cluster的TileSize
public static int s_TileSizeClustered = 32;
static int GetNumTileClusteredX(HDCamera hdCamera)
{
return HDUtils.DivRoundUp((int) hdCamera.screenSize.x, LightDefinitions.s_TileSizeClustered);
}
unsafe void PrepareBuildGPULightListPassData(
RenderGraph renderGraph,
RenderGraphBuilder builder,
HDCamera hdCamera,
TileAndClusterData tileAndClusterData,
ref ShaderVariablesLightList constantBuffer,
int totalLightCount,
TextureHandle depthStencilBuffer,
TextureHandle stencilBufferCopy,
GBufferOutput gBuffer,
BuildGPULightListPassData passData)
{
...
// Cluster
bool msaa = hdCamera.msaaEnabled;
var clustPrepassSourceIdx = hdCamera.frameSettings.IsEnabled(FrameSettingsField.BigTilePrepass) ? ClusterPrepassSource.BigTile : ClusterPrepassSource.None;
var clustDepthSourceIdx = ClusterDepthSource.NoDepth;
if (tileAndClusterData.clusterNeedsDepth)
clustDepthSourceIdx = msaa ? ClusterDepthSource.MSAA_Depth : ClusterDepthSource.Depth;
passData.buildPerVoxelLightListShader = buildPerVoxelLightListShader;
passData.clearClusterAtomicIndexShader = clearClusterAtomicIndexShader;
//类似UE的Shader宏开启写法
passData.buildPerVoxelLightListKernel = isProjectionOblique ? s_ClusterObliqueKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx] : s_ClusterKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx];
passData.numTilesClusterX = GetNumTileClusteredX(hdCamera);
passData.numTilesClusterY = GetNumTileClusteredY(hdCamera);
passData.clusterNeedsDepth = tileAndClusterData.clusterNeedsDepth;
...
}
static void VoxelLightListGeneration(BuildGPULightListPassData data, CommandBuffer cmd)
{
if (data.runLightList)
{
// clear atomic offset index
cmd.SetComputeBufferParam(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
cmd.DispatchCompute(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, 1, 1, 1);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vLayeredLightList, data.output.perVoxelLightLists);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredOffset, data.output.perVoxelOffset);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
//开启Big Tile
if (data.runBigTilePrepass)
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);
//开启Hiz culling
if (data.clusterNeedsDepth)
{
cmd.SetComputeTextureParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_logBaseBuffer, data.output.perTileLogBaseTweak);
}
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);
ConstantBuffer.Push(cmd, data.lightListCB, data.buildPerVoxelLightListShader, HDShaderIDs._ShaderVariablesLightList);
cmd.DispatchCompute(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, data.numTilesClusterX, data.numTilesClusterY, data.viewCount);
}
}
Initialize
首先依旧是跟之前的TileLightListGen类似,计算当前线程的Tile的映射关系要用到的数据.(Tile的X/Y轴上的数量,当前线程组对应的TileID)
#define TILE_SIZE_CLUSTERED (32)
//若data.clusterNeedsDepth==true,
//就使用TileLightListGen_DepthRT_SrcBigTile LIGHTLISTGEN=TileLightListGen_DepthRT_SrcBigTile ENABLE_DEPTH_TEXTURE_BACKPLANE
//即#define ENABLE_DEPTH_TEXTURE_BACKPLANE
#define ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint lightOffs;
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint ldsZMax;
#endif
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
uint eyeIndex = u3GroupID.z;
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
//firstbithigh(32)=5
const uint log2TileSize = firstbithigh(TILE_SIZE_CLUSTERED);
uint nrTilesX = ((uint)g_screenSize.x +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.x,32)
uint nrTilesY = ((uint)g_screenSize.y +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.y,32)
// Screen space coordinates of clustered tile
//当前Tile的左下角屏幕坐标
uint2 viTilLL = TILE_SIZE_CLUSTERED*tileIDX;
//当前Tile的右上角屏幕坐标
uint2 viTilUR = min( viTilLL+uint2(TILE_SIZE_CLUSTERED,TILE_SIZE_CLUSTERED), uint2(g_screenSize.x, g_screenSize.y) ); // not width and height minus 1 since viTilUR represents the end of the tile corner.
//重置lightOffs,ldsZMax(跟TileLightListGen类似需要求Tile内的ZMax)
if(t==0)
{
lightOffs = 0;
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
ldsZMax = 0;
#endif
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
...
}
ldsZMax
通过遍历Tile内的深度,得到linMaDist,最后再InterlockedMax Resolve得到ldsZMax(Tile内的Max Z)
#define TILE_SIZE_CLUSTERED (32)
#define VIEWPORT_SCALE_Z (1)
//跟lightlistbuild.compute一样,一样是通过zDptBufSpace以及对应的屏幕坐标计算出对应的Linear Depth [Near,Far]
float GetLinearDepth(float2 pixXY, float zDptBufSpace, uint eyeIndex) // 0 is near 1 is far
{
float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[eyeIndex];
#ifdef USE_OBLIQUE_MODE
float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
return res2.x / res2.y;
#else
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
// however this function must also work for orthographic projection so we keep it like this.
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;
return (m22 * zDptBufSpace + m23) / (m32 * zDptBufSpace + m33);
#endif
}
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
// establish max depth first
float linMaDist = 0.0;
//TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED=32*32
//遍历Cluster Tile内对应的Depth
for (int idx = t; idx < (TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED); idx += NR_THREADS)
{
uint2 uPixCrd = min(uint2(viTilLL.x + (idx & (TILE_SIZE_CLUSTERED - 1)), viTilLL.y + (idx >> log2TileSize)), uint2(g_screenSize.x - 1, g_screenSize.y - 1));
//#ifdef MSAA_ENABLED
//for(int i=0; i<g_iNumSamplesMSAA; i++)
//{
//const float fDpth = FetchDepthMSAA(uPixCrd, i);
//const float2 fracSampleCoord = g_depth_tex.GetSamplePosition(i).xy; // this is optimized away when USE_OBLIQUE_MODE is NOT set.
//#else
const float fDpth = FetchDepth(uPixCrd);
const float2 fracSampleCoord = float2(0.5, 0.5);
//#endif
if (fDpth < VIEWPORT_SCALE_Z) // if not skydome
{
float linZ = GetLinearDepth(uPixCrd + fracSampleCoord, fDpth, eyeIndex);
#if USE_LEFT_HAND_CAMERA_SPACE
float linDistZ = linZ;
#else
float linDistZ = -linZ;
#endif
//求ZMax
linMaDist = max(linDistZ, linMaDist);
}
//#ifdef MSAA_ENABLED
//}
//#endif
}
//Resolve linMaDist并线程同步(InterlockedMax)赋值给ldsZMax
linMaDist = max(linMaDist, 0.0);
InterlockedMax(ldsZMax, asuint(linMaDist));
//这个GroupMemoryBarrierWithGroupSync莫名其妙的
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
linMaDist = asfloat(ldsZMax);
//if (fDpth < VIEWPORT_SCALE_Z)
if (linMaDist <= 0.0)
linMaDist = g_fFarPlane; // assume sky pixel
#endif
...
}
Build coarse list,SphericalIntersectionTests
跟lightlistbuild.compute一样,这里也同样可以借用Big Tile的计算结果(g_vBigTileLightList),只遍历Big Tile内的灯光列表来Build coarseList
然后SphericalIntersectionTests Tile内的灯光,剔除掉并没有与Tile相交的灯光(DoesSphereOverlapTile)
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
// 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
float2 vTileLL = float2(viTilLL.x / g_screenSize.x, viTilLL.y / g_screenSize.y);
float2 vTileUR = float2(viTilUR.x / g_screenSize.x, viTilUR.y / g_screenSize.y);
// build coarse list using AABB
#ifdef USE_TWO_PASS_TILED_LIGHTING
//tileIDX映射bigTileIdx
const uint log2BigTileToClustTileRatio = firstbithigh(TILE_SIZE_BIG_TILE) - log2TileSize;
int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
const int bigTileBase = eyeIndex * NrBigTilesX * NrBigTilesY;
const int bigTileIdx = bigTileBase + ((tileIDX.y >> log2BigTileToClustTileRatio) * NrBigTilesX) + (tileIDX.x >> log2BigTileToClustTileRatio); // map the idx to 64x64 tiles
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + 0];
for (int l0 = (int)t; l0 < (int)nrBigTileLights; l0 += NR_THREADS)
{
int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + l0 + 1];
#else
for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
{
#endif
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;
//在Tile内
if (all(vMa > vTileLL) && all(vMi < vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if (uIndex < MAX_NR_COARSE_ENTRIES)
coarseList[uIndex] = l; // add to light list
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(TILE_SIZE_CLUSTERED / 2,TILE_SIZE_CLUSTERED / 2), uint2(g_screenSize.x - 1, g_screenSize.y - 1))),
eyeIndex);
#endif
...
}
根据Tile内的linMaDist分割Cluster
以Tile内linMaDist(fTileFarPlane)作为分割Cluster,即根据Tile内的最远的深度进行划分,当fTileFarPlane离得近时,Cluster的Index在靠前的深度分布越多。(提高Cluster的利用率)
函数图像:SuggestLogBase50
float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);
可以化简为:令d=rangeFittedDistance
\(\begin{cases}
suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.5 \\
suggestedBase=1 & \text{ if } d\ge0.5
\end{cases}\)
max(g_fClustBase, suggested_base)之后suggestedBase(g_fClustBase=1.02f)
\(\begin{cases}
suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.3466 \\
suggestedBase=1.02 & \text{ if } d\ge0.3466
\end{cases}\)
函数图像:SnapToClusterIdxFlex
f1(x,t)被限制在了[1.02,1.68]
1.68是代入rangeFittedDistance的最小值FLT_EPS计算得到的suggestedBase。
f2(x),f3(x)就是在演示suggestedBase在[1.02,1.68]之间滑动对SnapToClusterIdxFlex的影响。
可以看到当f1(x,t)从1.02变化到1.68的时候,由原本接近线性分布,变成了log曲线一样,使得更多的Index分布了在前面的深度。
float LogBase(float x, float b)
{
return log2(x) / log2(b);
}
int SnapToClusterIdxFlex(float z_in, float suggestedBase, bool logBasePerTile)
{
#if USE_LEFT_HAND_CAMERA_SPACE
float z = z_in;
#else
float z = -z_in;
#endif
//float userscale = g_fClustScale;
//if (logBasePerTile)
// userscale = GetScaleFromBase(suggestedBase);
// using the inverse of the geometric series
//const float dist = max(0, z - g_fNearPlane);
//return (int)clamp(log2(dist * userscale * (suggestedBase - 1.0f) + 1) / log2(suggestedBase), 0.0, (float)((1 << g_iLog2NumClusters) - 1));
const int C = 1 << g_iLog2NumClusters;
const float rangeFittedDistance = max(0, z - g_fNearPlane) / (g_fFarPlane - g_fNearPlane);
return (int)clamp( LogBase( lerp(1.0, PositivePow(suggestedBase, (float) C), rangeFittedDistance), suggestedBase), 0.0, (float)(C - 1));
}
int SnapToClusterIdx(float z_in, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
bool logBasePerTile = true; // resolved compile time
#else
bool logBasePerTile = false;
#endif
return SnapToClusterIdxFlex(z_in, suggestedBase, logBasePerTile);
}
// generate a log-base value such that half of the clusters are consumed from near plane to max. opaque depth of tile.
float SuggestLogBase50(float tileFarPlane)
{
const float C = (float)(1 << g_iLog2NumClusters);
float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPS, 1.0);
float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C); //
//g_fClustBase=1.02f;
return max(g_fClustBase, suggested_base);
}
#define MAX_NR_COARSE_ENTRIES 128
//两盏灯的ClusterId Min MaxID合并成一个clusterIdxs 128/2
groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES / 2];
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
float fTileFarPlane = linMaDist;
float suggestedBase = SuggestLogBase50(fTileFarPlane);
#else // ENABLE_DEPTH_TEXTURE_BACKPLANE
float fTileFarPlane = g_fFarPlane;
float suggestedBase = g_fClustBase;
#endif
// //#define EXACT_EDGE_TESTS EXACT_EDGE_TESTS没有启用
#ifdef EXACT_EDGE_TESTS
iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane, eyeIndex);
#endif
//这里的灯光Index排序依旧是上一节的双调排序
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if NR_THREADS > PLATFORM_LANE_COUNT
SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
#endif
//////////// cell specific code
//根据BoundBuffer.w(viewPos.z),结合上面求的suggestedBase用SnapToClusterIdx指定Cluster ID
{
for (int l = (int)t; l < ((iNrCoarseLights + 1) >> 1); l += NR_THREADS)
{
const int l0 = coarseList[2 * l + 0], l1 = coarseList[min(2 * l + 1, iNrCoarseLights - 1)];
const ScreenSpaceBoundsIndices l0Bounds = GenerateScreenSpaceBoundsIndices(l0, g_iNrVisibLights, eyeIndex);
const ScreenSpaceBoundsIndices l1Bounds = GenerateScreenSpaceBoundsIndices(l1, g_iNrVisibLights, eyeIndex);
const unsigned int clustIdxMi0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.min].w, suggestedBase));
const unsigned int clustIdxMa0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.max].w, suggestedBase));
const unsigned int clustIdxMi1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.min].w, suggestedBase));
const unsigned int clustIdxMa1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.max].w, suggestedBase));
//这里两盏灯的 clustIdxMin,clustIdxMax合并成一个ClusterIdx
clusterIdxs[l] = (clustIdxMa1 << 24) | (clustIdxMi1 << 16) | (clustIdxMa0 << 8) | (clustIdxMi0 << 0);
}
}
//线程同步
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
...
}
统计各个Cluster内的灯光数量[iSpaceAvail]
上面简单了Test灯光是否在Cluster内,是不够精准的,还需要检测构成cluster的点是否跟灯光Volume相交(CheckIntersection)。
如果相交了,灯光的Index才最终加入到g_vLayeredLightList里。
相对应的lightCategory也要计数+1
#define NR_THREADS 64
#define LIGHTCATEGORY_COUNT 5
//每个线程对应一个Cluster,即categoryListCountScratch记录每个Cluster的Light Count
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
//记录不同Category的在LightData/LightVolumeData中的Index偏移量(_EnvLightIndexShift/_DecalIndexShift/_LocalVolumetricFogIndexShift)
//这个值没有相关操作,感觉是多余的,不如直接用ConstantBuffer的变量。
groupshared int shiftIndexScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
//4盏灯,每盏灯记录6个平面,每个平面用float4描述,float4(vN.xyz,-dot(vN,p0))
groupshared float4 lightPlanes[4 * 6];// Each plane is defined by a float4. 6 planes per light, 4 lights (24 planes)
bool CheckIntersectionBasic(int l, int k)
{
unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
return ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
}
void ZeroCategoryListCountAndShiftIndex(uint threadIdx)
{
for (int i = 0; i < LIGHTCATEGORY_COUNT; ++i)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
}
}
void WriteShiftIndex(uint threadIdx, uint index, int value)
{
shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index] = value;
}
void IncrementCategoryListCount(uint threadIdx, uint index)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
//g_iLog2NumClusters=6 nrClusters=64
int nrClusters = (1 << g_iLog2NumClusters);
//////////////////////////////////////////////////////////
uint start = 0;
int i = (int)t;
int iSpaceAvail = 0;
int iSum = 0;
if (i < nrClusters)
{
// Each thread checks it's respective cluster against all coarse lights for intersection.
// At the end, 'iSum' represents the number of lights that intersect this cluster!
for (int l = 0; l < iNrCoarseLights; l++)
{
iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
}
// We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
// want to allocate out of g_LayeredSingleIdxBuffer.
iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory
//Start记录前面所有Cluster包含的Light数量
//iSpaceAvail记录当前Cluster里包含的Light数量
//g_LayeredSingleIdxBuffer[0]没有用到过,应该是某个Indirect的操作,记录所有Cluster的LightIndex总数
}
// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
// to make it work correctly
ZeroCategoryListCountAndShiftIndex(t);
WriteShiftIndex(t, LIGHTCATEGORY_ENV, _EnvLightIndexShift);
WriteShiftIndex(t, LIGHTCATEGORY_DECAL, _DecalIndexShift);
WriteShiftIndex(t, LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG, _LocalVolumetricFogIndexShift);
uint offs = start;
//遍历Tile内灯光列表(CoarseLights)把对应的灯光Index放到对应的
//int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);最大值为128
for (int ll = 0; ll < iNrCoarseLights; ll += 4)
{
//只有线程组前24线程FetchPlane,每次循环只fetch 4盏灯的24个平面
int p = i >> 2;
int m = i & 3;
if (i < 24)
lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);
//线程同步
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//检测刚刚Fetch过平面的灯光,检测是否与Cluster的8个AABB平面相交。
for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
{
if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
{
const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
//当前Cluster对应categoryListCount计数器Index++,
//categoryListCountScratch LDS里面的count才是不同Category Cluster内的lightDataCount
IncrementCategoryListCount(t, lightCategory);
//跟上一篇的BuildPerTileLightList末尾输出LightIndex类似,也需要减去对应lightCategory偏移得到对应Category的Data Index
g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
}
...
}
Fetch Plane
这里的FetchPlane函数依旧是使用LightingConvexHullUtils.hlsl里面的,
[GetHullPlane]根据不同面序号返回对应的平面上的一点以及平面的法向。
[GetHullPlaneEq]后续为了判断点与平面的朝向(ToLeftTest)就预先构成float4(vN, -dot(vN,p0))这样的表示平面方式。
//LightingConvexHullUtils.hlsl
void GetHullPlane(out float3 p0, out float3 n0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
//const int iAbsSide = (sideIndex == 0 || sideIndex == 1) ? 0 : ((sideIndex == 2 || sideIndex == 3) ? 1 : 2);
const int iAbsSide = min(sideIndex>>1, 2);
const float fS = (sideIndex & 1) != 0 ? 1 : (-1);
float3 vA = fS*(iAbsSide == 0 ? boxX : (iAbsSide == 1 ? (-boxY) : boxZ));
float3 vB = fS*(iAbsSide == 0 ? (-boxY) : (iAbsSide == 1 ? (-boxX) : (-boxY)));
float3 vC = iAbsSide == 0 ? boxZ : (iAbsSide == 1 ? boxZ : (-boxX));
//isTop的条件判断多少有点抽象,建议直接在C#模拟一遍就够了
bool bIsTopQuad = iAbsSide == 2 && (sideIndex & 1) != 0; // in this case all 4 verts get scaled.
bool bIsSideQuad = (iAbsSide == 0 || iAbsSide == 1); // if side quad only two verts get scaled (impacts q1 and q2)
if (bIsTopQuad)
{
vB *= scaleXY.y;
vC *= scaleXY.x;
}
float3 vA2 = vA;
float3 vB2 = vB;
if (bIsSideQuad)
{
vA2 *= (iAbsSide == 0 ? scaleXY.x : scaleXY.y);
vB2 *= (iAbsSide == 0 ? scaleXY.y : scaleXY.x);
}
float3 vN = cross(vB2, 0.5 * (vA - vA2) - vC); // +/- normal
float3 v0 = vA + vB - vC; // vector from center to p0
p0 = center + v0; // center + vA is center of face when scaleXY is 1.0
//dot(vN,v0) < 0.0 保证法线朝外
n0 = dot(vN,v0) < 0.0 ? (-vN) : vN;
}
float4 GetHullPlaneEq(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
float3 p0, vN;
GetHullPlane(p0, vN, boxX, boxY, boxZ, center, scaleXY, sideIndex);
return float4(vN, -dot(vN,p0));
}
CheckIntersection
判断Cluster与灯光是否相交,有两个判断方式,
一个就是用前面计算的Cluster Index范围做简单的判断,
第二个就是用Cluster ID计算出构成Cluster的八个顶点与灯光平面的几何关系。
//用每个Tile ldsZMax计算出来的suggestedBase来计算计算每个Cluster的NearPlaneZ
//下一个Cluster的NearPlane就是当前Cluster的FarPlaneZ
float ClusterIdxToZFlex(int k, float suggestedBase, bool logBasePerTile)
{
float res;
//float userscale = g_fClustScale;
//if (logBasePerTile)
// userscale = GetScaleFromBase(suggestedBase);
//float dist = (PositivePow(suggestedBase, (float)k) - 1.0) / (userscale * (suggestedBase - 1.0f));
//res = dist + g_fNearPlane;
const float C = (float)(1 << g_iLog2NumClusters);
float rangeFittedDistance = (PositivePow(suggestedBase, (float)k) - 1.0) / (PositivePow(suggestedBase, C) - 1.0);
res = lerp(g_fNearPlane, g_fFarPlane, rangeFittedDistance);
#if USE_LEFT_HAND_CAMERA_SPACE
return res;
#else
return -res;
#endif
}
float ClusterIdxToZ(int k, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
bool logBasePerTile = true; // resolved compile time
#else
bool logBasePerTile = false;
#endif
return ClusterIdxToZFlex(k, suggestedBase, logBasePerTile);
}
bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase, uint eyeIndex)
{
//先简单通过Cluster ID是否在灯光MinMax范围内判交,若不在就直接返回
// If this light's screen space depth bounds intersect this cluster...simple cluster test
unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
bool bIsHit = ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
if (bIsHit)
{
#ifdef CONV_HULL_TEST_ENABLED
float depthAtNearZ = ClusterIdxToZ(k, suggestedBase);
float depthAtFarZ = ClusterIdxToZ(k + 1, suggestedBase);
//若Cluster所有的点都在灯光的某一平面外侧说明Cluster与灯光不相交
for (int p = 0; p < 6; p++)
{
float4 plane = lightPlanes[6 * (l & 3) + p];
bool bAllInvisib = true;
for (int i = 0; i < 8; i++)
{
float x = (i & 1) == 0 ? viTilLL.x : viTilUR.x;
float y = (i & 2) == 0 ? viTilLL.y : viTilUR.y;
float z = (i & 4) == 0 ? depthAtNearZ : depthAtFarZ;
//用屏幕坐标以及对应的LinDepth计算ViewPositiion,计算方式跟上一节一样都是用g_mScrProjectionArr计算
float3 vP = GetViewPosFromLinDepth(float2(x, y), z, eyeIndex);
//plane = float4(vN.xyz,-dot(vN,p0))
//dot(plane, float4(vP, 1.0)) > 0即vP对平面vN做ToLeftTest
//vN.xyz*vP.xyz>dot(vN,p0)检测vP是否在平面左侧
// Test each corner of the cluster against the light bounding box planes
bAllInvisib = bAllInvisib && dot(plane, float4(vP, 1.0)) > 0;
}
//即找到一个平面能跟Cluster完全分离
if (bAllInvisib)
bIsHit = false;
}
#endif
}
return bIsHit;
}
Final Resolve
上面的start值记录的是当前Cluster在g_vLayeredLightList记录LightData的起始Index。
categoryListCountScratch也记录了Cluster不同的Category的LightData Count,
所以我们可以通过start以及对应的Category的LightData Count就可以在g_vLayeredLightList中寻址得到对应的Category LightData Index
uint GenerateLayeredOffsetBufferIndex(uint lightCategory, uint2 tileIndex, uint clusterIndex, uint numTilesX, uint numTilesY, int numClusters, uint eyeIndex)
{
// Each eye is split into category, cluster, x, y
uint eyeOffset = eyeIndex * LIGHTCATEGORY_COUNT * numClusters * numTilesX * numTilesY;
int lightOffset = ((lightCategory * numClusters + clusterIndex) * numTilesY + tileIndex.y) * numTilesX + tileIndex.x;
return (eyeOffset + lightOffset);
}
//67108863=1<<26-1
#define LIGHT_CLUSTER_PACKING_OFFSET_MASK (67108863)
#define LIGHT_CLUSTER_PACKING_COUNT_MASK (63)
#define LIGHT_CLUSTER_PACKING_OFFSET_BITS (26)
uint PackClusterLayeredOffset(uint offset, uint count)
{
return (offset & LIGHT_CLUSTER_PACKING_OFFSET_MASK) | (min(count, LIGHT_CLUSTER_PACKING_COUNT_MASK) << LIGHT_CLUSTER_PACKING_OFFSET_BITS);
}
//统计当前Tile内的Cluster(64个)不同category计数器
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
void IncrementCategoryListCount(uint threadIdx, uint index)
{
categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}
int ReadCategoryListCount(uint threadIdx, uint index)
{
return categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
}
//LogBaseBufferIndex计算,suggestedBase是逐tile数据
uint GenerateLogBaseBufferIndex(uint2 tileIndex, uint numTilesX, uint numTilesY, uint eyeIndex)
{
uint eyeOffset = eyeIndex * numTilesX * numTilesY;
return (eyeOffset + (tileIndex.y * numTilesX) + tileIndex.x);
}
[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
uint start = 0;
int i = (int)t;
int iSpaceAvail = 0;
int iSum = 0;
if (i < nrClusters)
{
// Each thread checks it's respective cluster against all coarse lights for intersection.
// At the end, 'iSum' represents the number of lights that intersect this cluster!
for (int l = 0; l < iNrCoarseLights; l++)
{
iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
}
// We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
// want to allocate out of g_LayeredSingleIdxBuffer.
iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
//start = g_LayeredSingleIdxBuffer[0];
//InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail);
InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory
//start记录前面所有Cluster包含的Light数量
//iSpaceAvail记录当前Cluster里包含的Light数量
//g_LayeredSingleIdxBuffer[0]没有用到过,应该是某个Indirect的操作,记录所有Cluster的LightIndex总数
}
...
//start记录前面所有Cluster包含的Light数量,也是后续跳表List(g_LayeredOffset)存储的
uint offs = start;
for (int ll = 0; ll < iNrCoarseLights; ll += 4)
{
int p = i >> 2;
int m = i & 3;
if (i < 24)
lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
{
//iSpaceAvail是通过CheckIntersectionBasic测试的灯光数量
if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
{
//不同lightCategory是连续存储的,后续通过ReadCategoryListCount读取到不同Category的LightData的数量
//从而分开不同Category.
const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
IncrementCategoryListCount(t, lightCategory);
g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
}
...
uint localOffs = 0;
//每个Cluster起始offset由LIGHTCATEGORY_PUNCTUAL(0)为标准
//在loop里面再累计[offs += (nrClusters * nrTilesX * nrTilesY);]
//nrClusters=64
offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);
for (int category = 0; category < LIGHTCATEGORY_COUNT; category++)
{
//读取当前Cluster中category对应的light Count
int numLights = ReadCategoryListCount(t, category);
if (i < nrClusters)
{
//(读取g_vLayeredLightList的起始Index) start + localOffs
//(Cluster内category对应的light Count) numLights
//g_vLayeredLightList里面才存储LightIndex,g_LayeredOffset作为跳转的List
g_LayeredOffset[offs] = PackClusterLayeredOffset((start + localOffs), (uint)numLights);
offs += (nrClusters * nrTilesX * nrTilesY);
localOffs += numLights; // use unclamped count for localOffs
}
}
//为了后面LightingLoop映射ClusterIdx,需要保存每个Tile的划分Cluster的参数(suggestedBase)
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIDX, nrTilesX, nrTilesY, eyeIndex);
if (threadID == 0)
g_logBaseBuffer[logBaseIndex] = suggestedBase;
#endif
...
}
LightLoop
类似FPTL,Cluster这里也同样调用的是GetCountAndStart接口获取当前Cluster的lightData Count以及start的Index,
FetchIndex获取最终的LightData Index。
最终的流程如下:
1.positionInput的TileCoord + Depth.z计算出当前片元归属的clusterIdx
2.clusterIdx + 当前要计算的category (GenerateLayeredOffsetBufferIndex)计算当前cluster在g_vLayeredOffsetsBuffer的idx
3.g_vLayeredOffsetsBuffer[idx]即为上面最后记录的用来跳转用的dataPair(PackClusterLayeredOffset)
4.UnpackClusterLayeredOffset得到start和lightCount
5.Lighting的时候从start开始FetchIndex就可以得到lightData的真正Index。
//LightLoopDef.hlsl
...
#elif defined(USE_CLUSTERED_LIGHTLIST)
#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ClusteredUtils.hlsl"
uint GetTileSize()
{
return TILE_SIZE_CLUSTERED;
}
uint GetLightClusterIndex(uint2 tileIndex, float linearDepth)
{
float logBase = g_fClustBase;
if (g_isLogBaseBufferEnabled)
{
const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIndex, _NumTileClusteredX, _NumTileClusteredY, unity_StereoEyeIndex);
logBase = g_logBaseBuffer[logBaseIndex];
}
return SnapToClusterIdxFlex(linearDepth, logBase, g_isLogBaseBufferEnabled != 0);
}
void UnpackClusterLayeredOffset(uint packedValue, out uint offset, out uint count)
{
offset = packedValue & LIGHT_CLUSTER_PACKING_OFFSET_MASK;
count = packedValue >> LIGHT_CLUSTER_PACKING_OFFSET_BITS;
}
void GetCountAndStartCluster(uint2 tileIndex, uint clusterIndex, uint lightCategory, out uint start, out uint lightCount)
{
int nrClusters = (1 << g_iLog2NumClusters);
const int idx = GenerateLayeredOffsetBufferIndex(lightCategory, tileIndex, clusterIndex, _NumTileClusteredX, _NumTileClusteredY, nrClusters, unity_StereoEyeIndex);
uint dataPair = g_vLayeredOffsetsBuffer[idx];
UnpackClusterLayeredOffset(dataPair, start, lightCount);
}
void GetCountAndStartCluster(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
// Note: XR depends on unity_StereoEyeIndex already being defined,
// which means ShaderVariables.hlsl needs to be defined ahead of this!
uint2 tileIndex = posInput.tileCoord;
uint clusterIndex = GetLightClusterIndex(tileIndex, posInput.linearDepth);
GetCountAndStartCluster(tileIndex, clusterIndex, lightCategory, start, lightCount);
}
void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
GetCountAndStartCluster(posInput, lightCategory, start, lightCount);
}
uint FetchIndex(uint lightStart, uint lightOffset)
{
return g_vLightListCluster[lightStart + lightOffset];
}
...