Unity的Forward+ FPTL光照剔除解析(四)

序言

看完上一节基本上HDRP的光照剔除数据的流程就写完了,这一节主要是解析适用于透明队列的Cluster光照剔除。
为了让Cluster的均匀分布,Cluster的光照剔除也同样是借助了PreDepth的深度图

RenderGraph Dispatch

这里的buildPerVoxelLightListKernel根据是否开启读取Depth制定suggestBase划分Cluster以及MSAA是否开启有很多的不同的变体。
这里主要解析的Kernel是TileLightListGen_DepthRT_SrcBigTile

//Cluster的TileSize
public static int s_TileSizeClustered = 32;

static int GetNumTileClusteredX(HDCamera hdCamera)
{
    return HDUtils.DivRoundUp((int) hdCamera.screenSize.x, LightDefinitions.s_TileSizeClustered);
}

unsafe void PrepareBuildGPULightListPassData(
    RenderGraph renderGraph,
    RenderGraphBuilder builder,
    HDCamera hdCamera,
    TileAndClusterData tileAndClusterData,
    ref ShaderVariablesLightList constantBuffer,
    int totalLightCount,
    TextureHandle depthStencilBuffer,
    TextureHandle stencilBufferCopy,
    GBufferOutput gBuffer,
    BuildGPULightListPassData passData)
{
    ...
    // Cluster
    bool msaa = hdCamera.msaaEnabled;
    var clustPrepassSourceIdx = hdCamera.frameSettings.IsEnabled(FrameSettingsField.BigTilePrepass) ? ClusterPrepassSource.BigTile : ClusterPrepassSource.None;
    var clustDepthSourceIdx = ClusterDepthSource.NoDepth;
    if (tileAndClusterData.clusterNeedsDepth)
        clustDepthSourceIdx = msaa ? ClusterDepthSource.MSAA_Depth : ClusterDepthSource.Depth;

    passData.buildPerVoxelLightListShader = buildPerVoxelLightListShader;
    passData.clearClusterAtomicIndexShader = clearClusterAtomicIndexShader;
    //类似UE的Shader宏开启写法
    passData.buildPerVoxelLightListKernel = isProjectionOblique ? s_ClusterObliqueKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx] : s_ClusterKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx];
    passData.numTilesClusterX = GetNumTileClusteredX(hdCamera);
    passData.numTilesClusterY = GetNumTileClusteredY(hdCamera);
    passData.clusterNeedsDepth = tileAndClusterData.clusterNeedsDepth;

    ...
}


static void VoxelLightListGeneration(BuildGPULightListPassData data, CommandBuffer cmd)
{
    if (data.runLightList)
    {
        // clear atomic offset index
        cmd.SetComputeBufferParam(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
        cmd.DispatchCompute(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, 1, 1, 1);

        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vLayeredLightList, data.output.perVoxelLightLists);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredOffset, data.output.perVoxelOffset);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);

        //开启Big Tile
        if (data.runBigTilePrepass)
            cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);

        //开启Hiz culling
        if (data.clusterNeedsDepth)
        {
            cmd.SetComputeTextureParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
            cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_logBaseBuffer, data.output.perTileLogBaseTweak);
        }

        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
        cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);

        ConstantBuffer.Push(cmd, data.lightListCB, data.buildPerVoxelLightListShader, HDShaderIDs._ShaderVariablesLightList);

        cmd.DispatchCompute(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, data.numTilesClusterX, data.numTilesClusterY, data.viewCount);
    }
}

Initialize

首先依旧是跟之前的TileLightListGen类似,计算当前线程的Tile的映射关系要用到的数据.(Tile的X/Y轴上的数量,当前线程组对应的TileID)

#define TILE_SIZE_CLUSTERED (32)
//若data.clusterNeedsDepth==true,
//就使用TileLightListGen_DepthRT_SrcBigTile  LIGHTLISTGEN=TileLightListGen_DepthRT_SrcBigTile  ENABLE_DEPTH_TEXTURE_BACKPLANE
//即#define ENABLE_DEPTH_TEXTURE_BACKPLANE
#define ENABLE_DEPTH_TEXTURE_BACKPLANE

groupshared uint lightOffs;

#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint ldsZMax;
#endif

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    uint eyeIndex = u3GroupID.z;

    uint2 tileIDX = u3GroupID.xy;
    uint t=threadID;

    //firstbithigh(32)=5
    const uint log2TileSize = firstbithigh(TILE_SIZE_CLUSTERED);
    uint nrTilesX = ((uint)g_screenSize.x +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.x,32)
    uint nrTilesY = ((uint)g_screenSize.y +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.y,32)

    // Screen space coordinates of clustered tile
    //当前Tile的左下角屏幕坐标
    uint2 viTilLL = TILE_SIZE_CLUSTERED*tileIDX;
    //当前Tile的右上角屏幕坐标
    uint2 viTilUR = min( viTilLL+uint2(TILE_SIZE_CLUSTERED,TILE_SIZE_CLUSTERED), uint2(g_screenSize.x, g_screenSize.y) );       // not width and height minus 1 since viTilUR represents the end of the tile corner.
    
    //重置lightOffs,ldsZMax(跟TileLightListGen类似需要求Tile内的ZMax)
    if(t==0)
    {
        lightOffs = 0;

#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
        ldsZMax = 0;
#endif
    }

#if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
#endif

    ...
}

ldsZMax

通过遍历Tile内的深度,得到linMaDist,最后再InterlockedMax Resolve得到ldsZMax(Tile内的Max Z)

#define TILE_SIZE_CLUSTERED (32)
#define VIEWPORT_SCALE_Z (1)

//跟lightlistbuild.compute一样,一样是通过zDptBufSpace以及对应的屏幕坐标计算出对应的Linear Depth [Near,Far]
float GetLinearDepth(float2 pixXY, float zDptBufSpace, uint eyeIndex) // 0 is near 1 is far
{
    float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[eyeIndex];

    #ifdef USE_OBLIQUE_MODE
    float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
    return res2.x / res2.y;
    #else
    // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
    // however this function must also work for orthographic projection so we keep it like this.
    float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
    float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;

    return (m22 * zDptBufSpace + m23) / (m32 * zDptBufSpace + m33);
    #endif
}

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    // establish max depth first
    float linMaDist = 0.0;

    //TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED=32*32
    //遍历Cluster Tile内对应的Depth
    for (int idx = t; idx < (TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED); idx += NR_THREADS)
    {
        uint2 uPixCrd = min(uint2(viTilLL.x + (idx & (TILE_SIZE_CLUSTERED - 1)), viTilLL.y + (idx >> log2TileSize)), uint2(g_screenSize.x - 1, g_screenSize.y - 1));

        //#ifdef MSAA_ENABLED
        //for(int i=0; i<g_iNumSamplesMSAA; i++)
        //{
        //const float fDpth = FetchDepthMSAA(uPixCrd, i);
        //const float2 fracSampleCoord = g_depth_tex.GetSamplePosition(i).xy;     // this is optimized away when USE_OBLIQUE_MODE is NOT set.
        //#else
        const float fDpth = FetchDepth(uPixCrd);
        const float2 fracSampleCoord = float2(0.5, 0.5);
        //#endif

        if (fDpth < VIEWPORT_SCALE_Z) // if not skydome
        {
            float linZ = GetLinearDepth(uPixCrd + fracSampleCoord, fDpth, eyeIndex);
            #if USE_LEFT_HAND_CAMERA_SPACE
            float linDistZ = linZ;
            #else
            float linDistZ = -linZ;
            #endif

            //求ZMax
            linMaDist = max(linDistZ, linMaDist);
        }
        //#ifdef MSAA_ENABLED
        //}
        //#endif
    }

    //Resolve linMaDist并线程同步(InterlockedMax)赋值给ldsZMax
    linMaDist = max(linMaDist, 0.0);
    InterlockedMax(ldsZMax, asuint(linMaDist));

    //这个GroupMemoryBarrierWithGroupSync莫名其妙的
    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    linMaDist = asfloat(ldsZMax);
    //if (fDpth < VIEWPORT_SCALE_Z)
    if (linMaDist <= 0.0) 
        linMaDist = g_fFarPlane; // assume sky pixel
    #endif
    ...
}

Build coarse list,SphericalIntersectionTests

跟lightlistbuild.compute一样,这里也同样可以借用Big Tile的计算结果(g_vBigTileLightList),只遍历Big Tile内的灯光列表来Build coarseList
然后SphericalIntersectionTests Tile内的灯光,剔除掉并没有与Tile相交的灯光(DoesSphereOverlapTile)

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    // 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
    float2 vTileLL = float2(viTilLL.x / g_screenSize.x, viTilLL.y / g_screenSize.y);
    float2 vTileUR = float2(viTilUR.x / g_screenSize.x, viTilUR.y / g_screenSize.y);

    // build coarse list using AABB
    #ifdef USE_TWO_PASS_TILED_LIGHTING

    //tileIDX映射bigTileIdx
    const uint log2BigTileToClustTileRatio = firstbithigh(TILE_SIZE_BIG_TILE) - log2TileSize;

    int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
    int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;
    const int bigTileBase = eyeIndex * NrBigTilesX * NrBigTilesY;
    const int bigTileIdx = bigTileBase + ((tileIDX.y >> log2BigTileToClustTileRatio) * NrBigTilesX) + (tileIDX.x >> log2BigTileToClustTileRatio); // map the idx to 64x64 tiles

    int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + 0];
    for (int l0 = (int)t; l0 < (int)nrBigTileLights; l0 += NR_THREADS)
    {
        int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + l0 + 1];

    #else

    for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
    {
        #endif
        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
        const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
        const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;

        //在Tile内
        if (all(vMa > vTileLL) && all(vMi < vTileUR))
        {
            unsigned int uInc = 1;
            unsigned int uIndex;
            InterlockedAdd(lightOffs, uInc, uIndex);
            if (uIndex < MAX_NR_COARSE_ENTRIES) 
                coarseList[uIndex] = l; // add to light list
        }
    }

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);

    #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
    iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(TILE_SIZE_CLUSTERED / 2,TILE_SIZE_CLUSTERED / 2), uint2(g_screenSize.x - 1, g_screenSize.y - 1))),
                                                 eyeIndex);
    #endif


    ...
}

根据Tile内的linMaDist分割Cluster

以Tile内linMaDist(fTileFarPlane)作为分割Cluster,即根据Tile内的最远的深度进行划分,当fTileFarPlane离得近时,Cluster的Index在靠前的深度分布越多。(提高Cluster的利用率)

函数图像:SuggestLogBase50
float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);
可以化简为:令d=rangeFittedDistance
\(\begin{cases} suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.5 \\ suggestedBase=1 & \text{ if } d\ge0.5 \end{cases}\)
max(g_fClustBase, suggested_base)之后suggestedBase(g_fClustBase=1.02f)
\(\begin{cases} suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.3466 \\ suggestedBase=1.02 & \text{ if } d\ge0.3466 \end{cases}\)
函数图像:SnapToClusterIdxFlex
f1(x,t)被限制在了[1.02,1.68]
1.68是代入rangeFittedDistance的最小值FLT_EPS计算得到的suggestedBase。
f2(x),f3(x)就是在演示suggestedBase在[1.02,1.68]之间滑动对SnapToClusterIdxFlex的影响。
可以看到当f1(x,t)从1.02变化到1.68的时候,由原本接近线性分布,变成了log曲线一样,使得更多的Index分布了在前面的深度。


float LogBase(float x, float b)
{
    return log2(x) / log2(b);
}

int SnapToClusterIdxFlex(float z_in, float suggestedBase, bool logBasePerTile)
{
#if USE_LEFT_HAND_CAMERA_SPACE
    float z = z_in;
#else
    float z = -z_in;
#endif

    //float userscale = g_fClustScale;
    //if (logBasePerTile)
    //    userscale = GetScaleFromBase(suggestedBase);

    // using the inverse of the geometric series
    //const float dist = max(0, z - g_fNearPlane);
    //return (int)clamp(log2(dist * userscale * (suggestedBase - 1.0f) + 1) / log2(suggestedBase), 0.0, (float)((1 << g_iLog2NumClusters) - 1));

    const int C = 1 << g_iLog2NumClusters;
    const float rangeFittedDistance = max(0, z - g_fNearPlane) / (g_fFarPlane - g_fNearPlane);
    return (int)clamp( LogBase( lerp(1.0, PositivePow(suggestedBase, (float) C), rangeFittedDistance), suggestedBase), 0.0, (float)(C - 1));
}

int SnapToClusterIdx(float z_in, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    bool logBasePerTile = true;     // resolved compile time
#else
    bool logBasePerTile = false;
#endif

    return SnapToClusterIdxFlex(z_in, suggestedBase, logBasePerTile);
}

// generate a log-base value such that half of the clusters are consumed from near plane to max. opaque depth of tile.
float SuggestLogBase50(float tileFarPlane)
{
    const float C = (float)(1 << g_iLog2NumClusters);
    float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPS, 1.0);
    float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);      //
    
    //g_fClustBase=1.02f;
    return max(g_fClustBase, suggested_base);
}

#define MAX_NR_COARSE_ENTRIES       128
//两盏灯的ClusterId Min MaxID合并成一个clusterIdxs 128/2
groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES / 2];

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    float fTileFarPlane = linMaDist;
    float suggestedBase = SuggestLogBase50(fTileFarPlane);
    #else // ENABLE_DEPTH_TEXTURE_BACKPLANE
    float fTileFarPlane = g_fFarPlane;
    float suggestedBase = g_fClustBase;
    #endif

    // //#define EXACT_EDGE_TESTS EXACT_EDGE_TESTS没有启用
    #ifdef EXACT_EDGE_TESTS
    iNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane, eyeIndex);
    #endif

    //这里的灯光Index排序依旧是上一节的双调排序
    // sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
    #if NR_THREADS > PLATFORM_LANE_COUNT
    SORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);
    #endif

    //////////// cell specific code
    //根据BoundBuffer.w(viewPos.z),结合上面求的suggestedBase用SnapToClusterIdx指定Cluster ID
    {
        for (int l = (int)t; l < ((iNrCoarseLights + 1) >> 1); l += NR_THREADS)
        {
            const int l0 = coarseList[2 * l + 0], l1 = coarseList[min(2 * l + 1, iNrCoarseLights - 1)];
            const ScreenSpaceBoundsIndices l0Bounds = GenerateScreenSpaceBoundsIndices(l0, g_iNrVisibLights, eyeIndex);
            const ScreenSpaceBoundsIndices l1Bounds = GenerateScreenSpaceBoundsIndices(l1, g_iNrVisibLights, eyeIndex);

            const unsigned int clustIdxMi0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.min].w, suggestedBase));
            const unsigned int clustIdxMa0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.max].w, suggestedBase));
            const unsigned int clustIdxMi1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.min].w, suggestedBase));
            const unsigned int clustIdxMa1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.max].w, suggestedBase));
            //这里两盏灯的 clustIdxMin,clustIdxMax合并成一个ClusterIdx 
            clusterIdxs[l] = (clustIdxMa1 << 24) | (clustIdxMi1 << 16) | (clustIdxMa0 << 8) | (clustIdxMi0 << 0);
        }
    }

    //线程同步
    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif
    ...
}

统计各个Cluster内的灯光数量[iSpaceAvail]

上面简单了Test灯光是否在Cluster内,是不够精准的,还需要检测构成cluster的点是否跟灯光Volume相交(CheckIntersection)
如果相交了,灯光的Index才最终加入到g_vLayeredLightList里。
相对应的lightCategory也要计数+1

#define NR_THREADS       64
#define LIGHTCATEGORY_COUNT       5

//每个线程对应一个Cluster,即categoryListCountScratch记录每个Cluster的Light Count
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
//记录不同Category的在LightData/LightVolumeData中的Index偏移量(_EnvLightIndexShift/_DecalIndexShift/_LocalVolumetricFogIndexShift)
//这个值没有相关操作,感觉是多余的,不如直接用ConstantBuffer的变量。
groupshared int shiftIndexScratch[NR_THREADS * LIGHTCATEGORY_COUNT];

//4盏灯,每盏灯记录6个平面,每个平面用float4描述,float4(vN.xyz,-dot(vN,p0))
groupshared float4 lightPlanes[4 * 6];// Each plane is defined by a float4. 6 planes per light, 4 lights (24 planes)

bool CheckIntersectionBasic(int l, int k)
{
    unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
    return ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
}

void ZeroCategoryListCountAndShiftIndex(uint threadIdx)
{
    for (int i = 0; i < LIGHTCATEGORY_COUNT; ++i)
    {
        categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
        shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;
    }
}

void WriteShiftIndex(uint threadIdx, uint index, int value)
{
    shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index] = value;
}

void IncrementCategoryListCount(uint threadIdx, uint index)
{
    categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    //g_iLog2NumClusters=6 nrClusters=64
    int nrClusters = (1 << g_iLog2NumClusters);


    //////////////////////////////////////////////////////////

    uint start = 0;
    int i = (int)t;
    int iSpaceAvail = 0;
    int iSum = 0;
    if (i < nrClusters)
    {
        // Each thread checks it's respective cluster against all coarse lights for intersection.
        // At the end, 'iSum' represents the number of lights that intersect this cluster!
        for (int l = 0; l < iNrCoarseLights; l++)
        {
            iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
        }

        // We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
        // want to allocate out of g_LayeredSingleIdxBuffer.
        iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
        InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory

        //Start记录前面所有Cluster包含的Light数量
        //iSpaceAvail记录当前Cluster里包含的Light数量
        //g_LayeredSingleIdxBuffer[0]没有用到过,应该是某个Indirect的操作,记录所有Cluster的LightIndex总数
    }

    // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
    // to make it work correctly
    ZeroCategoryListCountAndShiftIndex(t);

    WriteShiftIndex(t, LIGHTCATEGORY_ENV, _EnvLightIndexShift);
    WriteShiftIndex(t, LIGHTCATEGORY_DECAL, _DecalIndexShift);
    WriteShiftIndex(t, LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG, _LocalVolumetricFogIndexShift);

    uint offs = start;
    //遍历Tile内灯光列表(CoarseLights)把对应的灯光Index放到对应的
    //int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);最大值为128
    for (int ll = 0; ll < iNrCoarseLights; ll += 4)
    {
        //只有线程组前24线程FetchPlane,每次循环只fetch 4盏灯的24个平面
        int p = i >> 2;
        int m = i & 3;
        if (i < 24)
            lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);

        //线程同步
        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif

        //检测刚刚Fetch过平面的灯光,检测是否与Cluster的8个AABB平面相交。
        for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
        {
            if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
            {
                const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
                uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;

                //当前Cluster对应categoryListCount计数器Index++,
                //categoryListCountScratch LDS里面的count才是不同Category Cluster内的lightDataCount
                IncrementCategoryListCount(t, lightCategory);

                //跟上一篇的BuildPerTileLightList末尾输出LightIndex类似,也需要减去对应lightCategory偏移得到对应Category的Data Index
                g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
            }
        }

        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif
    }
    ...
}

Fetch Plane

这里的FetchPlane函数依旧是使用LightingConvexHullUtils.hlsl里面的,
[GetHullPlane]根据不同面序号返回对应的平面上的一点以及平面的法向
[GetHullPlaneEq]后续为了判断点与平面的朝向(ToLeftTest)预先构成float4(vN, -dot(vN,p0))这样的表示平面方式

_D9F6E79F-8A33-47ed-B15D-A01A967A5788_.png

图中标红Cube的是Hull的顶点,红线是Hull的平面法向
//LightingConvexHullUtils.hlsl

void GetHullPlane(out float3 p0, out float3 n0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
    //const int iAbsSide = (sideIndex == 0 || sideIndex == 1) ? 0 : ((sideIndex == 2 || sideIndex == 3) ? 1 : 2);
    const int iAbsSide = min(sideIndex>>1, 2);
    const float fS = (sideIndex & 1) != 0 ? 1 : (-1);

    float3 vA = fS*(iAbsSide == 0 ? boxX : (iAbsSide == 1 ? (-boxY) : boxZ));
    float3 vB = fS*(iAbsSide == 0 ? (-boxY) : (iAbsSide == 1 ? (-boxX) : (-boxY)));
    float3 vC = iAbsSide == 0 ? boxZ : (iAbsSide == 1 ? boxZ : (-boxX));

    //isTop的条件判断多少有点抽象,建议直接在C#模拟一遍就够了
    bool bIsTopQuad = iAbsSide == 2 && (sideIndex & 1) != 0;        // in this case all 4 verts get scaled.
    bool bIsSideQuad = (iAbsSide == 0 || iAbsSide == 1);        // if side quad only two verts get scaled (impacts q1 and q2)

    if (bIsTopQuad) 
    { 
        vB *= scaleXY.y; 
        vC *= scaleXY.x; 
    }

    float3 vA2 = vA;
    float3 vB2 = vB;

    if (bIsSideQuad) 
    {
        vA2 *= (iAbsSide == 0 ? scaleXY.x : scaleXY.y); 
        vB2 *= (iAbsSide == 0 ? scaleXY.y : scaleXY.x); 
    }

    float3 vN = cross(vB2, 0.5 * (vA - vA2) - vC);  // +/- normal
    float3 v0 = vA + vB - vC;   // vector from center to p0
    p0 = center + v0;           // center + vA is center of face when scaleXY is 1.0
    //dot(vN,v0) < 0.0 保证法线朝外
    n0 = dot(vN,v0) < 0.0 ? (-vN) : vN;
}

float4 GetHullPlaneEq(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{
    float3 p0, vN;
    GetHullPlane(p0, vN, boxX, boxY, boxZ, center, scaleXY, sideIndex);

    return float4(vN, -dot(vN,p0));
}

CheckIntersection

判断Cluster与灯光是否相交,有两个判断方式,
一个就是用前面计算的Cluster Index范围做简单的判断,
第二个就是用Cluster ID计算出构成Cluster的八个顶点与灯光平面的几何关系

//用每个Tile ldsZMax计算出来的suggestedBase来计算计算每个Cluster的NearPlaneZ
//下一个Cluster的NearPlane就是当前Cluster的FarPlaneZ
float ClusterIdxToZFlex(int k, float suggestedBase, bool logBasePerTile)
{
    float res;

    //float userscale = g_fClustScale;
    //if (logBasePerTile)
    //    userscale = GetScaleFromBase(suggestedBase);

    //float dist = (PositivePow(suggestedBase, (float)k) - 1.0) / (userscale * (suggestedBase - 1.0f));
    //res = dist + g_fNearPlane;

    const float C = (float)(1 << g_iLog2NumClusters);
    float rangeFittedDistance = (PositivePow(suggestedBase, (float)k) - 1.0) / (PositivePow(suggestedBase, C) - 1.0);
    res = lerp(g_fNearPlane, g_fFarPlane, rangeFittedDistance);


#if USE_LEFT_HAND_CAMERA_SPACE
    return res;
#else
    return -res;
#endif
}

float ClusterIdxToZ(int k, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    bool logBasePerTile = true;     // resolved compile time
#else
    bool logBasePerTile = false;
#endif

    return ClusterIdxToZFlex(k, suggestedBase, logBasePerTile);
}

bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase, uint eyeIndex)
{
    //先简单通过Cluster ID是否在灯光MinMax范围内判交,若不在就直接返回
    // If this light's screen space depth bounds intersect this cluster...simple cluster test
    unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;
    bool bIsHit = ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
    if (bIsHit)
    {
        #ifdef CONV_HULL_TEST_ENABLED
        float depthAtNearZ = ClusterIdxToZ(k, suggestedBase);
        float depthAtFarZ = ClusterIdxToZ(k + 1, suggestedBase);

        //若Cluster所有的点都在灯光的某一平面外侧说明Cluster与灯光不相交
        for (int p = 0; p < 6; p++)
        {
            float4 plane = lightPlanes[6 * (l & 3) + p];

            bool bAllInvisib = true;

            for (int i = 0; i < 8; i++)
            {
                float x = (i & 1) == 0 ? viTilLL.x : viTilUR.x;
                float y = (i & 2) == 0 ? viTilLL.y : viTilUR.y;
                float z = (i & 4) == 0 ? depthAtNearZ : depthAtFarZ;

                //用屏幕坐标以及对应的LinDepth计算ViewPositiion,计算方式跟上一节一样都是用g_mScrProjectionArr计算
                float3 vP = GetViewPosFromLinDepth(float2(x, y), z, eyeIndex);

                //plane = float4(vN.xyz,-dot(vN,p0))
                //dot(plane, float4(vP, 1.0)) > 0即vP对平面vN做ToLeftTest
                //vN.xyz*vP.xyz>dot(vN,p0)检测vP是否在平面左侧

                // Test each corner of the cluster against the light bounding box planes
                bAllInvisib = bAllInvisib && dot(plane, float4(vP, 1.0)) > 0;
            }
            
            //即找到一个平面能跟Cluster完全分离
            if (bAllInvisib) 
                bIsHit = false;
        }
        #endif
    }

    return bIsHit;
}

Final Resolve

上面的start值记录的是当前Cluster在g_vLayeredLightList记录LightData的起始Index
categoryListCountScratch也记录了Cluster不同的Category的LightData Count,
所以我们可以通过start以及对应的Category的LightData Count就可以在g_vLayeredLightList中寻址得到对应的Category LightData Index


uint GenerateLayeredOffsetBufferIndex(uint lightCategory, uint2 tileIndex, uint clusterIndex, uint numTilesX, uint numTilesY, int numClusters, uint eyeIndex)
{
    // Each eye is split into category, cluster, x, y

    uint eyeOffset = eyeIndex * LIGHTCATEGORY_COUNT * numClusters * numTilesX * numTilesY;
    int lightOffset = ((lightCategory * numClusters + clusterIndex) * numTilesY + tileIndex.y) * numTilesX + tileIndex.x;

    return (eyeOffset + lightOffset);
}


//67108863=1<<26-1
#define LIGHT_CLUSTER_PACKING_OFFSET_MASK (67108863)

#define LIGHT_CLUSTER_PACKING_COUNT_MASK (63)
#define LIGHT_CLUSTER_PACKING_OFFSET_BITS (26)

uint PackClusterLayeredOffset(uint offset, uint count)
{
    return (offset & LIGHT_CLUSTER_PACKING_OFFSET_MASK) | (min(count, LIGHT_CLUSTER_PACKING_COUNT_MASK) << LIGHT_CLUSTER_PACKING_OFFSET_BITS);
}

//统计当前Tile内的Cluster(64个)不同category计数器
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];

void IncrementCategoryListCount(uint threadIdx, uint index)
{
    categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}

int ReadCategoryListCount(uint threadIdx, uint index)
{
    return categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
}

//LogBaseBufferIndex计算,suggestedBase是逐tile数据
uint GenerateLogBaseBufferIndex(uint2 tileIndex, uint numTilesX, uint numTilesY, uint eyeIndex)
{
    uint eyeOffset = eyeIndex * numTilesX * numTilesY;
    return (eyeOffset + (tileIndex.y * numTilesX) + tileIndex.x);
}


[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    uint start = 0;
    int i = (int)t;
    int iSpaceAvail = 0;
    int iSum = 0;
    if (i < nrClusters)
    {
        // Each thread checks it's respective cluster against all coarse lights for intersection.
        // At the end, 'iSum' represents the number of lights that intersect this cluster!
        for (int l = 0; l < iNrCoarseLights; l++)
        {
            iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);
        }

        // We have a limit to the number of lights we will track in a cluster (128). This is how much memory we
        // want to allocate out of g_LayeredSingleIdxBuffer.
        iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection
        
        //start = g_LayeredSingleIdxBuffer[0];
        //InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail);
        InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory

        //start记录前面所有Cluster包含的Light数量
        //iSpaceAvail记录当前Cluster里包含的Light数量
        //g_LayeredSingleIdxBuffer[0]没有用到过,应该是某个Indirect的操作,记录所有Cluster的LightIndex总数
    }

    ...
    //start记录前面所有Cluster包含的Light数量,也是后续跳表List(g_LayeredOffset)存储的
    uint offs = start;
    for (int ll = 0; ll < iNrCoarseLights; ll += 4)
    {
        int p = i >> 2;
        int m = i & 3;
        if (i < 24)
            lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);

        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif

        for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++)
        {
            //iSpaceAvail是通过CheckIntersectionBasic测试的灯光数量
            if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex))
            {
                //不同lightCategory是连续存储的,后续通过ReadCategoryListCount读取到不同Category的LightData的数量
                //从而分开不同Category.
                const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);
                uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;
                IncrementCategoryListCount(t, lightCategory);
                g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);
            }
        }

        #if NR_THREADS > PLATFORM_LANE_COUNT
        GroupMemoryBarrierWithGroupSync();
        #endif
    }

    ...
    uint localOffs = 0;

    //每个Cluster起始offset由LIGHTCATEGORY_PUNCTUAL(0)为标准
    //在loop里面再累计[offs += (nrClusters * nrTilesX * nrTilesY);]

    //nrClusters=64
    offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);

    for (int category = 0; category < LIGHTCATEGORY_COUNT; category++)
    {
        //读取当前Cluster中category对应的light Count
        int numLights = ReadCategoryListCount(t, category);
        if (i < nrClusters)
        {
            //(读取g_vLayeredLightList的起始Index) start + localOffs 
            //(Cluster内category对应的light Count) numLights
            //g_vLayeredLightList里面才存储LightIndex,g_LayeredOffset作为跳转的List

            g_LayeredOffset[offs] = PackClusterLayeredOffset((start + localOffs), (uint)numLights);
            offs += (nrClusters * nrTilesX * nrTilesY);
            localOffs += numLights; // use unclamped count for localOffs
        }
    }

    //为了后面LightingLoop映射ClusterIdx,需要保存每个Tile的划分Cluster的参数(suggestedBase)
    #ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
    const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIDX, nrTilesX, nrTilesY, eyeIndex);
    if (threadID == 0) 
        g_logBaseBuffer[logBaseIndex] = suggestedBase;
    #endif
    ...
}

LightLoop

类似FPTL,Cluster这里也同样调用的是GetCountAndStart接口获取当前Cluster的lightData Count以及start的Index,
FetchIndex获取最终的LightData Index
最终的流程如下:
1.positionInput的TileCoord + Depth.z计算出当前片元归属的clusterIdx
2.clusterIdx + 当前要计算的category (GenerateLayeredOffsetBufferIndex)计算当前cluster在g_vLayeredOffsetsBuffer的idx
3.g_vLayeredOffsetsBuffer[idx]即为上面最后记录的用来跳转用的dataPair(PackClusterLayeredOffset)
4.UnpackClusterLayeredOffset得到start和lightCount
5.Lighting的时候从start开始FetchIndex就可以得到lightData的真正Index

//LightLoopDef.hlsl

...

#elif defined(USE_CLUSTERED_LIGHTLIST)

#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ClusteredUtils.hlsl"

uint GetTileSize()
{
    return TILE_SIZE_CLUSTERED;
}

uint GetLightClusterIndex(uint2 tileIndex, float linearDepth)
{
    float logBase = g_fClustBase;
    if (g_isLogBaseBufferEnabled)
    {
        const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIndex, _NumTileClusteredX, _NumTileClusteredY, unity_StereoEyeIndex);
        logBase = g_logBaseBuffer[logBaseIndex];
    }

    return SnapToClusterIdxFlex(linearDepth, logBase, g_isLogBaseBufferEnabled != 0);
}

void UnpackClusterLayeredOffset(uint packedValue, out uint offset, out uint count)
{
    offset = packedValue & LIGHT_CLUSTER_PACKING_OFFSET_MASK;
    count = packedValue >> LIGHT_CLUSTER_PACKING_OFFSET_BITS;
}

void GetCountAndStartCluster(uint2 tileIndex, uint clusterIndex, uint lightCategory, out uint start, out uint lightCount)
{
    int nrClusters = (1 << g_iLog2NumClusters);

    const int idx = GenerateLayeredOffsetBufferIndex(lightCategory, tileIndex, clusterIndex, _NumTileClusteredX, _NumTileClusteredY, nrClusters, unity_StereoEyeIndex);

    uint dataPair = g_vLayeredOffsetsBuffer[idx];
    UnpackClusterLayeredOffset(dataPair, start, lightCount);
}

void GetCountAndStartCluster(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    // Note: XR depends on unity_StereoEyeIndex already being defined,
    // which means ShaderVariables.hlsl needs to be defined ahead of this!

    uint2 tileIndex    = posInput.tileCoord;
    uint  clusterIndex = GetLightClusterIndex(tileIndex, posInput.linearDepth);

    GetCountAndStartCluster(tileIndex, clusterIndex, lightCategory, start, lightCount);
}

void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    GetCountAndStartCluster(posInput, lightCategory, start, lightCount);
}

uint FetchIndex(uint lightStart, uint lightOffset)
{
    return g_vLightListCluster[lightStart + lightOffset];
}

...

posted @ 2024-04-08 02:23  凶恶的真实  阅读(77)  评论(0编辑  收藏  举报