Unity的Forward+ FPTL光照剔除解析(三)

序言

如果看了前面的BigTileLightList的建立,这一章会简单一点。
因为如果启用了BigTile之后,这里的BuildPerTileLightList就需要从BigTileLightList里面读取LightList。
否则还是需要像BigTile一样去先走同样的灯光剔除流程(NDCAABBBoundTest,SphericalIntersectionTests)
然后才到最后的FinePruneLightsTest

LightListBuild

RenderGraph Dispatch

下面是RenderGraph中Dispatch时需要的Buffer以及ConstantBuffer

//HDRenderPipeline.LightLoop.cs
static void BuildPerTileLightList(BuildGPULightListPassData data, ref bool tileFlagsWritten, CommandBuffer cmd)
{
    // optimized for opaques only
    if (data.runLightList && data.runFPTL)
    {
        //第一步计算的灯光AABB Buffer
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);

        //LightVolumeData与SFiniteLightBound的ComputeBuffer
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);

        //用作Hiz剔除的深度图
        cmd.SetComputeTextureParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
        //最终输出的lightList
        cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vLightList, data.output.lightList);
        //Big Tile Light List
        if (data.runBigTilePrepass)
            cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);

        var localLightListCB = data.lightListCB;
        //计算Tile内需要计算的FeatureVariant
        //LightLoop中用于控制着色光照计算流程,baseFeatureFlags就是最基础的Flag
        //LightLoop的时候获取TileFeatureFlag,就可以知道当前Tile是否需要计算Punctual/Area/Directional/Env的光照
        if (data.enableFeatureVariants)
        {
            uint baseFeatureFlags = 0;
            if (data.directionalLightCount > 0)
            {
                baseFeatureFlags |= (uint)LightFeatureFlags.Directional;
            }
            if (data.skyEnabled)
            {
                baseFeatureFlags |= (uint)LightFeatureFlags.Sky;
            }
            if (!data.computeMaterialVariants)
            {
                baseFeatureFlags |= LightDefinitions.s_MaterialFeatureMaskFlags;
            }

            localLightListCB.g_BaseFeatureFlags = baseFeatureFlags;

            cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_TileFeatureFlags, data.output.tileFeatureFlags);
            tileFlagsWritten = true;
        }

        ConstantBuffer.Push(cmd, localLightListCB, data.buildPerTileLightListShader, HDShaderIDs._ShaderVariablesLightList);

        cmd.DispatchCompute(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, data.numTilesFPTLX, data.numTilesFPTLY, data.viewCount);
    }
}

Initialize

BigTile类似,计算当前线程的Tile的映射关系要用到的数据.(Tile的X/Y轴上的数量,当前线程组对应的TileID)

//FPTL这一步的Tile Size为16*16
#define TILE_SIZE_FPTL (16)

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);
    uint2 tileIDX = u3GroupID.xy;
    uint t=threadID;

    if(t<LIGHT_LIST_MAX_COARSE_ENTRIES)
        prunedList[t]=0;

    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;
    uint nrTilesX = (iWidth+15)/16;
    uint nrTilesY = (iHeight+15)/16;

    // build tile scr boundary
    const uint uFltMax = 0x7f7fffff;  // FLT_MAX as a uint
    if(t==0)
    {
        ldsZMin = uFltMax;
        ldsZMax = 0;
        lightOffs = 0;
    }

#if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
#endif

...
}

Hi-z剔除

读取PreDepth深度图获取4个像素内的Min、MaxDepth,通过线程同步计算得出一个线程组内的Min/MaxDepth 即(64*4=16*16),建立起当前Tile的Bounds(ldsZMin/ldsZMax)

读取深度图图,计算viewPostion.z

LinearDepth

camera.projectMatrix是右手坐标系的(OpenGL),为了统一整个剔除流程的轴向,ScrProjection翻转了z轴,采用左手坐标系

groupshared uint ldsZMin;
groupshared uint ldsZMax;

TEXTURE2D_X(g_depth_tex) : register( t0 );

float FetchDepth(uint2 pixCoord)
{
    float zdpth = LOAD_TEXTURE2D_X(g_depth_tex, pixCoord.xy).x;
    //https://zhuanlan.zhihu.com/p/389971233
    //投影矩阵的Z Flip了,读取时也要翻转过来(匹配 0 is near 1 is far)
#if UNITY_REVERSED_Z
        zdpth = 1.0 - zdpth;
#endif
    return zdpth;
}

//USE_OBLIQUE_MODE m_LightListProjMatrices.m20!= 0 || m_LightListProjMatrices.m21 != 0;
//即投影矩阵的r+l!=0,t+b!=0
//linearDepth; // View space Z coordinate                              : [Near, Far]
//Reverse z :-z_eye=1/((n-f)/(n*f)*depth+1/n); //https://zhuanlan.zhihu.com/p/393643084
float GetLinearDepth(float2 pixXY, float zDptBufSpace)    // 0 is near 1 is far
{
    float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex];

#ifdef USE_OBLIQUE_MODE
    float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
    return res2.x / res2.y;
#else
    //正交矩阵用(m22*zDptBufSpace+m23),透视用(m32*zDptBufSpace+m33)
    // for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
    // however this function must also work for orthographic projection so we keep it like this.
    float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
    float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;

    return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);
#endif
}
GetViewPosFromLinDepth

这里简单的以x轴的推导为例
由投影矩阵中的相似三角形易得
[Unity Shader入门精要]
{92766D46-7FAD-4038-AAD9-06849DBD8F6B}.png
\(\frac{(ScreenPos.x-pixWidth/2)}{pixWidth/2}=\frac{clipPos.x}{clipPos.w}\)

\(\frac{clipPos.x}{clipPos.w}=\frac{viewPos.x*\frac{cotFOV}{Aspect}}{-viewPos.z}\)

\(viewPos.x = \frac{Screen.x-pixWidth/2}{pixWidth/2*\frac{cotFOV}{Aspect}}*-viewPos.z\)

由于之前ScrProjection已经FlipZ,所以可以直接fLinDepth * p.xy

unsafe void PrepareBuildGPULightListPassData(
    RenderGraph renderGraph,
    RenderGraphBuilder builder,
    HQCamera hqCamera,
    TileAndClusterData tileAndClusterData,
    ref ShaderVariablesLightList constantBuffer,
    int totalLightCount,
    TextureHandle depthStencilBuffer,
    TextureHandle stencilBufferCopy,
    BuildGPULightListPassData passData)
{
    ....

    // camera to screen matrix (and it's inverse)
    for (int viewIndex = 0; viewIndex < hqCamera.viewCount; ++viewIndex)
    {
        var proj = camera.projectionMatrix;
        // Note: we need to take into account the TAA jitter when indexing the light list
        proj = hqCamera.RequiresCameraJitter() ? hqCamera.GetJitteredProjectionMatrix(proj) : proj;

        m_LightListProjMatrices[viewIndex] = proj * s_FlipMatrixLHSRHS;

        var tempMatrix = temp * m_LightListProjMatrices[viewIndex];
        var invTempMatrix = tempMatrix.inverse;

        for (int i = 0; i < 16; ++i)
        {
            cb.g_mScrProjectionArr[viewIndex * 16 + i] = tempMatrix[i];
            cb.g_mInvScrProjectionArr[viewIndex * 16 + i] = invTempMatrix[i];
        }
    }
}

\(pixWidth/2*\frac{cotFOV}{Aspect}=fSx\)
\(pixHeight/2*\frac{cotFOV}{Aspect}=fSy\)
\(pixWidth/2=fCx\)
\(pixHeight/2=fCy\)

//
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
{
    float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];

    bool isOrthographic = g_isOrthographic != 0;
    float fSx = g_mScrProjection[0].x;
    float fSy = g_mScrProjection[1].y;
    float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
    float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;

    #if USE_LEFT_HAND_CAMERA_SPACE
    bool useLeftHandVersion = true;
    #else
    bool useLeftHandVersion = isOrthographic;
    #endif

    float s = useLeftHandVersion ? 1 : (-1);
    float2 p = float2((s * v2ScrPos.x - fCx) / fSx, (s * v2ScrPos.y - fCy) / fSy);

    return float3(isOrthographic ? p.xy : (fLinDepth * p.xy), fLinDepth);
}

这里读取深度图并将其转换到[Near,Far],然后计算出2*2像素中的MinDepth,MaxDepth,
然后通过线程同步(InterlockedMax/InterlockedMin)计算线程组内(Tile内)的MinDepth,MaxDepth
注:FPTL的Tile Size为16*16,64线程一组,一个线程计算4个像素。(16*16=64*4),这里的同步计算不会影响到别的线程组(Tile)

#define NR_THREADS              64
#define TILE_SIZE_FPTL (16)
#define VIEWPORT_SCALE_Z (1)
#define PIXEL_PER_THREAD      ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4
//16*16/64=4
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    uint2 viTilLL = 16*tileIDX;

    // establish min and max depth first
    float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;


#if PIXEL_PER_THREAD == 4
    float4 vLinDepths;
#else
    float vLinDepths[PIXEL_PER_THREAD];
#endif
    {

        //VIEWPORT_SCALE_Z

        // Fetch depths and calculate min/max
        UNITY_UNROLL
        for(int i = 0; i < PIXEL_PER_THREAD; i++)
        {
            int idx = i * NR_THREADS + t;
            uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
            const float fDepth = FetchDepth(uCrd);
            vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
            if(fDepth<VIEWPORT_SCALE_Z)     // if not skydome
            {
                dpt_mi = min(fDepth, dpt_mi);
                dpt_ma = max(fDepth, dpt_ma);
            }
        }

        
        InterlockedMax(ldsZMax, asuint(dpt_ma));
        InterlockedMin(ldsZMin, asuint(dpt_mi));

#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without)
        GroupMemoryBarrierWithGroupSync();
#endif
    }

    ...
}

NDCAABBBoundTest

若启用了BigTile预计算,则通过映射读取LightOffset(Big-tile内的灯光数量)以及对应的lightIndex
若没有则按原样直接遍历g_vBoundBuffer,用AABB计算当前灯光是否在Tile内若在则加入到CoareList中

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    // build coarse list using AABB
    //若启用了BigTile计算


    #ifdef USE_TWO_PASS_TILED_LIGHTING
    //firstbithigh(64)=6 64*64
    //firstbithigh(16)=4 16*16
    //log2BigTileToTileRatio=2 
    //#define TILE_SIZE_FPTL (16)
    //#define TILE_SIZE_BIG_TILE (64)
    //即4*4个FTPL Tile构成一个Big Tile log2_4=2
    const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);

    //计算Big Tile X/Y的数量
    //((1 << log2BigTileToTileRatio) -1 ) 1<<2-1 = 3 相当于DivRoundUp(nrTilesX,4)
    int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio;
    int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio;
    //BigTile总数
    const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY;
    //计算当前Tile对应的BigTile
    const int bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio);       // map the idx to 64x64 tiles
    //第一位记录当前Tile的灯光数量
    int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
    //遍历BigTileLightList
    for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
    {
        int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+l0+1];
    
    #else
    //若没有启用Big Tile,则直接遍历所有的灯光,利用Scrbound计算的AABBBound计算Tile(16*16)的当前灯光列表(coarseList)
    for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
    {
        #endif
        // Skip Local Volumetric Fog (lights are sorted by category). TODO: improve data locality

        // if (_LightVolumeData[l].lightCategory == LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG) { break; }

        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex);
        const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz;
        const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz;

        if (all(vMa > vTileLL) && all(vMi < vTileUR))
        {
            unsigned int uInc = 1;
            unsigned int uIndex;
            InterlockedAdd(lightOffs, uInc, uIndex);
            if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
        }
    }
    ...
}

清空ldsDoesLightInterset初始化,并同步线程组

#define FINE_PRUNING_ENABLED
#define LIGHT_LIST_MAX_COARSE_ENTRIES (64)//coarseList/prunedList LDS的最大容量为64
//uint 32Bit容纳不了64盏灯的灯光与当前Tile相交情况,所以加多了一个记录另外32盏灯.
groupshared uint ldsDoesLightIntersect[2];
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifdef FINE_PRUNING_ENABLED
    if (t < 2)
        ldsDoesLightIntersect[t] = 0;
    #endif

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    //coarseList/prunedList LDS的最大容量为64
    int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES);
    ...
}

SphericalIntersectionTest,FinePruneLights

SphericalIntersectionTest

这里的SphericalIntersectionTest与BigTile中的SphericalIntersectionTest唯一不同的区别是就是需要把coarseList拷贝到prunedList暂存
检测到灯光Overlap Tile之后再暂存在coarseList的lightIndex加入到prunedList

#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
{
    if (threadID == 0) lightOffsSph = 0;

    // make a copy of coarseList in prunedList.
    int l;
    for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
        prunedList[l] = coarseList[l];

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    #if USE_LEFT_HAND_CAMERA_SPACE
    float3 V = GetViewPosFromLinDepth(screenCoordinate, 1.0);
    #else
    float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
    #endif

    float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
    float halfTileSizeAtZDistOne = 8 * onePixDiagDist; // scale by half a tile

    for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
    {
        const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[l], g_iNrVisibLights, unity_StereoEyeIndex);
        SFiniteLightBound lightData = g_data[lightBoundIndex];

        if (DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic != 0))
        {
            unsigned int uIndex;
            InterlockedAdd(lightOffsSph, 1, uIndex);
            coarseList[uIndex] = prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
        }
    }

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    return lightOffsSph;
}

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
    iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(16 / 2, 16 / 2), uint2(iWidth - 1, iHeight - 1))));
    #endif
    ...
}

FinePruneLights

s_lightVolumesCache LDS

在开始计算FinePruneLights之前,需要预先记录coarseList对应的LightVolume进LDS s_lightVolumesCache(StoreLightVolumeCache)中

#define FINE_PRUNING_ENABLED

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifndef FINE_PRUNING_ENABLED
        {
            if((int)t<iNrCoarseLights)
                prunedList[t] = coarseList[t];
            if(t==0)
                ldsNrLightsFinal=iNrCoarseLights;
        }
    #else
    {
        // initializes ldsNrLightsFinal with the number of accepted lights.
        // all accepted entries delivered in prunedList[].
        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
    }
    #endif
    ...
}

//

//eyeIndex==0时,GetCoarseLightIndex=>coarseList[l]
uint GenerateLightCullDataIndex(uint lightIndex, uint numVisibleLights, uint eyeIndex)
{
    lightIndex = min(lightIndex, numVisibleLights - 1); // Stay within bounds

    // For monoscopic, there is just one set of light cull data structs.
    // In stereo, all of the left eye structs are first, followed by the right eye structs.
    const uint perEyeBaseIndex = eyeIndex * numVisibleLights;
    return (perEyeBaseIndex + lightIndex);
}

int GetCoarseLightIndex(int l, int iNrCoarseLights)
{
    return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;
}

//

groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];

void StoreLightVolumeCache(int lightIndex, int coarseIndex, uint volumeType)
{
    // 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.
    // 29 bits for a coarse light index.
    s_lightVolumesCache[lightIndex] = (volumeType & 0x7) | (uint)(coarseIndex << 3);
}

void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)
{
    int data = s_lightVolumesCache[lightIndex];
    coarseIndex = data >> 3;
    volumeType = data & 0x7;
}


// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
    uint t = threadID;
    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;

    uint uLightsFlags[2] = {0, 0};
    int l = 0;
    // need this outer loop even on xb1 and ps4 since direct lights and
    // reflection lights are kept in separate regions.

    if (threadID < (uint)iNrCoarseLights)
    {
        int idxCoarse = GetCoarseLightIndex((int)threadID, iNrCoarseLights);
        int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume;
        StoreLightVolumeCache(threadID, idxCoarse, uLightVolume);
    }

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    ....
}


判交计算

读取LDS中的volumeData,并且利用之前的DepthBound(vLinDepths)逐像素进行判交

#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

//(记录Tile内灯光数量)
groupshared int ldsNrLightsFinal;

// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
    ...

    //When using LDS to cache the volume data, this produces the best most optimal code.
    //Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
    for (; l < iNrCoarseLights; ++l)
    {
        int idxCoarse;
        int uLightVolume;
        //读取之前的LightVolumeData
        LoadLightVolumeCache(l, idxCoarse, uLightVolume);
        bool lightValid = false;
        if (uLightVolume == LIGHTVOLUMETYPE_CONE)
        {
            LightVolumeData lightData = _LightVolumeData[idxCoarse];
            const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;
            for (int i = 0; i < PIXEL_PER_THREAD; i++)
            {
                int idx = t + i * NR_THREADS;
                //先计算当前像素的深度对应的ViewPosition
                uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);

                //LightData.lightPos是View Space
                // check pixel
                //用当前像素到LightPos的向量fromLight以及LightAxis判断fromLight在Cone内部
                float3 fromLight = vVPos - lightData.lightPos.xyz;
                float distSq = dot(fromLight, fromLight);
                const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz

                float2 V = abs(float2(dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz)));
                //bIsSpotDisc=true
                //即fDist2D=dot(fromLight, lightData.lightAxisX)^2+dot(fromLight, lightData.lightAxisX)^2;
                float fDist2D = bIsSpotDisc ? length(V) : max(V.x, V.y);



                //lightData.radiusSq>distSq
                //fSclProj>fDist2D * lightData.cotan即fSclProj/fDist2D>lightData.cotan,用fromLight和Axis计算夹角的cot

                //lightVolumeData radiusSq的计算
                //lightVolumeData.radiusSq = range * range;

                //lightVolumeData cotan的计算
                // var sa = light.spotAngle;
                // var cs = Mathf.Cos(0.5f * sa * Mathf.Deg2Rad);
                // var si = Mathf.Sin(0.5f * sa * Mathf.Deg2Rad);
                //  if (gpuLightType == GPULightType.ProjectorPyramid)
                //    {
                //        Vector3 lightPosToProjWindowCorner = (0.5f * lightDimensions.x) * vx + (0.5f * lightDimensions.y) * vy + 1.0f * vz;
                //        cs = Vector3.Dot(vz, Vector3.Normalize(lightPosToProjWindowCorner));
                //        si = Mathf.Sqrt(1.0f - cs * cs);
                //   }
                //   const float FltMax = 3.402823466e+38F;
                //   var ta = cs > 0.0f ? (si / cs) : FltMax;
                //   var cota = si > 0.0f ? (cs / si) : FltMax;
                //   lightVolumeData.cotan = cota;


                bool validInPixel = all(float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D * lightData.cotan));
                #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
                //a wave is on the same tile, and the loop is uniform for the wave.
                // thus we early out if at least 1 thread in the wave passed this light, saving some ALU.
                lightValid = WaveActiveAnyTrue(validInPixel);
                #else
                lightValid = validInPixel;
                #endif
                if (lightValid)
                    break;
            }
        }
        else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)
        {
            LightVolumeData lightData = _LightVolumeData[idxCoarse];
            for (int i = 0; i < PIXEL_PER_THREAD; i++)
            {
                int idx = t + i * NR_THREADS;
                //先计算当前像素的深度对应的ViewPosition
                uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);

                //简单的球形距离场
                // check pixel
                float3 vLp = lightData.lightPos.xyz;
                float3 toLight = vLp - vVPos;
                float distSq = dot(toLight, toLight);

                bool validInPixel = lightData.radiusSq > distSq;
                #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
                lightValid = WaveActiveAnyTrue(validInPixel);
                #else
                lightValid = validInPixel;
                #endif
                if (lightValid)
                    break;
            }
        }
        else if (uLightVolume == LIGHTVOLUMETYPE_BOX)
        {
            LightVolumeData lightData = _LightVolumeData[idxCoarse];
            for (int i = 0; i < PIXEL_PER_THREAD; i++)
            {
                int idx = t + i * NR_THREADS;
                //先计算当前像素的深度对应的ViewPosition
                uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
                float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);

                // check pixel
                float3 toLight = lightData.lightPos.xyz - vVPos;

                //用toLight计算有向距离场判断当前像素是否在Box内部
                float3 dist = float3(dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ));
                dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
                bool validInPixel = max(max(dist.x, dist.y), dist.z) < 1; // but allows us to not write out OuterDists
                #ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
                lightValid = WaveActiveAnyTrue(validInPixel);
                #else
                lightValid = validInPixel;
                #endif
                if (lightValid)
                    break;
            }
        }
        else
            break;

        //lightValid记录判交结果,前32盏灯记录到uLightsFlags[0],剩下的记录到uLightsFlags[1]
        uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
    }

    ...
}



[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifndef FINE_PRUNING_ENABLED
        {
            if((int)t<iNrCoarseLights)
                prunedList[t] = coarseList[t];
            if(t==0)
                ldsNrLightsFinal=iNrCoarseLights;
        }
    #else
    {
        // initializes ldsNrLightsFinal with the number of accepted lights.
        // all accepted entries delivered in prunedList[].
        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
    }
    #endif
    ...
}
Resolve Pruned List

遍历ldsDoesLightIntersect的Flag,重新结算Tile内的灯光数量,并把对应灯光Index(coarseList)加入到prunedList

#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS

//(记录Tile内灯光数量)
groupshared int ldsNrLightsFinal;

// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
    ....

    //When using LDS to cache the volume data, this produces the best most optimal code.
    //Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
    for (; l < iNrCoarseLights; ++l)
    {
        //判交计算
        ....
        //lightValid记录判交结果,前32盏灯记录到uLightsFlags[0],剩下的记录到uLightsFlags[1]
        uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
    }

    //线程同步uLightsFlags
    InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
    InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
    //重置ldsNrLightsFinal(记录Tile内灯光数量)
    if (t == 0) 
        ldsNrLightsFinal = 0;

    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    //遍历ldsDoesLightIntersect的Flag
    if (t < (uint)iNrCoarseLights && (ldsDoesLightIntersect[t < 32 ? 0 : 1] & (1 << (t & 31))) != 0)
    {
        unsigned int uInc = 1;
        unsigned int uIndex;
        InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);
        if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES) 
            prunedList[uIndex] = coarseList[t]; // we allow up to 64 pruned lights while stored in LDS.
    }
}



[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...
    #ifndef FINE_PRUNING_ENABLED
        {
            if((int)t<iNrCoarseLights)
                prunedList[t] = coarseList[t];
            if(t==0)
                ldsNrLightsFinal=iNrCoarseLights;
        }
    #else
    {
        // initializes ldsNrLightsFinal with the number of accepted lights.
        // all accepted entries delivered in prunedList[].
        FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
    }
    #endif
    ...
}

遍历PruneList根据不同的LightCategory进行划分,Resolve FeatureFlag

根据上一步FinePruneLights得到的prunedList,可以通过遍历prunedList对应的灯光,拿到灯光的lightCategory以及featureFlags
这样就可以计算出Tile内CategoryCount以及光照涉及的Light Feature Flag(Punctual/Env/Decal....)

Feature Flag的主要作用于Deferred的Material classification:https://www.cnblogs.com/OneStargazer/p/18174135

groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES]; 

//LightCategory计数器
groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];

//Light Feature Flag
#ifdef USE_FEATURE_FLAGS
groupshared uint ldsFeatureFlags;
RWStructuredBuffer<uint> g_TileFeatureFlags;
#endif

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    //重置LightCategory计数器
    if (t < CATEGORY_LIST_SIZE)
        ldsCategoryListCount[t] = 0;
    //重置ldsFeatureFlags
    #ifdef USE_FEATURE_FLAGS
        if(t==0)
            ldsFeatureFlags=0;
    #endif

    //线程同步
    #if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
    #endif

    //ldsNrLightsFinal为上一步FinePruneLights计算得出的Tile内灯光数量
    int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES);
    //遍历prunedList,用InterlockedAdd累计不同的LightCategory到ldsCategoryListCount LDS计数器中
    //InterlockedOr合计Tile内所有的Light Feature Flag
    for (int i = t; i < nrLightsCombinedList; i += NR_THREADS)
    {
        const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[i], g_iNrVisibLights, unity_StereoEyeIndex);

        InterlockedAdd(ldsCategoryListCount[_LightVolumeData[lightBoundIndex].lightCategory], 1);
        #ifdef USE_FEATURE_FLAGS
            InterlockedOr(ldsFeatureFlags, _LightVolumeData[lightBoundIndex].featureFlags);
        #endif
    }

    //排序prunedList
    // sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
    #if NR_THREADS > PLATFORM_LANE_COUNT
    SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS);
    //MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
    #endif

    //初始化每个Tile对应的g_TileFeatureFlags
    #ifdef USE_FEATURE_FLAGS
        if(t == 0)
        {
            uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
            // In case of back
            if(ldsZMax < ldsZMin)   // is background pixel
                {
                // There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case. 
                // It will still execute but will do nothing
                featureFlags = 0;
                }

            g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags;
        }
    #endif

    ...
}

根据不同的LightCategory使用对应的Offset压入到g_vLightList

这最后一步就是把prunedList放进g_vLightList(大象装进冰箱)
其中由于Index用不了那么UInt32那么大的精度,所以需要将两个Index合并成一个,用的时候再Unpack出来.


/////////HDRenderPipeline.LightLoop.cs PrepareBuildGPULightListPassData
//灯光数量作为EnvLightIndex起始点
//cb._EnvLightIndexShift = (uint)m_GpuLightsBuilder.lightsCount;

//灯光数量以及反射探针数量之和作为DecalIndex起始点
//cb._DecalIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count);

//灯光数量,反射探针数量以及贴花数量之和作为Local Volumetric Fog Index起始点
//cb._LocalVolumetricFogIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count + decalDatasCount);
/////////End of HDRenderPipeline.LightLoop.cs

CBUFFER_START(ShaderVariablesLightList)
    ...
    uint _EnvLightIndexShift;
    uint _DecalIndexShift;
    uint _LocalVolumetricFogIndexShift;
    ...
CBUFFER_END
//原本每个Tile内的元素数量64,由于两个Index合并成一个,所以Tile内元素数量就变成32
#define LIGHT_DWORD_PER_FPTL_TILE (32)

//ShaderConfig.cs.hlsl
//由脚本GenerateHLSL生成控制 FPTL Tile内的LightIndex数量上限
#define SHADEROPTIONS_FPTLMAX_LIGHT_COUNT (63)

[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    ...

    //Tile的起始Index计算
    // write lights to global buffers
    int localOffs = 0;
    int offs = tileIDX.y * nrTilesX + tileIDX.x;

    #if defined(UNITY_STEREO_INSTANCING_ENABLED)
        // Eye base offset must match code in GetCountAndStartTile()
        offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT;
    #endif

    //CBUFFER变量初始化shiftIndex
    // All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
    // to make it work correctly
    int shiftIndex[CATEGORY_LIST_SIZE];
    ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
    
    shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
    shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;

    //通过读取ldsCategoryListCount获取不同Category(Punctual/Area/Env/Decal)对应的LightData数量
    for (int category = 0; category < CATEGORY_LIST_SIZE; category++)
    {
        int nrLightsFinal = ldsCategoryListCount[category];
        int nrLightsFinalClamped = nrLightsFinal < SHADEROPTIONS_FPTLMAX_LIGHT_COUNT ? nrLightsFinal : SHADEROPTIONS_FPTLMAX_LIGHT_COUNT;

        //由于LightIndex用不了uint那么多的位数(32Bit),所以可以对LightList中的Index每两个合并(uLow/uHigh)合并成一个Index
        //nrLightsFinalClamped + 1(记录LightList的Index数量的nrLightsFinalClamped)
        //((nrLightsFinalClamped + 1) + 1) >> 1  相当于DivRoundUp(nrLightsFinalClamped + 1,2)
        const int nrDWords = ((nrLightsFinalClamped + 1) + 1) >> 1;
        for (int l = (int)t; l < (int)nrDWords; l += NR_THREADS)
        {
            //prunedList里存储的Index是:所有的Category(Punctual/Area/Env/Decal)的LightData/LightVolumeData Buffer的Index,
            //所以需要减去Category对应的shiftIndex,重新映射得到对应Category在各自Buffer中真正的Index

            // We remap the prunedList index to the original LightData / EnvLightData indices
            uint uLow = l == 0 ? nrLightsFinalClamped : prunedList[max(0, 2 * l - 1 + localOffs)] - shiftIndex[category];
            uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];

            //偏移16位
            g_vLightList[LIGHT_DWORD_PER_FPTL_TILE * offs + l] = (uLow & 0xffff) | (uHigh << 16);
        }

        //localOffs滑动到下一个Category
        localOffs += nrLightsFinal;
        //不同Category存储Index的偏移是Tile的数量。这样有利于遍历时提高Cache Hit.
        offs += (nrTilesX * nrTilesY);
    }

    ...
}

Lighting Loop

最终来到了LightLoop中应用g_vLightList计算的结果。
应用的流程如下:
1.首先需要根据当前像素PositionSS计算出Tile的Index
2.根据Tile的Index以及当前计算的Category得到g_vLightList的偏移tileOffset
3.Tile List的起始点的Index(start)就是tileOffset,用&0xffff取出第一个元素即为LightCount
4.后续获取LightData的Index时,只需要从start前面的uHigh(前16bit)开始遍历就行了

#define LIGHT_DWORD_PER_FPTL_TILE (32)

//渲染不透明队列时启用
#ifdef USE_FPTL_LIGHTLIST

//计算PositionInputs的tileIndex
//uint2 tileIndex = uint2(fragInput.positionSS.xy) / TILE_SIZE_FPTL;
//PositionInputs posInput = GetPositionInput(fragInput.positionSS.xy, _ScreenSize.zw, fragInput.positionSS.z, fragInput.positionSS.w, input.positionWS.xyz, tileIndex);

//根据tileIndex计算当前lightCategory对应的Tile在g_vLightListTile Buffer中的Offset Index
int GetTileOffset(PositionInputs posInput, uint lightCategory)
{
    uint2 tileIndex = posInput.tileCoord;
    return (tileIndex.y + lightCategory * _NumTileFtplY) * _NumTileFtplX + tileIndex.x;
}

void GetCountAndStartTile(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    int tileOffset = GetTileOffset(posInput, lightCategory);

    #if defined(UNITY_STEREO_INSTANCING_ENABLED)
    // Eye base offset must match code in lightlistbuild.compute
    tileOffset += unity_StereoEyeIndex * _NumTileFtplX * _NumTileFtplY * LIGHTCATEGORY_COUNT;
    #endif

    //List的第一个元素就是Light的数量
    // The first entry inside a tile is the number of light for lightCategory (thus the +0)
    lightCount = g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + 0] & 0xffff;
    start = tileOffset;
}

uint GetTileSize()
{
    return TILE_SIZE_FPTL;
}

void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
    GetCountAndStartTile(posInput, lightCategory, start, lightCount);
}

//Loop中读取灯光Index的函数
uint FetchIndex(uint tileOffset, uint lightOffset)
{
    //List的第一个元素就是Light的数量
    //从start前面的uHigh开始遍历
    const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light

    //用32bit存了两个Index
    // Light index are store on 16bit
    return (g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff;
}

//渲染透明队列时启用
#elif defined(USE_CLUSTERED_LIGHTLIST)
...
    //LightingLoop.hlsl

    // This struct is define in the material. the Lightloop must not access it
    // PostEvaluateBSDF call at the end will convert Lighting to diffuse and specular lighting
    AggregateLighting aggregateLighting;
    ZERO_INITIALIZE(AggregateLighting, aggregateLighting); // LightLoop is in charge of initializing the struct

    
    if (featureFlags & LIGHTFEATUREFLAGS_PUNCTUAL)
    {
        uint lightCount, lightStart;

//默认开启
//#ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
        GetCountAndStart(posInput, LIGHTCATEGORY_PUNCTUAL, lightStart, lightCount);
//#else   // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
        //lightCount = _PunctualLightCount;
        //lightStart = 0;
//#endif

        bool fastPath = false;

        //SCALARIZE_LIGHT_LOOP涉及到Wave相关的指令,详细介绍可以看https://zhuanlan.zhihu.com/p/469436345
        #if SCALARIZE_LIGHT_LOOP
            uint lightStartLane0;
            fastPath = IsFastPath(lightStart, lightStartLane0);

            if (fastPath)
            {
                lightStart = lightStartLane0;
            }
        #endif

        // Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed.
        // For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register),
        // v_ are variables that might have different value for each thread in the wave (meant for vector registers).
        // This will perform more loads than it is supposed to, however, the benefits should offset the downside, especially given that light data accessed should be largely coherent.
        // Note that the above is valid only if wave intriniscs are supported.
        uint v_lightListOffset = 0;
        uint v_lightIdx = lightStart;

        while (v_lightListOffset < lightCount)
        {
            v_lightIdx = FetchIndex(lightStart, v_lightListOffset);
#if SCALARIZE_LIGHT_LOOP
            uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath);
#else
            uint s_lightIdx = v_lightIdx;
#endif
            if (s_lightIdx == -1)
                break;

            //获取LightData
            LightData s_lightData = FetchLight(s_lightIdx);

            ...
        }
    }
posted @ 2024-04-06 04:53  凶恶的真实  阅读(307)  评论(0编辑  收藏  举报