Unity的Forward+ FPTL光照剔除解析(二)


在建立更细粒度的Tile Light List的时候,可以通过屏幕坐标映射找到对应的BigTile,只需对BigTile内的Light做剔除计算,从而减少剔除时的运算量。



        //passData.numBigTilesX = (w + 63) / 64; HDUtils.DivRoundUp(hdCamera.width,64)
        //passData.numBigTilesY = (h + 63) / 64; HDUtils.DivRoundUp(hdCamera.height,64)
        // enable coarse 2D pass on 64x64 tiles (used for both fptl and clustered).
        static void BigTilePrepass(BuildGPULightListPassData data, CommandBuffer cmd)
            if (data.runLightList && data.runBigTilePrepass)
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_vLightList, data.output.bigTileLightList);
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);

                ConstantBuffer.Push(cmd, data.lightListCB, data.bigTilePrepassShader, HDShaderIDs._ShaderVariablesLightList);

                cmd.DispatchCompute(data.bigTilePrepassShader, data.bigTilePrepassKernel, data.numBigTilesX, data.numBigTilesY, data.viewCount);



groupshared unsigned int lightsListLDS[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE];
groupshared uint lightOffs;

[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
    uint eyeIndex = u3GroupID.z;
    uint2 tileIDX = u3GroupID.xy;
    uint t=threadID;

    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;
    uint nrBigTilesX = (iWidth+63)/64;
    uint nrBigTilesY = (iHeight+63)/64;

    if(t==0) lightOffs = 0;


    // Raw pixel coordinates of tile
    uint2 viTilLL = 64*tileIDX;
    uint2 viTilUR = min( viTilLL+uint2(64,64), uint2(iWidth, iHeight) );            // not width and height minus 1 since viTilUR represents the end of the tile corner.

    // 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
    float2 vTileLL = float2(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight);
    float2 vTileUR = float2(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight);

    //遍历第一篇文章最后建立的VisibleLight的g_vBoundsBuffer Buffer
    // build coarse list using AABB
    for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
        //用eyeIndex,g_iNrVisibLights(Visible Light Count)计算偏移

        //ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(uint lightIndex, uint numVisibleLights, uint eyeIndex)
            //const uint eyeRelativeBase = eyeIndex * 2 * numVisibleLights;

            //ScreenSpaceBoundsIndices indices;
            //indices.min = eyeRelativeBase + lightIndex;
            //indices.max = indices.min + numVisibleLights;

            //return indices;

        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
        const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
        const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;

        if( all(vMa>vTileLL) && all(vMi<vTileUR))
            unsigned int uInc = 1;
            unsigned int uIndex;

            //InterlockedAdd(lightOffs, uInc);
            InterlockedAdd(lightOffs, uInc, uIndex);

                lightsListLDS[uIndex] = l;     // add to light list
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_XBOXONE) || defined(SHADER_API_GAMECORE) || defined(SHADER_API_SWITCH) // not sure why XB1 and Switch need the barrier (it will not be correct without)

    int iNrCoarseLights = min(lightOffs,MAX_NR_BIGTILE_LIGHTS);





[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
    //screenCoordinate=float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))) Tile的中心坐标
    SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))), eyeIndex );

float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex)
    float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];

    bool isOrthographic = g_isOrthographic!=0;
    float fSx = g_mScrProjection[0].x;
    float fSy = g_mScrProjection[1].y;
    float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
    float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;

    bool useLeftHandVersion = true;
    //bool useLeftHandVersion = isOrthographic;

    float s = useLeftHandVersion ? 1 : (-1);
    float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);

    return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);

void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate, uint eyeIndex)
    //计算Far Plane上的ViewPosition作为View vector与灯光Bound进一步求交
    float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0, eyeIndex);
    //float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0, eyeIndex);

    float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne(eyeIndex);
    float halfTileSizeAtZDistOne = 32*onePixDiagDist;       // scale by half a tile

    for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
        const int boundIndex = GenerateLightCullDataIndex(lightsListLDS[l], g_iNrVisibLights, eyeIndex);
        SFiniteLightBound lgtDat = g_data[boundIndex];

        if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius, g_isOrthographic!=0) )



bool DoesSphereOverlapTile(float3 dir, float halfTileSizeAtZDistOne, float3 sphCen_in, float sphRadiusIn, bool isOrthographic)
    //ray direction down center of tile (does not need to be normalized).
    //float3 V = float3(isOrthographic ? 0.0 : dir.x, isOrthographic ? 0.0 : dir.y, dir.z);     
    //float3 sphCen = float3(sphCen_in.x - (isOrthographic ? dir.x : 0.0), sphCen_in.y - (isOrthographic ? dir.y : 0.0), sphCen_in.z);
    float3 V =  float3(dir.x, dir.y, dir.z);
    float3 sphCen =  float3(sphCen_in.x, sphCen_in.y, sphCen_in.z);
//#if 1
    float3 maxZdir = float3(-sphCen.z*sphCen.x, -sphCen.z*sphCen.y, sphCen.x*sphCen.x + sphCen.y*sphCen.y);     // cross(sphCen,cross(Zaxis,sphCen))
    float len = length(maxZdir);
    float scalarProj = len>0.0001 ? (maxZdir.z/len) : len;  // if len<=0.0001 then either |sphCen|<sphRadius or sphCen is very closely aligned with Z axis in which case little to no additional offs needed.
    float offs = scalarProj*sphRadiusIn;
    //float offs = sphRadiusIn;       // more false positives due to larger radius but works too

    // enlarge sphere so it overlaps the center of the tile assuming it overlaps the tile to begin with.
    float s = sphCen.z+offs;
    //float s = -(sphCen.z-offs);

    //z valve *halfTileSizeAtZDistOne等于外扩half tile size求得球体外扩半个tile之后的半径大小
    //sphRadius = sphRadiusIn+(sphCen.z + ((maxZdir.z / len) * sphRadiusIn))* halfTileSizeAtZDistOne
    float sphRadius = sphRadiusIn + s * halfTileSizeAtZDistOne;
    //float sphRadius = sphRadiusIn + (isOrthographic ? 1.0 : s)*halfTileSizeAtZDistOne;

    float a = dot(V,V);
    float CdotV = dot(sphCen,V);
    float c = dot(sphCen,sphCen) - sphRadius*sphRadius;

    float fDescDivFour = CdotV*CdotV - a*c;

    return c<0 || (fDescDivFour>0 && CdotV>0);      // if ray hits bounding sphere



[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
    CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, eyeIndex);

float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex)
    float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];

    bool isOrthographic = g_isOrthographic!=0;
    float fSx = g_mScrProjection[0].x;
    float fSy = g_mScrProjection[1].y;
    float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
    float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;

    bool useLeftHandVersion = true;
    //bool useLeftHandVersion = isOrthographic;

    float s = useLeftHandVersion ? 1 : (-1);
    float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);

    return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);

float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane, uint eyeIndex)
    float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
    float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
    float z = (i&4)==0 ? g_fNearPlane : fTileFarPlane;
    z = -z;
    return GetViewPosFromLinDepth( float2(x, y), z, eyeIndex);
void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane, uint eyeIndex)
    int iSection = e0>>2;       // section 0 is side edges, section 1 is near edges and section 2 is far edges
    int iSwizzle = e0&0x3;

    int i=iSwizzle + (2*(iSection&0x2));    // offset by 4 at section 2
    vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane, eyeIndex);

    float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,1.0);
    float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,-1.0);

    vE0 = iSection == 0 ? edgeSectionZero : (((iSwizzle & 0x2) == 0 ? 1.0f : (-1.0f)) * ((int)(iSwizzle & 0x1) == (iSwizzle >> 1) ? float3(1, 0, 0) : float3(0, 1, 0)));

float3 GetHullVertex(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int p)
    const bool bIsTopVertex = (p&4)!=0;
    float3 vScales = float3(
        ((p&1)!=0 ? 1.0f : (-1.0f))*(bIsTopVertex ? scaleXY.x : 1.0),
        ((p&2)!=0 ? 1.0f : (-1.0f))*(bIsTopVertex ? scaleXY.y : 1.0),
        (p&4)!=0 ? 1.0f : (-1.0f) );
    return (vScales.x*boxX + vScales.y*boxY + vScales.z*boxZ) + center;

void GetHullEdge(out int idx0, out int idx_twin, out float3 vP0, out float3 vE0, const int e0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY)
    int iAxis = e0>>2;//iAxis=>[0,0,0,0,1,1,1,1,2,2,2,2]
    int iSwizzle = e0&0x3;//iSwizzle=>[0,1,2,3,0,1,2,3,0,1,2,3]
    bool bIsSwizzleOneOrTwo = ((iSwizzle-1)&0x2)==0;//bIsSwizzleOneOrTwo=[0,1,1,0,0,1,1,0,0,1,1,0]

    const int i0 = iAxis==0 ? (2*iSwizzle+0) : ( iAxis==1 ? (iSwizzle+(iSwizzle&2)) : iSwizzle);
    const int i1 = i0 + (1<<iAxis);
    const bool bSwap = iAxis==0 ? (!bIsSwizzleOneOrTwo) : (iAxis==1 ? false : bIsSwizzleOneOrTwo);

    idx0 = bSwap ? i1 : i0;
    idx_twin = bSwap ? i0 : i1;
    float3 p0 = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, idx0);
    float3 p1 = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, idx_twin);

    vP0 = p0;
    vE0 = p1-p0;

void CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, uint eyeIndex)
    const bool bOnlyNeedFrustumSideEdges = true;
    const int nrFrustEdges = bOnlyNeedFrustumSideEdges ? 4 : 8; // max 8 since we never need to test 4 far edges of frustum since they are identical vectors to near edges and plane is placed at vP0 on light hull.

    const int totNrEdgePairs = 12*nrFrustEdges;
    //遍历Coarse Light List LDS
    for(int l=0; l<iNrCoarseLights; l++)
        const uint idxCoarse = lightsListLDS[l];
        const int bufIdxCoarse = GenerateLightCullDataIndex(idxCoarse, g_iNrVisibLights, eyeIndex);

        bool canEnter = idxCoarse<(uint) g_iNrVisibLights;

        canEnter = _LightVolumeData[bufIdxCoarse].lightVolume != LIGHTVOLUMETYPE_SPHERE;      
         // don't bother doing edge tests for sphere lights since these have camera aligned bboxes.
        UNITY_BRANCH if(canEnter)
            SFiniteLightBound lgtDat = g_data[bufIdxCoarse];

            const float3 boxX = lgtDat.boxAxisX.xyz;
            const float3 boxY = lgtDat.boxAxisY.xyz;
            const float3 boxZ = -lgtDat.boxAxisZ.xyz;   // flip axis (so it points away from the light direction for a spot-light)
            const float3 center = lgtDat.center.xyz;
            const float2 scaleXY = lgtDat.scaleXY;

            for(int i=threadID; i<totNrEdgePairs; i+=NR_THREADS)
                //这里可以优化成右移运算i>>2,e0为 灯光体积Hull的Edge Index,Range=>[0,11]
                int e0 = (int) (((uint)i)/((uint) nrFrustEdges)); // should become a shift right

                int e1 = i - e0*nrFrustEdges;

                int idx_cur=0, idx_twin=0;
                float3 vP0, vE0;
                GetHullEdge(idx_cur, idx_twin, vP0, vE0, e0, boxX, boxY, boxZ, center, scaleXY);

                float3 vP1, vE1;
                GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, g_fFarPlane, eyeIndex);

                //类似的对Tile构成的Hull的所有的边也是进行同样的To Left Test
                //当出现一种vE0, vE1组合构成的平面同时与Hull与Tile棱台不相交就说明Hull与Tile不相交.
                // potential separation plane
                float3 vN = cross(vE0, vE1);

                int positive=0, negative=0;
                for(int k=1; k<8; k++)      // only need to test 7 verts (technically just 6).
                    int j = (idx_cur+k)&0x7;
                    float3 vPh = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, j);
                    float fSignDist = idx_twin==j ? 0.0 : dot(vN, vPh-vP0);

                    else if(fSignDist<0) 
                int resh = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));

                positive=0; negative=0;
                for(int j=0; j<8; j++)
                    float3 vPf = GetTileVertex(viTilLL, viTilUR, j, g_fFarPlane, eyeIndex);
                    float fSignDist = dot(vN, vPf-vP0);
                    else if(fSignDist<0) 
                int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));

                bool bFoundSepPlane = (resh*resf)<0;
                if(bFoundSepPlane) lightsListLDS[l]=UINT_MAX;


这一步主要是把前面移除出去的Light(UINT_MAX)通过排序,把Index排到LDS的末尾。然后统计真正在Big Tile内灯光数量(iNrCoarseLights)

[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
    // sort lights

        lightOffs = 0;

    int i;
    for(i=t; i<iNrCoarseLights; i+=NR_THREADS) 
            InterlockedAdd(lightOffs, 1);
    iNrCoarseLights = lightOffs;

    int offs = tileIDX.y*nrBigTilesX + tileIDX.x + (eyeIndex * nrBigTilesX * nrBigTilesY);

    //#define MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE (512)
    //i==0 ? iNrCoarseLights(第一位记录Tile内的灯光数量)
    for(i=t; i<(iNrCoarseLights+1); i+=NR_THREADS)
        g_vLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*offs + i] = i==0 ? iNrCoarseLights : lightsListLDS[max(i-1, 0)];
Bitonic Sort

3.mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
比如当k为4,排序元素为0,1,2,3,4,5,6,7,分割成(i&k=0)0,1,2,3; (i&k=1)4,5,6,7;
在第一个序列,当A>B时,才能够互换元素(i&k=0),即(i&k)!=0^(Avalue>Bvalue) == true

unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
#if 0
    unsigned int value = 1;

    while(value<value_in && (value<<1)<=maxValue)

    return value_in==0 ? 0 : value;
    uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in));        // firstbithigh(0) returns -1
    valpw2 = max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0));   // max() just in case of overflow
    return min(valpw2, maxValue);
#define SORTLIST(data, length_in, maxcapacity_in, localThreadID_in, nrthreads_in)   \
{   \
    int length=(int) length_in, maxcapacity=(int) maxcapacity_in, localThreadID=(int) localThreadID_in, nrthreads=(int) nrthreads_in;   \
    const int N = (const int) LimitPow2AndClamp((unsigned int) length, (uint) maxcapacity); \
    for(int t=length+localThreadID; t<N; t+=nrthreads) { data[t]=UINT_MAX; }              \
    GroupMemoryBarrierWithGroupSync();                                                      \
    for(int k=2; k<=N; k=2*k)                                                               \
    {                                                                                       \
        for(int j=k>>1; j>0; j=j>>1)                                                        \
        {                                                                                   \
            for(int i=localThreadID; i<N; i+=nrthreads)                                     \
            {                                                                               \
                int ixj=i^j;                                                                \
                if((ixj)>i)                                                                 \
                {                                                                           \
                    const unsigned int Avalue = data[i];                                    \
                    const unsigned int Bvalue = data[ixj];                                  \
                    const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;     \
                    if(mustSwap)                        \
                    {                                   \
                        data[i]=Bvalue;                 \
                        data[ixj]=Avalue;               \
                    }                   \
                }                       \
            }                           \
            GroupMemoryBarrierWithGroupSync();      \
        }       \
    }       \
