Unity的Forward+ FPTL光照剔除解析(二)

序言

这一节主要解析BigTile的流程,BigTile主要的作用是:
在进行更细粒度的剔除之前,建立起来屏幕空间上的LightList(BigTile)。
在建立更细粒度的Tile Light List的时候,可以通过屏幕坐标映射找到对应的BigTile,只需对BigTile内的Light做剔除计算,从而减少剔除时的运算量。

BigTileLightList

BigTile是64线程为一个线程组,一个BigTile是64x64大小。自然,线程组ID(SV_GroupID)就相当于tileIDX

        //passData.numBigTilesX = (w + 63) / 64; HDUtils.DivRoundUp(hdCamera.width,64)
        //passData.numBigTilesY = (h + 63) / 64; HDUtils.DivRoundUp(hdCamera.height,64)
        
        // enable coarse 2D pass on 64x64 tiles (used for both fptl and clustered).
        static void BigTilePrepass(BuildGPULightListPassData data, CommandBuffer cmd)
        {
            if (data.runLightList && data.runBigTilePrepass)
            {
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_vLightList, data.output.bigTileLightList);
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
                cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);

                ConstantBuffer.Push(cmd, data.lightListCB, data.bigTilePrepassShader, HDShaderIDs._ShaderVariablesLightList);

                cmd.DispatchCompute(data.bigTilePrepassShader, data.bigTilePrepassKernel, data.numBigTilesX, data.numBigTilesY, data.viewCount);
            }
        }

scrbound计算得到的AABBData计算屏幕空间的Tile灯光列表

没有用深度图进行(Hi-z)遮挡剔除剔除的粒度也更大这是跟LightListBuild最主要的区别

//暂存在BigTile内的LightIndex
groupshared unsigned int lightsListLDS[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE];
groupshared uint lightOffs;


[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
    uint eyeIndex = u3GroupID.z;
    //上面提过了
    uint2 tileIDX = u3GroupID.xy;
    uint t=threadID;

    //屏幕像素大小
    uint iWidth = g_viDimensions.x;
    uint iHeight = g_viDimensions.y;
    //DivRoundUp求BigTileX/Y的总数
    uint nrBigTilesX = (iWidth+63)/64;
    uint nrBigTilesY = (iHeight+63)/64;

    //初始化LDS
    if(t==0) lightOffs = 0;

#if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
#endif

    //用tileIDX求得当前Tile的左下角坐标(viTilLL),右上角坐标(viTilUR),Range=>[0,g_viDimensions]
    // Raw pixel coordinates of tile
    uint2 viTilLL = 64*tileIDX;
    uint2 viTilUR = min( viTilLL+uint2(64,64), uint2(iWidth, iHeight) );            // not width and height minus 1 since viTilUR represents the end of the tile corner.

    //用iWidth和iHeight归一化Range=>[0,1]
    // 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
    float2 vTileLL = float2(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight);
    float2 vTileUR = float2(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight);

    //遍历第一篇文章最后建立的VisibleLight的g_vBoundsBuffer Buffer
    // build coarse list using AABB
    for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
    {
        //用eyeIndex,g_iNrVisibLights(Visible Light Count)计算偏移
        //之前提到g_vBoundsBuffer的布局就是[light0.min,light1.min.....][light0.max,light1.max.....]

        //ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(uint lightIndex, uint numVisibleLights, uint eyeIndex)
        //{
            //const uint eyeRelativeBase = eyeIndex * 2 * numVisibleLights;

            //ScreenSpaceBoundsIndices indices;
            //indices.min = eyeRelativeBase + lightIndex;
            //indices.max = indices.min + numVisibleLights;

            //return indices;
        //}

        const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
        const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
        const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;

        //如果灯光在屏幕空间上的Bounds在Tile内就加入lightsListLDS
        if( all(vMa>vTileLL) && all(vMi<vTileUR))
        {
            unsigned int uInc = 1;
            unsigned int uIndex;

            //等效于:
            //uIndex=lightOffs;
            //InterlockedAdd(lightOffs, uInc);
            InterlockedAdd(lightOffs, uInc, uIndex);

            if(uIndex<MAX_NR_BIGTILE_LIGHTS) 
                lightsListLDS[uIndex] = l;     // add to light list
        }
    }
//线程同步LDS
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_XBOXONE) || defined(SHADER_API_GAMECORE) || defined(SHADER_API_SWITCH) // not sure why XB1 and Switch need the barrier (it will not be correct without)
    GroupMemoryBarrierWithGroupSync();
#endif

    //限制BigTile内的灯光数量为512-1
    int iNrCoarseLights = min(lightOffs,MAX_NR_BIGTILE_LIGHTS);

    ...
}

SphericalIntersectionTest

遍历刚才lightsListLDS(BigTile内的灯光Index),计算灯光的球体是否跟BigTile重叠
若没有重叠,需要将LDS对应的灯光Index设置为UINT_MAX,留待后续排序时,把UINT_MAX排到最后。(相当于List的Remove)

#define USE_LEFT_HAND_CAMERA_SPACE (1)

[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
    //screenCoordinate=float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))) Tile的中心坐标
    SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))), eyeIndex );
#endif
...
}

float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex)
{
    float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];

    bool isOrthographic = g_isOrthographic!=0;
    float fSx = g_mScrProjection[0].x;
    float fSy = g_mScrProjection[1].y;
    float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
    float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;

//#if USE_LEFT_HAND_CAMERA_SPACE
    bool useLeftHandVersion = true;
//#else
    //bool useLeftHandVersion = isOrthographic;
//#endif

    float s = useLeftHandVersion ? 1 : (-1);
    float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);

    return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
}

void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate, uint eyeIndex)
{
    //
    //计算Far Plane上的ViewPosition作为View vector与灯光Bound进一步求交
//#if USE_LEFT_HAND_CAMERA_SPACE
    float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0, eyeIndex);
//#else
    //float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0, eyeIndex);
//#endif

    float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne(eyeIndex);
    float halfTileSizeAtZDistOne = 32*onePixDiagDist;       // scale by half a tile

    //遍历lightsListLDS,一个线程分配一次或多次测试(当iNrCoarseLights>64时)。
    for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
    {
        const int boundIndex = GenerateLightCullDataIndex(lightsListLDS[l], g_iNrVisibLights, eyeIndex);
        SFiniteLightBound lgtDat = g_data[boundIndex];

        if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius, g_isOrthographic!=0) )
            lightsListLDS[l]=UINT_MAX;//(相当于List的Remove)
    }
//线程同步lightsListLDS
#if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
#endif
}

DoesSphereOverlapTile

这里主要测试灯光的球体是否跟BigTile重叠,但是这里没用Tile的四个点做测试(左右对角),而是用了中心点,所以需要扩大球体的半径进行测试


bool DoesSphereOverlapTile(float3 dir, float halfTileSizeAtZDistOne, float3 sphCen_in, float sphRadiusIn, bool isOrthographic)
{
    //ray direction down center of tile (does not need to be normalized).
    //float3 V = float3(isOrthographic ? 0.0 : dir.x, isOrthographic ? 0.0 : dir.y, dir.z);     
    //float3 sphCen = float3(sphCen_in.x - (isOrthographic ? dir.x : 0.0), sphCen_in.y - (isOrthographic ? dir.y : 0.0), sphCen_in.z);
    //透视情况
    float3 V =  float3(dir.x, dir.y, dir.z);
    float3 sphCen =  float3(sphCen_in.x, sphCen_in.y, sphCen_in.z);
//#if 1
    float3 maxZdir = float3(-sphCen.z*sphCen.x, -sphCen.z*sphCen.y, sphCen.x*sphCen.x + sphCen.y*sphCen.y);     // cross(sphCen,cross(Zaxis,sphCen))
    float len = length(maxZdir);
    float scalarProj = len>0.0001 ? (maxZdir.z/len) : len;  // if len<=0.0001 then either |sphCen|<sphRadius or sphCen is very closely aligned with Z axis in which case little to no additional offs needed.
    float offs = scalarProj*sphRadiusIn;
//#else
    //float offs = sphRadiusIn;       // more false positives due to larger radius but works too
//#endif

    //扩大球体的半径进行测试
    // enlarge sphere so it overlaps the center of the tile assuming it overlaps the tile to begin with.
//#if USE_LEFT_HAND_CAMERA_SPACE
    float s = sphCen.z+offs;
//#else
    //float s = -(sphCen.z-offs);
//#endif

    
    //maxZdir与灯的球形相切,在Z方向上求得球的半径投影长度,(scalarProj)
    //投影长度再加上sphCen.z得到球体在Zf
    //z valve *halfTileSizeAtZDistOne等于外扩half tile size求得球体外扩半个tile之后的半径大小
    //sphRadius = sphRadiusIn+(sphCen.z + ((maxZdir.z / len) * sphRadiusIn))* halfTileSizeAtZDistOne
    float sphRadius = sphRadiusIn + s * halfTileSizeAtZDistOne;
    //float sphRadius = sphRadiusIn + (isOrthographic ? 1.0 : s)*halfTileSizeAtZDistOne;

    //正常的射线跟球体进行判交计算,计算Delta是否大于0
    float a = dot(V,V);
    float CdotV = dot(sphCen,V);
    float c = dot(sphCen,sphCen) - sphRadius*sphRadius;

    float fDescDivFour = CdotV*CdotV - a*c;

    return c<0 || (fDescDivFour>0 && CdotV>0);      // if ray hits bounding sphere
}

CullByExactEdgeTest

这一步判断Tile构成的棱台边(FrustEdges)是否与灯光体积(Hull)相交,若不相交(FoundSepPlane)则将该灯光移除出LDS
需要注意的是,这一步只对LightVolumeType不是Sphere的灯光进行判交计算。
(这里的要用到的函数GetHullEdge/GetHullVertex/GetFrustEdge计算过于抽象建议在C#端模拟一遍即可)



[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef EXACT_EDGE_TESTS
    CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, eyeIndex);
#endif
...
}

float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex)
{
    float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];

    bool isOrthographic = g_isOrthographic!=0;
    float fSx = g_mScrProjection[0].x;
    float fSy = g_mScrProjection[1].y;
    float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
    float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;

//#if USE_LEFT_HAND_CAMERA_SPACE
    bool useLeftHandVersion = true;
//#else
    //bool useLeftHandVersion = isOrthographic;
//#endif

    float s = useLeftHandVersion ? 1 : (-1);
    float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);

    return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
}

float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane, uint eyeIndex)
{
    float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
    float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
    float z = (i&4)==0 ? g_fNearPlane : fTileFarPlane;
#if !USE_LEFT_HAND_CAMERA_SPACE
    z = -z;
#endif
    return GetViewPosFromLinDepth( float2(x, y), z, eyeIndex);
}
//获取Tile棱台的边
void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane, uint eyeIndex)
{
    int iSection = e0>>2;       // section 0 is side edges, section 1 is near edges and section 2 is far edges
    int iSwizzle = e0&0x3;

    int i=iSwizzle + (2*(iSection&0x2));    // offset by 4 at section 2
    vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane, eyeIndex);

#if USE_LEFT_HAND_CAMERA_SPACE
    float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,1.0);
#else
    float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,-1.0);
#endif

    vE0 = iSection == 0 ? edgeSectionZero : (((iSwizzle & 0x2) == 0 ? 1.0f : (-1.0f)) * ((int)(iSwizzle & 0x1) == (iSwizzle >> 1) ? float3(1, 0, 0) : float3(0, 1, 0)));
}

//通过点序号获取灯光体积Hull的顶点
float3 GetHullVertex(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int p)
{
    //scaleXY=0.01;
    const bool bIsTopVertex = (p&4)!=0;
    float3 vScales = float3(
        ((p&1)!=0 ? 1.0f : (-1.0f))*(bIsTopVertex ? scaleXY.x : 1.0),
        ((p&2)!=0 ? 1.0f : (-1.0f))*(bIsTopVertex ? scaleXY.y : 1.0),
        (p&4)!=0 ? 1.0f : (-1.0f) );
    return (vScales.x*boxX + vScales.y*boxY + vScales.z*boxZ) + center;
}

//获取灯光体积的边可以用脚本进行验算
void GetHullEdge(out int idx0, out int idx_twin, out float3 vP0, out float3 vE0, const int e0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY)
{
    int iAxis = e0>>2;//iAxis=>[0,0,0,0,1,1,1,1,2,2,2,2]
    int iSwizzle = e0&0x3;//iSwizzle=>[0,1,2,3,0,1,2,3,0,1,2,3]
    bool bIsSwizzleOneOrTwo = ((iSwizzle-1)&0x2)==0;//bIsSwizzleOneOrTwo=[0,1,1,0,0,1,1,0,0,1,1,0]

    const int i0 = iAxis==0 ? (2*iSwizzle+0) : ( iAxis==1 ? (iSwizzle+(iSwizzle&2)) : iSwizzle);
    const int i1 = i0 + (1<<iAxis);
    const bool bSwap = iAxis==0 ? (!bIsSwizzleOneOrTwo) : (iAxis==1 ? false : bIsSwizzleOneOrTwo);

    idx0 = bSwap ? i1 : i0;
    idx_twin = bSwap ? i0 : i1;
    float3 p0 = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, idx0);
    float3 p1 = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, idx_twin);

    vP0 = p0;
    vE0 = p1-p0;
}

void CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, uint eyeIndex)
{
    //只对Tile构成的棱台的侧边和灯光体积的进行相交测试
    const bool bOnlyNeedFrustumSideEdges = true;
    const int nrFrustEdges = bOnlyNeedFrustumSideEdges ? 4 : 8; // max 8 since we never need to test 4 far edges of frustum since they are identical vectors to near edges and plane is placed at vP0 on light hull.

    //灯光体积有12条边
    const int totNrEdgePairs = 12*nrFrustEdges;
    //遍历Coarse Light List LDS
    for(int l=0; l<iNrCoarseLights; l++)
    {
        const uint idxCoarse = lightsListLDS[l];
        const int bufIdxCoarse = GenerateLightCullDataIndex(idxCoarse, g_iNrVisibLights, eyeIndex);

        bool canEnter = idxCoarse<(uint) g_iNrVisibLights;

        //LightVolume为球形体积不用进行测试,SphericalIntersectionTest就够了
        //主要给LIGHTVOLUMETYPE_CONE,LIGHTVOLUMETYPE_BOX做棱台边相交测试
        if(canEnter) 
        canEnter = _LightVolumeData[bufIdxCoarse].lightVolume != LIGHTVOLUMETYPE_SPHERE;      
         // don't bother doing edge tests for sphere lights since these have camera aligned bboxes.
        UNITY_BRANCH if(canEnter)
        {
            SFiniteLightBound lgtDat = g_data[bufIdxCoarse];

            const float3 boxX = lgtDat.boxAxisX.xyz;
            const float3 boxY = lgtDat.boxAxisY.xyz;
            const float3 boxZ = -lgtDat.boxAxisZ.xyz;   // flip axis (so it points away from the light direction for a spot-light)
            const float3 center = lgtDat.center.xyz;
            const float2 scaleXY = lgtDat.scaleXY;

            //(64线程为一组)一个线程Test一次
            for(int i=threadID; i<totNrEdgePairs; i+=NR_THREADS)
            {
                //这里可以优化成右移运算i>>2,e0为 灯光体积Hull的Edge Index,Range=>[0,11]
                int e0 = (int) (((uint)i)/((uint) nrFrustEdges)); // should become a shift right

                //e1为当前线程需要测试的Tile棱台的侧边,Range=>[0,3]
                int e1 = i - e0*nrFrustEdges;

                int idx_cur=0, idx_twin=0;
                float3 vP0, vE0;
                GetHullEdge(idx_cur, idx_twin, vP0, vE0, e0, boxX, boxY, boxZ, center, scaleXY);


                float3 vP1, vE1;
                GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, g_fFarPlane, eyeIndex);

                //利用两条边E0,E1,Cross(E0,E1)得到的平面法线vN
                //HullvN代表的平面,
                //(positive,negative都大于0说明有一部分点在平面左侧,一部分在右侧,则相交)resh=0
                //(positive>0,negative=0说明有所有点在平面左侧,与平面不相交)resh=1
                //(negative>0,positive=0说明有所有点在平面右侧,与平面不相交)resh=-1
                //类似的对Tile构成的Hull的所有的边也是进行同样的To Left Test
                //当出现一种vE0, vE1组合构成的平面同时与Hull与Tile棱台不相交就说明Hull与Tile不相交.
                // potential separation plane
                float3 vN = cross(vE0, vE1);

                int positive=0, negative=0;
                for(int k=1; k<8; k++)      // only need to test 7 verts (technically just 6).
                {
                    int j = (idx_cur+k)&0x7;
                    float3 vPh = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, j);
                    float fSignDist = idx_twin==j ? 0.0 : dot(vN, vPh-vP0);

                    if(fSignDist>0) 
                        ++positive; 
                    else if(fSignDist<0) 
                        ++negative;
                }
                int resh = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));

                positive=0; negative=0;
                for(int j=0; j<8; j++)
                {
                    float3 vPf = GetTileVertex(viTilLL, viTilUR, j, g_fFarPlane, eyeIndex);
                    float fSignDist = dot(vN, vPf-vP0);
                    if(fSignDist>0) 
                        ++positive; 
                    else if(fSignDist<0) 
                        ++negative;
                }
                int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));

                bool bFoundSepPlane = (resh*resf)<0;
                if(bFoundSepPlane) lightsListLDS[l]=UINT_MAX;
            }
        }
    }
    //线程同步lightsListLDS
#if NR_THREADS > PLATFORM_LANE_COUNT
    GroupMemoryBarrierWithGroupSync();
#endif
}

对LDS进行排序,并且计算有效灯光数量的lightOffset,将Tile内的灯光放到g_vLightList中

这一步主要是把前面移除出去的Light(UINT_MAX)通过排序,把Index排到LDS的末尾。然后统计真正在Big Tile内灯光数量(iNrCoarseLights)
然后再把LDS的数据存放到g_vLightList(BigTileLightList)中,留到下一步BuildPerTileLightList使用。

[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
    // sort lights
    SORTLIST(lightsListLDS, iNrCoarseLights, MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE, t, NR_THREADS);

    //经过排序之后,UINT_MAX就自然排到了List的末尾。
    //然后就可以统计真正留在Tile内的Light的数量
    if(t==0) 
        lightOffs = 0;
    GroupMemoryBarrierWithGroupSync();

    int i;
    for(i=t; i<iNrCoarseLights; i+=NR_THREADS) 
        if(lightsListLDS[i]<(uint)g_iNrVisibLights) 
            InterlockedAdd(lightOffs, 1);
    GroupMemoryBarrierWithGroupSync();
    iNrCoarseLights = lightOffs;

    int offs = tileIDX.y*nrBigTilesX + tileIDX.x + (eyeIndex * nrBigTilesX * nrBigTilesY);

    //#define MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE (512)
    //一个BigTile的灯光List的Capacity为512,MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*offs即为当前BigTile的LightList的起始点。
    //i==0 ? iNrCoarseLights(第一位记录Tile内的灯光数量)
    for(i=t; i<(iNrCoarseLights+1); i+=NR_THREADS)
        g_vLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*offs + i] = i==0 ? iNrCoarseLights : lightsListLDS[max(i-1, 0)];
}
Bitonic Sort

双调排序是顺序无关的算法,特别适合用于并行运算。
具体的算法原理:http://t.csdnimg.cn/dXxs0
这个SortList只能够对长度是2的幂次数的进行排序,lightsListLDS的长度为512(MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE)
1.首先先对空余的空间赋值为UINT_MAX,然后GroupSync线程同步。
2.(ixj)>i用于保证单调序列的配对。
3.mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
(i&k)!=0用于分割待排序元素成单调序列,
比如当k为4,排序元素为0,1,2,3,4,5,6,7,分割成(i&k=0)0,1,2,3; (i&k=1)4,5,6,7;
在第一个序列,当A>B时,才能够互换元素(i&k=0),即(i&k)!=0^(Avalue>Bvalue) == true


unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
{
#if 0
    unsigned int value = 1;

    while(value<value_in && (value<<1)<=maxValue)
        value<<=1;

    return value_in==0 ? 0 : value;
#else
    uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in));        // firstbithigh(0) returns -1
    valpw2 = max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0));   // max() just in case of overflow
    return min(valpw2, maxValue);
#endif
}
#define SORTLIST(data, length_in, maxcapacity_in, localThreadID_in, nrthreads_in)   \
{   \
    int length=(int) length_in, maxcapacity=(int) maxcapacity_in, localThreadID=(int) localThreadID_in, nrthreads=(int) nrthreads_in;   \
                                                                                            \
    const int N = (const int) LimitPow2AndClamp((unsigned int) length, (uint) maxcapacity); \
    for(int t=length+localThreadID; t<N; t+=nrthreads) { data[t]=UINT_MAX; }              \
    GroupMemoryBarrierWithGroupSync();                                                      \
                                                                                            \
    for(int k=2; k<=N; k=2*k)                                                               \
    {                                                                                       \
        for(int j=k>>1; j>0; j=j>>1)                                                        \
        {                                                                                   \
            for(int i=localThreadID; i<N; i+=nrthreads)                                     \
            {                                                                               \
                int ixj=i^j;                                                                \
                if((ixj)>i)                                                                 \
                {                                                                           \
                    const unsigned int Avalue = data[i];                                    \
                    const unsigned int Bvalue = data[ixj];                                  \
                                                                                            \
                    const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;     \
                    if(mustSwap)                        \
                    {                                   \
                        data[i]=Bvalue;                 \
                        data[ixj]=Avalue;               \
                    }                   \
                }                       \
            }                           \
                                        \
            GroupMemoryBarrierWithGroupSync();      \
        }       \
    }       \
}
posted @ 2024-04-04 15:12  凶恶的真实  阅读(134)  评论(0编辑  收藏  举报