Unity的Forward+ FPTL光照剔除解析(二)
序言
这一节主要解析BigTile的流程,BigTile主要的作用是:
在进行更细粒度的剔除之前,建立起来屏幕空间上的LightList(BigTile)。
在建立更细粒度的Tile Light List的时候,可以通过屏幕坐标映射找到对应的BigTile,只需对BigTile内的Light做剔除计算,从而减少剔除时的运算量。
BigTileLightList
BigTile是64线程为一个线程组,一个BigTile是64x64大小。自然,线程组ID(SV_GroupID)就相当于tileIDX
//passData.numBigTilesX = (w + 63) / 64; HDUtils.DivRoundUp(hdCamera.width,64)
//passData.numBigTilesY = (h + 63) / 64; HDUtils.DivRoundUp(hdCamera.height,64)
// enable coarse 2D pass on 64x64 tiles (used for both fptl and clustered).
static void BigTilePrepass(BuildGPULightListPassData data, CommandBuffer cmd)
{
if (data.runLightList && data.runBigTilePrepass)
{
cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_vLightList, data.output.bigTileLightList);
cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
cmd.SetComputeBufferParam(data.bigTilePrepassShader, data.bigTilePrepassKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);
ConstantBuffer.Push(cmd, data.lightListCB, data.bigTilePrepassShader, HDShaderIDs._ShaderVariablesLightList);
cmd.DispatchCompute(data.bigTilePrepassShader, data.bigTilePrepassKernel, data.numBigTilesX, data.numBigTilesY, data.viewCount);
}
}
由scrbound计算得到的AABBData计算屏幕空间的Tile灯光列表
没有用深度图进行(Hi-z)遮挡剔除,剔除的粒度也更大这是跟LightListBuild最主要的区别
//暂存在BigTile内的LightIndex
groupshared unsigned int lightsListLDS[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE];
groupshared uint lightOffs;
[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
uint eyeIndex = u3GroupID.z;
//上面提过了
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
//屏幕像素大小
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
//DivRoundUp求BigTileX/Y的总数
uint nrBigTilesX = (iWidth+63)/64;
uint nrBigTilesY = (iHeight+63)/64;
//初始化LDS
if(t==0) lightOffs = 0;
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//用tileIDX求得当前Tile的左下角坐标(viTilLL),右上角坐标(viTilUR),Range=>[0,g_viDimensions]
// Raw pixel coordinates of tile
uint2 viTilLL = 64*tileIDX;
uint2 viTilUR = min( viTilLL+uint2(64,64), uint2(iWidth, iHeight) ); // not width and height minus 1 since viTilUR represents the end of the tile corner.
//用iWidth和iHeight归一化Range=>[0,1]
// 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBuffer
float2 vTileLL = float2(viTilLL.x/(float) iWidth, viTilLL.y/(float) iHeight);
float2 vTileUR = float2(viTilUR.x/(float) iWidth, viTilUR.y/(float) iHeight);
//遍历第一篇文章最后建立的VisibleLight的g_vBoundsBuffer Buffer
// build coarse list using AABB
for(int l=(int) t; l<(int) g_iNrVisibLights; l += NR_THREADS)
{
//用eyeIndex,g_iNrVisibLights(Visible Light Count)计算偏移
//之前提到g_vBoundsBuffer的布局就是[light0.min,light1.min.....][light0.max,light1.max.....]
//ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(uint lightIndex, uint numVisibleLights, uint eyeIndex)
//{
//const uint eyeRelativeBase = eyeIndex * 2 * numVisibleLights;
//ScreenSpaceBoundsIndices indices;
//indices.min = eyeRelativeBase + lightIndex;
//indices.max = indices.min + numVisibleLights;
//return indices;
//}
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);
const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;
const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;
//如果灯光在屏幕空间上的Bounds在Tile内就加入lightsListLDS
if( all(vMa>vTileLL) && all(vMi<vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
//等效于:
//uIndex=lightOffs;
//InterlockedAdd(lightOffs, uInc);
InterlockedAdd(lightOffs, uInc, uIndex);
if(uIndex<MAX_NR_BIGTILE_LIGHTS)
lightsListLDS[uIndex] = l; // add to light list
}
}
//线程同步LDS
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_XBOXONE) || defined(SHADER_API_GAMECORE) || defined(SHADER_API_SWITCH) // not sure why XB1 and Switch need the barrier (it will not be correct without)
GroupMemoryBarrierWithGroupSync();
#endif
//限制BigTile内的灯光数量为512-1
int iNrCoarseLights = min(lightOffs,MAX_NR_BIGTILE_LIGHTS);
...
}
SphericalIntersectionTest
遍历刚才lightsListLDS(BigTile内的灯光Index),计算灯光的球体是否跟BigTile重叠
若没有重叠,需要将LDS对应的灯光Index设置为UINT_MAX,留待后续排序时,把UINT_MAX排到最后。(相当于List的Remove)
#define USE_LEFT_HAND_CAMERA_SPACE (1)
[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
//screenCoordinate=float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))) Tile的中心坐标
SphericalIntersectionTests( t, iNrCoarseLights, float2(min(viTilLL.xy+uint2(64/2,64/2), uint2(iWidth-1, iHeight-1))), eyeIndex );
#endif
...
}
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex)
{
float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];
bool isOrthographic = g_isOrthographic!=0;
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;
//#if USE_LEFT_HAND_CAMERA_SPACE
bool useLeftHandVersion = true;
//#else
//bool useLeftHandVersion = isOrthographic;
//#endif
float s = useLeftHandVersion ? 1 : (-1);
float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);
return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
}
void SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate, uint eyeIndex)
{
//
//计算Far Plane上的ViewPosition作为View vector与灯光Bound进一步求交
//#if USE_LEFT_HAND_CAMERA_SPACE
float3 V = GetViewPosFromLinDepth( screenCoordinate, 1.0, eyeIndex);
//#else
//float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0, eyeIndex);
//#endif
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne(eyeIndex);
float halfTileSizeAtZDistOne = 32*onePixDiagDist; // scale by half a tile
//遍历lightsListLDS,一个线程分配一次或多次测试(当iNrCoarseLights>64时)。
for(int l=threadID; l<iNrCoarseLights; l+=NR_THREADS)
{
const int boundIndex = GenerateLightCullDataIndex(lightsListLDS[l], g_iNrVisibLights, eyeIndex);
SFiniteLightBound lgtDat = g_data[boundIndex];
if( !DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lgtDat.center.xyz, lgtDat.radius, g_isOrthographic!=0) )
lightsListLDS[l]=UINT_MAX;//(相当于List的Remove)
}
//线程同步lightsListLDS
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
}
DoesSphereOverlapTile
这里主要测试灯光的球体是否跟BigTile重叠,但是这里没用Tile的四个点做测试(左右对角),而是用了中心点,所以需要扩大球体的半径进行测试。
bool DoesSphereOverlapTile(float3 dir, float halfTileSizeAtZDistOne, float3 sphCen_in, float sphRadiusIn, bool isOrthographic)
{
//ray direction down center of tile (does not need to be normalized).
//float3 V = float3(isOrthographic ? 0.0 : dir.x, isOrthographic ? 0.0 : dir.y, dir.z);
//float3 sphCen = float3(sphCen_in.x - (isOrthographic ? dir.x : 0.0), sphCen_in.y - (isOrthographic ? dir.y : 0.0), sphCen_in.z);
//透视情况
float3 V = float3(dir.x, dir.y, dir.z);
float3 sphCen = float3(sphCen_in.x, sphCen_in.y, sphCen_in.z);
//#if 1
float3 maxZdir = float3(-sphCen.z*sphCen.x, -sphCen.z*sphCen.y, sphCen.x*sphCen.x + sphCen.y*sphCen.y); // cross(sphCen,cross(Zaxis,sphCen))
float len = length(maxZdir);
float scalarProj = len>0.0001 ? (maxZdir.z/len) : len; // if len<=0.0001 then either |sphCen|<sphRadius or sphCen is very closely aligned with Z axis in which case little to no additional offs needed.
float offs = scalarProj*sphRadiusIn;
//#else
//float offs = sphRadiusIn; // more false positives due to larger radius but works too
//#endif
//扩大球体的半径进行测试
// enlarge sphere so it overlaps the center of the tile assuming it overlaps the tile to begin with.
//#if USE_LEFT_HAND_CAMERA_SPACE
float s = sphCen.z+offs;
//#else
//float s = -(sphCen.z-offs);
//#endif
//maxZdir与灯的球形相切,在Z方向上求得球的半径投影长度,(scalarProj)
//投影长度再加上sphCen.z得到球体在Zf
//z valve *halfTileSizeAtZDistOne等于外扩half tile size求得球体外扩半个tile之后的半径大小
//sphRadius = sphRadiusIn+(sphCen.z + ((maxZdir.z / len) * sphRadiusIn))* halfTileSizeAtZDistOne
float sphRadius = sphRadiusIn + s * halfTileSizeAtZDistOne;
//float sphRadius = sphRadiusIn + (isOrthographic ? 1.0 : s)*halfTileSizeAtZDistOne;
//正常的射线跟球体进行判交计算,计算Delta是否大于0
float a = dot(V,V);
float CdotV = dot(sphCen,V);
float c = dot(sphCen,sphCen) - sphRadius*sphRadius;
float fDescDivFour = CdotV*CdotV - a*c;
return c<0 || (fDescDivFour>0 && CdotV>0); // if ray hits bounding sphere
}
CullByExactEdgeTest
这一步判断Tile构成的棱台边(FrustEdges)是否与灯光体积(Hull)相交,若不相交(FoundSepPlane)则将该灯光移除出LDS。
需要注意的是,这一步只对LightVolumeType不是Sphere的灯光进行判交计算。
(这里的要用到的函数GetHullEdge/GetHullVertex/GetFrustEdge计算过于抽象建议在C#端模拟一遍即可)
[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef EXACT_EDGE_TESTS
CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, eyeIndex);
#endif
...
}
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth, uint eyeIndex)
{
float4x4 g_mScrProjection = g_mScrProjectionArr[eyeIndex];
bool isOrthographic = g_isOrthographic!=0;
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;
//#if USE_LEFT_HAND_CAMERA_SPACE
bool useLeftHandVersion = true;
//#else
//bool useLeftHandVersion = isOrthographic;
//#endif
float s = useLeftHandVersion ? 1 : (-1);
float2 p = float2( (s*v2ScrPos.x-fCx)/fSx, (s*v2ScrPos.y-fCy)/fSy);
return float3(isOrthographic ? p.xy : (fLinDepth*p.xy), fLinDepth);
}
float3 GetTileVertex(uint2 viTilLL, uint2 viTilUR, int i, float fTileFarPlane, uint eyeIndex)
{
float x = (i&1)==0 ? viTilLL.x : viTilUR.x;
float y = (i&2)==0 ? viTilLL.y : viTilUR.y;
float z = (i&4)==0 ? g_fNearPlane : fTileFarPlane;
#if !USE_LEFT_HAND_CAMERA_SPACE
z = -z;
#endif
return GetViewPosFromLinDepth( float2(x, y), z, eyeIndex);
}
//获取Tile棱台的边
void GetFrustEdge(out float3 vP0, out float3 vE0, const int e0, uint2 viTilLL, uint2 viTilUR, float fTileFarPlane, uint eyeIndex)
{
int iSection = e0>>2; // section 0 is side edges, section 1 is near edges and section 2 is far edges
int iSwizzle = e0&0x3;
int i=iSwizzle + (2*(iSection&0x2)); // offset by 4 at section 2
vP0 = GetTileVertex(uint2(viTilLL.x, viTilUR.y), uint2(viTilUR.x, viTilLL.y), i, fTileFarPlane, eyeIndex);
#if USE_LEFT_HAND_CAMERA_SPACE
float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,1.0);
#else
float3 edgeSectionZero = g_isOrthographic==0 ? vP0 : float3(0.0,0.0,-1.0);
#endif
vE0 = iSection == 0 ? edgeSectionZero : (((iSwizzle & 0x2) == 0 ? 1.0f : (-1.0f)) * ((int)(iSwizzle & 0x1) == (iSwizzle >> 1) ? float3(1, 0, 0) : float3(0, 1, 0)));
}
//通过点序号获取灯光体积Hull的顶点
float3 GetHullVertex(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int p)
{
//scaleXY=0.01;
const bool bIsTopVertex = (p&4)!=0;
float3 vScales = float3(
((p&1)!=0 ? 1.0f : (-1.0f))*(bIsTopVertex ? scaleXY.x : 1.0),
((p&2)!=0 ? 1.0f : (-1.0f))*(bIsTopVertex ? scaleXY.y : 1.0),
(p&4)!=0 ? 1.0f : (-1.0f) );
return (vScales.x*boxX + vScales.y*boxY + vScales.z*boxZ) + center;
}
//获取灯光体积的边可以用脚本进行验算
void GetHullEdge(out int idx0, out int idx_twin, out float3 vP0, out float3 vE0, const int e0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY)
{
int iAxis = e0>>2;//iAxis=>[0,0,0,0,1,1,1,1,2,2,2,2]
int iSwizzle = e0&0x3;//iSwizzle=>[0,1,2,3,0,1,2,3,0,1,2,3]
bool bIsSwizzleOneOrTwo = ((iSwizzle-1)&0x2)==0;//bIsSwizzleOneOrTwo=[0,1,1,0,0,1,1,0,0,1,1,0]
const int i0 = iAxis==0 ? (2*iSwizzle+0) : ( iAxis==1 ? (iSwizzle+(iSwizzle&2)) : iSwizzle);
const int i1 = i0 + (1<<iAxis);
const bool bSwap = iAxis==0 ? (!bIsSwizzleOneOrTwo) : (iAxis==1 ? false : bIsSwizzleOneOrTwo);
idx0 = bSwap ? i1 : i0;
idx_twin = bSwap ? i0 : i1;
float3 p0 = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, idx0);
float3 p1 = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, idx_twin);
vP0 = p0;
vE0 = p1-p0;
}
void CullByExactEdgeTests(uint threadID, int iNrCoarseLights, uint2 viTilLL, uint2 viTilUR, uint eyeIndex)
{
//只对Tile构成的棱台的侧边和灯光体积的进行相交测试
const bool bOnlyNeedFrustumSideEdges = true;
const int nrFrustEdges = bOnlyNeedFrustumSideEdges ? 4 : 8; // max 8 since we never need to test 4 far edges of frustum since they are identical vectors to near edges and plane is placed at vP0 on light hull.
//灯光体积有12条边
const int totNrEdgePairs = 12*nrFrustEdges;
//遍历Coarse Light List LDS
for(int l=0; l<iNrCoarseLights; l++)
{
const uint idxCoarse = lightsListLDS[l];
const int bufIdxCoarse = GenerateLightCullDataIndex(idxCoarse, g_iNrVisibLights, eyeIndex);
bool canEnter = idxCoarse<(uint) g_iNrVisibLights;
//LightVolume为球形体积不用进行测试,SphericalIntersectionTest就够了
//主要给LIGHTVOLUMETYPE_CONE,LIGHTVOLUMETYPE_BOX做棱台边相交测试
if(canEnter)
canEnter = _LightVolumeData[bufIdxCoarse].lightVolume != LIGHTVOLUMETYPE_SPHERE;
// don't bother doing edge tests for sphere lights since these have camera aligned bboxes.
UNITY_BRANCH if(canEnter)
{
SFiniteLightBound lgtDat = g_data[bufIdxCoarse];
const float3 boxX = lgtDat.boxAxisX.xyz;
const float3 boxY = lgtDat.boxAxisY.xyz;
const float3 boxZ = -lgtDat.boxAxisZ.xyz; // flip axis (so it points away from the light direction for a spot-light)
const float3 center = lgtDat.center.xyz;
const float2 scaleXY = lgtDat.scaleXY;
//(64线程为一组)一个线程Test一次
for(int i=threadID; i<totNrEdgePairs; i+=NR_THREADS)
{
//这里可以优化成右移运算i>>2,e0为 灯光体积Hull的Edge Index,Range=>[0,11]
int e0 = (int) (((uint)i)/((uint) nrFrustEdges)); // should become a shift right
//e1为当前线程需要测试的Tile棱台的侧边,Range=>[0,3]
int e1 = i - e0*nrFrustEdges;
int idx_cur=0, idx_twin=0;
float3 vP0, vE0;
GetHullEdge(idx_cur, idx_twin, vP0, vE0, e0, boxX, boxY, boxZ, center, scaleXY);
float3 vP1, vE1;
GetFrustEdge(vP1, vE1, e1, viTilLL, viTilUR, g_fFarPlane, eyeIndex);
//利用两条边E0,E1,Cross(E0,E1)得到的平面法线vN
//HullvN代表的平面,
//(positive,negative都大于0说明有一部分点在平面左侧,一部分在右侧,则相交)resh=0
//(positive>0,negative=0说明有所有点在平面左侧,与平面不相交)resh=1
//(negative>0,positive=0说明有所有点在平面右侧,与平面不相交)resh=-1
//类似的对Tile构成的Hull的所有的边也是进行同样的To Left Test
//当出现一种vE0, vE1组合构成的平面同时与Hull与Tile棱台不相交就说明Hull与Tile不相交.
// potential separation plane
float3 vN = cross(vE0, vE1);
int positive=0, negative=0;
for(int k=1; k<8; k++) // only need to test 7 verts (technically just 6).
{
int j = (idx_cur+k)&0x7;
float3 vPh = GetHullVertex(boxX, boxY, boxZ, center, scaleXY, j);
float fSignDist = idx_twin==j ? 0.0 : dot(vN, vPh-vP0);
if(fSignDist>0)
++positive;
else if(fSignDist<0)
++negative;
}
int resh = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));
positive=0; negative=0;
for(int j=0; j<8; j++)
{
float3 vPf = GetTileVertex(viTilLL, viTilUR, j, g_fFarPlane, eyeIndex);
float fSignDist = dot(vN, vPf-vP0);
if(fSignDist>0)
++positive;
else if(fSignDist<0)
++negative;
}
int resf = (positive>0 && negative>0) ? 0 : (positive>0 ? 1 : (negative>0 ? (-1) : 0));
bool bFoundSepPlane = (resh*resf)<0;
if(bFoundSepPlane) lightsListLDS[l]=UINT_MAX;
}
}
}
//线程同步lightsListLDS
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
}
对LDS进行排序,并且计算有效灯光数量的lightOffset,将Tile内的灯光放到g_vLightList中
这一步主要是把前面移除出去的Light(UINT_MAX)通过排序,把Index排到LDS的末尾。然后统计真正在Big Tile内灯光数量(iNrCoarseLights)
然后再把LDS的数据存放到g_vLightList(BigTileLightList)中,留到下一步BuildPerTileLightList使用。
[numthreads(NR_THREADS, 1, 1)]
void BigTileLightListGen(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
// sort lights
SORTLIST(lightsListLDS, iNrCoarseLights, MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE, t, NR_THREADS);
//经过排序之后,UINT_MAX就自然排到了List的末尾。
//然后就可以统计真正留在Tile内的Light的数量
if(t==0)
lightOffs = 0;
GroupMemoryBarrierWithGroupSync();
int i;
for(i=t; i<iNrCoarseLights; i+=NR_THREADS)
if(lightsListLDS[i]<(uint)g_iNrVisibLights)
InterlockedAdd(lightOffs, 1);
GroupMemoryBarrierWithGroupSync();
iNrCoarseLights = lightOffs;
int offs = tileIDX.y*nrBigTilesX + tileIDX.x + (eyeIndex * nrBigTilesX * nrBigTilesY);
//#define MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE (512)
//一个BigTile的灯光List的Capacity为512,MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*offs即为当前BigTile的LightList的起始点。
//i==0 ? iNrCoarseLights(第一位记录Tile内的灯光数量)
for(i=t; i<(iNrCoarseLights+1); i+=NR_THREADS)
g_vLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*offs + i] = i==0 ? iNrCoarseLights : lightsListLDS[max(i-1, 0)];
}
Bitonic Sort
双调排序是顺序无关的算法,特别适合用于并行运算。
具体的算法原理:http://t.csdnimg.cn/dXxs0
这个SortList只能够对长度是2的幂次数的进行排序,lightsListLDS的长度为512(MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE)
1.首先先对空余的空间赋值为UINT_MAX,然后GroupSync线程同步。
2.(ixj)>i用于保证单调序列的配对。
3.mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue;
(i&k)!=0用于分割待排序元素成单调序列,
比如当k为4,排序元素为0,1,2,3,4,5,6,7,分割成(i&k=0)0,1,2,3; (i&k=1)4,5,6,7;
在第一个序列,当A>B时,才能够互换元素(i&k=0),即(i&k)!=0^(Avalue>Bvalue) == true
unsigned int LimitPow2AndClamp(unsigned int value_in, unsigned int maxValue)
{
#if 0
unsigned int value = 1;
while(value<value_in && (value<<1)<=maxValue)
value<<=1;
return value_in==0 ? 0 : value;
#else
uint valpw2 = value_in==0 ? 0 : (1<<firstbithigh(value_in)); // firstbithigh(0) returns -1
valpw2 = max(valpw2, valpw2<<(valpw2!=value_in ? 1 : 0)); // max() just in case of overflow
return min(valpw2, maxValue);
#endif
}
#define SORTLIST(data, length_in, maxcapacity_in, localThreadID_in, nrthreads_in) \
{ \
int length=(int) length_in, maxcapacity=(int) maxcapacity_in, localThreadID=(int) localThreadID_in, nrthreads=(int) nrthreads_in; \
\
const int N = (const int) LimitPow2AndClamp((unsigned int) length, (uint) maxcapacity); \
for(int t=length+localThreadID; t<N; t+=nrthreads) { data[t]=UINT_MAX; } \
GroupMemoryBarrierWithGroupSync(); \
\
for(int k=2; k<=N; k=2*k) \
{ \
for(int j=k>>1; j>0; j=j>>1) \
{ \
for(int i=localThreadID; i<N; i+=nrthreads) \
{ \
int ixj=i^j; \
if((ixj)>i) \
{ \
const unsigned int Avalue = data[i]; \
const unsigned int Bvalue = data[ixj]; \
\
const bool mustSwap = ((i&k)!=0^(Avalue>Bvalue)) && Avalue!=Bvalue; \
if(mustSwap) \
{ \
data[i]=Bvalue; \
data[ixj]=Avalue; \
} \
} \
} \
\
GroupMemoryBarrierWithGroupSync(); \
} \
} \
}