Unity的Forward+ FPTL光照剔除解析(三)
序言
如果看了前面的BigTileLightList的建立,这一章会简单一点。
因为如果启用了BigTile之后,这里的BuildPerTileLightList就需要从BigTileLightList里面读取LightList。
否则还是需要像BigTile一样去先走同样的灯光剔除流程(NDCAABBBoundTest,SphericalIntersectionTests),
然后才到最后的FinePruneLightsTest。
LightListBuild
RenderGraph Dispatch
下面是RenderGraph中Dispatch时需要的Buffer以及ConstantBuffer
//HDRenderPipeline.LightLoop.cs
static void BuildPerTileLightList(BuildGPULightListPassData data, ref bool tileFlagsWritten, CommandBuffer cmd)
{
// optimized for opaques only
if (data.runLightList && data.runFPTL)
{
//第一步计算的灯光AABB Buffer
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
//LightVolumeData与SFiniteLightBound的ComputeBuffer
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);
//用作Hiz剔除的深度图
cmd.SetComputeTextureParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);
//最终输出的lightList
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vLightList, data.output.lightList);
//Big Tile Light List
if (data.runBigTilePrepass)
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);
var localLightListCB = data.lightListCB;
//计算Tile内需要计算的FeatureVariant
//LightLoop中用于控制着色光照计算流程,baseFeatureFlags就是最基础的Flag
//LightLoop的时候获取TileFeatureFlag,就可以知道当前Tile是否需要计算Punctual/Area/Directional/Env的光照
if (data.enableFeatureVariants)
{
uint baseFeatureFlags = 0;
if (data.directionalLightCount > 0)
{
baseFeatureFlags |= (uint)LightFeatureFlags.Directional;
}
if (data.skyEnabled)
{
baseFeatureFlags |= (uint)LightFeatureFlags.Sky;
}
if (!data.computeMaterialVariants)
{
baseFeatureFlags |= LightDefinitions.s_MaterialFeatureMaskFlags;
}
localLightListCB.g_BaseFeatureFlags = baseFeatureFlags;
cmd.SetComputeBufferParam(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, HDShaderIDs.g_TileFeatureFlags, data.output.tileFeatureFlags);
tileFlagsWritten = true;
}
ConstantBuffer.Push(cmd, localLightListCB, data.buildPerTileLightListShader, HDShaderIDs._ShaderVariablesLightList);
cmd.DispatchCompute(data.buildPerTileLightListShader, data.buildPerTileLightListKernel, data.numTilesFPTLX, data.numTilesFPTLY, data.viewCount);
}
}
Initialize
跟BigTile类似,计算当前线程的Tile的映射关系要用到的数据.(Tile的X/Y轴上的数量,当前线程组对应的TileID)
//FPTL这一步的Tile Size为16*16
#define TILE_SIZE_FPTL (16)
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
UNITY_XR_ASSIGN_VIEW_INDEX(dispatchThreadId.z);
uint2 tileIDX = u3GroupID.xy;
uint t=threadID;
if(t<LIGHT_LIST_MAX_COARSE_ENTRIES)
prunedList[t]=0;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint nrTilesX = (iWidth+15)/16;
uint nrTilesY = (iHeight+15)/16;
// build tile scr boundary
const uint uFltMax = 0x7f7fffff; // FLT_MAX as a uint
if(t==0)
{
ldsZMin = uFltMax;
ldsZMax = 0;
lightOffs = 0;
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
...
}
Hi-z剔除
读取PreDepth深度图获取4个像素内的Min、MaxDepth,通过线程同步计算得出一个线程组内的Min/MaxDepth 即(64*4=16*16),建立起当前Tile的Bounds(ldsZMin/ldsZMax)。
读取深度图图,计算viewPostion.z
LinearDepth
camera.projectMatrix是右手坐标系的(OpenGL),为了统一整个剔除流程的轴向,ScrProjection翻转了z轴,采用左手坐标系。
groupshared uint ldsZMin;
groupshared uint ldsZMax;
TEXTURE2D_X(g_depth_tex) : register( t0 );
float FetchDepth(uint2 pixCoord)
{
float zdpth = LOAD_TEXTURE2D_X(g_depth_tex, pixCoord.xy).x;
//https://zhuanlan.zhihu.com/p/389971233
//投影矩阵的Z Flip了,读取时也要翻转过来(匹配 0 is near 1 is far)
#if UNITY_REVERSED_Z
zdpth = 1.0 - zdpth;
#endif
return zdpth;
}
//USE_OBLIQUE_MODE m_LightListProjMatrices.m20!= 0 || m_LightListProjMatrices.m21 != 0;
//即投影矩阵的r+l!=0,t+b!=0
//linearDepth; // View space Z coordinate : [Near, Far]
//Reverse z :-z_eye=1/((n-f)/(n*f)*depth+1/n); //https://zhuanlan.zhihu.com/p/393643084
float GetLinearDepth(float2 pixXY, float zDptBufSpace) // 0 is near 1 is far
{
float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[unity_StereoEyeIndex];
#ifdef USE_OBLIQUE_MODE
float2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;
return res2.x / res2.y;
#else
//正交矩阵用(m22*zDptBufSpace+m23),透视用(m32*zDptBufSpace+m33)
// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)
// however this function must also work for orthographic projection so we keep it like this.
float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;
float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;
return (m22*zDptBufSpace+m23) / (m32*zDptBufSpace+m33);
#endif
}
GetViewPosFromLinDepth
这里简单的以x轴的推导为例
由投影矩阵中的相似三角形易得
[Unity Shader入门精要]
\(\frac{(ScreenPos.x-pixWidth/2)}{pixWidth/2}=\frac{clipPos.x}{clipPos.w}\)
\(\frac{clipPos.x}{clipPos.w}=\frac{viewPos.x*\frac{cotFOV}{Aspect}}{-viewPos.z}\)
\(viewPos.x = \frac{Screen.x-pixWidth/2}{pixWidth/2*\frac{cotFOV}{Aspect}}*-viewPos.z\)
由于之前ScrProjection已经FlipZ,所以可以直接fLinDepth * p.xy
unsafe void PrepareBuildGPULightListPassData(
RenderGraph renderGraph,
RenderGraphBuilder builder,
HQCamera hqCamera,
TileAndClusterData tileAndClusterData,
ref ShaderVariablesLightList constantBuffer,
int totalLightCount,
TextureHandle depthStencilBuffer,
TextureHandle stencilBufferCopy,
BuildGPULightListPassData passData)
{
....
// camera to screen matrix (and it's inverse)
for (int viewIndex = 0; viewIndex < hqCamera.viewCount; ++viewIndex)
{
var proj = camera.projectionMatrix;
// Note: we need to take into account the TAA jitter when indexing the light list
proj = hqCamera.RequiresCameraJitter() ? hqCamera.GetJitteredProjectionMatrix(proj) : proj;
m_LightListProjMatrices[viewIndex] = proj * s_FlipMatrixLHSRHS;
var tempMatrix = temp * m_LightListProjMatrices[viewIndex];
var invTempMatrix = tempMatrix.inverse;
for (int i = 0; i < 16; ++i)
{
cb.g_mScrProjectionArr[viewIndex * 16 + i] = tempMatrix[i];
cb.g_mInvScrProjectionArr[viewIndex * 16 + i] = invTempMatrix[i];
}
}
}
\(pixWidth/2*\frac{cotFOV}{Aspect}=fSx\)
\(pixHeight/2*\frac{cotFOV}{Aspect}=fSy\)
\(pixWidth/2=fCx\)
\(pixHeight/2=fCy\)
//
float3 GetViewPosFromLinDepth(float2 v2ScrPos, float fLinDepth)
{
float4x4 g_mScrProjection = g_mScrProjectionArr[unity_StereoEyeIndex];
bool isOrthographic = g_isOrthographic != 0;
float fSx = g_mScrProjection[0].x;
float fSy = g_mScrProjection[1].y;
float fCx = isOrthographic ? g_mScrProjection[0].w : g_mScrProjection[0].z;
float fCy = isOrthographic ? g_mScrProjection[1].w : g_mScrProjection[1].z;
#if USE_LEFT_HAND_CAMERA_SPACE
bool useLeftHandVersion = true;
#else
bool useLeftHandVersion = isOrthographic;
#endif
float s = useLeftHandVersion ? 1 : (-1);
float2 p = float2((s * v2ScrPos.x - fCx) / fSx, (s * v2ScrPos.y - fCy) / fSy);
return float3(isOrthographic ? p.xy : (fLinDepth * p.xy), fLinDepth);
}
这里读取深度图并将其转换到[Near,Far],然后计算出2*2像素中的MinDepth,MaxDepth,
然后通过线程同步(InterlockedMax/InterlockedMin)计算线程组内(Tile内)的MinDepth,MaxDepth。
注:FPTL的Tile Size为16*16,64线程一组,一个线程计算4个像素。(16*16=64*4),这里的同步计算不会影响到别的线程组(Tile)
#define NR_THREADS 64
#define TILE_SIZE_FPTL (16)
#define VIEWPORT_SCALE_Z (1)
#define PIXEL_PER_THREAD ((TILE_SIZE_FPTL*TILE_SIZE_FPTL) / NR_THREADS) // 8 or 4
//16*16/64=4
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
uint2 viTilLL = 16*tileIDX;
// establish min and max depth first
float dpt_mi=asfloat(uFltMax), dpt_ma=0.0;
#if PIXEL_PER_THREAD == 4
float4 vLinDepths;
#else
float vLinDepths[PIXEL_PER_THREAD];
#endif
{
//VIEWPORT_SCALE_Z
// Fetch depths and calculate min/max
UNITY_UNROLL
for(int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = i * NR_THREADS + t;
uint2 uCrd = min( uint2(viTilLL.x+(idx&0xf), viTilLL.y+(idx>>4)), uint2(iWidth-1, iHeight-1) );
const float fDepth = FetchDepth(uCrd);
vLinDepths[i] = GetLinearDepth(uCrd+float2(0.5,0.5), fDepth);
if(fDepth<VIEWPORT_SCALE_Z) // if not skydome
{
dpt_mi = min(fDepth, dpt_mi);
dpt_ma = max(fDepth, dpt_ma);
}
}
InterlockedMax(ldsZMax, asuint(dpt_ma));
InterlockedMin(ldsZMin, asuint(dpt_mi));
#if NR_THREADS > PLATFORM_LANE_COUNT || defined(SHADER_API_SWITCH) // not sure why Switch needs the barrier (it will not be correct without)
GroupMemoryBarrierWithGroupSync();
#endif
}
...
}
NDCAABBBoundTest
若启用了BigTile预计算,则通过映射读取LightOffset(Big-tile内的灯光数量)以及对应的lightIndex
若没有则按原样直接遍历g_vBoundBuffer,用AABB计算当前灯光是否在Tile内,若在则加入到CoareList中
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
// build coarse list using AABB
//若启用了BigTile计算
#ifdef USE_TWO_PASS_TILED_LIGHTING
//firstbithigh(64)=6 64*64
//firstbithigh(16)=4 16*16
//log2BigTileToTileRatio=2
//#define TILE_SIZE_FPTL (16)
//#define TILE_SIZE_BIG_TILE (64)
//即4*4个FTPL Tile构成一个Big Tile log2_4=2
const uint log2BigTileToTileRatio = firstbithigh(64) - firstbithigh(16);
//计算Big Tile X/Y的数量
//((1 << log2BigTileToTileRatio) -1 ) 1<<2-1 = 3 相当于DivRoundUp(nrTilesX,4)
int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToTileRatio) -1 )) >> log2BigTileToTileRatio;
int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToTileRatio) - 1)) >> log2BigTileToTileRatio;
//BigTile总数
const int bigTileBase = unity_StereoEyeIndex * NrBigTilesX * NrBigTilesY;
//计算当前Tile对应的BigTile
const int bigTileIdx = bigTileBase + (tileIDX.y>>log2BigTileToTileRatio)*NrBigTilesX + (tileIDX.x>>log2BigTileToTileRatio); // map the idx to 64x64 tiles
//第一位记录当前Tile的灯光数量
int nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+0];
//遍历BigTileLightList
for(int l0=(int) t; l0<(int) nrBigTileLights; l0 += NR_THREADS)
{
int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE*bigTileIdx+l0+1];
#else
//若没有启用Big Tile,则直接遍历所有的灯光,利用Scrbound计算的AABBBound计算Tile(16*16)的当前灯光列表(coarseList)
for (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS)
{
#endif
// Skip Local Volumetric Fog (lights are sorted by category). TODO: improve data locality
// if (_LightVolumeData[l].lightCategory == LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG) { break; }
const ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, unity_StereoEyeIndex);
const float3 vMi = g_vBoundsBuffer[boundsIndices.min].xyz;
const float3 vMa = g_vBoundsBuffer[boundsIndices.max].xyz;
if (all(vMa > vTileLL) && all(vMi < vTileUR))
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(lightOffs, uInc, uIndex);
if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list
}
}
...
}
清空ldsDoesLightInterset初始化,并同步线程组
#define FINE_PRUNING_ENABLED
#define LIGHT_LIST_MAX_COARSE_ENTRIES (64)//coarseList/prunedList LDS的最大容量为64
//uint 32Bit容纳不了64盏灯的灯光与当前Tile相交情况,所以加多了一个记录另外32盏灯.
groupshared uint ldsDoesLightIntersect[2];
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef FINE_PRUNING_ENABLED
if (t < 2)
ldsDoesLightIntersect[t] = 0;
#endif
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//coarseList/prunedList LDS的最大容量为64
int iNrCoarseLights = min(lightOffs,LIGHT_LIST_MAX_COARSE_ENTRIES);
...
}
SphericalIntersectionTest,FinePruneLights
SphericalIntersectionTest
这里的SphericalIntersectionTest与BigTile中的SphericalIntersectionTest唯一不同的区别是就是需要把coarseList拷贝到prunedList暂存。
检测到灯光Overlap Tile之后再暂存在coarseList的lightIndex加入到prunedList中。
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
int SphericalIntersectionTests(uint threadID, int iNrCoarseLights, float2 screenCoordinate)
{
if (threadID == 0) lightOffsSph = 0;
// make a copy of coarseList in prunedList.
int l;
for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
prunedList[l] = coarseList[l];
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
#if USE_LEFT_HAND_CAMERA_SPACE
float3 V = GetViewPosFromLinDepth(screenCoordinate, 1.0);
#else
float3 V = GetViewPosFromLinDepth( screenCoordinate, -1.0);
#endif
float onePixDiagDist = GetOnePixDiagWorldDistAtDepthOne();
float halfTileSizeAtZDistOne = 8 * onePixDiagDist; // scale by half a tile
for (l = threadID; l < iNrCoarseLights; l += NR_THREADS)
{
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[l], g_iNrVisibLights, unity_StereoEyeIndex);
SFiniteLightBound lightData = g_data[lightBoundIndex];
if (DoesSphereOverlapTile(V, halfTileSizeAtZDistOne, lightData.center.xyz, lightData.radius, g_isOrthographic != 0))
{
unsigned int uIndex;
InterlockedAdd(lightOffsSph, 1, uIndex);
coarseList[uIndex] = prunedList[l]; // read from the original copy of coarseList which is backed up in prunedList
}
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
return lightOffsSph;
}
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTS
iNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(16 / 2, 16 / 2), uint2(iWidth - 1, iHeight - 1))));
#endif
...
}
FinePruneLights
s_lightVolumesCache LDS
在开始计算FinePruneLights之前,需要预先记录coarseList对应的LightVolume进LDS s_lightVolumesCache(StoreLightVolumeCache)中
#define FINE_PRUNING_ENABLED
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifndef FINE_PRUNING_ENABLED
{
if((int)t<iNrCoarseLights)
prunedList[t] = coarseList[t];
if(t==0)
ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
...
}
//
//eyeIndex==0时,GetCoarseLightIndex=>coarseList[l]
uint GenerateLightCullDataIndex(uint lightIndex, uint numVisibleLights, uint eyeIndex)
{
lightIndex = min(lightIndex, numVisibleLights - 1); // Stay within bounds
// For monoscopic, there is just one set of light cull data structs.
// In stereo, all of the left eye structs are first, followed by the right eye structs.
const uint perEyeBaseIndex = eyeIndex * numVisibleLights;
return (perEyeBaseIndex + lightIndex);
}
int GetCoarseLightIndex(int l, int iNrCoarseLights)
{
return l < iNrCoarseLights ? GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, unity_StereoEyeIndex) : 0;
}
//
groupshared uint s_lightVolumesCache[LIGHT_LIST_MAX_COARSE_ENTRIES];
void StoreLightVolumeCache(int lightIndex, int coarseIndex, uint volumeType)
{
// 3 bits for the volume type, in case we have a corrupted one we can early out of the switch statement.
// 29 bits for a coarse light index.
s_lightVolumesCache[lightIndex] = (volumeType & 0x7) | (uint)(coarseIndex << 3);
}
void LoadLightVolumeCache(int lightIndex, out int coarseIndex, out int volumeType)
{
int data = s_lightVolumesCache[lightIndex];
coarseIndex = data >> 3;
volumeType = data & 0x7;
}
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
uint t = threadID;
uint iWidth = g_viDimensions.x;
uint iHeight = g_viDimensions.y;
uint uLightsFlags[2] = {0, 0};
int l = 0;
// need this outer loop even on xb1 and ps4 since direct lights and
// reflection lights are kept in separate regions.
if (threadID < (uint)iNrCoarseLights)
{
int idxCoarse = GetCoarseLightIndex((int)threadID, iNrCoarseLights);
int uLightVolume = (int)_LightVolumeData[idxCoarse].lightVolume;
StoreLightVolumeCache(threadID, idxCoarse, uLightVolume);
}
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
....
}
判交计算
读取LDS中的volumeData,并且利用之前的DepthBound(vLinDepths)逐像素进行判交。
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
//(记录Tile内灯光数量)
groupshared int ldsNrLightsFinal;
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
...
//When using LDS to cache the volume data, this produces the best most optimal code.
//Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
for (; l < iNrCoarseLights; ++l)
{
int idxCoarse;
int uLightVolume;
//读取之前的LightVolumeData
LoadLightVolumeCache(l, idxCoarse, uLightVolume);
bool lightValid = false;
if (uLightVolume == LIGHTVOLUMETYPE_CONE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
const bool bIsSpotDisc = true; // (lightData.flags&IS_CIRCULAR_SPOT_SHAPE) != 0;
for (int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = t + i * NR_THREADS;
//先计算当前像素的深度对应的ViewPosition
uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);
//LightData.lightPos是View Space
// check pixel
//用当前像素到LightPos的向量fromLight以及LightAxis判断fromLight在Cone内部
float3 fromLight = vVPos - lightData.lightPos.xyz;
float distSq = dot(fromLight, fromLight);
const float fSclProj = dot(fromLight, lightData.lightAxisZ.xyz); // spotDir = lightData.lightAxisZ.xyz
float2 V = abs(float2(dot(fromLight, lightData.lightAxisX.xyz), dot(fromLight, lightData.lightAxisY.xyz)));
//bIsSpotDisc=true
//即fDist2D=dot(fromLight, lightData.lightAxisX)^2+dot(fromLight, lightData.lightAxisX)^2;
float fDist2D = bIsSpotDisc ? length(V) : max(V.x, V.y);
//lightData.radiusSq>distSq
//fSclProj>fDist2D * lightData.cotan即fSclProj/fDist2D>lightData.cotan,用fromLight和Axis计算夹角的cot
//lightVolumeData radiusSq的计算
//lightVolumeData.radiusSq = range * range;
//lightVolumeData cotan的计算
// var sa = light.spotAngle;
// var cs = Mathf.Cos(0.5f * sa * Mathf.Deg2Rad);
// var si = Mathf.Sin(0.5f * sa * Mathf.Deg2Rad);
// if (gpuLightType == GPULightType.ProjectorPyramid)
// {
// Vector3 lightPosToProjWindowCorner = (0.5f * lightDimensions.x) * vx + (0.5f * lightDimensions.y) * vy + 1.0f * vz;
// cs = Vector3.Dot(vz, Vector3.Normalize(lightPosToProjWindowCorner));
// si = Mathf.Sqrt(1.0f - cs * cs);
// }
// const float FltMax = 3.402823466e+38F;
// var ta = cs > 0.0f ? (si / cs) : FltMax;
// var cota = si > 0.0f ? (cs / si) : FltMax;
// lightVolumeData.cotan = cota;
bool validInPixel = all(float2(lightData.radiusSq, fSclProj) > float2(distSq, fDist2D * lightData.cotan));
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
//a wave is on the same tile, and the loop is uniform for the wave.
// thus we early out if at least 1 thread in the wave passed this light, saving some ALU.
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else if (uLightVolume == LIGHTVOLUMETYPE_SPHERE)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
for (int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = t + i * NR_THREADS;
//先计算当前像素的深度对应的ViewPosition
uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);
//简单的球形距离场
// check pixel
float3 vLp = lightData.lightPos.xyz;
float3 toLight = vLp - vVPos;
float distSq = dot(toLight, toLight);
bool validInPixel = lightData.radiusSq > distSq;
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else if (uLightVolume == LIGHTVOLUMETYPE_BOX)
{
LightVolumeData lightData = _LightVolumeData[idxCoarse];
for (int i = 0; i < PIXEL_PER_THREAD; i++)
{
int idx = t + i * NR_THREADS;
//先计算当前像素的深度对应的ViewPosition
uint2 uPixLoc = min(uint2(viTilLL.x + (idx & 0xf), viTilLL.y + (idx >> 4)), uint2(iWidth - 1, iHeight - 1));
float3 vVPos = GetViewPosFromLinDepth(uPixLoc + float2(0.5, 0.5), vLinDepths[i]);
// check pixel
float3 toLight = lightData.lightPos.xyz - vVPos;
//用toLight计算有向距离场判断当前像素是否在Box内部
float3 dist = float3(dot(toLight, lightData.lightAxisX), dot(toLight, lightData.lightAxisY), dot(toLight, lightData.lightAxisZ));
dist = (abs(dist) - lightData.boxInnerDist) * lightData.boxInvRange; // not as efficient as it could be
bool validInPixel = max(max(dist.x, dist.y), dist.z) < 1; // but allows us to not write out OuterDists
#ifdef PLATFORM_SUPPORTS_WAVE_INTRINSICS
lightValid = WaveActiveAnyTrue(validInPixel);
#else
lightValid = validInPixel;
#endif
if (lightValid)
break;
}
}
else
break;
//lightValid记录判交结果,前32盏灯记录到uLightsFlags[0],剩下的记录到uLightsFlags[1]
uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
}
...
}
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifndef FINE_PRUNING_ENABLED
{
if((int)t<iNrCoarseLights)
prunedList[t] = coarseList[t];
if(t==0)
ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
...
}
Resolve Pruned List
遍历ldsDoesLightIntersect的Flag,重新结算Tile内的灯光数量,并把对应灯光Index(coarseList)加入到prunedList中
#define FINE_PRUNING_ENABLED
#define PERFORM_SPHERICAL_INTERSECTION_TESTS
//(记录Tile内灯光数量)
groupshared int ldsNrLightsFinal;
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
#if PIXEL_PER_THREAD == 4
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float4 vLinDepths) // keep float4 vectorization when possible, as shader compiler may generate bad code for array of floats.
#else
void FinePruneLights(uint threadID, int iNrCoarseLights, uint2 viTilLL, float vLinDepths[PIXEL_PER_THREAD])
#endif
{
....
//When using LDS to cache the volume data, this produces the best most optimal code.
//Doing a manual loop like the one below adds an extra cost of .1 ms on ps4 if we use LDS.
for (; l < iNrCoarseLights; ++l)
{
//判交计算
....
//lightValid记录判交结果,前32盏灯记录到uLightsFlags[0],剩下的记录到uLightsFlags[1]
uLightsFlags[l < 32 ? 0 : 1] |= ((lightValid ? 1 : 0) << (l & 31));
}
//线程同步uLightsFlags
InterlockedOr(ldsDoesLightIntersect[0], uLightsFlags[0]);
InterlockedOr(ldsDoesLightIntersect[1], uLightsFlags[1]);
//重置ldsNrLightsFinal(记录Tile内灯光数量)
if (t == 0)
ldsNrLightsFinal = 0;
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//遍历ldsDoesLightIntersect的Flag
if (t < (uint)iNrCoarseLights && (ldsDoesLightIntersect[t < 32 ? 0 : 1] & (1 << (t & 31))) != 0)
{
unsigned int uInc = 1;
unsigned int uIndex;
InterlockedAdd(ldsNrLightsFinal, uInc, uIndex);
if (uIndex < LIGHT_LIST_MAX_COARSE_ENTRIES)
prunedList[uIndex] = coarseList[t]; // we allow up to 64 pruned lights while stored in LDS.
}
}
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
#ifndef FINE_PRUNING_ENABLED
{
if((int)t<iNrCoarseLights)
prunedList[t] = coarseList[t];
if(t==0)
ldsNrLightsFinal=iNrCoarseLights;
}
#else
{
// initializes ldsNrLightsFinal with the number of accepted lights.
// all accepted entries delivered in prunedList[].
FinePruneLights(t, iNrCoarseLights, viTilLL, vLinDepths);
}
#endif
...
}
遍历PruneList根据不同的LightCategory进行划分,Resolve FeatureFlag
根据上一步FinePruneLights得到的prunedList,可以通过遍历prunedList对应的灯光,拿到灯光的lightCategory以及featureFlags。
这样就可以计算出Tile内CategoryCount以及光照涉及的Light Feature Flag(Punctual/Env/Decal....)
Feature Flag的主要作用于Deferred的Material classification:https://www.cnblogs.com/OneStargazer/p/18174135
groupshared unsigned int prunedList[LIGHT_LIST_MAX_COARSE_ENTRIES];
//LightCategory计数器
groupshared int ldsCategoryListCount[CATEGORY_LIST_SIZE];
//Light Feature Flag
#ifdef USE_FEATURE_FLAGS
groupshared uint ldsFeatureFlags;
RWStructuredBuffer<uint> g_TileFeatureFlags;
#endif
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
//重置LightCategory计数器
if (t < CATEGORY_LIST_SIZE)
ldsCategoryListCount[t] = 0;
//重置ldsFeatureFlags
#ifdef USE_FEATURE_FLAGS
if(t==0)
ldsFeatureFlags=0;
#endif
//线程同步
#if NR_THREADS > PLATFORM_LANE_COUNT
GroupMemoryBarrierWithGroupSync();
#endif
//ldsNrLightsFinal为上一步FinePruneLights计算得出的Tile内灯光数量
int nrLightsCombinedList = min(ldsNrLightsFinal,LIGHT_LIST_MAX_COARSE_ENTRIES);
//遍历prunedList,用InterlockedAdd累计不同的LightCategory到ldsCategoryListCount LDS计数器中
//InterlockedOr合计Tile内所有的Light Feature Flag
for (int i = t; i < nrLightsCombinedList; i += NR_THREADS)
{
const int lightBoundIndex = GenerateLightCullDataIndex(prunedList[i], g_iNrVisibLights, unity_StereoEyeIndex);
InterlockedAdd(ldsCategoryListCount[_LightVolumeData[lightBoundIndex].lightCategory], 1);
#ifdef USE_FEATURE_FLAGS
InterlockedOr(ldsFeatureFlags, _LightVolumeData[lightBoundIndex].featureFlags);
#endif
}
//排序prunedList
// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).
#if NR_THREADS > PLATFORM_LANE_COUNT
SORTLIST(prunedList, nrLightsCombinedList, LIGHT_LIST_MAX_COARSE_ENTRIES, t, NR_THREADS);
//MERGESORTLIST(prunedList, coarseList, nrLightsCombinedList, t, NR_THREADS);
#endif
//初始化每个Tile对应的g_TileFeatureFlags
#ifdef USE_FEATURE_FLAGS
if(t == 0)
{
uint featureFlags = ldsFeatureFlags | g_BaseFeatureFlags;
// In case of back
if(ldsZMax < ldsZMin) // is background pixel
{
// There is no stencil usage with compute path, featureFlags set to 0 is use to have fast rejection of tile in this case.
// It will still execute but will do nothing
featureFlags = 0;
}
g_TileFeatureFlags[tileIDX.y * nrTilesX + tileIDX.x + unity_StereoEyeIndex * nrTilesX * nrTilesY] = featureFlags;
}
#endif
...
}
根据不同的LightCategory使用对应的Offset压入到g_vLightList
这最后一步就是把prunedList放进g_vLightList(大象装进冰箱)
其中由于Index用不了那么UInt32那么大的精度,所以需要将两个Index合并成一个,用的时候再Unpack出来.
/////////HDRenderPipeline.LightLoop.cs PrepareBuildGPULightListPassData
//灯光数量作为EnvLightIndex起始点
//cb._EnvLightIndexShift = (uint)m_GpuLightsBuilder.lightsCount;
//灯光数量以及反射探针数量之和作为DecalIndex起始点
//cb._DecalIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count);
//灯光数量,反射探针数量以及贴花数量之和作为Local Volumetric Fog Index起始点
//cb._LocalVolumetricFogIndexShift = (uint)(m_GpuLightsBuilder.lightsCount + m_lightList.envLights.Count + decalDatasCount);
/////////End of HDRenderPipeline.LightLoop.cs
CBUFFER_START(ShaderVariablesLightList)
...
uint _EnvLightIndexShift;
uint _DecalIndexShift;
uint _LocalVolumetricFogIndexShift;
...
CBUFFER_END
//原本每个Tile内的元素数量64,由于两个Index合并成一个,所以Tile内元素数量就变成32
#define LIGHT_DWORD_PER_FPTL_TILE (32)
//ShaderConfig.cs.hlsl
//由脚本GenerateHLSL生成控制 FPTL Tile内的LightIndex数量上限
#define SHADEROPTIONS_FPTLMAX_LIGHT_COUNT (63)
[numthreads(NR_THREADS, 1, 1)]
void TileLightListGen(uint3 dispatchThreadId : SV_DispatchThreadID, uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{
...
//Tile的起始Index计算
// write lights to global buffers
int localOffs = 0;
int offs = tileIDX.y * nrTilesX + tileIDX.x;
#if defined(UNITY_STEREO_INSTANCING_ENABLED)
// Eye base offset must match code in GetCountAndStartTile()
offs += unity_StereoEyeIndex * nrTilesX * nrTilesY * LIGHTCATEGORY_COUNT;
#endif
//CBUFFER变量初始化shiftIndex
// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index
// to make it work correctly
int shiftIndex[CATEGORY_LIST_SIZE];
ZERO_INITIALIZE_ARRAY(int, shiftIndex, CATEGORY_LIST_SIZE);
shiftIndex[LIGHTCATEGORY_ENV] = _EnvLightIndexShift;
shiftIndex[LIGHTCATEGORY_DECAL] = _DecalIndexShift;
//通过读取ldsCategoryListCount获取不同Category(Punctual/Area/Env/Decal)对应的LightData数量
for (int category = 0; category < CATEGORY_LIST_SIZE; category++)
{
int nrLightsFinal = ldsCategoryListCount[category];
int nrLightsFinalClamped = nrLightsFinal < SHADEROPTIONS_FPTLMAX_LIGHT_COUNT ? nrLightsFinal : SHADEROPTIONS_FPTLMAX_LIGHT_COUNT;
//由于LightIndex用不了uint那么多的位数(32Bit),所以可以对LightList中的Index每两个合并(uLow/uHigh)合并成一个Index
//nrLightsFinalClamped + 1(记录LightList的Index数量的nrLightsFinalClamped)
//((nrLightsFinalClamped + 1) + 1) >> 1 相当于DivRoundUp(nrLightsFinalClamped + 1,2)
const int nrDWords = ((nrLightsFinalClamped + 1) + 1) >> 1;
for (int l = (int)t; l < (int)nrDWords; l += NR_THREADS)
{
//prunedList里存储的Index是:所有的Category(Punctual/Area/Env/Decal)的LightData/LightVolumeData Buffer的Index,
//所以需要减去Category对应的shiftIndex,重新映射得到对应Category在各自Buffer中真正的Index
// We remap the prunedList index to the original LightData / EnvLightData indices
uint uLow = l == 0 ? nrLightsFinalClamped : prunedList[max(0, 2 * l - 1 + localOffs)] - shiftIndex[category];
uint uHigh = prunedList[2 * l + 0 + localOffs] - shiftIndex[category];
//偏移16位
g_vLightList[LIGHT_DWORD_PER_FPTL_TILE * offs + l] = (uLow & 0xffff) | (uHigh << 16);
}
//localOffs滑动到下一个Category
localOffs += nrLightsFinal;
//不同Category存储Index的偏移是Tile的数量。这样有利于遍历时提高Cache Hit.
offs += (nrTilesX * nrTilesY);
}
...
}
Lighting Loop
最终来到了LightLoop中应用g_vLightList计算的结果。
应用的流程如下:
1.首先需要根据当前像素PositionSS计算出Tile的Index。
2.根据Tile的Index以及当前计算的Category得到g_vLightList的偏移tileOffset。
3.Tile List的起始点的Index(start)就是tileOffset,用&0xffff取出第一个元素即为LightCount
4.后续获取LightData的Index时,只需要从start前面的uHigh(前16bit)开始遍历就行了
#define LIGHT_DWORD_PER_FPTL_TILE (32)
//渲染不透明队列时启用
#ifdef USE_FPTL_LIGHTLIST
//计算PositionInputs的tileIndex
//uint2 tileIndex = uint2(fragInput.positionSS.xy) / TILE_SIZE_FPTL;
//PositionInputs posInput = GetPositionInput(fragInput.positionSS.xy, _ScreenSize.zw, fragInput.positionSS.z, fragInput.positionSS.w, input.positionWS.xyz, tileIndex);
//根据tileIndex计算当前lightCategory对应的Tile在g_vLightListTile Buffer中的Offset Index
int GetTileOffset(PositionInputs posInput, uint lightCategory)
{
uint2 tileIndex = posInput.tileCoord;
return (tileIndex.y + lightCategory * _NumTileFtplY) * _NumTileFtplX + tileIndex.x;
}
void GetCountAndStartTile(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
int tileOffset = GetTileOffset(posInput, lightCategory);
#if defined(UNITY_STEREO_INSTANCING_ENABLED)
// Eye base offset must match code in lightlistbuild.compute
tileOffset += unity_StereoEyeIndex * _NumTileFtplX * _NumTileFtplY * LIGHTCATEGORY_COUNT;
#endif
//List的第一个元素就是Light的数量
// The first entry inside a tile is the number of light for lightCategory (thus the +0)
lightCount = g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + 0] & 0xffff;
start = tileOffset;
}
uint GetTileSize()
{
return TILE_SIZE_FPTL;
}
void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{
GetCountAndStartTile(posInput, lightCategory, start, lightCount);
}
//Loop中读取灯光Index的函数
uint FetchIndex(uint tileOffset, uint lightOffset)
{
//List的第一个元素就是Light的数量
//从start前面的uHigh开始遍历
const uint lightOffsetPlusOne = lightOffset + 1; // Add +1 as first slot is reserved to store number of light
//用32bit存了两个Index
// Light index are store on 16bit
return (g_vLightListTile[LIGHT_DWORD_PER_FPTL_TILE * tileOffset + (lightOffsetPlusOne >> 1)] >> ((lightOffsetPlusOne & 1) * 16)) & 0xffff;
}
//渲染透明队列时启用
#elif defined(USE_CLUSTERED_LIGHTLIST)
...
//LightingLoop.hlsl
// This struct is define in the material. the Lightloop must not access it
// PostEvaluateBSDF call at the end will convert Lighting to diffuse and specular lighting
AggregateLighting aggregateLighting;
ZERO_INITIALIZE(AggregateLighting, aggregateLighting); // LightLoop is in charge of initializing the struct
if (featureFlags & LIGHTFEATUREFLAGS_PUNCTUAL)
{
uint lightCount, lightStart;
//默认开启
//#ifndef LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
GetCountAndStart(posInput, LIGHTCATEGORY_PUNCTUAL, lightStart, lightCount);
//#else // LIGHTLOOP_DISABLE_TILE_AND_CLUSTER
//lightCount = _PunctualLightCount;
//lightStart = 0;
//#endif
bool fastPath = false;
//SCALARIZE_LIGHT_LOOP涉及到Wave相关的指令,详细介绍可以看https://zhuanlan.zhihu.com/p/469436345
#if SCALARIZE_LIGHT_LOOP
uint lightStartLane0;
fastPath = IsFastPath(lightStart, lightStartLane0);
if (fastPath)
{
lightStart = lightStartLane0;
}
#endif
// Scalarized loop. All lights that are in a tile/cluster touched by any pixel in the wave are loaded (scalar load), only the one relevant to current thread/pixel are processed.
// For clarity, the following code will follow the convention: variables starting with s_ are meant to be wave uniform (meant for scalar register),
// v_ are variables that might have different value for each thread in the wave (meant for vector registers).
// This will perform more loads than it is supposed to, however, the benefits should offset the downside, especially given that light data accessed should be largely coherent.
// Note that the above is valid only if wave intriniscs are supported.
uint v_lightListOffset = 0;
uint v_lightIdx = lightStart;
while (v_lightListOffset < lightCount)
{
v_lightIdx = FetchIndex(lightStart, v_lightListOffset);
#if SCALARIZE_LIGHT_LOOP
uint s_lightIdx = ScalarizeElementIndex(v_lightIdx, fastPath);
#else
uint s_lightIdx = v_lightIdx;
#endif
if (s_lightIdx == -1)
break;
//获取LightData
LightData s_lightData = FetchLight(s_lightIdx);
...
}
}