Unity的Forward+ FPTL光照剔除解析(一)
序言
FPTL也叫Fine Pruned Tiled Light Lists。自从我开发HDRP的时候,就一直想把这个FPTL的实现完全看明白。但是无论是国内还是国外都几乎找不到关于HDRP相关算法的解析,基本上都是寥寥草草几句话带过,于是乎就有了这篇水文。
FPTL的大致的流程
CPU准备灯光数据流程:
使用Jobs ProcessedVisibleLightsBuilder 转换VisibleLight成LightRenderData(HQLightRenderDatabase中的m_LightData),主要目的是排序灯光数据,使其更加紧凑,对GPU遍历性能也更加友好
使用Jobs GpuLightsBuilder,将LightRenderData转成LightData(渲染中实际上要用的数据,剔除要用的数据lightBounds,lightVolumes)
最后Push Light Data到GPU
GPU渲染流程:
预先生成Pre Depth深度图,
清理LightList.(一般是更改分辨率触发)
计算Light的ScrBound,(GenerateLightsScreenSpaceAABBs)
BigTileLightList建立(可选BigTilePrepass),
更细小的Tile的LightList建立。(BuildPerTileLightList)
VoxelLightList建立(透明物体接受光照使用VoxelLightList)
//HDRenderPipeline.LightLoop.cs
BuildGPULightListOutput BuildGPULightList(
RenderGraph renderGraph,
HDCamera hdCamera,
TileAndClusterData tileAndClusterData,
int totalLightCount,
ref ShaderVariablesLightList constantBuffer,
TextureHandle depthStencilBuffer,
TextureHandle stencilBufferCopy,
GBufferOutput gBuffer)
{
using (var builder = renderGraph.AddRenderPass<BuildGPULightListPassData>("Build Light List", out var passData, ProfilingSampler.Get(HDProfileId.BuildLightList)))
{
builder.EnableAsyncCompute(hdCamera.frameSettings.BuildLightListRunsAsync());
PrepareBuildGPULightListPassData(renderGraph, builder, hdCamera, tileAndClusterData, ref constantBuffer, totalLightCount, depthStencilBuffer, stencilBufferCopy, gBuffer, passData);
builder.SetRenderFunc(
(BuildGPULightListPassData data, RenderGraphContext context) =>
{
bool tileFlagsWritten = false;
ClearLightLists(data, context.cmd);
GenerateLightsScreenSpaceAABBs(data, context.cmd);
BigTilePrepass(data, context.cmd);
BuildPerTileLightList(data, ref tileFlagsWritten, context.cmd);
VoxelLightListGeneration(data, context.cmd);
BuildDispatchIndirectArguments(data, tileFlagsWritten, context.cmd);
});
return passData.output;
}
}
从HDAdditionalLightData(前端)到HDLightRenderDatabase(后端)
创建灯光的时候,Light自动Attach了一个MonoBehavior HDAdditionalLightData,这一功能实现是在Light的Editor内。
通过继承LightEditor,再加上CustomEditorForRenderPipeline Attribute就可以让BuildIn的灯光Editor在不同的管线切换到管线扩展的灯光Editor。
//HDLightEditor.cs
[CanEditMultipleObjects]
[CustomEditorForRenderPipeline(typeof(Light), typeof(HDRenderPipelineAsset))]
sealed partial class HDLightEditor : LightEditor
{
...
protected override void OnEnable()
{
base.OnEnable();
// 自动Attach逻辑
m_AdditionalLightDatas = CoreEditorUtils.GetAdditionalData<HDAdditionalLightData>(targets, HDAdditionalLightData.InitDefaultHDAdditionalLightData);
m_SerializedHDLight = new SerializedHDLight(m_AdditionalLightDatas, settings);
// Update emissive mesh and light intensity when undo/redo
Undo.undoRedoPerformed += OnUndoRedo;
HDLightUI.RegisterEditor(this);
}
...
}
创建灯光的时候,HDAdditionalLightData(Light创建时自动Attach的MonoBehavior)就需要在HDLightRenderDatabase中创建出HDLightRenderEntity(用于寻址用的句柄)
可以看到HDLightRenderDatabase灯光数据库,用于方便(容易绕晕)索引灯光数据,采用的是单例模式。
//HDAdditionalLightData.cs
internal void CreateHDLightRenderEntity(bool autoDestroy = false)
{
if (!this.lightEntity.valid)
{
HDLightRenderDatabase lightEntities = HDLightRenderDatabase.instance;
this.lightEntity = lightEntities.CreateEntity(autoDestroy);
lightEntities.AttachGameObjectData(this.lightEntity, legacyLight.GetInstanceID(), this, legacyLight.gameObject);
}
UpdateRenderEntity();
}
void OnEnable()
{
...
CreateHDLightRenderEntity();
}
//HDLightRenderDatabase.cs
//Light rendering entity. This struct acts as a handle to set / get light render information into the database.
internal struct HDLightRenderEntity
{
public int entityIndex;
public static readonly HDLightRenderEntity Invalid = new HDLightRenderEntity() { entityIndex = HDLightRenderDatabase.InvalidDataIndex };
public bool valid { get { return entityIndex != HDLightRenderDatabase.InvalidDataIndex; } }
}
//HDLightRenderDatabase.cs
internal partial class HDLightRenderDatabase
{
....
static public HDLightRenderDatabase instance
{
get
{
if (s_Instance == null)
s_Instance = new HDLightRenderDatabase();
return s_Instance;
}
}
...
}
在修改灯光相关数据的时候,需要通过lightEntity句柄进行寻址,然后修改HDLightRenderDatabase中的HDLightRenderData
//HDAdditionalLightData.cs
//类似的还有UpdateRenderEntity
public void SetAreaLightSize(Vector2 size)
{
...
if (lightEntity.valid)
{
ref HDLightRenderData lightRenderData = ref HDLightRenderDatabase.instance.EditLightDataAsRef(lightEntity);
lightRenderData.shapeWidth = m_ShapeWidth;
lightRenderData.shapeHeight = m_ShapeHeight;
}
...
}
//HDLightRenderDatabase.cs
....
//Gets and edits a reference. Must be not called during rendering pipeline, only during game object modification.
public ref HDLightRenderData EditLightDataAsRef(in HDLightRenderEntity entity) => ref EditLightDataAsRef(m_LightEntities[entity.entityIndex].dataIndex);
//Gets and edits a reference. Must be not called during rendering pipeline, only during game object modification.
public ref HDLightRenderData EditLightDataAsRef(int dataIndex)
{
if (dataIndex >= m_LightCount)
throw new Exception("Entity passed in is out of bounds. Index requested " + dataIndex + " and maximum length is " + m_LightCount);
unsafe
{
HDLightRenderData* data = (HDLightRenderData*)m_LightData.GetUnsafePtr<HDLightRenderData>() + dataIndex;
return ref UnsafeUtility.AsRef<HDLightRenderData>(data);
}
}
...
同样,删除灯光的时候也需要销毁对应的lightEntity
//HDAdditionalLightData.cs
void OnDestroy()
{
...
DestroyHDLightRenderEntity();
}
internal void DestroyHDLightRenderEntity()
{
if (!lightEntity.valid)
return;
HDLightRenderDatabase.instance.DestroyEntity(lightEntity);
lightEntity = HDLightRenderEntity.Invalid;
}
//HDLightRenderDatabase.cs
public void DestroyEntity(HDLightRenderEntity lightEntity)
{
Assert.IsTrue(IsValid(lightEntity));
m_FreeIndices.Enqueue(lightEntity.entityIndex);
LightEntityInfo entityData = m_LightEntities[lightEntity.entityIndex];
m_LightsToEntityItem.Remove(entityData.lightInstanceID);
if (m_HDAdditionalLightData[entityData.dataIndex] != null)
--m_AttachedGameObjects;
RemoveAtSwapBackArrays(entityData.dataIndex);
if (m_LightCount == 0)
{
DeleteArrays();
}
else
{
HDLightRenderEntity entityToUpdate = m_OwnerEntity[entityData.dataIndex];
LightEntityInfo dataToUpdate = m_LightEntities[entityToUpdate.entityIndex];
dataToUpdate.dataIndex = entityData.dataIndex;
m_LightEntities[entityToUpdate.entityIndex] = dataToUpdate;
if (dataToUpdate.lightInstanceID != entityData.lightInstanceID)
m_LightsToEntityItem[dataToUpdate.lightInstanceID] = dataToUpdate;
}
}
而HDLightRenderData就是我们基于BuildIn的灯光扩展数据类型,也是存放在HDLightRenderDatabase最基本的存储数据。
而渲染的时候,我们通常来说只能拿到CullingResults的VisibleLights数组也就是说可以拿到BuildIn的Light对象。
而这时候就需要我们对Light对象操作一次TryGetComponent<HDAdditionalLightData>(及其离谱)。
//HDProcessedVisibleLightsBuilder.LightLoop.cs
private void BuildVisibleLightEntities(in CullingResults cullResults)
{
...
//明知道是bullshit还是不改,TODO了一万年了
//TODO: this should be accelerated by a c++ API
var defaultEntity = HDLightRenderDatabase.instance.GetDefaultLightEntity();
for (int i = 0; i < cullResults.visibleLights.Length; ++i)
{
Light light = cullResults.visibleLights[i].light;
int dataIndex = HDLightRenderDatabase.instance.FindEntityDataIndex(light);
if (dataIndex == HDLightRenderDatabase.InvalidDataIndex)
{
//Shuriken lights bullshit: this happens because shuriken lights dont have the HDAdditionalLightData OnEnabled.
//Because of this, we have to forcefully create a light render entity on the rendering side. Horrible!!!
if (light.TryGetComponent<HDAdditionalLightData>(out var hdAdditionalLightData))
{
if (!hdAdditionalLightData.lightEntity.valid)
hdAdditionalLightData.CreateHDLightRenderEntity(autoDestroy: true);
}
else
dataIndex = HDLightRenderDatabase.instance.GetEntityDataIndex(defaultEntity);
}
m_VisibleLightEntityDataIndices[i] = dataIndex;
m_VisibleLightBakingOutput[i] = light.bakingOutput;
m_VisibleLightShadowCasterMode[i] = light.lightShadowCasterMode;
m_VisibleLightShadows[i] = light.shadows;
}
}
HDProcessedVisibleLightsBuilder,HDGpuLightsBuilder
HDProcessedVisibleLightsBuilder
HDProcessedVisibleLightsBuilder顾名思义,就是用来处理VisibleLight的Builder(采用的是设计模式中的创建者模式)。
由于上面提到渲染管线的CullingResult只能拿到Light对象,
所以为了拿到对应Visible的HDLightRenderData,就需要用HDProcessedVisibleLightsBuilder对灯光进行预处理(拿到灯光数据,排序)。
//LightingLoop.cs
// Compute data that will be used during the light loop for a particular light.
void PreprocessVisibleLights(CommandBuffer cmd, HDCamera hdCamera, in CullingResults cullResults, DebugDisplaySettings debugDisplaySettings, in AOVRequestData aovRequest)
{
using (new ProfilingScope(cmd, ProfilingSampler.Get(HDProfileId.ProcessVisibleLights)))
{
m_ProcessedLightsBuilder.Build(
hdCamera,
cullResults,
m_ShadowManager,
m_ShadowInitParameters,
aovRequest,
lightLoopSettings,
m_CurrentDebugDisplaySettings);
...
}
}
BuildVisibleLightEntities就是上面Light对象操作一次TryGetComponent然后拿到lightEntity的函数,就不再贴代码了。
//HDProcessedVisibleLightsBuilder.cs
//Builds sorted HDProcessedVisibleLight structures.
public void Build(
HDCamera hdCamera,
in CullingResults cullingResult,
HDShadowManager shadowManager,
in HDShadowInitParameters inShadowInitParameters,
in AOVRequestData aovRequestData,
in GlobalLightLoopSettings lightLoopSettings,
DebugDisplaySettings debugDisplaySettings)
{
BuildVisibleLightEntities(cullingResult);
if (m_Size == 0)
return;
FilterVisibleLightsByAOV(aovRequestData);
StartProcessVisibleLightJob(hdCamera, cullingResult.visibleLights, lightLoopSettings, debugDisplaySettings);
CompleteProcessVisibleLightJob();
SortLightKeys();
ProcessShadows(hdCamera, shadowManager, inShadowInitParameters, cullingResult);
}
可以看到Build函数里面开启了一个Jobs,StartProcessVisibleLightJob,这个Jobs主要是用来对不同灯光种类进行分类计数并且按照重要性进行排序
m_ProcessVisibleLightCounts 是分类计数器的数组,数组元素统计了(Directional,Area,Punctual,Area,Shadow,BakeShadow)的数量。
m_ProcessedLightVolumeType 每个VisibleLight对应的LightVolumeType。
m_ProcessedEntities 就是每个VisibleLight对应的HDProcessedVisibleLight
m_SortKeys就是后续用来排序重要性的数组
//HDProcessedVisibleLightsBuilder.Jobs.cs
...
#region output processed lights
[WriteOnly]
public NativeArray<int> processedVisibleLightCountsPtr;
[WriteOnly]
public NativeArray<LightVolumeType> processedLightVolumeType;
[WriteOnly]
public NativeArray<HDProcessedVisibleLight> processedEntities;
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<uint> sortKeys;
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<int> shadowLightsDataIndices;
#endregion
...
public void StartProcessVisibleLightJob(
HDCamera hdCamera,
NativeArray<VisibleLight> visibleLights,
in GlobalLightLoopSettings lightLoopSettings,
DebugDisplaySettings debugDisplaySettings)
{
if (m_Size == 0)
return;
var lightEntityCollection = HDLightRenderDatabase.instance;
var processVisibleLightJob = new ProcessVisibleLightJob()
{
//Parameters.
....
//render light entities.
lightData = lightEntityCollection.lightData,
//data of all visible light entities.
visibleLights = visibleLights,
....
//Output processed lights.
processedVisibleLightCountsPtr = m_ProcessVisibleLightCounts,
processedLightVolumeType = m_ProcessedLightVolumeType,
processedEntities = m_ProcessedEntities,
sortKeys = m_SortKeys,
shadowLightsDataIndices = m_ShadowLightsDataIndices
};
m_ProcessVisibleLightJobHandle = processVisibleLightJob.Schedule(m_Size, 32);
}
HDProcessedVisibleLight是VisibleLight转换到LightData之前的中间载体,
记录了在HDLightRenderDatabase中lightEntity的Index,
渲染时GPULightType,以及对应的HDLightType。
这里为什么要有HDLightType,主要是因为HDRP把面光源也当做了Point Light Type,这样才能够不被剔除(在cullingResult.VisibleLights里面),因为面光在BuildIn里面只能够烘焙,不能设置成RealTime。
其他数据暂时用不上就不说了。
public enum HDLightType
{
/// <summary>Spot Light. Complete this type by setting the SpotLightShape too.</summary>
Spot = LightType.Spot,
/// <summary>Directional Light.</summary>
Directional = LightType.Directional,
/// <summary>Point Light.</summary>
Point = LightType.Point,
/// <summary>Area Light. Complete this type by setting the AreaLightShape too.</summary>
Area = LightType.Area,
}
enum GPULightType
{
Directional,
Point,
Spot,
ProjectorPyramid,
ProjectorBox,
// AreaLight
Tube, // Keep Line lights before Rectangle. This is needed because of a compiler bug (see LightLoop.hlsl)
Rectangle,
// Currently not supported in real time (just use for reference)
Disc,
// Sphere,
};
internal struct HDProcessedVisibleLight
{
public int dataIndex;
public GPULightType gpuLightType;
public HDLightType lightType;
public float lightDistanceFade;
public float lightVolumetricDistanceFade;
public float distanceToCamera;
public HDProcessedVisibleLightsBuilder.ShadowMapFlags shadowMapFlags;
public bool isBakedShadowMask;
}
LightVolumeType主要是用来描述灯光体积的形状,就可以在ComputeShader内应用不同的的剔除计算
internal enum LightVolumeType
{
Cone,
Sphere,
Box,
Count
}
LightCategory这里的命名我认为是有问题的。叫灯光种类,实际上贴花,体积雾都进来了,这些实际上是因为HDRP的贴花和体积雾也想要参与剔除计算(一般体积雾和贴花都能够用LightVolumeType描述自身体积的形状)。
后续剔除之后,LightList里面也会根据不同的LightCategory划分Buffer布局。
internal enum LightCategory
{
Punctual,
Area,
Env,
Decal,
LocalVolumetricFog, // WARNING: Currently lightlistbuild.compute assumes Local Volumetric Fog is the last element in the LightCategory enum. Do not append new LightCategory types after LocalVolumetricFog. TODO: Fix .compute code.
Count
}
回到ProcessVisibleLightJob上面来,可以看到这里并行地对cullingResult.VisibleLight进行处理,得出light对应的lightCategory,gpuLightType,lightVolumeType。
将lightCategory, gpuLightType, lightVolumeType, index打包成SortKey.从PackLightSortKey看得出来平行光的重要性是最高的。
//HDProcessedVisibleLightsBuilder.Jobs.cs
//对计数器进行原子操作
private int IncrementCounter(HDProcessedVisibleLightsBuilder.ProcessLightsCountSlots counterSlot)
{
int outputIndex = 0;
unsafe
{
int* ptr = (int*)processedVisibleLightCountsPtr.GetUnsafePtr<int>() + (int)counterSlot;
outputIndex = Interlocked.Increment(ref UnsafeUtility.AsRef<int>(ptr));
}
return outputIndex;
}
private int NextOutputIndex() => IncrementCounter(HDProcessedVisibleLightsBuilder.ProcessLightsCountSlots.ProcessedLights) - 1;
//在HDGpuLightsBuilder.cs中
public static uint PackLightSortKey(LightCategory lightCategory, GPULightType gpuLightType, LightVolumeType lightVolumeType, int lightIndex)
{
//We sort directional lights to be in the beginning of the list.
//This ensures that we can access directional lights very easily after we sort them.
uint isDirectionalMSB = gpuLightType == GPULightType.Directional ? 0u : 1u;
uint sortKey = (uint)isDirectionalMSB << 31 | (uint)lightCategory << 27 | (uint)gpuLightType << 22 | (uint)lightVolumeType << 17 | (uint)lightIndex;
return sortKey;
}
//Unpacks a sort key for a light
public static void UnpackLightSortKey(uint sortKey, out LightCategory lightCategory, out GPULightType gpuLightType, out LightVolumeType lightVolumeType, out int lightIndex)
{
lightCategory = (LightCategory)((sortKey >> 27) & 0xF);
gpuLightType = (GPULightType)((sortKey >> 22) & 0x1F);
lightVolumeType = (LightVolumeType)((sortKey >> 17) & 0x1F);
lightIndex = (int)(sortKey & 0xFFFF);
}
//End of HDGpuLightsBuilder.cs
public void Execute(int index)
{
VisibleLight visibleLight = visibleLights[index];
int dataIndex = visibleLightEntityDataIndices[index];
LightBakingOutput bakingOutput = visibleLightBakingOutput[index];
LightShadows shadows = visibleLightShadows[index];
if (TrivialRejectLight(visibleLight, dataIndex))
return;
ref HDLightRenderData lightRenderData = ref GetLightData(dataIndex);
...
//防止超出同屏幕(Area,Punctual,Directional)光源限制
if (!IncrementLightCounterAndTestLimit(lightCategory, gpuLightType))
return;
//原子操作
int outputIndex = NextOutputIndex();
sortKeys[outputIndex] = HDGpuLightsBuilder.PackLightSortKey(lightCategory, gpuLightType, lightVolumeType, index);
processedLightVolumeType[index] = lightVolumeType;
processedEntities[index] = new HDProcessedVisibleLight()
{
dataIndex = dataIndex,
gpuLightType = gpuLightType,
lightType = lightType,
lightDistanceFade = lightDistanceFade,
lightVolumetricDistanceFade = volumetricDistanceFade,
distanceToCamera = distanceToCamera,
shadowMapFlags = shadowMapFlags,
isBakedShadowMask = isBakedShadowMaskLight
};
...
}
所以经过HDProcessedVisibleLightsBuilder的ProcessVisibleLightJob调度Jobs对visibleLights进行处理之后,就得到了预处理过后的HDProcessedVisibleLight以及LightVolumeType,以及一个根据重要性排序过的SortKey数组。
而这个SortKey数组则是后续CreateGpuLightDataJob中寻址灯光数据最主要的手段。
(通过UnpackSortKey,我们能够得到当前灯光在m_ProcessedLightVolumeType/m_ProcessedEntities的Index)
HDGpuLightsBuilder
经过 HDProcessedVisibleLightsBuilder的预处理,得到了SortKey,每个VisibleLight对应的HDProcessedVisibleLight以及LightVolumeType。
接下来就是通过Jobs对HDProcessedVisibleLight以及LightVolumeType转化成最终在渲染时的LightData,DirectionalLightData,剔除计算用的SFiniteLightBound,LightVolumeData。
//LightLoop.cs
void PrepareGPULightdata(CommandBuffer cmd, HDCamera hdCamera, CullingResults cullResults)
{
using (new ProfilingScope(cmd, ProfilingSampler.Get(HDProfileId.PrepareGPULightdata)))
{
// 2. Go through all lights, convert them to GPU format.
// Simultaneously create data for culling (LightVolumeData and SFiniteLightBound)
m_GpuLightsBuilder.Build(cmd, hdCamera, cullResults,
m_ProcessedLightsBuilder,
HDLightRenderDatabase.instance, m_ShadowInitParameters, m_CurrentDebugDisplaySettings);
...
}
}
//HDGpuLightsBuilder.LightLoop.cs
public void Build(
CommandBuffer cmd,
HDCamera hdCamera,
in CullingResults cullingResult,
HDProcessedVisibleLightsBuilder visibleLights,
HDLightRenderDatabase lightEntities,
in HDShadowInitParameters shadowInitParams,
DebugDisplaySettings debugDisplaySettings)
{
...
if (totalLightsCount > 0)
{
...
StartCreateGpuLightDataJob(hdCamera, cullingResult, hdShadowSettings, visibleLights, lightEntities);
CompleteGpuLightDataJob();
CalculateAllLightDataTextureInfo(cmd, hdCamera, cullingResult, visibleLights, lightEntities, hdShadowSettings, shadowInitParams, debugDisplaySettings);
}
}
从Execute可以看出来,对于平行光直接转换成GPUFormat(DirectionalLightData),而其他光源执行StoreAndConvertLightToGPUFormat。
//HDGpuLightsBuilder.Jobs.cs
#region output processed lights
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<LightData> lights;
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<DirectionalLightData> directionalLights;
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<LightsPerView> lightsPerView;
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<SFiniteLightBound> lightBounds;
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<LightVolumeData> lightVolumes;
[WriteOnly]
[NativeDisableContainerSafetyRestriction]
public NativeArray<int> gpuLightCounters;
#endregion
public void Execute(int index)
{
var sortKey = sortKeys[index];
HDGpuLightsBuilder.UnpackLightSortKey(sortKey, out var lightCategory, out var gpuLightType, out var lightVolumeType, out var lightIndex);
if (gpuLightType == GPULightType.Directional)
{
int outputIndex = index;
ConvertDirectionalLightToGPUFormat(outputIndex, lightIndex, lightCategory, gpuLightType, lightVolumeType);
}
else
{
int outputIndex = index - directionalSortedLightCounts;
StoreAndConvertLightToGPUFormat(outputIndex, lightIndex, lightCategory, gpuLightType, lightVolumeType);
}
}
public void StartCreateGpuLightDataJob(
HDCamera hdCamera,
in CullingResults cullingResult,
HDShadowSettings hdShadowSettings,
HDProcessedVisibleLightsBuilder visibleLights,
HDLightRenderDatabase lightEntities)
{
...
var createGpuLightDataJob = new CreateGpuLightDataJob()
{
//Parameters
....
//outputs
gpuLightCounters = m_LightTypeCounters,
lights = m_Lights,
directionalLights = m_DirectionalLights,
lightsPerView = m_LightsPerView,
lightBounds = m_LightBounds,
lightVolumes = m_LightVolumes
};
m_CreateGpuLightDataJobHandle = createGpuLightDataJob.Schedule(visibleLights.sortedLightCounts, 32);
}
StoreAndConvertLightToGPUFormat具体的内容就是跟平行光类似,除了需要转换成GPUFormat(LightData),还需要转换出剔除计算用的SFiniteLightBound,LightVolumeData。
需要留意的是,在这里也对LightCategory.Punctual,LightCategory.Area的光源进行计数了。(不启用FPTL或者Cluster的时候进行遍历LightData用)
//HDGpuLightsBuilder.Jobs.cs
private void ComputeLightVolumeDataAndBound(
LightCategory lightCategory, GPULightType gpuLightType, LightVolumeType lightVolumeType,
in VisibleLight light, in LightData lightData, in Vector3 lightDimensions, in Matrix4x4 worldToView, int outputIndex)
{
// Then Culling side
var range = lightDimensions.z;
var lightToWorld = light.localToWorldMatrix;
Vector3 positionWS = lightData.positionRWS;
Vector3 positionVS = worldToView.MultiplyPoint(positionWS);
Vector3 xAxisVS = worldToView.MultiplyVector(lightToWorld.GetColumn(0));
Vector3 yAxisVS = worldToView.MultiplyVector(lightToWorld.GetColumn(1));
Vector3 zAxisVS = worldToView.MultiplyVector(lightToWorld.GetColumn(2));
// Fill bounds
var bound = new SFiniteLightBound();
var lightVolumeData = new LightVolumeData();
lightVolumeData.lightCategory = (uint)lightCategory;
lightVolumeData.lightVolume = (uint)lightVolumeType;
if (gpuLightType == GPULightType.Spot || gpuLightType == GPULightType.ProjectorPyramid)
{
...
}
else if (gpuLightType == GPULightType.Point)
{
...
}
else if (gpuLightType == GPULightType.Tube)
{
...
}
else if (gpuLightType == GPULightType.Rectangle)
{
...
}
else if (gpuLightType == GPULightType.ProjectorBox)
{
...
}
else if (gpuLightType == GPULightType.Disc)
{
//not supported at real time at the moment
}
else
{
Debug.Assert(false, "TODO: encountered an unknown GPULightType.");
}
lightBounds[outputIndex] = bound;
lightVolumes[outputIndex] = lightVolumeData;
}
private void StoreAndConvertLightToGPUFormat(
int outputIndex, int lightIndex,
LightCategory lightCategory, GPULightType gpuLightType, LightVolumeType lightVolumeType)
{
var light = visibleLights[lightIndex];
var processedEntity = processedEntities[lightIndex];
var lightData = new LightData();
ref HDLightRenderData lightRenderData = ref GetLightData(processedEntity.dataIndex);
ConvertLightToGPUFormat(
lightCategory, gpuLightType, globalConfig,
visibleLightShadowCasterMode[lightIndex],
visibleLightBakingOutput[lightIndex],
light,
processedEntity,
lightRenderData,
out var lightDimensions,
ref lightData);
for (int viewId = 0; viewId < viewCounts; ++viewId)
{
var lightsPerViewContainer = lightsPerView[viewId];
ComputeLightVolumeDataAndBound(
lightCategory, gpuLightType, lightVolumeType,
light, lightData, lightDimensions, lightsPerViewContainer.worldToView, outputIndex + lightsPerViewContainer.boundsOffset);
}
if (useCameraRelativePosition)
lightData.positionRWS -= cameraPos;
switch (lightCategory)
{
case LightCategory.Punctual:
IncrementCounter(HDGpuLightsBuilder.GPULightTypeCountSlots.Punctual);
break;
case LightCategory.Area:
IncrementCounter(HDGpuLightsBuilder.GPULightTypeCountSlots.Area);
break;
default:
Debug.Assert(false, "TODO: encountered an unknown LightCategory.");
break;
}
#if DEBUG
if (outputIndex < 0 || outputIndex >= outputLightCounts)
throw new Exception("Trying to access an output index out of bounds. Output index is " + outputIndex + "and max length is " + outputLightCounts);
#endif
lights[outputIndex] = lightData;
}
SFiniteLightBound,LightVolumeData
SFiniteLightBound 记录了灯光AABB数据(轴朝向,View Space中的中心点坐标,半径)
LightVolumeData则是主要用于记录LightVolumeType为Box时的剔除用数据(boxInnerDist/boxInvRange),以及Spot,ProjectorPyramid类型灯光剔除用数据(cotan)
[GenerateHLSL]
struct SFiniteLightBound
{
public Vector3 boxAxisX; // Scaled by the extents (half-size)
public Vector3 boxAxisY; // Scaled by the extents (half-size)
public Vector3 boxAxisZ; // Scaled by the extents (half-size)
public Vector3 center; // Center of the bounds (box) in camera space
public float scaleXY; // Scale applied to the top of the box to turn it into a truncated pyramid (X = Y)
public float radius; // Circumscribed sphere for the bounds (box)
};
[GenerateHLSL]
struct LightVolumeData
{
public Vector3 lightPos; // Of light's "origin"
public uint lightVolume; // Type index
public Vector3 lightAxisX; // Normalized
public uint lightCategory; // Category index
public Vector3 lightAxisY; // Normalized
public float radiusSq; // Cone and sphere: light range squared
public Vector3 lightAxisZ; // Normalized
public float cotan; // Cone: cotan of the aperture (half-angle)
public Vector3 boxInnerDist; // Box: extents (half-size) of the inner box
public uint featureFlags;
public Vector3 boxInvRange; // Box: 1 / (OuterBoxExtents - InnerBoxExtents)
public float unused2;
};
先以最简单的Point Light举例,捋一遍剔除的流程,这样后续其他类型的灯光剔除流程也只是计算上的差别了。
Point Light的SFiniteLightBound/LightVolumeData数据
private void ComputeLightVolumeDataAndBound(
LightCategory lightCategory, GPULightType gpuLightType, LightVolumeType lightVolumeType,
in VisibleLight light, in LightData lightData, in Vector3 lightDimensions, in Matrix4x4 worldToView, int outputIndex)
{
...
else if (gpuLightType == GPULightType.Point)
{
// Construct a view-space axis-aligned bounding cube around the bounding sphere.
// This allows us to utilize the same polygon clipping technique for all lights.
// Non-axis-aligned vectors may result in a larger screen-space AABB.
Vector3 vx = new Vector3(1, 0, 0);
Vector3 vy = new Vector3(0, 1, 0);
Vector3 vz = new Vector3(0, 0, 1);
bound.center = positionVS;
bound.boxAxisX = vx * range;
bound.boxAxisY = vy * range;
bound.boxAxisZ = vz * range;
bound.scaleXY = 1.0f;
bound.radius = range;
// fill up ldata
lightVolumeData.lightAxisX = vx;
lightVolumeData.lightAxisY = vy;
lightVolumeData.lightAxisZ = vz;
lightVolumeData.lightPos = bound.center;
lightVolumeData.radiusSq = range * range;
lightVolumeData.featureFlags = (uint)LightFeatureFlags.Punctual;
}
...
}
GenerateLightsScreenSpaceAABBs
在前面介绍FPTL的大致的流程中,
第一步,预先PrePass得到深度图,这个在一般的延迟管线或者是前向管线也可以拿得到,这里就不说了。
第二步,清理LightList也一般是分辨率更改才会触发的,这里先跳过。
第三部,也就是这篇文章的重点。利用前面Jobs得到的SFiniteLightBound,在Scrbound.compute计算灯光在屏幕空间上的AABB。
Scrbound Dispatch
Scrbound Dispatch 64线程为1组,4个线程计算1盏灯。
//HDRenderPipeline.LightLoop.cs
static void GenerateLightsScreenSpaceAABBs(BuildGPULightListPassData data, CommandBuffer cmd)
{
if (data.totalLightCount != 0)
{
using (new ProfilingScope(cmd, ProfilingSampler.Get(HDProfileId.GenerateLightAABBs)))
{
// With XR single-pass, we have one set of light bounds per view to iterate over (bounds are in view space for each view)
cmd.SetComputeBufferParam(data.screenSpaceAABBShader, data.screenSpaceAABBKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);
cmd.SetComputeBufferParam(data.screenSpaceAABBShader, data.screenSpaceAABBKernel, HDShaderIDs.outputData, data.debugDataReadBackBuffer);
cmd.SetComputeBufferParam(data.screenSpaceAABBShader, data.screenSpaceAABBKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);
ConstantBuffer.Push(cmd, data.lightListCB, data.screenSpaceAABBShader, HDShaderIDs._ShaderVariablesLightList);
const int threadsPerLight = 4; // Shader: THREADS_PER_LIGHT (4)
const int threadsPerGroup = 64; // Shader: THREADS_PER_GROUP (64)
int groupCount = HDUtils.DivRoundUp(data.totalLightCount * threadsPerLight, threadsPerGroup);
cmd.DispatchCompute(data.screenSpaceAABBShader, data.screenSpaceAABBKernel, groupCount, data.viewCount, 1);
}
}
}
//Scrbound.compute
#define MAX_CLIP_VERTS (10)
#define NUM_VERTS (8)
#define NUM_FACES (6)
#define NUM_PLANES (6)
#define THREADS_PER_GROUP (64)
#define THREADS_PER_LIGHT (4) // Set to 1 for debugging
#define LIGHTS_PER_GROUP (THREADS_PER_GROUP / THREADS_PER_LIGHT)
#define VERTS_PER_GROUP (NUM_VERTS * LIGHTS_PER_GROUP)
#define VERTS_PER_THREAD (NUM_VERTS / THREADS_PER_LIGHT)
#define FACES_PER_THREAD DIV_ROUND_UP(NUM_FACES, THREADS_PER_LIGHT)
计算灯光AABB的各个顶点在视锥体的内外情况
1.先拿到刚刚计算的SFiniteLightBound灯光AABB数据
[numthreads(THREADS_PER_GROUP, 1, 1)]
void main(uint threadID : SV_GroupIndex, uint3 groupID : SV_GroupID)
{
const uint t = threadID;
const uint g = groupID.x;
const uint eyeIndex = groupID.y; // Currently, can only be 0 or 1
const uint intraGroupLightIndex = t / THREADS_PER_LIGHT;
const uint globalLightIndex = g * LIGHTS_PER_GROUP + intraGroupLightIndex;
const uint baseVertexOffset = intraGroupLightIndex * NUM_VERTS;
const uint eyeAdjustedInputOffset = GenerateLightCullDataIndex(globalLightIndex, g_iNrVisibLights, eyeIndex);
const SFiniteLightBound cullData = g_data[eyeAdjustedInputOffset];
const float4x4 projMat = g_mProjectionArr[eyeIndex];
const float4x4 invProjMat = g_mInvProjectionArr[eyeIndex];
//gs_CullClipFaceMasks初始化为0,即默认所有的面都在外面
if (t % THREADS_PER_LIGHT == 0)
{
gs_CullClipFaceMasks[intraGroupLightIndex] = 0;
}
// Bounding frustum.
const float3 rbpC = cullData.center.xyz; // View-space
const float3 rbpX = cullData.boxAxisX.xyz; // Pre-scaled
const float3 rbpY = cullData.boxAxisY.xyz; // Pre-scaled
const float3 rbpZ = cullData.boxAxisZ.xyz; // Pre-scaled
const float scale = cullData.scaleXY; // scale.x = scale.y
// Bounding sphere.
const float radius = cullData.radius;
...
}
2.计算各个AABB的顶点,并且把viewSpace的顶点转换到齐次坐标下,
判断顶点是否在视锥体外部。(0 <= x <= w, 0 <= y <= w, 0 <= z <= w)
若behindMask为0,则说明当前遍历的顶点在视锥体内部,需要更新当前AABB的ndcAaBbMinPt,ndcAaBbMaxPt
若behindMask不为0,则说明当前遍历的顶点在视锥体外面,需要记录点相关的面,留到后面计算相关的面与视锥体的交点,再更新AABB的ndcAaBbMinPt,ndcAaBbMaxPt.
遍历顶点结束后,通过原子操作InterlockedOr gs_CullClipFaceMasks,同步LDS,就得到了当前灯光所有在视锥体外面的顶点相关的面(需要进一步计算的所有的面)
需要留意intraGroupLightIndex 4个线程计算1盏灯,表示4个线程并行计算的灯光Index,这里的InterlockedOr四个线程同步的是相同的intraGroupLightIndex。
//VERTS_PER_THREAD=8/4=2,一个线程计算两个顶点,4个线程就能算8个顶点,也就是一盏灯
for (i = 0; i < VERTS_PER_THREAD; i++)
{
uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
// rbpVerts[0] = rbpC - rbpX * scale - rbpY * scale - rbpZ; (-s, -s, -1)
// rbpVerts[1] = rbpC + rbpX * scale - rbpY * scale - rbpZ; (+s, -s, -1)
// rbpVerts[2] = rbpC - rbpX * scale + rbpY * scale - rbpZ; (-s, +s, -1)
// rbpVerts[3] = rbpC + rbpX * scale + rbpY * scale - rbpZ; (+s, +s, -1)
// rbpVerts[4] = rbpC - rbpX - rbpY + rbpZ; (-1, -1, +1)
// rbpVerts[5] = rbpC + rbpX - rbpY + rbpZ; (+1, -1, +1)
// rbpVerts[6] = rbpC - rbpX + rbpY + rbpZ; (-1, +1, +1)
// rbpVerts[7] = rbpC + rbpX + rbpY + rbpZ; (+1, +1, +1)
float3 m = GenerateVertexOfStandardCube(v);
m.xy *= ((v & 4) == 0) ? scale : 1; // X, Y in [-scale, scale]
float3 rbpVertVS = rbpC + m.x * rbpX + m.y * rbpY + m.z * rbpZ;
// Avoid generating (w = 0).
rbpVertVS.z = (abs(rbpVertVS.z) > FLT_MIN) ? rbpVertVS.z : FLT_MIN;
float4 hapVert = mul(projMat, float4(rbpVertVS, 1));
// Warning: the W component may be negative.
// Flipping the -W pyramid by negating all coordinates is incorrect
// and will break both classification and clipping.
// For the orthographic projection, (w = 1).
// Transform the X and Y components: [-w, w] -> [0, w].
hapVert.xy = 0.5 * hapVert.xy + (0.5 * hapVert.w);
// For each vertex, we must determine whether it is within the bounds.
// For culling and clipping, we must know, per culling plane, whether the vertex
// is in the positive or the negative half-space.
uint behindMask = 0; // Initially in front
// Consider the vertex to be inside the view volume if:
// 0 <= x <= w
// 0 <= y <= w <-- include boundary points to avoid clipping them later
// 0 <= z <= w
// w is always valid
// TODO: epsilon for numerical robustness?
//#define NUM_FACES (6) 6/2 一个顶点相关面数为3.
for (uint j = 0; j < (NUM_PLANES / 2); j++)
{
float w = hapVert.w;
behindMask |= (hapVert[j] < 0 ? 1 : 0) << (2 * j + 0); // Planes crossing '0'
behindMask |= (hapVert[j] > w ? 1 : 0) << (2 * j + 1); // Planes crossing 'w'
}
if (behindMask == 0) // Inside?
{
// Clamp to the bounds in case of numerical errors (may still generate -0).
float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVS.z));
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVS.z));
}
else // Outside
{
// Mark all the faces of the bounding frustum associated with this vertex.
cullClipFaceMask |= GetFaceMaskOfVertex(v);
}
gs_HapVertsX[baseVertexOffset + v] = hapVert.x;
gs_HapVertsY[baseVertexOffset + v] = hapVert.y;
gs_HapVertsZ[baseVertexOffset + v] = hapVert.z;
gs_HapVertsW[baseVertexOffset + v] = hapVert.w;
gs_BehindMasksOfVerts[baseVertexOffset + v] = behindMask;
}
InterlockedOr(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
GroupMemoryBarrierWithGroupSync();
测试视锥体八个顶点是否在灯光体积内
若上面经过线程同步过后的cullClipFaceMasks!=0,即存在顶点与灯光体积相交,就测试视锥体的八个顶点是否在灯光体积的AABB内。
若在,则将当前测试的顶点也作为视锥体与灯光体积相交的顶点,并用于更新ndcAaBbMinPt,ndcAaBbMaxPt。
// (2) Test the corners of the view volume.
if (cullClipFaceMask != 0)
{
//1.利用之前计算的视角空间的灯光体积center坐标以及轴向rbpX,rbpY,rbpZ(cullData.boxAxisX/Y/Z)重构灯光空间矩阵
//2.GenerateVertexOfStandardCube的顶点坐标[-1,1],z需要*0.5+0.5转换到[0,1]这时候的顶点才算是Project Space下的视锥体的八个顶点
//3.视锥体的八个顶点转换到灯光空间,需要先转到View Space,再转换到灯光空间矩阵
// The light is partially outside the view volume.
// Therefore, some of the corners of the view volume may be inside the light volume.
// We perform aggressive culling, so we must make sure they are accounted for.
// The light volume is a special type of cuboid - a right frustum.
// We can exploit this fact by building a light-space projection matrix.
// P_v = T * (R * S) * P_l
// P_l = (R * S)^{-1} * T^{-1} * P_v
float4x4 invTranslateToLightSpace = Translation4x4(-rbpC);
float4x4 invRotateAndScaleInLightSpace = Homogenize3x3(Invert3x3(ScaledRotation3x3(rbpX, rbpY, rbpZ)));
// TODO: avoid full inversion by using unit vectors and passing magnitudes explicitly.
// This (orthographic) projection matrix maps a view-space point to a light-space [-1, 1]^3 cube.
float4x4 lightSpaceMatrix = mul(invRotateAndScaleInLightSpace, invTranslateToLightSpace);
//GPULightType为Spot和ProjectorPyramid的LightSpace才是透视的,
//其他光源(Point,Rectangle,Tube,ProjectorBox)Scale为1
//只关心点光源流程可以不用看
if (scale != 1) // Perspective light space?
{
//(bound.scaleXY = squeeze ? 0.01f : 1.0f;)s为0.01
//e=-1.0202020f;也就是lightSpace的z从[-1, 1]变成[-2,0],把AABB原点往z轴负方向偏移
//n=0.0202020f,f=0.9797979797f,g=0.9797979797f,a(aspect)=1
//PerspectiveProjection4x4返回一个空间投影矩阵,把正交投影变成透视投影矩阵
// Compute the parameters of the perspective projection.
float s = scale;
float e = -1 - 2 * (s * rcp(1 - s)); // Signed distance from the origin to the eye
float n = -e - 1; // Distance from the eye to the near plane
float f = -e + 1; // Distance from the eye to the far plane
float g = f; // Distance from the eye to the projection plane
float4x4 invTranslateEye = Translation4x4(float3(0, 0, -e));
float4x4 perspProjMatrix = PerspectiveProjection4x4(1, g, n, f);
lightSpaceMatrix = mul(mul(perspProjMatrix, invTranslateEye), lightSpaceMatrix);
}
for (i = 0; i < VERTS_PER_THREAD; i++)
{
uint v = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
float3 rapVertCS = GenerateVertexOfStandardCube(v);
rapVertCS.z = rapVertCS.z * 0.5 + 0.5; // View's projection matrix MUST map Z to [0, 1]
float4 hbpVertVS = mul(invProjMat, float4(rapVertCS, 1)); // Clip to view space
float4 hapVertLS = mul(lightSpaceMatrix, hbpVertVS); // View to light space
// Consider the vertex to be inside the light volume if:
// -w < x < w
// -w < y < w <-- exclude boundary points, as we will not clip using these vertices
// -w < z < w <-- assume that Z-precision is not very important here
// 0 < w
// TODO: epsilon for numerical robustness?
bool inside = Max3(abs(hapVertLS.x), abs(hapVertLS.y), abs(hapVertLS.z)) < hapVertLS.w;
//若视锥体的八个顶点也在灯光体积内部,则将当前测试的顶点也作为视锥体与灯光体积相交的顶点,
//并用于更新ndcAaBbMinPt,ndcAaBbMaxPt。
if (inside)
{
float3 rapVertNDC = float3(rapVertCS.xy * 0.5 + 0.5, rapVertCS.z);
float rbpVertVSz = hbpVertVS.z * rcp(hbpVertVS.w);
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
}
}
}
InterlockedAnd(gs_CullClipFaceMasks[intraGroupLightIndex], cullClipFaceMask);
GroupMemoryBarrierWithGroupSync();
cullClipFaceMask = gs_CullClipFaceMasks[intraGroupLightIndex];
用CullClipFaceMasks计算相交的面
在前面第一步的时候LDS的gs_BehindMasksOfVerts记录了灯光体积顶点在视锥体的内外情况。
而CullClipFaceMask只是将在视锥体外面的顶点相关的面记录了下来,
在记录的面里面,可能含有四个顶点都在视锥体外部的面(用gs_BehindMasksOfVerts判断),这种面需要剔除掉。
//////////////////////////////////Tool Functions/////////////////////////////////////////////////////////
// offset:提取位置的偏移量
// numBits:提取Bit的数量
//ex: data = 1111 1111 1111 0101 0101 0101 , offset = 12 , numBits = 12
// mask = 1111 1111 1111
// data >> 12 = 0000 0000 0000 1111 1111 1111
// result = 1111 1111 1111
// Unsigned integer bit field extraction.
// Note that the intrinsic itself generates a vector instruction.
// Wrap this function with WaveReadLaneFirst() to get scalar output.
uint BitFieldExtract(uint data, uint offset, uint numBits)
{
uint mask = (1u << numBits) - 1u;
return (data >> offset) & mask;
}
#define VERT_LIST_LEFT ((4) << 9 | (6) << 6 | (2) << 3 | (0) << 0)
#define VERT_LIST_RIGHT ((3) << 9 | (7) << 6 | (5) << 3 | (1) << 0)
#define VERT_LIST_BOTTOM ((1) << 9 | (5) << 6 | (4) << 3 | (0) << 0)
#define VERT_LIST_TOP ((6) << 9 | (7) << 6 | (3) << 3 | (2) << 0)
#define VERT_LIST_FRONT ((2) << 9 | (3) << 6 | (1) << 3 | (0) << 0)
#define VERT_LIST_BACK ((5) << 9 | (7) << 6 | (6) << 3 | (4) << 0)
//allVertLists
//VERT_LIST_RIGHT 右边面对应的顶点序列,有4个点,顺时针3 7 5 1
//顶点序号最大为7,所以只需要3位就可以表示点的序号,所以一共需要3*4=12bit表示一个面的顶点序列
//allVertLists[f >> 1]
//f>>1 即f/2 选择allVertLists的xyz
//left,right 0/2,1/2 => 0 0
//top,bottom 2/2,3/2 => 1 1
//back,front 4/2,5/2 => 2 2
//12 * (f & 1)
//f&1判断face的奇偶数,控制是否要偏移12bit
//0&1=0 1&1=1
//2&1=0 3&1=1
//偶数需要offset,而奇数不需要offset直接拿就行.
//比如top=>2 拿的是allVertLists[1]中的前面12bit,需要Offset 12bit
//bottom=>3 奇数不需要Offset
uint GetVertexListOfFace(uint f)
{
// Warning: don't add 'static' here unless you want really bad code gen.
const uint3 allVertLists = uint3((VERT_LIST_RIGHT << 12) | VERT_LIST_LEFT,
(VERT_LIST_TOP << 12) | VERT_LIST_BOTTOM,
(VERT_LIST_BACK << 12) | VERT_LIST_FRONT);
return BitFieldExtract(allVertLists[f >> 1], 12 * (f & 1), 12);
}
bool TryCullFace(uint f, uint baseOffsetVertex)
{
//FACE_MASK=>((1 << NUM_FACES) - 1) => (1<<6-1) => 111111
uint cullMaskOfFace = FACE_MASK; // Initially behind
uint vertListOfFace = GetVertexListOfFace(f);
for (uint j = 0; j < 4; j++)
{
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
//BehindMask记录了对应点对于视锥体Volume的内外情况
//11 11 11代表完全在Volume外面,00 00 00代表完全在Volume里面
//如果有一个点是00 00 00则说明至少有一个点在Volume里面那这个面就不能被剔除掉即cullMaskOfFace != 0返回false
//
// Consider the vertex to be inside the view volume if:
// 0 <= x <= w
// 0 <= y <= w <-- include boundary points to avoid clipping them later
// 0 <= z <= w
// Non-zero if ALL the vertices are behind any of the planes.
cullMaskOfFace &= gs_BehindMasksOfVerts[baseOffsetVertex + v];
}
return (cullMaskOfFace != 0);
}
//减去前面n个bit==1的位之后,返回第一个bit==1的index
//ex: value = 111100 n = 3
// result = 5
//firstbitlow(111100) = 2
uint NthBitLow(uint value, uint n)
{
uint b = -1; // Consistent with the behavior of firstbitlow()
uint c = countbits(value);
if (n < c) // Validate inputs
{
uint r = n + 1; // Compute the number of remaining bits
do
{
uint f = firstbitlow(value >> (b + 1)); // Find the next set bit
b += f + r; // Make a guess (assume all [b+f+1,b+f+r] bits are set)
c = countbits(value << (32 - (b + 1))); // Count the number of bits actually set
r = (n + 1) - c; // Compute the number of remaining bits
}
while (r > 0);
}
return b;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// (3) Cull the faces.
{
const uint cullFaceMask = cullClipFaceMask;
//countbits返回cullFaceMask有多少位为1,第一步时标记了几个面需要进行剔除处理
const uint numFacesToCull = countbits(cullFaceMask); // [0, 6]
//FACES_PER_THREAD 6/4 = 2 每个线程计算两个面
for (i = 0; i < FACES_PER_THREAD; i++)
{
uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
if (n < numFacesToCull)
{
//减去前面n个bit==1的位之后,返回第一个bit==1的index
//即f为cullFaceMask中所有bit==1的面序号
uint f = NthBitLow(cullFaceMask, n);
//若面相关的4个点都在视锥体内(cullMaskOfFace != 0),说明当前面和视锥体不存在相交,可以被剔除。
//只要有一个点是在视锥体外(cullMaskOfFace == 0)而且之前第一步的计算被标记的面是存在点是在视锥体内部的,
//说明当前面不能被完全剔除,面所对应的Bit需要保留。
if (TryCullFace(f, baseVertexOffset))
{
cullClipFaceMask ^= 1 << f; // Clear the bit
}
}
}
}
裁剪相交的面,并计算相交的顶点
上一步得到cullClipFaceMask即灯光体积和视锥体体积相交的面。那么就需要进一步计算面与面之间相交的顶点,然后再更新ndcAaBbMinPt,ndcAaBbMaxPt。
这里跟上一步一样遍历cullClipFaceMask中标记的面
// (4) Clip the faces.
{
const uint clipFaceMask = cullClipFaceMask;
const uint numFacesToClip = countbits(clipFaceMask); // [0, 6]
for (i = 0; i < FACES_PER_THREAD; i++)
{
uint n = i * THREADS_PER_LIGHT + t % THREADS_PER_LIGHT;
if (n < numFacesToClip)
{
uint f = NthBitLow(clipFaceMask, n);
uint srcBegin, srcSize;
ClipFaceAgainstViewVolume(f, baseVertexOffset,
srcBegin, srcSize, t);
UpdateAaBb(srcBegin, srcSize, t, g_isOrthographic != 0, invProjMat,
ndcAaBbMinPt, ndcAaBbMaxPt);
}
}
}
ClipFaceAgainstViewVolume
RingBuffer
RingBufer是用来存储面被裁剪之后的顶点,两平面相交输入进RingBuffer除了原本的顶点之外,还会加入新的顶点,为了保证原本的求交过程正常进行,每个线程都需要独立的Buffer空间,所以Buffer长度为10*64
// Clipping a plane by a cube may produce a hexagon (6-gon).
// Clipping a hexagon by 4 planes may produce a decagon (10-gon).
#define MAX_CLIP_VERTS (10)
#define THREADS_PER_GROUP (64)
// ----------- Use LDS for the vertex ring buffer as otherwise on FXC we create register spilling
groupshared float gs_VertexRingBufferX[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferY[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferZ[MAX_CLIP_VERTS * THREADS_PER_GROUP];
groupshared float gs_VertexRingBufferW[MAX_CLIP_VERTS * THREADS_PER_GROUP];
float4 GetFromRingBuffer(uint threadIdx, uint entry)
{
float4 outV;
outV.x = gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry];
outV.y = gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry];
outV.z = gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry];
outV.w = gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry];
return outV;
}
void WriteToRingBuffer(uint threadIdx, uint entry, float4 value)
{
gs_VertexRingBufferX[threadIdx * MAX_CLIP_VERTS + entry] = value.x;
gs_VertexRingBufferY[threadIdx * MAX_CLIP_VERTS + entry] = value.y;
gs_VertexRingBufferZ[threadIdx * MAX_CLIP_VERTS + entry] = value.z;
gs_VertexRingBufferW[threadIdx * MAX_CLIP_VERTS + entry] = value.w;
}
///////////////////////////////////////////////////////////////////////////////////////
RingBuffer初始化
void ClipFaceAgainstViewVolume(uint f, uint baseVertexOffset,
out uint srcBegin, out uint srcSize,
uint threadIdx)
{
//RingBuffer存储当前面的所有顶点(包括裁剪产生的顶点,只需要记录Begin的Index和顶点数量就可以反复利用RingBuffer的空间)
srcBegin = 0;
srcSize = 4;
uint clipMaskOfFace = 0; // Initially in front
//得到面相关的顶点列表
uint vertListOfFace = GetVertexListOfFace(f);
//先把第一步存放在gs_BehindMasksOfVerts中的顶点拿出来,然后放入到RingBuffer中。
for (uint j = 0; j < 4; j++)
{
//提取面相关的顶点序号
uint v = BitFieldExtract(vertListOfFace, 3 * j, 3);
//gs_BehindMasksOfVerts记录了对应点对于视锥体六个面的内外情况,clipMaskOfFace则记录当前面对于视锥体六个面的相交的情况,
//ex: 点a:000010 点b:000110 点c:000000 点d:000100
//clipMaskOfFace:000110
//clipMaskOfFace==0标志为当前面完全在视锥体内部
// Non-zero if ANY of the vertices are behind any of the planes.
clipMaskOfFace |= gs_BehindMasksOfVerts[baseVertexOffset + v];
//写入RingBuffer中
// Not all edges may require clipping. However, filtering the vertex list
// is somewhat expensive, so we currently don't do it.
WriteToRingBuffer(threadIdx, j, float4(gs_HapVertsX[baseVertexOffset + v], gs_HapVertsY[baseVertexOffset + v], gs_HapVertsZ[baseVertexOffset + v], gs_HapVertsW[baseVertexOffset + v]));
//vertRingBuffer[j].x = gs_HapVertsX[baseVertexOffset + v];
//vertRingBuffer[j].y = gs_HapVertsY[baseVertexOffset + v];
//vertRingBuffer[j].z = gs_HapVertsZ[baseVertexOffset + v];
//vertRingBuffer[j].w = gs_HapVertsW[baseVertexOffset + v];
}
// Sutherland-Hodgeman polygon clipping algorithm.
// It works by clipping the entire polygon against one clipping plane at a time.
while (clipMaskOfFace != 0)
{
//接下来就是遍历灯光体积与视锥体平面相交的平面
uint p = firstbitlow(clipMaskOfFace);
uint dstBegin, dstSize;
ClipPolygonAgainstPlane(p, srcBegin, srcSize, threadIdx, dstBegin, dstSize);
srcBegin = dstBegin;
srcSize = dstSize;
clipMaskOfFace ^= 1 << p; // Clear the bit to continue using firstbitlow()
}
}
AABB平面与视锥体平面求交(ClipPolygonAgainstPlane)
这一步就是AABB平面与视锥体平面求交并把求交后的顶点加入到RingBuffer中
struct ClipVertex
{
float4 pt; // Homogeneous coordinate after perspective
float bc; // Boundary coordinate with respect to the plane 'p'
};
ClipVertex CreateClipVertex(uint p, float4 v)
{
bool evenPlane = (p & 1) == 0;
//不同Clip Plane对应的轴分量:0>>1=0 1>>1=0 2>>1=1 3>>1=1 ....
//01=>0 [left/right:x]
//23=>1 [bottom/top:y]
//45=>2 [front/back:z]
float c = v[p >> 1];
float w = v.w;
ClipVertex cv;
cv.pt = v;
//clip space:[0,w]在视锥体内 所以视锥体的ClipPlane的left,bottom,front Face被视作为对应轴上的原点
//0 2 4:c left=0 bottom=0 front=0
//1 3 5:w-c right=w top=w back=w
//left_plane:0-----right_plane:w-----v:c
//bc为当前点到视锥体平面的投影距离
cv.bc = evenPlane ? c : w - c; // dot(PlaneEquation, HapVertex);
return cv;
}
float4 IntersectEdgeAgainstPlane(ClipVertex v0, ClipVertex v1)
{
//用bc(点到视锥体平面的投影距离)插值求得中间的顶点位置。
float alpha = saturate(v0.bc * rcp(v0.bc - v1.bc)); // Guaranteed to lie between 0 and 1
return lerp(v0.pt, v1.pt, alpha);
}
void ClipPolygonAgainstPlane(uint p, uint srcBegin, uint srcSize,
uint threadIdx,
out uint dstBegin, out uint dstSize)
{
//滑动窗口
//dstBegin标记下一次平面判交的开始Index.
//dstSize记录平面需要判交顶点的数量
dstBegin = srcBegin + srcSize; // Start at the end; we don't use modular arithmetic here
dstSize = 0;
ClipVertex tailVert = CreateClipVertex(p, GetFromRingBuffer(threadIdx, (srcBegin + srcSize - 1) % MAX_CLIP_VERTS));
//防止越界
uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
uint modDstIdx = dstBegin % MAX_CLIP_VERTS;
//遍历RingBuffer读取放进去的顶点,并转换成ClipVertex
for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
{
float4 v = GetFromRingBuffer(threadIdx, modSrcIdx);
ClipVertex leadVert = CreateClipVertex(p, v);
// Execute Blinn's line clipping algorithm.
// Classify the line segment. 4 cases:
// 0. v0 out, v1 out -> add nothing
// 1. v0 in, v1 out -> add intersection
// 2. v0 out, v1 in -> add intersection, add v1
// 3. v0 in, v1 in -> add v1
// (bc >= 0) <-> in, (bc < 0) <-> out. Beware of -0.
//bc>=0即点到视锥体平面的投影距离大于0,在平面正侧.
//有In有Out,判定当前点为相交点,求交点并放入RingBuffer,留到下一次其他平面与RingBuffer进行求交
if ((tailVert.bc >= 0) != (leadVert.bc >= 0))
{
// The line segment is guaranteed to cross the plane.
float4 clipVert = IntersectEdgeAgainstPlane(tailVert, leadVert);
WriteToRingBuffer(threadIdx, modDstIdx, clipVert);
dstSize++;
modDstIdx++;
modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
}
//在平面正侧的点依旧放入RingBuffer,留到下一次其他平面与RingBuffer进行求交
if (leadVert.bc >= 0)
{
WriteToRingBuffer(threadIdx, modDstIdx, leadVert.pt);
dstSize++;
modDstIdx++;
modDstIdx = (modDstIdx == MAX_CLIP_VERTS) ? 0 : modDstIdx;
}
modSrcIdx++;
modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
tailVert = leadVert; // Avoid recomputation and overwriting the vertex in the ring buffer
}
}
UpdateAaBb
用上一步计算求交得到的RingBuffer更新ndcAaBbMaxPt/ndcAaBbMinPt
void UpdateAaBb(uint srcBegin, uint srcSize, uint threadIdx,
bool isOrthoProj, float4x4 invProjMat,
inout float4 ndcAaBbMinPt, inout float4 ndcAaBbMaxPt)
{
//滑动窗口遍历RingBuffer
uint modSrcIdx = srcBegin % MAX_CLIP_VERTS;
for (uint j = srcBegin; j < (srcBegin + srcSize); j++)
{
float4 hapVert = GetFromRingBuffer(threadIdx, modSrcIdx);
//透除转换到NDC
// Clamp to the bounds in case of numerical errors (may still generate -0).
float3 rapVertNDC = saturate(hapVert.xyz * rcp(hapVert.w));
float rbpVertVSz = hapVert.w;
//正交的时候w=1,需要逆变换回去
if (isOrthoProj) // Must replace (w = 1)
{
rbpVertVSz = dot(invProjMat[2], hapVert);
}
//更新ndcAaBbMaxPt/ndcAaBbMinPt
ndcAaBbMinPt = min(ndcAaBbMinPt, float4(rapVertNDC, rbpVertVSz));
ndcAaBbMaxPt = max(ndcAaBbMaxPt, float4(rapVertNDC, rbpVertVSz));
modSrcIdx++;
modSrcIdx = (modSrcIdx == MAX_CLIP_VERTS) ? 0 : modSrcIdx;
}
}
用BoundingSphere计算NDC上的RectMin/RectMax
这里需要求得BoundingSphere在XOZ/YOZ投影平面上过原点(CameraPosition)的切线OB和OD。
通过
\(cross(OB' , OC') = |OB'| * |OC'| * Sin[a']\)
\(OB' . OC' = |OB'| *|OC'|* Cos[a'].\)
解得:
\(b.z * c.x - b.x * c.z = |OB'| * |OC'| * Sin[a']\)
\(b.x * c.x + b.z * c.z = |OB'| * |OC'| * Cos[a']\)
而实际上不需要求得B的坐标只需要求得x/z(y/z)的比值因为透视投影矩阵只是对xy轴做缩放,转换到齐次坐标还要做透除(除以Z)
//https://www.zhihu.com/question/289794588/answer/466643632
所以等式也就可以化成
\(令(z=t*b.z,x=t*b.x,t=|OC'|/|OB'|)\)
\(z * c.x - x * c.z = |OC'|^3 * Sin[a']\)
\(x * c.x + z * c.z = |OC'|^3 * Cos[a']\)
\(x = -c.z * r + c.x * |OB'|\)
\(z = c.x * r + c.z * |OB'|\)
\(cross(OD' , OC') = |OD'| * |OC'| * Sin[a']\)
\(OD' . OD' = |OD'| *|OC'|* Cos[a'].\)
同理得:
\(x = c.z * r + c.x * |OB'|\)
\(z = -c.x * r + c.z * |OB'|\)
float2 ComputeBoundsOfSphereOnProjectivePlane(float3 C, float r, float projScale, float projOffset)
{
float xMin, xMax;
// See sec. 8.2.1 of https://foundationsofgameenginedev.com/#fged2 for an alternative derivation.
// Goal: find the planes that pass through the origin O, bound the sphere, and form
// an axis-aligned rectangle at the intersection with the projection plane.
// Solution (for the X-coordinate):
// The intersection of the bounding planes and the projection plane must be vertical lines,
// which means that the bounding planes must be tangent to the Y-axis.
// The bounding planes must be also tangent to the sphere.
// Call the intersection points of the two vertical bounding planes and the bounding
// sphere B and D. Assume that B is on the left of C; D is on the right of C.
// Note that C may be behind the origin, so the same generally goes for B and D.
// BC is normal w.r.t. the bounding plane, so it is normal w.r.t. the Y-axis; |BC| = r.
// As a consequence, it lies in a plane parallel to the the O-X-Z plane.
// Consider B'C', which is an orthogonal projection of BC onto the actual O-X-Z plane.
// (Imagine sliding the sphere up or down between the bounding planes).
// We then consider a triangle OB'C' that lies entirely in the O-X-Z plane.
// The coordinates are: OB' = (b.x, 0, b.z), OC' = (c.x, 0, c.z).
float3 B, D;
// OBC is a right triangle. So is OB'C'.
// |BC| = |B'C'| = r.
// |OB'|^2 = |OC'|^2 - |B'C'|^2.
float lenSqOC_ = math.dot(C.xz, C.xz);
float lenSqOB_ = lenSqOC_ - r * r;
// If |OB'| = 0 or |OC'| = 0, the bounding planes tangent to the sphere do not exist.
if (lenSqOB_ > 0)
{
float lenOB_ = math.sqrt(lenSqOB_);
// |OB' x OC'| = |OB'| * |OC'| * Sin[a'].
// OB' . OC' = |OB'| * |OC'| * Cos[a'].
// We can determine Sin[a'] = |B'C'| / |OC'| = R / |OC'|.
// Cos[a'] = Sqrt[1 - Sin[a']^2].
// (OB' x OC') points along Y.
// (OB' x OC').y = b.z * c.x - b.x * c.z.
// Therefore, b.z * c.x - b.x * c.z = |OB'| * |OC'| * Sin[a'].
// OB' . OC' = b.x * c.x + b.z * c.z = |OB'| * |OC'| * Cos[a'].
// Since we don't care about the scale, and |OB'| != 0 and |OC'| != 0,
// we can equivalently solve
// z * c.x - x * c.z = |OC'|^3 * Sin[a'].
// x * c.x + z * c.z = |OC'|^3 * Cos[a'].
// With 2 equations and 2 unknowns, we can easily solve this linear system.
// The solutions is
// x = -c.z * r + c.x * |OB'|.
// z = c.x * r + c.z * |OB'|.
B.x = C.x * lenOB_ - (C.z * r);
B.z = C.z * lenOB_ + (C.x * r);
// (OD' x OC') points along Y.
// (OD' x OC').y = d.z * c.x - d.x * c.z.
// We must solve
// z * c.x - x * c.z = -|OC'|^3 * Sin[a'].
// x * c.x + z * c.z = |OC'|^3 * Cos[a'].
// The solution is
// x = c.z * r + c.x * |OB'|.
// z = -c.x * r + c.z * |OB'|.
D.x = C.x * lenOB_ + (C.z * r);
D.z = C.z * lenOB_ - (C.x * r);
// We can transform OB and OD as direction vectors.
// For the simplification below, see OptimizeProjectionMatrix.
float rapBx = (B.x * math.rcp(B.z)) * projScale + projOffset;
float rapDx = (D.x * math.rcp(D.z)) * projScale + projOffset;
// One problem with the above is that this direction may, for certain spheres,
// point behind the origin (B.z <= 0 or D.z <= 0).
// At this point we know that the sphere at least *partially* in front of the origin,
// and that it is we are not inside the sphere, so there is at least one valid
// plane (and one valid direction). We just need the second direction to go "in front"
// of the first one to extend the bounding box.
xMin = (B.z > 0) ? rapBx : -(float) 0x7F800000;
xMax = (D.z > 0) ? rapDx : (float) 0x7F800000;
}
else
{
// Conservative estimate (we do not cull the bounding sphere using the view frustum).
xMin = -1;
xMax = 1;
}
return new float2(xMin, xMax);
}
// (5) Compute the AABB of the bounding sphere.
if (radius > 0)
{
// Occasionally, an intersection of AABBs of a bounding sphere and a bounding frustum
// results in a tighter AABB when compared to using the AABB of the frustum alone.
// That is the case (mostly) for sphere-capped spot lights with very wide angles.
// Note that, unfortunately, it is not quite as tight as an AABB of a CSG intersection
// of a sphere and frustum. Also note that the algorithm below doesn't clip the bounding
// sphere against the view frustum before computing the bounding box, simply because it is
// too hard/expensive. I will leave it as a TODO in case someone wants to tackle this problem.
if ((rbpC.z + radius) > 0) // Is the sphere at least *partially* in front of the origin?
{
ndcAaBbMinPt.w = max(ndcAaBbMinPt.w, rbpC.z - radius);
ndcAaBbMaxPt.w = min(ndcAaBbMaxPt.w, rbpC.z + radius);
// Computing the 'z' component for an arbitrary projection matrix is hard, so we don't do it.
// See sec. 8.2.2 of https://foundationsofgameenginedev.com/#fged2 for a solution.
float2 rectMin, rectMax;
// For the 'x' and 'y' components, the solution is given below.
//如果是正交投影
if (g_isOrthographic)
{
// Compute the center and the extents (half-diagonal) of the bounding box.
float2 center = mul(projMat, float4(rbpC.xyz, 1)).xy;
float2 extents = mul(projMat, float4(radius.xx, 0, 0)).xy;
rectMin = center - extents;
rectMax = center + extents;
}
else // Perspective
{
//ComputeBoundsOfSphereOnProjectivePlane函数中只用xz分量
float2 xBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.xxz, radius, projMat._m00, projMat._m02); // X-Z plane
float2 yBounds = ComputeBoundsOfSphereOnProjectivePlane(rbpC.yyz, radius, projMat._m11, projMat._m12); // Y-Z plane
rectMin = float2(xBounds.r, yBounds.r);
rectMax = float2(xBounds.g, yBounds.g);
}
// Transform to the NDC coordinates.
rectMin = rectMin * 0.5 + 0.5;
rectMax = rectMax * 0.5 + 0.5;
// Note: separating the X- and Y-computations across 2 threads is not worth it.
ndcAaBbMinPt.xy = max(ndcAaBbMinPt.xy, rectMin);
ndcAaBbMaxPt.xy = min(ndcAaBbMaxPt.xy, rectMax);
}
}
计算出最终的ScrBound(RectMin,RectMax)
最后将计算好的ndcAaBbMinPt以及ndcAaBbMaxPt存放到g_vBoundsBuffer里面,Scrbound的流程就结束了。
正常来说,eyeIndex在没开VR的时候为0,开了之后eyeIndex=()
所以g_vBoundsBuffer的布局就是[light0.min,light1.min.....][light0.max,light1.max.....]
// The returned values are used to index into our AABB screen space bounding box buffer
// Usually named g_vBoundsBuffer. The two values represent the min/max indices.
ScreenSpaceBoundsIndices GenerateScreenSpaceBoundsIndices(uint lightIndex, uint numVisibleLights, uint eyeIndex)
{
// In the monoscopic mode, there is one set of bounds (min,max -> 2 * g_iNrVisibLights)
// In stereo, there are two sets of bounds (leftMin, leftMax, rightMin, rightMax -> 4 * g_iNrVisibLights)
const uint eyeRelativeBase = eyeIndex * 2 * numVisibleLights;
ScreenSpaceBoundsIndices indices;
indices.min = eyeRelativeBase + lightIndex;
indices.max = indices.min + numVisibleLights;
return indices;
}
if ((globalLightIndex < (uint)g_iNrVisibLights) && (t % THREADS_PER_LIGHT == 0)) // Avoid bank conflicts
{
// For stereo, we have two sets of lights. Therefore, each eye has a set of mins
// followed by a set of maxs, and each set is equal to g_iNrVisibLights.
const ScreenSpaceBoundsIndices eyeAdjustedOutputOffsets = GenerateScreenSpaceBoundsIndices(globalLightIndex, g_iNrVisibLights, eyeIndex);
g_vBoundsBuffer[eyeAdjustedOutputOffsets.min] = ndcAaBbMinPt;
g_vBoundsBuffer[eyeAdjustedOutputOffsets.max] = ndcAaBbMaxPt;
}