SRPCore ColorPyramid优化

序言

很早之前就看HDRP里面的Color ColorPyramid的生成不爽了，都不知道为什么明明有现成的ColorPyramid.compute放着不用，还是用的是PixelShader生成。DrawCall数量多，效率也不够Compute的方式快。
这篇文章主要解析ColorPyramid.compute怎么用LDS优化GuassianBlur以及不同方式Copy mip 0的性能对比。

Color Pyramid

ColorPyramid主要用来模拟毛玻璃折射(Refraction)效果,物体表面越粗糙,折射越模糊。(并不物理正确)

HDRP中通过物体表面的PerceptualRoughness通过一个经验公式映射出一个计算透射时采样ColorPyramid的Mip Level。

PreLightData GetPreLightData(float3 V, PositionInputs posInput, inout BSDFData bsdfData)
{
...
// refraction (forward only)
#if HAS_REFRACTION
RefractionModelResult refraction = REFRACTION_MODEL(V, posInput, bsdfData);
preLightData.transparentRefractV = refraction.rayWS;
preLightData.transparentPositionWS = refraction.positionWS;
preLightData.transparentTransmittance = exp(-bsdfData.absorptionCoefficient * refraction.dist);

// Empirical remap to try to match a bit the refraction probe blurring for the fallback
// Use IblPerceptualRoughness so we can handle approx of clear coat.
preLightData.transparentSSMipLevel = PositivePow(preLightData.iblPerceptualRoughness, 1.3) * uint(max(_ColorPyramidLodCount - 1, 0));
#endif
}

通过trilinear三线性过滤，就能够在不同Mip之间做插值。

IndirectLighting EvaluateBSDF_ScreenspaceRefraction(LightLoopContext lightLoopContext,
                                                    float3 V, PositionInputs posInput,
                                                    PreLightData preLightData, BSDFData bsdfData,
                                                    EnvLightData envLightData,
                                                    inout float hierarchyWeight)
{
...
float2 samplingPositionNDC = lerp(posInput.positionNDC, hit.positionNDC, refractionOffsetMultiplier);
//三线性过滤
float3 preLD = SAMPLE_TEXTURE2D_X_LOD(_ColorPyramidTexture, s_trilinear_clamp_sampler, samplingPositionNDC * _RTHandleScaleHistory.xy, preLightData.transparentSSMipLevel).rgb;
                                    // Offset by half a texel to properly interpolate between this pixel and its mips

// Inverse pre-exposure
preLD *= GetInverseCurrentExposureMultiplier();

// We use specularFGD as an approximation of the fresnel effect (that also handle smoothness)
float3 F = preLightData.specularFGD;
//(1-F模拟菲尼尔效果)
lighting.specularTransmitted = (1.0 - F) * preLD.rgb * preLightData.transparentTransmittance * weight;

UpdateLightingHierarchyWeights(hierarchyWeight, weight); // Shouldn't be needed, but safer in case we decide to change hierarchy priority

#else // HAS_REFRACTION
// No refraction, no need to go further
hierarchyWeight = 1.0;
#endif

return lighting;
}

一般在透明队列渲染之前就需要生成mip level越高越模糊的Color Pyramid留着后续透明队列模拟折射时采样。
如果还有Distortion Pass的话，后处理之后还要生成Color Pyramid多一次。
HDRP生成Color Pyramid流程比较简单,
首先申请一张临时的RT用于DownSample(),以及Horizontal Blur输出的临时RT，
读取DownSample的RT Horizontal Blur+vertical Blur输出到ColorPyramid上面，
(Vertical Blur直接输出到ColorPyramid当前Downsample Size所对应的mip中)
mip0不需要模糊直接拷贝即可。

//MipGenerator.cs

public int RenderColorGaussianPyramid(CommandBuffer cmd, Vector2Int size, Texture source, RenderTexture destination)
{
    ...
    // Mip0直接拷贝
    m_PropertyBlock.SetTexture(Blitter.BlitShaderIDs._BlitTexture, source);
    m_PropertyBlock.SetVector(Blitter.BlitShaderIDs._BlitScaleBias, new Vector4(sourceScaleX, sourceScaleY, 0f, 0f));
    m_PropertyBlock.SetFloat(Blitter.BlitShaderIDs._BlitMipLevel, 0f);
    cmd.SetRenderTarget(destination, 0, CubemapFace.Unknown, -1);
    cmd.SetViewport(new Rect(0, 0, srcMipWidth, srcMipHeight));
    cmd.DrawProcedural(Matrix4x4.identity, Blitter.GetBlitMaterial(source.dimension), 0, MeshTopology.Triangles, 3, 1, m_PropertyBlock);

    int finalTargetMipWidth = destination.width;
    int finalTargetMipHeight = destination.height;


    // Note: smaller mips are excluded as we don't need them and the gaussian compute works
    // on 8x8 blocks
    while (srcMipWidth >= 8 || srcMipHeight >= 8)
    {
        int dstMipWidth = Mathf.Max(1, srcMipWidth >> 1);
        int dstMipHeight = Mathf.Max(1, srcMipHeight >> 1);

        // Scale for downsample
        float scaleX = ((float) srcMipWidth / finalTargetMipWidth);
        float scaleY = ((float) srcMipHeight / finalTargetMipHeight);

        // Downsample输出到临时的RT中
        m_PropertyBlock.SetTexture(Blitter.BlitShaderIDs._BlitTexture, destination);
        m_PropertyBlock.SetVector(Blitter.BlitShaderIDs._BlitScaleBias, new Vector4(scaleX, scaleY, 0f, 0f));
        m_PropertyBlock.SetFloat(Blitter.BlitShaderIDs._BlitMipLevel, srcMipLevel);
        cmd.SetRenderTarget(m_TempDownsamplePyramid0[rtIndex], 0, CubemapFace.Unknown, -1);
        cmd.SetViewport(new Rect(0, 0, dstMipWidth, dstMipHeight));
        cmd.DrawProcedural(Matrix4x4.identity, Blitter.GetBlitMaterial(source.dimension), 1, MeshTopology.Triangles, 3, 1, m_PropertyBlock);

        ...

        // Blur horizontal
        m_PropertyBlock.SetTexture(ShaderIDs._Source, m_TempDownsamplePyramid0[rtIndex]);
        m_PropertyBlock.SetVector(ShaderIDs._SrcScaleBias, new Vector4(scaleX, scaleY, 0f, 0f));
        m_PropertyBlock.SetVector(ShaderIDs._SrcUvLimits,
            new Vector4((dstMipWidth - 0.5f) / blurSourceTextureWidth, (dstMipHeight - 0.5f) / blurSourceTextureHeight, 1.0f / blurSourceTextureWidth, 0f));
        m_PropertyBlock.SetFloat(ShaderIDs._SourceMip, 0);
        //输出到临时RT上
        cmd.SetRenderTarget(m_TempColorTargets[rtIndex], 0, CubemapFace.Unknown, -1);
        cmd.SetViewport(new Rect(0, 0, dstMipWidth, dstMipHeight));
        cmd.DrawProcedural(Matrix4x4.identity, m_ColorPyramidPSMat, rtIndex, MeshTopology.Triangles, 3, 1, m_PropertyBlock);

        // Blur vertical
        m_PropertyBlock.SetTexture(ShaderIDs._Source, m_TempColorTargets[rtIndex]);
        m_PropertyBlock.SetVector(ShaderIDs._SrcScaleBias, new Vector4(scaleX, scaleY, 0f, 0f));
        m_PropertyBlock.SetVector(ShaderIDs._SrcUvLimits,
            new Vector4((dstMipWidth - 0.5f) / blurSourceTextureWidth, (dstMipHeight - 0.5f) / blurSourceTextureHeight, 0f, 1.0f / blurSourceTextureHeight));
        m_PropertyBlock.SetFloat(ShaderIDs._SourceMip, 0);
        //输出到Color Pyramid RT的当前Downsample Size对应mip上
        cmd.SetRenderTarget(destination, srcMipLevel + 1, CubemapFace.Unknown, -1);
        cmd.SetViewport(new Rect(0, 0, dstMipWidth, dstMipHeight));
        cmd.DrawProcedural(Matrix4x4.identity, m_ColorPyramidPSMat, rtIndex, MeshTopology.Triangles, 3, 1, m_PropertyBlock);

        srcMipLevel++;
        srcMipWidth = srcMipWidth >> 1;
        srcMipHeight = srcMipHeight >> 1;

        finalTargetMipWidth = finalTargetMipWidth >> 1;
        finalTargetMipHeight = finalTargetMipHeight >> 1;
    }

    return srcMipLevel + 1;
}

Blur时通过_SrcUvLimits.zw控制采样Offset的方向(Horizontal/Vertical)。

half4 Frag(Varyings input) : SV_Target
{
    // Gaussian weights for 9 texel kernel from center textel to furthest texel. Keep in sync with ColorPyramid.compute
    const half gaussWeights[] = {0.27343750, 0.21875000, 0.10937500, 0.03125000, 0.00390625};

    //Blur horizontal.  1.0f / blurSourceTextureWidth, 0f
    //Blur vertical.    0f, 1.0f / blurSourceTextureHeight
    const half2 offset = _SrcUvLimits.zw;
    const half2 offset1 = offset * (1.0 + (gaussWeights[2] / (gaussWeights[1] + gaussWeights[2])));
    const half2 offset2 = offset * (3.0 + (gaussWeights[4] / (gaussWeights[3] + gaussWeights[4])));

    half2 uv_m2 = input.texcoord.xy - offset2;
    half2 uv_m1 = input.texcoord.xy - offset1;
    half2 uv_p0 = input.texcoord.xy;
    half2 uv_p1 = min(_SrcUvLimits.xy, input.texcoord.xy + offset1);
    half2 uv_p2 = min(_SrcUvLimits.xy, input.texcoord.xy + offset2);

    return
        + SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_m2, _SourceMip) * (gaussWeights[3] + gaussWeights[4])
        + SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_m1, _SourceMip) * (gaussWeights[1] + gaussWeights[2])
        + SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_p0, _SourceMip) * gaussWeights[0]
        + SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_p1, _SourceMip) * (gaussWeights[1] + gaussWeights[2])
        + SAMPLE_TEXTURE2D_LOD(_Source, sampler_LinearClamp, uv_p2, _SourceMip) * (gaussWeights[3] + gaussWeights[4]);
}

Downsample

显然上面除了Mip0之外一个Mip就要调用三次DrawProcedural(Downsample+Horizontal Blur+Vertical Blur)
其中的Horizontal Blur以及Vertical Blur可以在Compute Shader中利用LDS一次Dispatch完成,从而节省DrawCall的消耗。

首先还是得先Downsample,4个像素Down Sample成一个,在第一次Downsample的时候可以顺便把采样的四个像素拷贝到Color Pyramid Mip0中(不需要Blur)

管线调用

public int RenderColorGaussianPyramidCS(CommandBuffer cmd, Vector2Int size, Texture source, RenderTexture destination)
{
    ...

    int finalTargetMipWidth = destination.width;
    int finalTargetMipHeight = destination.height;

    var cs = m_ColorPyramidCS;
    bool isFirstLoop = true;
    bool switchFlag = false;
    // Note: smaller mips are excluded as we don't need them and the gaussian compute works
    // on 8x8 blocks
    while (srcMipWidth >= 8 || srcMipHeight >= 8)
    {
        int dstMipWidth = Mathf.Max(1, srcMipWidth >> 1);
        int dstMipHeight = Mathf.Max(1, srcMipHeight >> 1);

        
        RenderTargetIdentifier sourceRTI, destinationRTI;
        if (isFirstLoop)
        {
            sourceRTI = source;
            destinationRTI = m_TempDownsamplePyramid0[rtIndex];
            //启用关键字Copy Mip0
            //第一次Loop的时候DownSample之余，顺便把source的Mip0 Copy到ColorPyramid Mip0
            cmd.EnableKeyword(cs, this.copyMip0);
            cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, "_Mip0", destination, 0);
        }
        else
        {
            //Ping Pong Downsample
            if (switchFlag)
            {
                sourceRTI = m_TempDownsamplePyramid1[rtIndex];
                destinationRTI = m_TempDownsamplePyramid0[rtIndex];
            }
            else
            {
                sourceRTI = m_TempDownsamplePyramid0[rtIndex];
                destinationRTI = m_TempDownsamplePyramid1[rtIndex];
            }

            switchFlag = !switchFlag;
        }

        // Downsample.
        //
        this.size[0] = srcMipWidth;
        this.size[1] = srcMipHeight;
        cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Source, sourceRTI);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Destination, destinationRTI);
        cmd.DispatchCompute(cs, m_ColorPyramidDownSampleKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);
        if (isFirstLoop)
        {
            cmd.DisableKeyword(cs, this.copyMip0);
            isFirstLoop = false;
        }

        //Blur.
        //Blur完直接输出到ColorPyramid对应的Mip
        this.size[0] = dstMipWidth;
        this.size[1] = dstMipHeight;
        cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Source, destinationRTI);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Destination, destination, srcMipLevel + 1);
        cmd.DispatchCompute(cs, m_ColorPyramidGaussianKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);

        srcMipLevel++;
        srcMipWidth = srcMipWidth >> 1;
        srcMipHeight = srcMipHeight >> 1;

        finalTargetMipWidth = finalTargetMipWidth >> 1;
        finalTargetMipHeight = finalTargetMipHeight >> 1;
    }

    return srcMipLevel + 1;
}

DownSample Compute Shader

COPY_MIP_0的Keyword控制是否拷贝到Mip0中

#if COPY_MIP_0
    TEXTURE2D(_Source);
    RW_TEXTURE2D(float4, _Mip0);
#else
RW_TEXTURE2D(float4, _Source);
#endif

RW_TEXTURE2D(float4, _Destination);

SamplerState sampler_LinearClamp;

CBUFFER_START(cb)
float4 _Size; // x: src width, y: src height, zw: unused
CBUFFER_END

[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_DOWNSAMPLE(uint3 dispatchThreadId : SV_DispatchThreadID)
{
    uint2 offset = dispatchThreadId.xy * 2u;
    uint2 size = uint2(_Size.xy) - 1u;

    uint2 c00 = min(offset + uint2(0u, 0u), size);
    uint2 c10 = min(offset + uint2(1u, 0u), size);
    uint2 c11 = min(offset + uint2(1u, 1u), size);
    uint2 c01 = min(offset + uint2(0u, 1u), size);
    float4 p00 = _Source[(c00)];
    float4 p10 = _Source[(c10)];
    float4 p11 = _Source[(c11)];
    float4 p01 = _Source[(c01)];

    #if COPY_MIP_0
    _Mip0[(c00)] = p00;
    _Mip0[(c10)] = p10;
    _Mip0[(c11)] = p11;
    _Mip0[(c01)] = p01;
    #endif

    _Destination[(dispatchThreadId.xy)] = (p00 + p01 + p11 + p10) * 0.25;
}

Downsample完之后就可以对DownSample的结果进行Blur处理

Gaussian Blur

Store Pixel Into LDS

这里的threadUL的命名其实有点误导,因为unity (0,0)是左下角。这里应该是LL才对，但是影响不大。
可以看到这里先读取了四个像素threadUL上，左，斜上角的四个像素。
读取完之后将float32转成16位，通过位运算将两个像素的r,g,b,a分别塞进gs_cacheR/gs_cacheG/gs_cacheB/gs_cacheA的LDS中。

// 16x16 pixels with an 8x8 center that we will be blurring writing out. Each uint is two color
// channels packed together.
// The reason for separating channels is to reduce bank conflicts in the local data memory
// controller. A large stride will cause more threads to collide on the same memory bank.
groupshared uint gs_cacheR[128];
groupshared uint gs_cacheG[128];
groupshared uint gs_cacheB[128];
groupshared uint gs_cacheA[128];

void Store2Pixels(uint index, float4 pixel1, float4 pixel2)
{
    gs_cacheR[index] = f32tof16(pixel1.r) | f32tof16(pixel2.r) << 16;
    gs_cacheG[index] = f32tof16(pixel1.g) | f32tof16(pixel2.g) << 16;
    gs_cacheB[index] = f32tof16(pixel1.b) | f32tof16(pixel2.b) << 16;
    gs_cacheA[index] = f32tof16(pixel1.a) | f32tof16(pixel2.a) << 16;
}

//KERNEL_SIZE=8
[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_GAUSSIAN(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
    // Upper-left pixel coordinate of quad that this thread will read
    //kernel = 8 groupId<<3为当前group的左下角
    //groupThreadId<<1即groupThreadId*2
    //-4即每个Group的边界Padding4个像素  读取Source像素Tile的大小:8+4+4=16
    //16*16/2=128(LDS的容量)
    int2 threadUL = (groupThreadId << 1) + (groupId << 3) - 4;
    
    //边界控制
    //当groupThreadId.y=0时，groupThreadId.x = 0,1,2依旧读取左下角(0,0)
    uint2 uthreadUL = uint2(max(0, threadUL));
    uint2 size = uint2(_Size.xy) - 1u;

    float4 p00 = _Source[(min(uthreadUL + uint2(0u, 0u), size))];
    float4 p10 = _Source[(min(uthreadUL + uint2(1u, 0u), size))];
    float4 p11 = _Source[(min(uthreadUL + uint2(1u, 1u), size))];
    float4 p01 = _Source[(min(uthreadUL + uint2(0u, 1u), size))];

    // Store the 4 downsampled pixels in LDS
    uint destIdx = groupThreadId.x + (groupThreadId.y << 4u);
    Store2Pixels(destIdx, p00, p10);
    Store2Pixels(destIdx + 8u, p01, p11);
    
    //同步LDS
    GroupMemoryBarrierWithGroupSync();
    ...
}

LDS映射关系(写得太抽象了，建议自己理一遍Index)

上面的LDS映射关系(destIdx)：
2*2的采样Quad中
p00, p10(offsetY=0)存储在destIdx中，而p01, p11(offsetY=1)存储在destIdx+8的位置中。
也就是说在LDS中:

groupThreadId.y=0：(边界像素)
(destIdx=0,1....,7存储Tile[8*8] y=0的像素)
(destIdx=8,9....,15存储Tile y=1的像素)

groupThreadId.y=1：(边界像素)
(destIdx=16,....,23存储Tile y=0的像素)
(destIdx=24,....,31存储Tile y=1的像素)

groupThreadId.y=2：(边界像素)
(destIdx=32,....,39存储Tile y=0的像素)
(destIdx=40,....,47存储Tile y=1的像素)

groupThreadId.y=3：
(destIdx=48,....,55存储Tile y=2的像素)
(destIdx=56,....,63存储Tile y=3的像素)

groupThreadId.y=4：
(destIdx=64,....,71存储Tile y=3的像素)
(destIdx=72,....,79存储Tile y=4的像素)

...

groupThreadId.y=7：(min(uthreadUL + uint2(0u, 0u), size)限制在RT的size之内。
(destIdx=112,....,119存储Tile y=6的像素)
(destIdx=120,....,127存储Tile y=7的像素)

对于groupThreadId.x来说也同样有类似的边界限制。
groupThreadId.x=0/1/2依旧以Tile中的(0,0)像素作为左下角进行采样

BlurHorizontally

水平方向上的模糊计算，读取LDS上的像素，计算高斯模糊的结果，然后把Blur结果存在Gaussian Kernel中心点(e)对应LDS中Index位置上[outIndex]。
leftMostIndex为读取LDS的起始Index。
一共读取5次LDS，获得10个像素，以9个像素计算一次Blur。

ex:
groupThreadId.y=0
groupThreadId.x:0~3 BlurHorizontally tile.y=0的像素保存Blur的结果在LDS中,Index范围为(0~7)
groupThreadId.x:4~7 BlurHorizontally tile.y=1的像素 Index范围为(8~15)
以此类推


//根据Gaussian曲线上的值近似计算
float4 BlurPixels(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h, float4 i)
{
    return 0.27343750 * (e)
        + 0.21875000 * (d + f)
        + 0.10937500 * (c + g)
        + 0.03125000 * (b + h)
        + 0.00390625 * (a + i);
}

void Load2Pixels(uint index, out float4 pixel1, out float4 pixel2)
{
    uint rr = gs_cacheR[index];
    uint gg = gs_cacheG[index];
    uint bb = gs_cacheB[index];
    uint aa = gs_cacheA[index];
    pixel1 = float4(f16tof32(rr), f16tof32(gg), f16tof32(bb), f16tof32(aa));
    pixel2 = float4(f16tof32(rr >> 16), f16tof32(gg >> 16), f16tof32(bb >> 16), f16tof32(aa >> 16));
}

void Store1Pixel(uint index, float4 pixel)
{
    gs_cacheR[index] = asuint(pixel.r);
    gs_cacheG[index] = asuint(pixel.g);
    gs_cacheB[index] = asuint(pixel.b);
    gs_cacheA[index] = asuint(pixel.a);
}


// Blur two pixels horizontally.  This reduces LDS reads and pixel unpacking.
void BlurHorizontally(uint outIndex, uint leftMostIndex)
{
    float4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
    Load2Pixels(leftMostIndex + 0, s0, s1);
    Load2Pixels(leftMostIndex + 1, s2, s3);
    Load2Pixels(leftMostIndex + 2, s4, s5);
    Load2Pixels(leftMostIndex + 3, s6, s7);
    Load2Pixels(leftMostIndex + 4, s8, s9);

    Store1Pixel(outIndex, BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8));
    Store1Pixel(outIndex + 1, BlurPixels(s1, s2, s3, s4, s5, s6, s7, s8, s9));
}

[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_GAUSSIAN(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
    ...
    // Horizontally blur the pixels in LDS
    uint row = groupThreadId.y << 4u;
    BlurHorizontally(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u));

    GroupMemoryBarrierWithGroupSync();
    ...
}

BlurVertically

竖直方向上Blur就用刚刚水平方向上Blur完的LDS继续模糊即可。
以s4的像素作为中心点进行模糊，并把模糊的结果写入到像素对应的位置上。


float4 BlurPixels(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h, float4 i)
{
    return 0.27343750 * (e)
        + 0.21875000 * (d + f)
        + 0.10937500 * (c + g)
        + 0.03125000 * (b + h)
        + 0.00390625 * (a + i);
}

void Load1Pixel(uint index, out float4 pixel)
{
    pixel = asfloat(uint4(gs_cacheR[index], gs_cacheG[index], gs_cacheB[index], gs_cacheA[index]));
}

//以s4的像素作为中心点进行模糊
void BlurVertically(uint2 pixelCoord, uint topMostIndex)
{
    float4 s0, s1, s2, s3, s4, s5, s6, s7, s8;
    Load1Pixel(topMostIndex, s0);
    Load1Pixel(topMostIndex + 8, s1);
    Load1Pixel(topMostIndex + 16, s2);
    Load1Pixel(topMostIndex + 24, s3);
    Load1Pixel(topMostIndex + 32, s4);
    Load1Pixel(topMostIndex + 40, s5);
    Load1Pixel(topMostIndex + 48, s6);
    Load1Pixel(topMostIndex + 56, s7);
    Load1Pixel(topMostIndex + 64, s8);

    float4 blurred = BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8);

    // Write to the final target
    _Destination[(pixelCoord)] = blurred;
}

[numthreads(KERNEL_SIZE, KERNEL_SIZE, 1)]
void MAIN_GAUSSIAN(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint3 dispatchThreadId : SV_DispatchThreadID)
{
    ...
    // Horizontally blur the pixels in LDS
    uint row = groupThreadId.y << 4u;
    BlurHorizontally(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u));

    GroupMemoryBarrierWithGroupSync();
    
    // Vertically blur the pixels in LDS and write the result to memory
    //(groupThreadId.y << 3u) + groupThreadId.x为当前线程对应像素在LDS中的Index。存放线程对应像素HorizontallyBlur后的结果。
    BlurVertically(dispatchThreadId.xy, (groupThreadId.y << 3u) + groupThreadId.x);
}

Copy Mip 0

在Copy Mip 0上面，除了可以用上面提到的第一次DownSample时拷贝的方式之外，还能够用Cmd.CopyTexture的方式。

public int RenderColorGaussianPyramidCS(CommandBuffer cmd, Vector2Int size, Texture source, RenderTexture destination)
{
    ...
    
    // Copies src mip0 to dst mip0
    cmd.CopyTexture(source, 0, 0, 0, 0, size.x, size.y, destination, 0, 0, 0, 0);

    int finalTargetMipWidth = destination.width;
    int finalTargetMipHeight = destination.height;

    var cs = m_ColorPyramidCS;
    bool isFirstLoop = true;
    bool switchFlag = false;
    // Note: smaller mips are excluded as we don't need them and the gaussian compute works
    // on 8x8 blocks
    while (srcMipWidth >= 8 || srcMipHeight >= 8)
    {
        int dstMipWidth = Mathf.Max(1, srcMipWidth >> 1);
        int dstMipHeight = Mathf.Max(1, srcMipHeight >> 1);

        // Downsample.
        RenderTargetIdentifier sourceRTI, destinationRTI;
        if (isFirstLoop)
        {
            sourceRTI = source;
            destinationRTI = m_TempDownsamplePyramid0[rtIndex];
            // cmd.EnableKeyword(cs, this.copyMip0);
            // cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, "_Mip0", destination, 0);
        }
        else
        {
            if (switchFlag)
            {
                sourceRTI = m_TempDownsamplePyramid1[rtIndex];
                destinationRTI = m_TempDownsamplePyramid0[rtIndex];
            }
            else
            {
                sourceRTI = m_TempDownsamplePyramid0[rtIndex];
                destinationRTI = m_TempDownsamplePyramid1[rtIndex];
            }

            switchFlag = !switchFlag;
        }

        this.size[0] = srcMipWidth;
        this.size[1] = srcMipHeight;
        cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Source, sourceRTI);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidDownSampleKernel, ShaderIDs._Destination, destinationRTI);
        cmd.DispatchCompute(cs, m_ColorPyramidDownSampleKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);
        if (isFirstLoop)
        {
            // cmd.DisableKeyword(cs, this.copyMip0);
            isFirstLoop = false;
        }


        this.size[0] = dstMipWidth;
        this.size[1] = dstMipHeight;
        cmd.SetComputeVectorParam(cs, ShaderIDs._Size, this.size);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Source, destinationRTI);
        cmd.SetComputeTextureParam(cs, m_ColorPyramidGaussianKernel, ShaderIDs._Destination, destination, srcMipLevel + 1);
        cmd.DispatchCompute(cs, m_ColorPyramidGaussianKernel, HQUtils.DivRoundUp(dstMipWidth, 8), HQUtils.DivRoundUp(dstMipHeight, 8), 1);

        srcMipLevel++;
        srcMipWidth = srcMipWidth >> 1;
        srcMipHeight = srcMipHeight >> 1;

        finalTargetMipWidth = finalTargetMipWidth >> 1;
        finalTargetMipHeight = finalTargetMipHeight >> 1;
    }

    return srcMipLevel + 1;
}

性能对比

上面是HDRP原来用PixelShader 生成ColorPyramid的耗时
下面是使用cmd.CopyTexture+Compute Shader Blur生成ColorPyramid的耗时

这个是不用cmd.CopyTexture生成ColorPyramid的耗时

测试用的显卡是RTX3080，一通操作下来减少DrawCall的调用优化了大概0.01ms左右，可以看到相比于不使用Cmd.CopyTexture的方式还能够提升0.004ms左右。以我目前贫瘠的硬件知识，我猜测通过Cmd.CopyTexture的方式减轻了Compute pipeline的压力，从而有所提升。