前些天想根据DX11的SDk自己写一个最简单的Direct Compute,中间又病倒了,耽搁了这些天昨晚才调好,鉴于网上太少dx11的学习资料了,总结下。

  首先要说的是我的改写版本主要是为了代码上的简明,去掉了很多硬件检测,以便迅速明白dx11的Direct Compute的架构,所以在很多人的机器上怕一时还运行不了,这个可以先明白基本架构后再进行向下兼容。

  首先说下Directx11的存储读写模型。在Dx11的硬件中主要用buffer存储数据,device来进行计算,shader resource view和unordered access view来作为前两者沟通的桥梁 也就是device通过shader resource view来读buffer通过unordereed access view来读写buffer

  我们要实现的功能是将CPU中的数据读写入GPU,然后在GPU中执行简单的计算:将两个buffer的数据相加到另一个buffer,再读回到CPU中以验证。在GPU中主要执行的计算是

struct BufType
{
    int i;
    float f;
};

BufferOut[DTid.x].i = Buffer0[DTid.x].i+Buffer1[DTid.x].i;

BufferOut[DTid.x].f = Buffer0[DTid.x].f+Buffer1[DTid.x].f;

 

  下面集中讲解下DX11的代码,虽然有点繁琐,但是记住我们的工作关键点大部分是一一依次初始化上面说的三个存储模型ID3D11Device,ID3D11Buffer,ID3D11ShaderResourceView,ID3D11UnorderedAccessView就可以抓住关键不会乱了。

 

#include<stdio.h>
#include<crtdbg.h>
#include <d3dcommon.h>
#include <d3d11.h>
#include <d3dcompiler.h>
#include <d3dx11.h>
#include<iostream>
#include <algorithm>
using namespace std;

 

#define SAFE_RELEASE(p) {if(p){(p)->Release();(p)=NULL;}}

const UINT NUM_ELEMENTS=512;

 

//为了程序易读性,全局声明所有的函数
//1-创建D3d11的设备
HRESULT CreateComputeDevice(ID3D11Device** ppDeviceOut,ID3D11DeviceContext** ppContextDeviceOut);

 

//2-编译HLSL程序以生成computeShader接口:主要调用D3DX11CompileFromFile(),ID3D11Device::CreateComputeShader()
HRESULT CreateComputeShader(LPCWSTR pSrcFile,LPCSTR pFunctionName,ID3D11Device* pDevice,ID3D11ComputeShader** ppShaderOut);

 

//3-根据CPU中的数据void* pInitData创建GPU中的structure buffer:主要调用ID3D11Device::CreateBuffer()
HRESULT CreateStructureBuffer(ID3D11Device* gDevice,UINT elementSize,UINT uCount,void* gInitData,ID3D11Buffer** ppBufferOut);

 

//4-创建ShaderResourceView,让GPU能够读取相应的buffer:主要调用ID3D11Device::CreateShaderResourceView()
HRESULT CreateBufferSRV(ID3D11Device* pDevice,ID3D11Buffer* pBuffer,ID3D11ShaderResourceView** ppSRVOut);

 

//5-创建UnorderAccessView,让GPU能够读取和写入相应的buffer:主要调用ID3D11Device::CreateUnorderedAccessView()
HRESULT CreateBufferUAV(ID3D11Device* pDevice,ID3D11Buffer* pBuffer,ID3D11UnorderedAccessView** ppUAVOut);

 

//6-运行Compute Shader的程序
void RunComputeShader(ID3D11DeviceContext* pImmediateContext,ID3D11ComputeShader* pComputeShader,UINT nNumViews,ID3D11ShaderResourceView** pShaderResourceViews,ID3D11UnorderedAccessView* pUnorderedView,
UINT X,UINT Y,UINT Z);

 

//7-将GPU输出的Buffer拷贝回CPU中,以检验运算结果:主要调用ID3D11DeviceContext::CopyResource();
ID3D11Buffer* CreateAndCopyToDebugBuf(ID3D11Device* pDevice,ID3D11DeviceContext* pd3dImmediateContext,ID3D11Buffer* pBuffer);

 

//声明和定义全局变量

//与设备相关的指针借口
ID3D11Device* g_pDevice=NULL;
ID3D11DeviceContext* g_pContext=NULL;
ID3D11ComputeShader* g_pCS=NULL;

 

//各个Buffer指针变量

ID3D11Buffer* g_pBuffer0=NULL;
ID3D11Buffer* g_pBuffer1=NULL;
ID3D11Buffer* g_pBufferResult=NULL;

 

//读写上面buffer的ID3D11ShaderResourceView和UnorderedAccessView接口

ID3D11ShaderResourceView* g_pBuf0SRV=NULL;
ID3D11ShaderResourceView* g_pBuf1SRV=NULL;
ID3D11UnorderedAccessView* g_pBufResultUAV=NULL;

 

struct BufType
{
 int i;
 float f;
};

 

BufType g_vBuf0[NUM_ELEMENTS];
BufType g_vBuf1[NUM_ELEMENTS];

int  main()
{
 //在debug模式下,开启run-time时的内存检查
#if defined(DEBUG)||defined(_DEBUG)
 _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF|_CRTDBG_LEAK_CHECK_DF);
#endif

 

 cout<<"创建设备"<<endl;
 if(FAILED(CreateComputeDevice(&g_pDevice,&g_pContext)))
 {
  printf("初始化设备失败");
  return 1;
 }
 cout<<"创建设备成功"<<endl;

 printf("创建Compute Shader/n");
 if(FAILED(CreateComputeShader(L"BasicCompute11.hlsl","CSMain",g_pDevice,&g_pCS)))
 {
  printf("创建Compute Shader失败");
  return 1;
 }
 cout<<"创建Compute Shader成功"<<endl;


 //初始化计算数据
 for(int i=0;i<NUM_ELEMENTS;i++)
 {
  g_vBuf0[i].i=i;
  g_vBuf0[i].f=(float)i;


  g_vBuf1[i].i=i;
  g_vBuf1[i].f=(float)i;
 }

 

 //为CPU中的数组创建GPU中相应Buffer
  printf("创建用于存储输入数组0的 structured buffer0/n");
 if(FAILED(CreateStructureBuffer(g_pDevice,sizeof(BufType),NUM_ELEMENTS,(void*)g_vBuf0,&g_pBuffer0)))
 {
  printf("创建buffer0失败/n");
  return 1;
 }
 printf("创建buffer0成功/n");

 

 printf("创建用于存储输入数组1的 structured buffer1/n");
 if(FAILED(CreateStructureBuffer(g_pDevice,sizeof(BufType),NUM_ELEMENTS,(void*)g_vBuf1,&g_pBuffer1)))
 {
  printf("创建buffer1失败/n");
  return 1;
 }
 printf("创建buffer1成功/n");

 

 printf("创建用于输出数组bufferOut的Unordered bufferOut/n");
 if(FAILED(CreateStructureBuffer(g_pDevice,sizeof(BufType),NUM_ELEMENTS,NULL,&g_pBufferResult)))
 {
  printf("创建bufferOut失败/n");
  return 1;
 }
 printf("创建bufferOut成功/n");

 

 //为buffer创建相应的resource view,以access buffer。
 printf("创建buffer0 shader resource view/n");
 if(FAILED(CreateBufferSRV(g_pDevice,g_pBuffer0,&g_pBuf0SRV)))
 {
  printf("创建 buffer0 SRV 失败/n");
  return 1;
 }
 printf("创建 buffer0 SRV 成功/n");

 

 printf("创建buffer1 shader resource view/n");
 if(FAILED(CreateBufferSRV(g_pDevice,g_pBuffer1,&g_pBuf1SRV)))
 {
  printf("创建 buffer1 SRV 失败/n");
  return 1;
 }
 printf("创建 buffer1 SRV 成功/n");


 printf("创建bufferOut Unordered Access View/n");
 if(FAILED(CreateBufferUAV(g_pDevice,g_pBufferResult,&g_pBufResultUAV)))
 {
  printf("创建bufferOut Unordered Access View失败/n");
  return 1;
 }
 printf("创建bufferOut Unordered Access View成功/n");


 ID3D11ShaderResourceView* shaderResourceViews[2]={g_pBuf0SRV,g_pBuf1SRV};
 //运行Shader Compute程序
 RunComputeShader(g_pContext,g_pCS,2,shaderResourceViews,g_pBufResultUAV,NUM_ELEMENTS,1,1);

 //将GPU计算的结果写回CPU。
 ID3D11Buffer* debugBuf=NULL;
 debugBuf=CreateAndCopyToDebugBuf(g_pDevice,g_pContext,g_pBufferResult);
 D3D11_MAPPED_SUBRESOURCE MappedResource;
 BufType* p;

 g_pContext->Map(debugBuf,0,D3D11_MAP_READ,0,&MappedResource);
 p=(BufType*)MappedResource.pData;

 

 printf("GPU计算完毕,输出前面30个结果如下:");
 cout<<"Output GPU compute results:"<<endl;
 for(int i=0;i<30;i++)
  cout<<p[i].i<<" "<<p[i].f<<" "<<endl;

 g_pContext->Unmap(debugBuf,0);
 SAFE_RELEASE(debugBuf);


 //释放资源
 cout<<"clean up"<<endl;
 SAFE_RELEASE(g_pBuf0SRV);
 SAFE_RELEASE(g_pBuf1SRV);
 SAFE_RELEASE(g_pBufResultUAV);

 SAFE_RELEASE(g_pBuffer0);
 SAFE_RELEASE(g_pBuffer1);
 SAFE_RELEASE(g_pBufferResult);

 SAFE_RELEASE(g_pCS);
 SAFE_RELEASE(g_pContext);
 SAFE_RELEASE(g_pDevice);

 return 0;

}

 

 

 

//--------------------------------------------------------------------------------------

//该函数其实和之前的D3D11CreateDevice()一样的,只是它可以动态的安装D311.dll,方便那些没有安装上D3D11.dll的用户

//--------------------------------------------------------------------------------------

HRESULT WINAPI Dynamic_D3D11CreateDevice( IDXGIAdapter* pAdapter,

D3D_DRIVER_TYPE DriverType,

HMODULE Software,

UINT32 Flags,

CONST D3D_FEATURE_LEVEL* pFeatureLevels,

UINT FeatureLevels,

UINT32 SDKVersion,

ID3D11Device** ppDevice,

D3D_FEATURE_LEVEL* pFeatureLevel,

ID3D11DeviceContext** ppImmediateContext )

{

typedef HRESULT (WINAPI * LPD3D11CREATEDEVICE)( IDXGIAdapter*, D3D_DRIVER_TYPE, HMODULE, UINT32, CONST D3D_FEATURE_LEVEL*, UINT, UINT32, ID3D11Device**, D3D_FEATURE_LEVEL*, ID3D11DeviceContext** );

static LPD3D11CREATEDEVICE  s_DynamicD3D11CreateDevice = NULL;

 

if ( s_DynamicD3D11CreateDevice == NULL )

{            

HMODULE hModD3D11 = LoadLibrary( L"d3d11.dll" );

 

s_DynamicD3D11CreateDevice = ( LPD3D11CREATEDEVICE )GetProcAddress( hModD3D11, "D3D11CreateDevice" );           

}

 

return s_DynamicD3D11CreateDevice( pAdapter, DriverType, Software, Flags, pFeatureLevels, FeatureLevels,

SDKVersion, ppDevice, pFeatureLevel, ppImmediateContext );

}

 

//--------------------------------------------------------------------------------------

// 下面就是真正的创建D3d11Device了,主要是创建ID3D11Device和ID3D11DeviceContext,为了代码的简明,假设硬件是

//支持D3D11的,免去了硬件的检查的繁琐代码。

//--------------------------------------------------------------------------------------

HRESULT CreateComputeDevice( ID3D11Device** ppDeviceOut, ID3D11DeviceContext** ppContextOut )

{    

BOOL bForceRef=FALSE;

 

*ppDeviceOut = NULL;

*ppContextOut = NULL;

 

HRESULT hr = S_OK;

 

UINT uCreationFlags = D3D11_CREATE_DEVICE_SINGLETHREADED;

#if defined(DEBUG) || defined(_DEBUG)

uCreationFlags |= D3D11_CREATE_DEVICE_DEBUG;

#endif

D3D_FEATURE_LEVEL flOut;

static const D3D_FEATURE_LEVEL flvl[] = { D3D_FEATURE_LEVEL_11_0, D3D_FEATURE_LEVEL_10_1, D3D_FEATURE_LEVEL_10_0 };

 

BOOL bNeedRefDevice = FALSE;

if ( !bForceRef )

{

hr = Dynamic_D3D11CreateDevice( NULL,                        // Use default graphics card

D3D_DRIVER_TYPE_HARDWARE,    // Try to create a hardware accelerated device

NULL,                        // Do not use external software rasterizer module

uCreationFlags,              // Device creation flags

flvl,

sizeof(flvl) / sizeof(D3D_FEATURE_LEVEL),

D3D11_SDK_VERSION,           // SDK version

ppDeviceOut,                 // Device out

&flOut,                      // Actual feature level created

ppContextOut );              // Context out

}

return hr;

}

 

 

//先后利用D3DX11CompileFromFile()和ID3D11Device::CreateComputeShader()来创建ID3D11ComputeShader*

HRESULT CreateComputeShader(LPCWSTR pSrcFile,LPCSTR pFunctionName,

ID3D11Device* pDevice,ID3D11ComputeShader** ppShaderOut)

{

 

HRESULT hr=S_FALSE;

DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;

#if defined( DEBUG ) || defined( _DEBUG )

// Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.

// Setting this flag improves the shader debugging experience, but still allows 

// the shaders to be optimized and to run exactly the way they will run in 

// the release configuration of this program.

dwShaderFlags |= D3DCOMPILE_DEBUG;

#endif

 

const D3D_SHADER_MACRO defines[] = 

{

"USE_STRUCTURED_BUFFERS", "1",

NULL, NULL

};

 

// We generally prefer to use the higher CS shader profile when possible as CS 5.0 is better performance on 11-class hardware

LPCSTR pProfile =  "cs_5_0" ;

 

ID3DBlob* pErrorBlob = NULL;

ID3DBlob* pBlob = NULL;

hr = D3DX11CompileFromFile( pSrcFile, defines, NULL, pFunctionName, pProfile, 

dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );

if ( FAILED(hr) )

{

if ( pErrorBlob )

OutputDebugStringA( (char*)pErrorBlob->GetBufferPointer() );

 

SAFE_RELEASE( pErrorBlob );

SAFE_RELEASE( pBlob );    

 

return hr;

}    

 

hr = pDevice->CreateComputeShader( pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL, ppShaderOut );

 

SAFE_RELEASE( pErrorBlob );

SAFE_RELEASE( pBlob );

 

return hr;

}

 

 

 

 

//利用ID3D11Device::CreateBuffer()在GPU中创建buffer以存储数据

HRESULT CreateStructureBuffer(ID3D11Device* pDevice,UINT elementSize,UINT uCount,void* pInitData,ID3D11Buffer** ppBufferOut)

{

*ppBufferOut=NULL;

 

D3D11_BUFFER_DESC desc;

ZeroMemory(&desc,sizeof(desc));

desc.BindFlags=D3D11_BIND_UNORDERED_ACCESS|D3D11_BIND_SHADER_RESOURCE;

desc.ByteWidth=elementSize*uCount;

desc.MiscFlags=D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;

desc.StructureByteStride=elementSize;

 

if(pInitData)

{

D3D11_SUBRESOURCE_DATA InitData;

InitData.pSysMem=pInitData;

return pDevice->CreateBuffer(&desc,&InitData,ppBufferOut);

}

else

return pDevice->CreateBuffer(&desc,NULL,ppBufferOut);

}

 

 

 

//利用ID3D11Device::CreateShaderResouceView()来创建GPU中Buffer的resourceView

HRESULT CreateBufferSRV(ID3D11Device* pDevice,ID3D11Buffer* pBuffer,ID3D11ShaderResourceView** ppSRVOut)

{

D3D11_BUFFER_DESC descBuf;

ZeroMemory(&descBuf,sizeof(descBuf));

pBuffer->GetDesc(&descBuf);

 

D3D11_SHADER_RESOURCE_VIEW_DESC desc;

ZeroMemory(&desc,sizeof(desc));

desc.ViewDimension=D3D11_SRV_DIMENSION_BUFFEREX;

desc.BufferEx.FirstElement=0;

 

//假定这是个structure buffer

desc.Format=DXGI_FORMAT_UNKNOWN;

desc.BufferEx.NumElements=descBuf.ByteWidth / descBuf.StructureByteStride;

 

return pDevice->CreateShaderResourceView(pBuffer,&desc,ppSRVOut);

}

 

 

 

 

//利用ID3D11Device::CreateUnorderedAccessView()来为buffer创建UnorderedAccessView

HRESULT CreateBufferUAV(ID3D11Device* pDevice,ID3D11Buffer* pBuffer,ID3D11UnorderedAccessView** ppUAVOut)

{

D3D11_BUFFER_DESC descBuf;

ZeroMemory(&descBuf,sizeof(descBuf));

pBuffer->GetDesc(&descBuf);

 

D3D11_UNORDERED_ACCESS_VIEW_DESC desc;

ZeroMemory(&desc,sizeof(desc));

desc.ViewDimension=D3D11_UAV_DIMENSION_BUFFER;

desc.Buffer.FirstElement=0;

 

//假设这是一个structure buffer

desc.Format=DXGI_FORMAT_UNKNOWN;

desc.Buffer.NumElements=descBuf.ByteWidth/descBuf.StructureByteStride;

 

return pDevice->CreateUnorderedAccessView(pBuffer,&desc,ppUAVOut);

}

/*下面是代码的核心,调用GPU来进行计算,关键是ID3D11DeviceContext::Dispatch(),不过关于GPU并行计算模型因为有一点点复杂,我们先不细说,大家先知道Direct Compute调用框架,下次有空再细聊其并行计算模型*/
void RunComputeShader(ID3D11DeviceContext* pImmediateContext,ID3D11ComputeShader* pComputeShader,UINT nNumViews,
 ID3D11ShaderResourceView** pShaderResourceViews,ID3D11UnorderedAccessView* pUnorderedView, 
 UINT X,UINT Y,UINT Z)
{
pImmediateContext->CSSetShader(pComputeShader,NULL,0);
pImmediateContext->CSSetShaderResources(0,nNumViews,pShaderResourceViews);
pImmediateContext->CSSetUnorderedAccessViews(0,1,&pUnorderedView,NULL);
pImmediateContext->Dispatch(NUM_ELEMENTS,1,1);
//清空Shader和各个ShaderView以及以及一些Constant Buffer
pImmediateContext->CSSetShader(NULL,NULL,0);
ID3D11UnorderedAccessView* ppUAViewNULL[1]={NULL};
pImmediateContext->CSSetUnorderedAccessViews(0,1,ppUAViewNULL,NULL);
ID3D11ShaderResourceView* ppSRVNULL[2]={NULL,NULL};
pImmediateContext->CSSetShaderResources(0,2,ppSRVNULL);
ID3D11Buffer* ppCBNULL[1]={NULL};
pImmediateContext->CSSetConstantBuffers(0,1,ppCBNULL);
}

 

//创建一个CPU可读的buffer,然后将GPU中的buffer内容导入到其中
//ID3D11DeviceContext::CopyResouce()
ID3D11Buffer* CreateAndCopyToDebugBuf(ID3D11Device* pDevice,ID3D11DeviceContext* pd3dImmediateContext,ID3D11Buffer* pBuffer)
{
ID3D11Buffer* debugBuf=NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory(&desc,sizeof(desc));
pBuffer->GetDesc(&desc);
desc.CPUAccessFlags=D3D11_CPU_ACCESS_READ;
desc.Usage=D3D11_USAGE_STAGING;
desc.BindFlags=0;
desc.MiscFlags=0;
if(SUCCEEDED(pDevice->CreateBuffer(&desc,NULL,&debugBuf)))
{
pd3dImmediateContext->CopyResource(debugBuf,pBuffer);
}
return debugBuf;
}
  最后在HLSL的Direct Compute 的详细代码如下:
struct BufType
{
    int i;
    float f;
};
StructuredBuffer<BufType> Buffer0 : register(t0);
StructuredBuffer<BufType> Buffer1 : register(t1);
RWStructuredBuffer<BufType> BufferOut : register(u0);
[numthreads(1, 1, 1)]
void CSMain( uint3 DTid : SV_DispatchThreadID )
{
    BufferOut[DTid.x].i = Buffer0[DTid.x].i+Buffer1[DTid.x].i;
    BufferOut[DTid.x].f = Buffer0[DTid.x].f+Buffer1[DTid.x].f;
}

 

posted on 2011-01-29 10:29  Bester  阅读(581)  评论(0编辑  收藏  举报