cuda core实现两个128x128 float矩阵乘法demo
#include <stdio.h>
#include <cuda_runtime.h>
// 128 x 128 ->
__global__ void mm(float* a, float* b, float* c) {
// 8 x 8个方块,每个方块16x16
extern __shared__ float buf[];
float* a_local = buf;
float* b_local = buf + 16*128;
for(int i=0; i<8; i++) {
a_local[threadIdx.x + i*16 + threadIdx.y*128] = a[threadIdx.x + i*16 + threadIdx.y*128 + blockIdx.y*128*16];
b_local[(threadIdx.y + i*16)*16 + threadIdx.x] = b[(threadIdx.y + i*16)*128 + threadIdx.x + blockIdx.x*16];
}
__syncthreads();
float tmp = 0.0f;
for(int k=0; k<128; k++) tmp += a_local[threadIdx.y*128+k]*b_local[k*16+threadIdx.x];
c[(blockIdx.y*16+threadIdx.y)*128+blockIdx.x*16+threadIdx.x] = tmp;
}
#define CHECK_ERROR(expr) { \
cudaError_t err = expr; \
if(err != cudaSuccess) { \
fprintf(stderr, "[Error] %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
} \
}
#define A(i,j) a[i*128+j]
#define B(i,j) b[i*128+j]
#define C(i,j) c[i*128+j]
#define G(i,j) golden[i*128+j]
int main() {
// int deviceId = 0;
// CHECK_ERROR(cudaSetDevice(deviceId));
constexpr size_t size = 128*128*sizeof(float);
float* a = (float*)malloc(size);
float* b = (float*)malloc(size);
float* c = (float*)malloc(size);
float* golden = (float*)malloc(size);
// generate input data and golden
for(int i=0; i<128; i++) {
for(int j=0; j<128; j++) {
A(i,j) = (float)(random()%1024);
B(i,j) = (float)(random()%1024);
}
}
for(int i=0; i<128; i++) {
for(int j=0; j<128; j++) {
G(i,j) = 0.0f;
for(int k=0; k<128; k++) {
G(i,j) += A(i,k)*B(k,j);
}
}
}
float *a_d, *b_d, *c_d;
CHECK_ERROR(cudaMalloc((void**)&a_d, size));
CHECK_ERROR(cudaMalloc((void**)&b_d, size));
CHECK_ERROR(cudaMalloc((void**)&c_d, size));
cudaStream_t stream;
CHECK_ERROR( cudaStreamCreate(&stream) );
CHECK_ERROR( cudaMemcpy(a_d, a, size, cudaMemcpyHostToDevice) );
CHECK_ERROR( cudaMemcpy(b_d, b, size, cudaMemcpyHostToDevice) );
mm<<<dim3(8,8,1), dim3(16, 16, 1), 16*128*2*4, stream>>>(a_d, b_d, c_d);
{
cudaError_t err = cudaGetLastError();
if(err!=cudaSuccess) {
fprintf(stderr, "[Error] %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err));
}
}
CHECK_ERROR(cudaMemcpy(c, c_d, size, cudaMemcpyDeviceToHost));
CHECK_ERROR( cudaStreamSynchronize(stream) );
//check result
float res = 0.0f;
for(int i=0; i<128; i++)
for(int j=0; j<128; j++) res += fabs(G(i,j) - C(i,j));
for(int i=0; i<10; i++)
printf("golden[%d]: %f vs real[%d]: %f \n", i, golden[i], i, c[i]);
if(res < 1.0e-2) printf("test pass!\n");
else {
printf("test fail! res = %f\n", res);
}
free(a); free(b); free(c); free(golden);
cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
return 0;
}
采用8x8的block, 每个block中完成c矩阵中16x16
编译执行结果。
$ nvcc mmad.cu -Xptxas -v
$ ./a.out
golden[0]: 32589786.000000 vs real[0]: 32589786.000000
golden[1]: 38473160.000000 vs real[1]: 38473160.000000
golden[2]: 30227116.000000 vs real[2]: 30227116.000000
golden[3]: 28977550.000000 vs real[3]: 28977550.000000
golden[4]: 34897048.000000 vs real[4]: 34897048.000000
golden[5]: 36245064.000000 vs real[5]: 36245064.000000
golden[6]: 31798204.000000 vs real[6]: 31798204.000000
golden[7]: 30707464.000000 vs real[7]: 30707464.000000
golden[8]: 34893612.000000 vs real[8]: 34893612.000000
golden[9]: 36354168.000000 vs real[9]: 36354168.000000
test pass!