cuda core实现两个128x128 float矩阵乘法demo

#include <stdio.h>
#include <cuda_runtime.h>

// 128 x 128 -> 
__global__ void mm(float* a, float* b, float* c) {
  // 8 x 8个方块,每个方块16x16
  extern __shared__ float buf[];
  float* a_local = buf;
  float* b_local = buf + 16*128;
  for(int i=0; i<8; i++) {
    a_local[threadIdx.x + i*16 + threadIdx.y*128] = a[threadIdx.x + i*16 + threadIdx.y*128 + blockIdx.y*128*16];
    b_local[(threadIdx.y + i*16)*16 + threadIdx.x] = b[(threadIdx.y + i*16)*128 + threadIdx.x + blockIdx.x*16];
  }
  __syncthreads();
  float tmp = 0.0f;
  for(int k=0; k<128; k++) tmp += a_local[threadIdx.y*128+k]*b_local[k*16+threadIdx.x];
  c[(blockIdx.y*16+threadIdx.y)*128+blockIdx.x*16+threadIdx.x] = tmp;
}

#define CHECK_ERROR(expr) { \
	cudaError_t err = expr; \
	if(err != cudaSuccess) { \
		fprintf(stderr, "[Error] %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
	} \
}

#define A(i,j) a[i*128+j]
#define B(i,j) b[i*128+j]
#define C(i,j) c[i*128+j]
#define G(i,j) golden[i*128+j]

int main() {
	// int deviceId = 0;
	// CHECK_ERROR(cudaSetDevice(deviceId));
	constexpr size_t size = 128*128*sizeof(float);
	float* a = (float*)malloc(size);
	float* b = (float*)malloc(size);
	float* c = (float*)malloc(size);
	float* golden = (float*)malloc(size);


    // generate input data and golden
	for(int i=0; i<128; i++) {
		for(int j=0; j<128; j++) {
			A(i,j) = (float)(random()%1024);
			B(i,j) = (float)(random()%1024);
		}
	}

	for(int i=0; i<128; i++) {
		for(int j=0; j<128; j++) {
			G(i,j) = 0.0f;
			for(int k=0; k<128; k++) {
				G(i,j) += A(i,k)*B(k,j);
			}
		}
	}

	float *a_d, *b_d, *c_d;
	CHECK_ERROR(cudaMalloc((void**)&a_d, size));
	CHECK_ERROR(cudaMalloc((void**)&b_d, size));
	CHECK_ERROR(cudaMalloc((void**)&c_d, size));
    cudaStream_t stream;
    CHECK_ERROR( cudaStreamCreate(&stream) );
	CHECK_ERROR( cudaMemcpy(a_d, a, size, cudaMemcpyHostToDevice) );
	CHECK_ERROR( cudaMemcpy(b_d, b, size, cudaMemcpyHostToDevice) );
    
	mm<<<dim3(8,8,1), dim3(16, 16, 1), 16*128*2*4, stream>>>(a_d, b_d, c_d);
	{
		cudaError_t err = cudaGetLastError();
		if(err!=cudaSuccess) {
			fprintf(stderr, "[Error] %s:%d %s\n", __FILE__, __LINE__, cudaGetErrorString(err));
		}
	}
	
	CHECK_ERROR(cudaMemcpy(c, c_d, size, cudaMemcpyDeviceToHost));
    CHECK_ERROR( cudaStreamSynchronize(stream) );
	//check result
	float res = 0.0f;
	for(int i=0; i<128; i++) 
		for(int j=0; j<128; j++) res += fabs(G(i,j) - C(i,j));
    for(int i=0; i<10; i++) 
        printf("golden[%d]: %f vs real[%d]: %f \n", i, golden[i], i, c[i]);

	if(res < 1.0e-2) printf("test pass!\n");
    else {
        printf("test fail! res = %f\n", res);
    }
        

    free(a); free(b); free(c); free(golden);
    cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
	return 0;
}

采用8x8的block, 每个block中完成c矩阵中16x16
编译执行结果。

$ nvcc mmad.cu -Xptxas -v 
$ ./a.out
golden[0]: 32589786.000000 vs real[0]: 32589786.000000 
golden[1]: 38473160.000000 vs real[1]: 38473160.000000 
golden[2]: 30227116.000000 vs real[2]: 30227116.000000 
golden[3]: 28977550.000000 vs real[3]: 28977550.000000 
golden[4]: 34897048.000000 vs real[4]: 34897048.000000 
golden[5]: 36245064.000000 vs real[5]: 36245064.000000 
golden[6]: 31798204.000000 vs real[6]: 31798204.000000 
golden[7]: 30707464.000000 vs real[7]: 30707464.000000 
golden[8]: 34893612.000000 vs real[8]: 34893612.000000 
golden[9]: 36354168.000000 vs real[9]: 36354168.000000 
test pass!
posted @ 2024-10-20 20:09  zwlwf  阅读(5)  评论(0编辑  收藏  举报