CUDA矩阵乘法
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <cuda_runtime.h> 4 #include <time.h> 5 #include <device_launch_parameters.h> 6 7 #define BLOCK_SIZE 32 8 void matgen(float* a, int lda, int n) 9 { 10 int i, j; 11 for (i = 0; i < n; i++) { 12 for (j = 0; j < n; j++) { 13 a[i * lda + j] = (float)rand() / RAND_MAX + 14 (float)rand() / (RAND_MAX * RAND_MAX); 15 } 16 } 17 } 18 clock_t matmultCPU(const float* a, int lda, const float* b, int ldb, 19 float* c, int ldc, int n) 20 { 21 int i, j, k; 22 clock_t start, end; 23 start = clock(); 24 for (i = 0; i < n; i++) { 25 for (j = 0; j < n; j++) { 26 double t = 0; 27 for (k = 0; k < n; k++) { 28 t += a[i * lda + k] * b[k * ldb + j]; 29 } 30 c[i * ldc + j] = t; 31 } 32 } 33 end = clock(); 34 return end - start; 35 } 36 void compare_mat(const float* a, int lda, 37 const float* b, int ldb, int n) 38 { 39 float max_err = 0; 40 float average_err = 0; 41 int i, j; 42 for (i = 0; i < n; i++) { 43 for (j = 0; j < n; j++) { 44 if (b[i * ldb + j] != 0) { 45 float err = fabs((a[i * lda + j] - 46 b[i * ldb + j]) / b[i * ldb + j]); 47 if (max_err < err) max_err = err; 48 average_err += err; 49 } 50 } 51 } 52 printf("Max error: %g Average error: %g\n", 53 max_err, average_err / (n * n)); 54 } 55 __global__ static void matMultCUDA(const float* a, size_t lda, 56 const float* b, size_t ldb, float* c, size_t ldc, int n) 57 { 58 __shared__ float matA[BLOCK_SIZE][BLOCK_SIZE]; 59 __shared__ float matB[BLOCK_SIZE][BLOCK_SIZE]; 60 const int tidc = threadIdx.x; 61 const int tidr = threadIdx.y; 62 const int bidc = blockIdx.x * BLOCK_SIZE; 63 const int bidr = blockIdx.y * BLOCK_SIZE; 64 int i, j; 65 float results = 0; 66 float comp = 0; 67 for (j = 0; j < n; j += BLOCK_SIZE) { 68 if (tidr + bidr < n && tidc + j < n) { 69 matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j]; 70 } 71 else { 72 matA[tidr][tidc] = 0; 73 } 74 if (tidr + j < n && tidc + bidc < n) { 75 matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc]; 76 } 77 else { 78 matB[tidr][tidc] = 0; 79 } 80 __syncthreads(); 81 for (i = 0; i < BLOCK_SIZE; i++) { 82 float t; 83 comp -= matA[tidr][i] * matB[i][tidc]; 84 t = results - comp; 85 comp = (t - results) + comp; 86 results = t; 87 } 88 __syncthreads(); 89 } 90 if (tidr + bidr < n && tidc + bidc < n) { 91 c[(tidr + bidr) * ldc + tidc + bidc] = results; 92 } 93 } 94 clock_t matmultCUDA(const float* a, int lda, 95 const float* b, int ldb, float* c, int ldc, int n) 96 { 97 float *ac, *bc, *cc; 98 size_t pitch_a, pitch_b, pitch_c; 99 clock_t start, end; 100 start = clock(); 101 cudaMallocPitch((void**)&ac, &pitch_a, sizeof(float)* n, n); 102 cudaMallocPitch((void**)&bc, &pitch_b, sizeof(float)* n, n); 103 cudaMallocPitch((void**)&cc, &pitch_c, sizeof(float)* n, n); 104 105 cudaMemcpy2D(ac, pitch_a, a, sizeof(float)* lda, 106 sizeof(float)* n, n, cudaMemcpyHostToDevice); 107 cudaMemcpy2D(bc, pitch_b, b, sizeof(float)* ldb, 108 sizeof(float)* n, n, cudaMemcpyHostToDevice); 109 110 int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; 111 dim3 blocks(bx, bx); 112 dim3 threads(BLOCK_SIZE, BLOCK_SIZE); 113 matMultCUDA <<<blocks, threads >>>(ac, pitch_a / sizeof(float), 114 bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n); 115 116 cudaMemcpy2D(c, sizeof(float)* ldc, cc, pitch_c, 117 sizeof(float)* n, n, cudaMemcpyDeviceToHost); 118 cudaFree(ac); 119 cudaFree(bc); 120 cudaFree(cc); 121 end = clock(); 122 return end - start; 123 } 124 int main() 125 { 126 float *a, *b, *c, *d; 127 int n = 1024; 128 a = (float*)malloc(sizeof(float)* n * n); 129 b = (float*)malloc(sizeof(float)* n * n); 130 c = (float*)malloc(sizeof(float)* n * n); 131 d = (float*)malloc(sizeof(float)* n * n); 132 srand(0); 133 matgen(a, n, n); 134 matgen(b, n, n); 135 clock_t cpu_time = matmultCPU(a, n, b, n, d, n, n); 136 clock_t gpu_time = matmultCUDA(a, n, b, n, c, n, n); 137 compare_mat(c, n, d, n, n); 138 printf("CPU time used: %dms \n", cpu_time); 139 printf("GPU time used: %dms \n", gpu_time); 140 141 system("pause"); 142 return 0; 143 }
参考:《深入浅出谈CUDA》