使用CUDA的线性代数库cuBLAS来计算矩阵乘法。这里主要记录调用规则,关于乘法函数中详细的参数说明和调用规则见另一篇随笔。
▶ 源代码:
1 #include <assert.h> 2 #include <helper_string.h> 3 #include <cuda_runtime.h> 4 #include <cublas_v2.h> 5 #include <helper_functions.h> 6 #include <helper_cuda.h> 7 8 #ifndef min 9 #define min(a,b) ((a < b) ? a : b) 10 #endif 11 #ifndef max 12 #define max(a,b) ((a > b) ? a : b) 13 #endif 14 15 // 存放各矩阵维数的结构体 16 typedef struct _matrixSize 17 { 18 unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; 19 } sMatrixSize; 20 21 // CPU 计算矩阵乘法。三个参数分别用于行定位、行程定位、列定位,没有查错机制。 22 void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB) 23 { 24 for (unsigned int i = 0; i < hA; ++i) // 从上往下数 i 行 25 { 26 for (unsigned int j = 0; j < wB; ++j) // 从左往右数 j 列 27 { 28 double sum = 0; 29 for (unsigned int k = 0; k < wA; ++k) // 行程长 30 { 31 double a = A[i * wA + k]; // 中间过程用 double,结果输出 float 32 double b = B[k * wB + j]; 33 sum += a * b; 34 } 35 C[i * wB + j] = (float)sum; 36 } 37 } 38 } 39 40 // 初始化数组 41 void randomInit(float *data, int size) 42 { 43 for (int i = 0; i < size; ++i) 44 data[i] = rand() / (float)RAND_MAX; 45 } 46 47 //输出两个矩阵的不相等的值及其位置,允许容差为 fListTol ,最多输出 iListLength 个 48 void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol) 49 { 50 printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol); 51 int i, j, k; 52 int error_count = 0; 53 54 for (j = 0; j < height; j++) 55 { 56 if (error_count < iListLength) 57 printf("\n Row %d:\n", j); 58 59 for (i = 0; i < width; i++) 60 { 61 k = j * width + i; 62 float fDiff = fabs(data1[k] - data2[k]); 63 if (fDiff > fListTol) 64 { 65 if (error_count < iListLength) 66 printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff); 67 error_count++; 68 } 69 } 70 } 71 printf(" \n Total Errors = %d\n", error_count); 72 } 73 74 // 初始化设备。包括选择设备,设定维数 75 void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size) 76 { 77 cudaError_t error; 78 79 // 选择设备,略去了检查错误部分 80 devID = 0; 81 if (checkCmdLineFlag(argc, (const char **)argv, "device")) 82 { 83 devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 84 error = cudaSetDevice(devID); 85 } 86 error = cudaGetDevice(&devID); 87 if (checkCmdLineFlag(argc, (const char **)argv, "sizemult")) 88 iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult"); 89 iSizeMultiple = max(min(iSizeMultiple, 10), 1); 90 cudaDeviceProp deviceProp; 91 error = cudaGetDeviceProperties(&deviceProp, devID); 92 printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 93 94 // Fermi 以上构架的计算机使用更大的线程块(blockDim),这里用了32 95 int block_size = (deviceProp.major < 2) ? 16 : 32; 96 97 matrix_size.uiWA = 3 * block_size * iSizeMultiple; 98 matrix_size.uiHA = 4 * block_size * iSizeMultiple; 99 matrix_size.uiWB = 2 * block_size * iSizeMultiple; 100 matrix_size.uiHB = 3 * block_size * iSizeMultiple; 101 matrix_size.uiWC = 2 * block_size * iSizeMultiple; 102 matrix_size.uiHC = 4 * block_size * iSizeMultiple; 103 104 printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", 105 matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC, matrix_size.uiWC); 106 if (matrix_size.uiWA != matrix_size.uiHB || matrix_size.uiHA != matrix_size.uiHC || matrix_size.uiWB != matrix_size.uiWC) 107 { 108 printf("ERROR: Matrix sizes do not match!\n"); 109 exit(-1); 110 } 111 } 112 113 // 计算矩阵乘法部分 114 int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) 115 { 116 cudaDeviceProp deviceProp; 117 cudaGetDeviceProperties(&deviceProp, devID); 118 int block_size = (deviceProp.major < 2) ? 16 : 32; 119 120 unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; 121 unsigned int mem_size_A = sizeof(float) * size_A; 122 float *h_A = (float *)malloc(mem_size_A); 123 unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; 124 unsigned int mem_size_B = sizeof(float) * size_B; 125 float *h_B = (float *)malloc(mem_size_B); 126 127 srand(2006); 128 randomInit(h_A, size_A); 129 randomInit(h_B, size_B); 130 131 float *d_A, *d_B, *d_C; 132 unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; 133 unsigned int mem_size_C = sizeof(float) * size_C; 134 //float *h_C = (float *) malloc(mem_size_C); // TM 没有用! 135 float *h_CUBLAS = (float *) malloc(mem_size_C); // 保存 d_C 回传的结果 136 cudaMalloc((void **) &d_A, mem_size_A); 137 cudaMalloc((void **) &d_B, mem_size_B); 138 cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 139 cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 140 cudaMalloc((void **) &d_C, mem_size_C); 141 142 dim3 threads(block_size, block_size); 143 dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); 144 145 printf("Computing result using CUBLAS..."); 146 int nIter = 30; 147 148 //cuBLAS代码块 149 { 150 const float alpha = 1.0f; 151 const float beta = 0.0f; 152 cublasHandle_t handle; 153 cudaEvent_t start, stop; 154 155 cublasCreate(&handle); 156 157 //热身,注意转置的问题,不采用 <<< >>> 调用核函数 158 cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, 159 &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); 160 161 cudaEventCreate(&start); 162 cudaEventCreate(&stop); 163 164 cudaEventRecord(start, NULL); 165 166 for (int j = 0; j < nIter; j++) 167 { 168 cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, 169 &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); 170 } 171 printf("done.\n"); 172 cudaEventRecord(stop, NULL); 173 cudaEventSynchronize(stop); 174 float msecTotal = 0.0f; 175 cudaEventElapsedTime(&msecTotal, start, stop); 176 177 // 计算了耗时、操作数以及操作速度 178 float msecPerMatrixMul = msecTotal / nIter; 179 double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiHC * (double)matrix_size.uiWC * (double)matrix_size.uiHB; 180 double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 181 printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); 182 183 cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost); 184 cublasDestroy(handle); 185 } 186 187 printf("Computing result using host CPU..."); 188 float *reference = (float *)malloc(mem_size_C); 189 matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); 190 printf("done.\n"); 191 192 bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); 193 194 if (resCUBLAS != true) 195 printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); 196 197 printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); 198 printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); 199 200 free(h_A); 201 free(h_B); 202 free(h_CUBLAS); 203 free(reference); 204 cudaFree(d_A); 205 cudaFree(d_B); 206 cudaFree(d_C); 207 208 if (resCUBLAS == true) 209 return EXIT_SUCCESS; 210 else 211 return EXIT_FAILURE; 212 } 213 214 int main(int argc, char **argv) 215 { 216 printf("[Matrix Multiply CUBLAS] - Starting...\n"); 217 218 int devID = 0, sizeMult = 5; 219 sMatrixSize matrix_size; 220 221 initializeCUDA(argc, argv, devID, sizeMult, matrix_size); 222 223 int matrix_result = matrixMultiply(argc, argv, devID, matrix_size); 224 225 getchar(); 226 return matrix_result; 227 }
▶ 输出结果:
[Matrix Multiply CUBLAS] - Starting... GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 MatrixA(640,480), MatrixB(480,320), MatrixC(640,320) Computing result using CUBLAS...done. Performance= 2887.08 GFlop/s, Time= 0.068 msec, Size= 196608000 Ops Computing result using host CPU...done. Comparing CUBLAS Matrix Multiply with CPU results: PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
▶ 涨姿势:
● 代码依然很烂。
多用了一个 h_C 根本没有用上,其作用被 h_CUBLAS 取代了,而且源代码中有free(h_C)却没有free(h_CUBLAS)。
1 float *h_C = (float *)malloc(mem_size_C); 2 ... 3 free(h_C);
● 句柄的创造与销毁,定义于cublas_api.h 中
1 /*cublas_api.h*/ 2 typedef struct cublasContext *cublasHandle_t; 3 4 /*cublas_v2.h*/ 5 #define cublasCreate cublasCreate_v2 6 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle); 7 8 #define cublasDestroy cublasDestroy_v2 9 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);
● 矩阵乘法计算核心函数。实际上该函数不是用来专门计算矩阵乘法的,而且对应不同的数据类型(实数、复数)和数据精度(单精度、双精度)一共有四个函数。
1 #define cublasSgemm cublasSgemm_v2 2 CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2 3 ( 4 cublasHandle_t handle, 5 cublasOperation_t transa, cublasOperation_t transb, 6 int m, int n, int k, 7 const float *alpha, 8 const float *A, int lda, 9 const float *B, int ldb, 10 const float *beta, 11 float *C, int ldc 12 );
● 定义在 helper_image.h 中的一个函数,用于比较两个长为 len 数组是否相等,允许容差为epsilon
1 inline bool sdkCompareL2fe(const float *reference, const float *data, const unsigned int len, const float epsilon)
● 调用cublas计算矩阵乘法的过程摘要(详细的参数说明和调用规则见另一篇随笔)
1 ...// 准备d_A,d_B,d_C,其中 d_A 和 d_B 中存放了需要相乘的两个矩阵,d_C初始化自动为零矩阵 2 3 // 规定使用的线程块和线程尺寸 4 dim3 threads(1, 1); 5 dim3 grid(1, 1); 6 7 // 常数因子,计算 d_C = d_A * d_B 时设定为 α = 1.0, β = 0.0 8 const float alpha = 1.0f; 9 const float beta = 0.0f; 10 11 // 创建句柄,需要在计算完成后销毁 12 cublasHandle_t handle; 13 cublasCreate(&handle); 14 15 // 调用计算函数,注意参数顺序 16 cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WB, HA, WA, &alpha, d_B, WB, d_A, WA, &beta, d_C, WB); 17 18 cublasDestroy(handle); 19 20 ...// 回收计算结果,顺序可以和销毁句柄交换