CUDA速度测试
CPU ==> AMD X4 635
GPU ==> GeForce GT 240
三个很简单的测试..
1. 最笨的算法,一堆FOR..
2. 四个线程(4物理核的CPU)..各算一块
3.GPU 分成64*64个BLOCK..每个BLOCK 16*16个线程
4.使用CUBLAS库
结果如下
6687720.500000, 6869132.500000, 6410965.000000, 6952017.500000
TIMES: 47125
6687720.500000, 6869132.500000, 6410965.000000, 6952017.500000
TIMES: 14203
6687720.500000, 6869132.500000, 6410964.500000, 6952017.000000
TIMES: 328
6687720.500000, 6869132.500000, 6410964.500000, 6952017.000000
TIMES: 250
时间比例大概为 1885:570:13:10 ....GPU的确比较强悍..
没用INTEL的库..AMD的U怕支持不好..改天找个机器试下.
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 #include <time.h> 5 #include <assert.h> 6 #include <conio.h> 7 #include <windows.h> 8 #include <process.h> 9 #include <cuda_runtime.h> 10 #include <cublas_v2.h> 11 #include <device_launch_parameters.h> 12 13 14 15 #define MAX_RUN 0 16 #define TILE_WIDTH 16 17 #define MAX_DIM 1024 18 19 float MatrixA[MAX_DIM][MAX_DIM]; 20 float MatrixB[MAX_DIM][MAX_DIM]; 21 float MatrixC[MAX_DIM][MAX_DIM]; 22 23 volatile unsigned long thr_run; 24 25 /* 设置矩阵内容 */ 26 void FillMatrix() 27 { 28 register int i, j; 29 srand( ( unsigned int )time( NULL ) ); 30 31 for ( i = 0; i < MAX_DIM; i ++ ) 32 { 33 for ( j = 0; j < MAX_DIM; j ++ ) 34 { 35 MatrixA[i][j] = ( float )rand() * rand() / 100 / RAND_MAX; 36 MatrixB[i][j] = ( float )rand() * rand() / 100 / RAND_MAX; 37 } 38 } 39 } 40 41 42 /********************************************************************/ 43 44 /* 运行在CPU上,最笨的方法 */ 45 void RunOnCPU() 46 { 47 float sum; 48 register int i, j, k; 49 50 for ( i = 0; i < MAX_DIM; ++ i ) 51 { 52 for ( j = 0; j < MAX_DIM; ++ j ) 53 { 54 sum = 0; 55 for ( k = 0; k < MAX_DIM; ++ k ) 56 { 57 sum += MatrixA[i][k] * MatrixB[k][j]; 58 } 59 MatrixC[i][j] = sum; 60 } 61 } 62 } 63 64 65 66 /********************************************************************/ 67 68 /* 子线程ROUTINE */ 69 void CPUThread( void* arg ) 70 { 71 register int i, j, k; 72 int dy, dy1; 73 float mulResult; 74 75 dy = ( ( int )MAX_DIM >> 2 ) * ( int )arg ; 76 dy1 = dy + ( ( int )MAX_DIM >> 2 ); 77 78 for ( i = dy; i < dy1; i ++ ) 79 { 80 for ( j = 0; j < MAX_DIM; j ++ ) 81 { 82 mulResult = 0; 83 for ( k = 0; k < MAX_DIM; k ++ ) 84 { 85 mulResult += MatrixA[i][k] * MatrixB[k][j]; 86 } 87 88 MatrixC[i][j] = mulResult; 89 } 90 } 91 92 InterlockedIncrement( &thr_run ); 93 94 _endthread(); 95 } 96 97 98 /* 运行在CPU上, CPU==>X4 635刚好开4个线程, 4个核全100% */ 99 void RunOnCPUMulThr() 100 { 101 int i; 102 unsigned int ret; 103 104 thr_run = 0; 105 106 for ( i = 0; i < 4; i ++ ) 107 { 108 ret = _beginthread( CPUThread, 0, ( void* )i ); 109 assert( ret != -1 ); 110 } 111 112 while ( thr_run != 4 ) 113 { 114 Sleep( 1 ); 115 } 116 } 117 118 /********************************************************************/ 119 120 /* 运行在GPU上 */ 121 __global__ void Matrix_Mul1( float* c, const float* a, const float* b ) 122 { 123 unsigned int i, j, bx, by, tx, ty; 124 float mulResult; 125 __shared__ float d_m[TILE_WIDTH][TILE_WIDTH]; 126 __shared__ float d_n[TILE_WIDTH][TILE_WIDTH]; 127 128 bx = blockIdx.x; 129 by = blockIdx.y; 130 tx = threadIdx.x; 131 ty = threadIdx.y; 132 133 mulResult = 0.0; 134 135 for ( i = 0; i < gridDim.x; ++i ) 136 { 137 d_m[ty][tx] = *( a + ( by * blockDim.y + ty ) * MAX_DIM + i * blockDim.x + tx ); 138 d_n[ty][tx] = *( b + ( i * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx ); 139 __syncthreads(); 140 141 for ( j = 0; j < blockDim.x; ++ j ) 142 { 143 mulResult += d_m[ty][j] * d_n[j][tx]; 144 } 145 __syncthreads(); 146 } 147 c[( by * blockDim.y + ty ) * MAX_DIM + bx * blockDim.x + tx] = mulResult; 148 } 149 150 void MatrixMul1( float* c, const float* a, const float* b ) 151 { 152 int cnt; 153 float* dev_a; 154 float* dev_b; 155 float* dev_c; 156 cudaError_t cudaStatus; 157 // 64 * 64 ====> 16 * 16 158 dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH ); 159 dim3 blocks( TILE_WIDTH, TILE_WIDTH ); 160 161 cnt = MAX_DIM * MAX_DIM; 162 dev_a = NULL; 163 dev_b = NULL; 164 dev_c = NULL; 165 166 /* 设置显卡,构建上下文 */ 167 cudaStatus = cudaSetDevice( 0 ); 168 assert( cudaStatus == cudaSuccess ); 169 170 /* 分配显存 */ 171 cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) ); 172 assert( cudaStatus == cudaSuccess ); 173 174 cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) ); 175 assert( cudaStatus == cudaSuccess ); 176 177 cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) ); 178 assert( cudaStatus == cudaSuccess ); 179 180 181 /* 内存传送数据到显存 */ 182 cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice ); 183 assert( cudaStatus == cudaSuccess ); 184 185 cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice ); 186 assert( cudaStatus == cudaSuccess ); 187 188 /* 调用显卡 */ 189 Matrix_Mul1 <<< grid, blocks >>> ( dev_c, dev_a, dev_b ); 190 191 /* 设备同步 */ 192 cudaStatus = cudaDeviceSynchronize(); 193 assert( cudaStatus == cudaSuccess ); 194 195 196 /* 结果从显存传送到内存 */ 197 cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost ); 198 assert( cudaStatus == cudaSuccess ); 199 200 /* 释放显存 */ 201 cudaFree( dev_c ); 202 cudaFree( dev_a ); 203 cudaFree( dev_b ); 204 205 /* 重启显卡(上下文) */ 206 cudaDeviceReset(); 207 } 208 209 210 /********************************************************************/ 211 212 /* 使用CUBLAS库 */ 213 void MatrixMul2( float* c, const float* a, const float* b ) 214 { 215 int cnt; 216 float* dev_a; 217 float* dev_b; 218 float* dev_c; 219 cublasHandle_t handle; 220 cublasStatus_t cuBlasStatus; 221 cudaError_t cudaStatus; 222 float alpha; 223 float beta; 224 // 64 * 64 ====> 16 * 16 225 dim3 grid( MAX_DIM / TILE_WIDTH, MAX_DIM / TILE_WIDTH ); 226 dim3 blocks( TILE_WIDTH, TILE_WIDTH ); 227 228 229 dev_a = NULL; 230 dev_b = NULL; 231 dev_c = NULL; 232 233 cnt = MAX_DIM * MAX_DIM; 234 235 alpha = 1.0f; 236 beta = 0.0f; 237 238 239 /* 设置显卡,构建上下文 */ 240 cudaStatus = cudaSetDevice( 0 ); 241 assert( cudaStatus == cudaSuccess ); 242 243 /* 初始化BLAS库 */ 244 cuBlasStatus = cublasCreate( &handle ); 245 assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS ); 246 247 /* 分配显存 */ 248 cudaStatus = cudaMalloc( ( void** )&dev_c, cnt * sizeof( float ) ); 249 assert( cudaStatus == cudaSuccess ); 250 251 cudaStatus = cudaMalloc( ( void** )&dev_a, cnt * sizeof( float ) ); 252 assert( cudaStatus == cudaSuccess ); 253 254 cudaStatus = cudaMalloc( ( void** )&dev_b, cnt * sizeof( float ) ); 255 assert( cudaStatus == cudaSuccess ); 256 257 /* 内存传送数据到显存 */ 258 cudaStatus = cudaMemcpy( dev_a, a, cnt * sizeof( float ), cudaMemcpyHostToDevice ); 259 assert( cudaStatus == cudaSuccess ); 260 261 cudaStatus = cudaMemcpy( dev_b, b, cnt * sizeof( float ), cudaMemcpyHostToDevice ); 262 assert( cudaStatus == cudaSuccess ); 263 264 265 /* 处理 */ 266 cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \ 267 MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \ 268 dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM ); 269 assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS ); 270 271 /* 处理 */ 272 cuBlasStatus = cublasSgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, \ 273 MAX_DIM, MAX_DIM, MAX_DIM, &alpha, \ 274 dev_b, MAX_DIM, dev_a, MAX_DIM, &beta, dev_c, MAX_DIM ); 275 assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS ); 276 277 /* 结果从显存传送到内存 */ 278 cudaStatus = cudaMemcpy( c, dev_c, cnt * sizeof( float ), cudaMemcpyDeviceToHost ); 279 assert( cudaStatus == cudaSuccess ); 280 281 /* 销毁BLAS */ 282 cuBlasStatus = cublasDestroy( handle ); 283 assert( cuBlasStatus == CUBLAS_STATUS_SUCCESS ); 284 285 /* 重启显卡(上下文) */ 286 cudaDeviceReset(); 287 } 288 289 290 /********************************************************************/ 291 292 293 int main() 294 { 295 DWORD dwTime1, dwTime2; 296 297 FillMatrix(); 298 299 memset( MatrixC, 0, sizeof( MatrixC ) ); 300 dwTime1 = GetTickCount(); 301 RunOnCPU(); 302 dwTime2 = GetTickCount() - dwTime1; 303 printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 ); 304 305 memset( MatrixC, 0, sizeof( MatrixC ) ); 306 dwTime1 = GetTickCount(); 307 RunOnCPUMulThr(); 308 dwTime2 = GetTickCount() - dwTime1; 309 printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 ); 310 311 memset( MatrixC, 0, sizeof( MatrixC ) ); 312 dwTime1 = GetTickCount(); 313 MatrixMul1( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB ); 314 dwTime2 = GetTickCount() - dwTime1; 315 printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 ); 316 317 memset( MatrixC, 0, sizeof( MatrixC ) ); 318 dwTime1 = GetTickCount(); 319 MatrixMul2( ( float* )MatrixC, ( const float* )MatrixA, ( const float* )MatrixB ); 320 dwTime2 = GetTickCount() - dwTime1; 321 printf( "%f, %f, %f, %f\nTIMES: %d\n\n", MatrixC[0][0], MatrixC[512][512], MatrixC[1023][1023], MatrixC[217][13], dwTime2 ); 322 323 getch(); 324 325 return 0; 326 }