CUDA Matrix Multiply
1 /** 2 * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 3 * 4 * Please refer to the NVIDIA end user license agreement (EULA) associated 5 * with this source code for terms and conditions that govern your use of 6 * this software. Any use, reproduction, disclosure, or distribution of 7 * this software and related documentation outside the terms of the EULA 8 * is strictly prohibited. 9 * 10 */ 11 12 /** 13 * Matrix multiplication: C = A * B. 14 * Host code. 15 * 16 * This sample implements matrix multiplication as described in Chapter 3 17 * of the programming guide. 18 * It has been written for clarity of exposition to illustrate various CUDA 19 * programming principles, not with the goal of providing the most 20 * performant generic kernel for matrix multiplication. 21 * 22 * See also: 23 * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," 24 * in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08), 25 * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. 26 */ 27 28 // System includes 29 #include <stdio.h> 30 #include <assert.h> 31 32 // CUDA runtime 33 #include <cuda_runtime.h> 34 35 // Helper functions and utilities to work with CUDA 36 #include <helper_functions.h> 37 38 /** 39 * Matrix multiplication (CUDA Kernel) on the device: C = A * B 40 * wA is A's width and wB is B's width 41 */ 42 template <int BLOCK_SIZE> __global__ void 43 matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) 44 { 45 // Block index 46 int bx = blockIdx.x; 47 int by = blockIdx.y; 48 49 // Thread index 50 int tx = threadIdx.x; 51 int ty = threadIdx.y; 52 53 // Index of the first sub-matrix of A processed by the block 54 int aBegin = wA * BLOCK_SIZE * by; 55 56 // Index of the last sub-matrix of A processed by the block 57 int aEnd = aBegin + wA - 1; 58 59 // Step size used to iterate through the sub-matrices of A 60 int aStep = BLOCK_SIZE; 61 62 // Index of the first sub-matrix of B processed by the block 63 int bBegin = BLOCK_SIZE * bx; 64 65 // Step size used to iterate through the sub-matrices of B 66 int bStep = BLOCK_SIZE * wB; 67 68 // Csub is used to store the element of the block sub-matrix 69 // that is computed by the thread 70 float Csub = 0; 71 72 // Loop over all the sub-matrices of A and B 73 // required to compute the block sub-matrix 74 for (int a = aBegin, b = bBegin; 75 a <= aEnd; 76 a += aStep, b += bStep) 77 { 78 79 // Declaration of the shared memory array As used to 80 // store the sub-matrix of A 81 __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 82 83 // Declaration of the shared memory array Bs used to 84 // store the sub-matrix of B 85 __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 86 87 // Load the matrices from device memory 88 // to shared memory; each thread loads 89 // one element of each matrix 90 As[ty][tx] = A[a + wA * ty + tx]; 91 Bs[ty][tx] = B[b + wB * ty + tx]; 92 93 // Synchronize to make sure the matrices are loaded 94 __syncthreads(); 95 96 // Multiply the two matrices together; 97 // each thread computes one element 98 // of the block sub-matrix 99 #pragma unroll 100 101 for (int k = 0; k < BLOCK_SIZE; ++k) 102 { 103 Csub += As[ty][k] * Bs[k][tx]; 104 } 105 106 // Synchronize to make sure that the preceding 107 // computation is done before loading two new 108 // sub-matrices of A and B in the next iteration 109 __syncthreads(); 110 } 111 112 // Write the block sub-matrix to device memory; 113 // each thread writes one element 114 int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 115 C[c + wB * ty + tx] = Csub; 116 } 117 118 void constantInit(float *data, int size, float val) 119 { 120 for (int i = 0; i < size; ++i) 121 { 122 data[i] = val; 123 } 124 } 125 126 /** 127 * Run a simple test of matrix multiplication using CUDA 128 */ 129 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) 130 { 131 // Allocate host memory for matrices A and B 132 unsigned int size_A = dimsA.x * dimsA.y; 133 unsigned int mem_size_A = sizeof(float) * size_A; 134 float *h_A = (float *)malloc(mem_size_A); 135 unsigned int size_B = dimsB.x * dimsB.y; 136 unsigned int mem_size_B = sizeof(float) * size_B; 137 float *h_B = (float *)malloc(mem_size_B); 138 139 // Initialize host memory 140 const float valB = 0.01f; 141 constantInit(h_A, size_A, 1.0f); 142 constantInit(h_B, size_B, valB); 143 144 // Allocate device memory 145 float *d_A, *d_B, *d_C; 146 147 // Allocate host matrix C 148 dim3 dimsC(dimsB.x, dimsA.y, 1); 149 unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 150 float *h_C = (float *) malloc(mem_size_C); 151 152 if (h_C == NULL) 153 { 154 fprintf(stderr, "Failed to allocate host matrix C!\n"); 155 exit(EXIT_FAILURE); 156 } 157 158 cudaError_t error; 159 160 error = cudaMalloc((void **) &d_A, mem_size_A); 161 162 if (error != cudaSuccess) 163 { 164 printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); 165 exit(EXIT_FAILURE); 166 } 167 168 error = cudaMalloc((void **) &d_B, mem_size_B); 169 170 if (error != cudaSuccess) 171 { 172 printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); 173 exit(EXIT_FAILURE); 174 } 175 176 error = cudaMalloc((void **) &d_C, mem_size_C); 177 178 if (error != cudaSuccess) 179 { 180 printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); 181 exit(EXIT_FAILURE); 182 } 183 184 // copy host memory to device 185 error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 186 187 if (error != cudaSuccess) 188 { 189 printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__); 190 exit(EXIT_FAILURE); 191 } 192 193 error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 194 195 if (error != cudaSuccess) 196 { 197 printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__); 198 exit(EXIT_FAILURE); 199 } 200 201 // Setup execution parameters 202 dim3 threads(block_size, block_size); 203 dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); 204 205 // Create and start timer 206 printf("Computing result using CUDA Kernel...\n"); 207 208 // Performs warmup operation using matrixMul CUDA kernel 209 if (block_size == 16) 210 { 211 matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 212 } 213 else 214 { 215 matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 216 } 217 218 printf("done\n"); 219 220 cudaDeviceSynchronize(); 221 222 // Allocate CUDA events that we'll use for timing 223 cudaEvent_t start; 224 error = cudaEventCreate(&start); 225 226 if (error != cudaSuccess) 227 { 228 fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); 229 exit(EXIT_FAILURE); 230 } 231 232 cudaEvent_t stop; 233 error = cudaEventCreate(&stop); 234 235 if (error != cudaSuccess) 236 { 237 fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); 238 exit(EXIT_FAILURE); 239 } 240 241 // Record the start event 242 error = cudaEventRecord(start, NULL); 243 244 if (error != cudaSuccess) 245 { 246 fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); 247 exit(EXIT_FAILURE); 248 } 249 250 // Execute the kernel 251 int nIter = 300; 252 253 for (int j = 0; j < nIter; j++) 254 { 255 if (block_size == 16) 256 { 257 matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 258 } 259 else 260 { 261 matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 262 } 263 } 264 265 // Record the stop event 266 error = cudaEventRecord(stop, NULL); 267 268 if (error != cudaSuccess) 269 { 270 fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); 271 exit(EXIT_FAILURE); 272 } 273 274 // Wait for the stop event to complete 275 error = cudaEventSynchronize(stop); 276 277 if (error != cudaSuccess) 278 { 279 fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); 280 exit(EXIT_FAILURE); 281 } 282 283 float msecTotal = 0.0f; 284 error = cudaEventElapsedTime(&msecTotal, start, stop); 285 286 if (error != cudaSuccess) 287 { 288 fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); 289 exit(EXIT_FAILURE); 290 } 291 292 // Compute and print the performance 293 float msecPerMatrixMul = msecTotal / nIter; 294 double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; 295 double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 296 printf( 297 "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", 298 gigaFlops, 299 msecPerMatrixMul, 300 flopsPerMatrixMul, 301 threads.x * threads.y); 302 303 // Copy result from device to host 304 error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 305 306 if (error != cudaSuccess) 307 { 308 printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__); 309 exit(EXIT_FAILURE); 310 } 311 312 printf("Checking computed result for correctness: "); 313 bool correct = true; 314 315 // test relative error by the formula 316 // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps 317 double eps = 1.e-6 ; // machine zero 318 for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) 319 { 320 double abs_err = fabs(h_C[i] - (dimsA.x * valB)); 321 double dot_length = dimsA.x; 322 double abs_val = fabs(h_C[i]); 323 double rel_err = abs_err/abs_val/dot_length ; 324 if (rel_err > eps) 325 { 326 printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps); 327 correct = false; 328 } 329 } 330 331 printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 332 333 // Clean up memory 334 free(h_A); 335 free(h_B); 336 free(h_C); 337 cudaFree(d_A); 338 cudaFree(d_B); 339 cudaFree(d_C); 340 341 printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n"); 342 343 cudaDeviceReset(); 344 345 if (correct) 346 { 347 return EXIT_SUCCESS; 348 } 349 else 350 { 351 return EXIT_FAILURE; 352 } 353 } 354 355 356 /** 357 * Program main 358 */ 359 int main(int argc, char **argv) 360 { 361 printf("[Matrix Multiply Using CUDA] - Starting...\n"); 362 363 if (checkCmdLineFlag(argc, (const char **)argv, "help") || 364 checkCmdLineFlag(argc, (const char **)argv, "?")) 365 { 366 printf("Usage -device=n (n >= 0 for deviceID)\n"); 367 printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); 368 printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); 369 printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n"); 370 371 exit(EXIT_SUCCESS); 372 } 373 374 // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line 375 int devID = 0; 376 377 if (checkCmdLineFlag(argc, (const char **)argv, "device")) 378 { 379 devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 380 cudaSetDevice(devID); 381 } 382 383 cudaError_t error; 384 cudaDeviceProp deviceProp; 385 error = cudaGetDevice(&devID); 386 387 if (error != cudaSuccess) 388 { 389 printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__); 390 } 391 392 error = cudaGetDeviceProperties(&deviceProp, devID); 393 394 if (deviceProp.computeMode == cudaComputeModeProhibited) 395 { 396 fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n"); 397 exit(EXIT_SUCCESS); 398 } 399 400 if (error != cudaSuccess) 401 { 402 printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); 403 } 404 else 405 { 406 printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); 407 } 408 409 // Use a larger block size for Fermi and above 410 int block_size = (deviceProp.major < 2) ? 16 : 32; 411 412 dim3 dimsA(64*block_size, 64*block_size, 1); 413 dim3 dimsB(64*block_size, 64*block_size, 1); 414 // width of Matrix A 415 if (checkCmdLineFlag(argc, (const char **)argv, "wA")) 416 { 417 dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); 418 } 419 420 // height of Matrix A 421 if (checkCmdLineFlag(argc, (const char **)argv, "hA")) 422 { 423 dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); 424 } 425 426 // width of Matrix B 427 if (checkCmdLineFlag(argc, (const char **)argv, "wB")) 428 { 429 dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); 430 } 431 432 // height of Matrix B 433 if (checkCmdLineFlag(argc, (const char **)argv, "hB")) 434 { 435 dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); 436 } 437 438 if (dimsA.x != dimsB.y) 439 { 440 printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", 441 dimsA.x, dimsB.y); 442 exit(EXIT_FAILURE); 443 } 444 445 printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); 446 447 int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); 448 449 exit(matrix_result); 450 }
1 CUDA_PATH ?=/usr/local/cuda-7.0 #Makefile Using for compiling cuda project 2 NVCC :=$(CUDA_PATH)/bin/nvcc -ccbin g++ 3 INCLUDE :=-I/usr/local/cuda/include/\ 4 -I/usr/local/cuda/samples/common/inc\ 5 -I/usr/include/c++\ 6 -I./ 7 8 LIBRARIES :=-L/usr/local/cuda/lib64 -lcudart 9 TARGETS :=$(notdir $(PWD)) 10 OBJECTS :=$(addsuffix .o, $(TARGETS)) 11 12 .SUFFIXES:.o .cu .cpp 13 .cu.o: 14 $(NVCC) -arch=sm_20 $(INCLUDE) -c -o $@ $< $(LIBRARIES) 15 .cpp.o: 16 $(CXX) $(INCLUDE) -c -o $@ $< $(LIBRARIES) 17 18 all: $(TARGETS) 19 20 $(TARGETS): $(OBJECTS) 21 #sudo cp /usr/local/cuda/lib64/libcufft.so.7.0 /usr/lib 22 sudo ln -s libcudart.so.7.0.28 libcudart.so 23 sudo ln -s libcudart.so.7.0.28 libcudart.so.7 24 g++ $(INCLUDE) -o $@ $^ $(LIBRARIES) 25 run: 26 ./$(TARGETS) 27 clean: 28 rm -rf *.o kernel libcudart.so libcudart.so.7 $(TARGETS)
./kernel
P[0] = 2048
P[1] = 2048
P[2] = 2048
P[3] = 2048
P[4] = 2048
P[5] = 2048
P[6] = 2048
P[7] = 2048
P[8] = 2048
P[9] = 2048
P[1048566] = 2048
P[1048567] = 2048
P[1048568] = 2048
P[1048569] = 2048
P[1048570] = 2048
P[1048571] = 2048
P[1048572] = 2048
P[1048573] = 2048
P[1048574] = 2048
P[1048575] = 2048
time = 57.198975 msec