CUDA matrixMulCUBLAS

 

  1 ///////////////////////////////////////////////////////////////////////////
  2 //  matirxmulCUBLAS.cpp
  3 //
  4 // Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  5 //
  6 // Please refer to the NVIDIA end user license agreement (EULA) associated
  7 // with this source code for terms and conditions that govern your use of
  8 // this software. Any use, reproduction, disclosure, or distribution of
  9 // this software and related documentation outside the terms of the EULA
 10 // is strictly prohibited.
 11 // 
 12 ////////////////////////////////////////////////////////////////////////////
 13 
 14 //
 15 // Matrix multiplication: C = A * B.
 16 // Host code.
 17 //
 18 // This sample implements matrix multiplication as described in Chapter 3
 19 // of the programming guide and uses the CUBLAS library to demonstrate
 20 // the best performance.
 21 
 22 // SOME PRECAUTIONS:
 23 // IF WE WANT TO CALCULATE ROW-MAJOR MATRIX MULTIPLY C = A * B,
 24 // WE JUST NEED CALL CUBLAS API IN A REVERSE ORDER: cublasSegemm(B, A)! 
 25 // The reason is explained as follows:
 26 
 27 // CUBLAS library uses column-major storage, but C/C++ use row-major storage.
 28 // When passing the matrix pointer to CUBLAS, the memory layout alters from 
 29 // row-major to column-major, which is equivalent to an implict transpose. 
 30 
 31 // In the case of row-major C/C++ matrix A, B, and a simple matrix multiplication 
 32 // C = A * B, we can't use the input order like cublasSgemm(A, B)  because of
 33 // implict transpose. The actual result of cublasSegemm(A, B) is A(T) * B(T). 
 34 // If col(A(T)) != row(B(T)), equal to row(A) != col(B), A(T) and B(T) are not 
 35 // multipliable. Moreover, even if A(T) and B(T) are multipliable, the result C 
 36 // is a column-based cublas matrix, which means C(T) in C/C++, we need extra 
 37 // transpose code to convert it to a row-based C/C++ matrix.
 38 
 39 // To solve the problem, let's consider our desired result C, a row-major matrix. 
 40 // In cublas format, it is C(T) actually (becuase of the implict transpose). 
 41 // C = A * B, so C(T) = (A * B) (T) = B(T) * A(T). Cublas matrice B(T) and A(T) 
 42 // happen to be C/C++ matrice B and A (still becuase of the implict transpose)! 
 43 // We don't need extra transpose code, we only need alter the input order!
 44 //
 45 // CUBLAS provides high-performance matrix multiplication.
 46 // See also:
 47 // V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 48 // in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
 49 // Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 50 //
 51 
 52 // Utilities and system includes
 53 #include <assert.h>
 54 #include <helper_string.h>  // helper for shared functions common to CUDA SDK samples
 55 
 56 // CUDA runtime
 57 #include <cuda_runtime.h>
 58 #include <cublas_v2.h>
 59 
 60 // CUDA and CUBLAS functions
 61 #include <helper_functions.h>
 62 
 63 #ifndef min
 64 #define min(a,b) ((a < b) ? a : b)
 65 #endif
 66 #ifndef max
 67 #define max(a,b) ((a > b) ? a : b)
 68 #endif
 69 
 70 typedef struct _matrixSize      // Optional Command-line multiplier for matrix sizes
 71 {
 72     unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
 73 } sMatrixSize;
 74 
 75 ////////////////////////////////////////////////////////////////////////////////
 76 //! Compute reference data set matrix multiply on CPU
 77 //! C = A * B
 78 //! @param C          reference data, computed but preallocated
 79 //! @param A          matrix A as provided to device
 80 //! @param B          matrix B as provided to device
 81 //! @param hA         height of matrix A
 82 //! @param wB         width of matrix B
 83 ////////////////////////////////////////////////////////////////////////////////
 84 void
 85 matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 86 {
 87     for (unsigned int i = 0; i < hA; ++i)
 88         for (unsigned int j = 0; j < wB; ++j)
 89         {
 90             double sum = 0;
 91 
 92             for (unsigned int k = 0; k < wA; ++k)
 93             {
 94                 double a = A[i * wA + k];
 95                 double b = B[k * wB + j];
 96                 sum += a * b;
 97             }
 98 
 99             C[i * wB + j] = (float)sum;
100         }
101 }
102 
103 ////////////////////////////////////////////////////////////////////////////////
104 // These are CUDA Helper functions (in addition to helper_cuda.h)
105 
106 void inline checkError(cublasStatus_t status, const char *msg)
107 {
108     if (status != CUBLAS_STATUS_SUCCESS)
109     {
110         printf("%s", msg);
111         exit(EXIT_FAILURE);
112     }
113 }
114 // end of CUDA Helper Functions
115 
116 // Allocates a matrix with random float entries.
117 void randomInit(float *data, int size)
118 {
119     for (int i = 0; i < size; ++i)
120         data[i] = rand() / (float)RAND_MAX;
121 }
122 
123 void printDiff(float *data1, float *data2, int width, int height, int iListLength, float fListTol)
124 {
125     printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
126     int i,j,k;
127     int error_count=0;
128 
129     for (j = 0; j < height; j++)
130     {
131         if (error_count < iListLength)
132         {
133             printf("\n  Row %d:\n", j);
134         }
135 
136         for (i = 0; i < width; i++)
137         {
138             k = j * width + i;
139             float fDiff = fabs(data1[k] - data2[k]);
140 
141             if (fDiff > fListTol)
142             {
143                 if (error_count < iListLength)
144                 {
145                     printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
146                 }
147 
148                 error_count++;
149             }
150         }
151     }
152 
153     printf(" \n  Total Errors = %d\n", error_count);
154 }
155 
156 void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
157 {
158     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
159     cudaError_t error;
160     devID = 0;
161 
162     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
163     {
164         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
165         error = cudaSetDevice(devID);
166 
167         if (error != cudaSuccess)
168         {
169             printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
170             exit(EXIT_FAILURE);
171         }
172     }
173 
174     // get number of SMs on this GPU
175     error = cudaGetDevice(&devID);
176 
177     if (error != cudaSuccess)
178     {
179         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
180         exit(EXIT_FAILURE);
181     }
182 
183 
184     if (checkCmdLineFlag(argc, (const char **)argv, "sizemult"))
185     {
186         iSizeMultiple = getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
187     }
188 
189     iSizeMultiple = min(iSizeMultiple, 10);
190     iSizeMultiple = max(iSizeMultiple, 1);
191 
192     cudaDeviceProp deviceProp;
193 
194     error = cudaGetDeviceProperties(&deviceProp, devID);
195 
196     if (error != cudaSuccess)
197     {
198         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
199         exit(EXIT_FAILURE);
200     }
201 
202     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
203 
204     // use a larger block size for Fermi and above
205     int block_size = (deviceProp.major < 2) ? 16 : 32;
206 
207     matrix_size.uiWA = 2 * block_size * iSizeMultiple;
208     matrix_size.uiHA = 4 * block_size * iSizeMultiple;
209     matrix_size.uiWB = 2 * block_size * iSizeMultiple;
210     matrix_size.uiHB = 4 * block_size * iSizeMultiple;
211     matrix_size.uiWC = 2 * block_size * iSizeMultiple;
212     matrix_size.uiHC = 4 * block_size * iSizeMultiple;
213 
214     printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n",
215            matrix_size.uiWA, matrix_size.uiHA,
216            matrix_size.uiWB, matrix_size.uiHB,
217            matrix_size.uiWC, matrix_size.uiHC);
218 }
219 
220 ////////////////////////////////////////////////////////////////////////////////
221 //! Run a simple test matrix multiply using CUBLAS
222 ////////////////////////////////////////////////////////////////////////////////
223 int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
224 {
225     cudaDeviceProp deviceProp;
226     cudaError_t error;
227 
228     error = cudaGetDeviceProperties(&deviceProp, devID);
229 
230     if (error != cudaSuccess)
231     {
232         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
233         exit(EXIT_FAILURE);
234     }
235 
236     // use a larger block size for Fermi and above
237     int block_size = (deviceProp.major < 2) ? 16 : 32;
238 
239     // set seed for rand()
240     srand(2006);
241 
242     // allocate host memory for matrices A and B
243     unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
244     unsigned int mem_size_A = sizeof(float) * size_A;
245     float *h_A = (float *)malloc(mem_size_A);
246     unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
247     unsigned int mem_size_B = sizeof(float) * size_B;
248     float *h_B = (float *)malloc(mem_size_B);
249 
250     // set seed for rand()
251     srand(2006);
252 
253     // initialize host memory
254     randomInit(h_A, size_A);
255     randomInit(h_B, size_B);
256 
257     // allocate device memory
258     float *d_A, *d_B, *d_C;
259     unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
260     unsigned int mem_size_C = sizeof(float) * size_C;
261 
262     // allocate host memory for the result
263     float *h_C      = (float *) malloc(mem_size_C);
264     float *h_CUBLAS = (float *) malloc(mem_size_C);
265 
266     error = cudaMalloc((void **) &d_A, mem_size_A);
267 
268     if (error != cudaSuccess)
269     {
270         printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
271         exit(EXIT_FAILURE);
272     }
273 
274     error = cudaMalloc((void **) &d_B, mem_size_B);
275 
276     if (error != cudaSuccess)
277     {
278         printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
279         exit(EXIT_FAILURE);
280     }
281 
282     // copy host memory to device
283     error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
284 
285     if (error != cudaSuccess)
286     {
287         printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__);
288         exit(EXIT_FAILURE);
289     }
290 
291     error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
292 
293     if (error != cudaSuccess)
294     {
295         printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__);
296         exit(EXIT_FAILURE);
297     }
298 
299     error = cudaMalloc((void **) &d_C, mem_size_C);
300 
301     if (error != cudaSuccess)
302     {
303         printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
304         exit(EXIT_FAILURE);
305     }
306 
307     // setup execution parameters
308     dim3 threads(block_size, block_size);
309     dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);
310 
311     // create and start timer
312     printf("Computing result using CUBLAS...");
313 
314     // execute the kernel
315     int nIter = 30;
316 
317     // CUBLAS version 2.0
318     {
319         cublasHandle_t handle;
320 
321         cublasStatus_t ret;
322 
323         ret = cublasCreate(&handle);
324 
325         if (ret != CUBLAS_STATUS_SUCCESS)
326         {
327             printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
328             exit(EXIT_FAILURE);
329         }
330 
331         const float alpha = 1.0f;
332         const float beta  = 0.0f;
333         //Perform warmup operation with cublas
334         ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);
335 
336         if (ret != CUBLAS_STATUS_SUCCESS)
337         {
338             printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
339             exit(EXIT_FAILURE);
340         }
341 
342         // Allocate CUDA events that we'll use for timing
343         cudaEvent_t start;
344         error = cudaEventCreate(&start);
345 
346         if (error != cudaSuccess)
347         {
348             fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
349             exit(EXIT_FAILURE);
350         }
351 
352         cudaEvent_t stop;
353         error = cudaEventCreate(&stop);
354 
355         if (error != cudaSuccess)
356         {
357             fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
358             exit(EXIT_FAILURE);
359         }
360 
361         // Record the start event
362         error = cudaEventRecord(start, NULL);
363 
364         if (error != cudaSuccess)
365         {
366             fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
367             exit(EXIT_FAILURE);
368         }
369 
370         for (int j = 0; j < nIter; j++)
371         {
372             //note cublas is column primary!
373             //need to transpose the order
374             ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);
375 
376             if (ret != CUBLAS_STATUS_SUCCESS)
377             {
378                 printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
379                 exit(EXIT_FAILURE);
380             }
381         }
382 
383         printf("done.\n");
384 
385         // Record the stop event
386         error = cudaEventRecord(stop, NULL);
387 
388         if (error != cudaSuccess)
389         {
390             fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
391             exit(EXIT_FAILURE);
392         }
393 
394         // Wait for the stop event to complete
395         error = cudaEventSynchronize(stop);
396 
397         if (error != cudaSuccess)
398         {
399             fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
400             exit(EXIT_FAILURE);
401         }
402 
403         float msecTotal = 0.0f;
404         error = cudaEventElapsedTime(&msecTotal, start, stop);
405 
406         if (error != cudaSuccess)
407         {
408             fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
409             exit(EXIT_FAILURE);
410         }
411 
412         // Compute and print the performance
413         float msecPerMatrixMul = msecTotal / nIter;
414         double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
415         double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
416         printf(
417             "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
418             gigaFlops,
419             msecPerMatrixMul,
420             flopsPerMatrixMul);
421 
422         // copy result from device to host
423         error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
424 
425         if (error != cudaSuccess)
426         {
427             printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__);
428             exit(EXIT_FAILURE);
429         }
430 
431         checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
432     }
433 
434     // compute reference solution
435     printf("Computing result using host CPU...");
436     float *reference = (float *)malloc(mem_size_C);
437     matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
438     printf("done.\n");
439 
440     // check result (CUBLAS)
441     bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
442 
443     if (resCUBLAS != true)
444     {
445         printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
446     }
447 
448     printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");
449 
450     // clean up memory
451     free(h_A);
452     free(h_B);
453     free(h_C);
454     free(reference);
455     cudaFree(d_A);
456     cudaFree(d_B);
457     cudaFree(d_C);
458 
459     cudaDeviceReset();
460 
461     if (resCUBLAS == true)
462     {
463         return EXIT_SUCCESS;    // return value = 1
464     }
465     else
466     {
467         return EXIT_FAILURE;     // return value = 0
468     }
469 }
470 
471 ////////////////////////////////////////////////////////////////////////////////
472 // Program main
473 ////////////////////////////////////////////////////////////////////////////////
474 int main(int argc, char **argv)
475 {
476     printf("[Matrix Multiply CUBLAS] - Starting...\n");
477 
478     int devID = 0, sizeMult = 5;
479     sMatrixSize matrix_size;
480 
481     initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
482 
483     int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
484 
485     exit(matrix_result);
486 }

 

 1 CUDA_PATH ?=/usr/local/cuda-7.0
 2 NVCC      :=$(CUDA_PATH)/bin/nvcc -ccbin g++
 3 INCLUDE   :=-I/usr/local/cuda/include/\
 4            -I/usr/local/cuda/samples/common/inc\
 5         -I/usr/include/c++\
 6         -I./
 7 
 8 LIBRARIES :=-L/usr/local/cuda/lib64 -lcudart -lcublas
 9 TARGETS   :=matrixMulCUBLAS
10 OBJECTS   :=$(addsuffix .o, $(TARGETS))
11 
12 .SUFFIXES:.o .cu .cpp
13 .cu.o: 
14     $(NVCC) -arch=sm_20  $(INCLUDE) -c -o $@  $< $(LIBRARIES)
15 .cpp.o:
16     g++ $(INCLUDE) -c -o $@ $< $(LIBRARYIES)
17 
18 all:$(OBJECTS)
19     #sudo cp /usr/local/cuda/lib64/libcublas.so.7.0 /usr/lib
20     ln -s libcudart.so.7.0  libcudart.so
21     ln -s libcudart.so.7.0  libcudart.so.7
22     ln -s libcublas.so.7.0 libcublas.so
23     ln -s libcublas.so.7.0 libcublas.so.7
24     g++    $(INCLUDE) -o $(TARGETS) $^ $(LIBRARIES)
25 run:
26     ./$(TARGETS)
27 clean:
28     rm -rf *.o libcudart.so libcudart.so.7\
29         libcublas.so libcublas.so.7 $(TARGETS)

./matrixMulCUBLAS
[Matrix Multiply CUBLAS] - Starting...
GPU Device 0: "GeForce GTX 650 Ti" with compute capability 3.0

MatrixA(320,640), MatrixB(320,640), MatrixC(320,640)
Computing result using CUBLAS...done.
Performance= 612.46 GFlop/s, Time= 0.214 msec, Size= 131072000 Ops
Computing result using host CPU...done.
Comparing CUBLAS Matrix Multiply with CPU results: PASS

 

posted @ 2017-03-21 21:41  souwang  阅读(695)  评论(0编辑  收藏  举报