CUDA Matrix Multiply

 

  1 /**
  2  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  3  *
  4  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5  * with this source code for terms and conditions that govern your use of
  6  * this software. Any use, reproduction, disclosure, or distribution of
  7  * this software and related documentation outside the terms of the EULA
  8  * is strictly prohibited.
  9  *
 10  */
 11 
 12 /**
 13  * Matrix multiplication: C = A * B.
 14  * Host code.
 15  *
 16  * This sample implements matrix multiplication as described in Chapter 3
 17  * of the programming guide.
 18  * It has been written for clarity of exposition to illustrate various CUDA
 19  * programming principles, not with the goal of providing the most
 20  * performant generic kernel for matrix multiplication.
 21  *
 22  * See also:
 23  * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra,"
 24  * in Proc. 2008 ACM/IEEE Conf. on Superconducting (SC '08),
 25  * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11.
 26  */
 27 
 28 // System includes
 29 #include <stdio.h>
 30 #include <assert.h>
 31 
 32 // CUDA runtime
 33 #include <cuda_runtime.h>
 34 
 35 // Helper functions and utilities to work with CUDA
 36 #include <helper_functions.h>
 37 
 38 /**
 39  * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 40  * wA is A's width and wB is B's width
 41  */
 42 template <int BLOCK_SIZE> __global__ void
 43 matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
 44 {
 45     // Block index
 46     int bx = blockIdx.x;
 47     int by = blockIdx.y;
 48 
 49     // Thread index
 50     int tx = threadIdx.x;
 51     int ty = threadIdx.y;
 52 
 53     // Index of the first sub-matrix of A processed by the block
 54     int aBegin = wA * BLOCK_SIZE * by;
 55 
 56     // Index of the last sub-matrix of A processed by the block
 57     int aEnd   = aBegin + wA - 1;
 58 
 59     // Step size used to iterate through the sub-matrices of A
 60     int aStep  = BLOCK_SIZE;
 61 
 62     // Index of the first sub-matrix of B processed by the block
 63     int bBegin = BLOCK_SIZE * bx;
 64 
 65     // Step size used to iterate through the sub-matrices of B
 66     int bStep  = BLOCK_SIZE * wB;
 67 
 68     // Csub is used to store the element of the block sub-matrix
 69     // that is computed by the thread
 70     float Csub = 0;
 71 
 72     // Loop over all the sub-matrices of A and B
 73     // required to compute the block sub-matrix
 74     for (int a = aBegin, b = bBegin;
 75          a <= aEnd;
 76          a += aStep, b += bStep)
 77     {
 78 
 79         // Declaration of the shared memory array As used to
 80         // store the sub-matrix of A
 81         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 82 
 83         // Declaration of the shared memory array Bs used to
 84         // store the sub-matrix of B
 85         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 86 
 87         // Load the matrices from device memory
 88         // to shared memory; each thread loads
 89         // one element of each matrix
 90         As[ty][tx] = A[a + wA * ty + tx];
 91         Bs[ty][tx] = B[b + wB * ty + tx];
 92 
 93         // Synchronize to make sure the matrices are loaded
 94         __syncthreads();
 95 
 96         // Multiply the two matrices together;
 97         // each thread computes one element
 98         // of the block sub-matrix
 99 #pragma unroll
100 
101         for (int k = 0; k < BLOCK_SIZE; ++k)
102         {
103             Csub += As[ty][k] * Bs[k][tx];
104         }
105 
106         // Synchronize to make sure that the preceding
107         // computation is done before loading two new
108         // sub-matrices of A and B in the next iteration
109         __syncthreads();
110     }
111 
112     // Write the block sub-matrix to device memory;
113     // each thread writes one element
114     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
115     C[c + wB * ty + tx] = Csub;
116 }
117 
118 void constantInit(float *data, int size, float val)
119 {
120     for (int i = 0; i < size; ++i)
121     {
122         data[i] = val;
123     }
124 }
125 
126 /**
127  * Run a simple test of matrix multiplication using CUDA
128  */
129 int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
130 {
131     // Allocate host memory for matrices A and B
132     unsigned int size_A = dimsA.x * dimsA.y;
133     unsigned int mem_size_A = sizeof(float) * size_A;
134     float *h_A = (float *)malloc(mem_size_A);
135     unsigned int size_B = dimsB.x * dimsB.y;
136     unsigned int mem_size_B = sizeof(float) * size_B;
137     float *h_B = (float *)malloc(mem_size_B);
138 
139     // Initialize host memory
140     const float valB = 0.01f;
141     constantInit(h_A, size_A, 1.0f);
142     constantInit(h_B, size_B, valB);
143 
144     // Allocate device memory
145     float *d_A, *d_B, *d_C;
146 
147     // Allocate host matrix C
148     dim3 dimsC(dimsB.x, dimsA.y, 1);
149     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
150     float *h_C = (float *) malloc(mem_size_C);
151 
152     if (h_C == NULL)
153     {
154         fprintf(stderr, "Failed to allocate host matrix C!\n");
155         exit(EXIT_FAILURE);
156     }
157 
158     cudaError_t error;
159 
160     error = cudaMalloc((void **) &d_A, mem_size_A);
161 
162     if (error != cudaSuccess)
163     {
164         printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
165         exit(EXIT_FAILURE);
166     }
167 
168     error = cudaMalloc((void **) &d_B, mem_size_B);
169 
170     if (error != cudaSuccess)
171     {
172         printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
173         exit(EXIT_FAILURE);
174     }
175 
176     error = cudaMalloc((void **) &d_C, mem_size_C);
177 
178     if (error != cudaSuccess)
179     {
180         printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
181         exit(EXIT_FAILURE);
182     }
183 
184     // copy host memory to device
185     error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
186 
187     if (error != cudaSuccess)
188     {
189         printf("cudaMemcpy (d_A,h_A) returned error code %d, line(%d)\n", error, __LINE__);
190         exit(EXIT_FAILURE);
191     }
192 
193     error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
194 
195     if (error != cudaSuccess)
196     {
197         printf("cudaMemcpy (d_B,h_B) returned error code %d, line(%d)\n", error, __LINE__);
198         exit(EXIT_FAILURE);
199     }
200 
201     // Setup execution parameters
202     dim3 threads(block_size, block_size);
203     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
204 
205     // Create and start timer
206     printf("Computing result using CUDA Kernel...\n");
207 
208     // Performs warmup operation using matrixMul CUDA kernel
209     if (block_size == 16)
210     {
211         matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
212     }
213     else
214     {
215         matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
216     }
217 
218     printf("done\n");
219 
220     cudaDeviceSynchronize();
221 
222     // Allocate CUDA events that we'll use for timing
223     cudaEvent_t start;
224     error = cudaEventCreate(&start);
225 
226     if (error != cudaSuccess)
227     {
228         fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
229         exit(EXIT_FAILURE);
230     }
231 
232     cudaEvent_t stop;
233     error = cudaEventCreate(&stop);
234 
235     if (error != cudaSuccess)
236     {
237         fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
238         exit(EXIT_FAILURE);
239     }
240 
241     // Record the start event
242     error = cudaEventRecord(start, NULL);
243 
244     if (error != cudaSuccess)
245     {
246         fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
247         exit(EXIT_FAILURE);
248     }
249 
250     // Execute the kernel
251     int nIter = 300;
252 
253     for (int j = 0; j < nIter; j++)
254     {
255         if (block_size == 16)
256         {
257             matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
258         }
259         else
260         {
261             matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
262         }
263     }
264 
265     // Record the stop event
266     error = cudaEventRecord(stop, NULL);
267 
268     if (error != cudaSuccess)
269     {
270         fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
271         exit(EXIT_FAILURE);
272     }
273 
274     // Wait for the stop event to complete
275     error = cudaEventSynchronize(stop);
276 
277     if (error != cudaSuccess)
278     {
279         fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
280         exit(EXIT_FAILURE);
281     }
282 
283     float msecTotal = 0.0f;
284     error = cudaEventElapsedTime(&msecTotal, start, stop);
285 
286     if (error != cudaSuccess)
287     {
288         fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
289         exit(EXIT_FAILURE);
290     }
291 
292     // Compute and print the performance
293     float msecPerMatrixMul = msecTotal / nIter;
294     double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
295     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
296     printf(
297         "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
298         gigaFlops,
299         msecPerMatrixMul,
300         flopsPerMatrixMul,
301         threads.x * threads.y);
302 
303     // Copy result from device to host
304     error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
305 
306     if (error != cudaSuccess)
307     {
308         printf("cudaMemcpy (h_C,d_C) returned error code %d, line(%d)\n", error, __LINE__);
309         exit(EXIT_FAILURE);
310     }
311 
312     printf("Checking computed result for correctness: ");
313     bool correct = true;
314 
315     // test relative error by the formula 
316     //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps 
317     double eps = 1.e-6 ; // machine zero
318     for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
319     {
320         double abs_err = fabs(h_C[i] - (dimsA.x * valB));
321         double dot_length = dimsA.x;
322         double abs_val = fabs(h_C[i]);
323         double rel_err = abs_err/abs_val/dot_length ;
324         if (rel_err > eps)
325         {
326             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
327             correct = false;
328         }
329     }
330 
331     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
332 
333     // Clean up memory
334     free(h_A);
335     free(h_B);
336     free(h_C);
337     cudaFree(d_A);
338     cudaFree(d_B);
339     cudaFree(d_C);
340 
341     printf("\nNote: For peak performance, please refer to the matrixMulCUBLAS example.\n");
342 
343     cudaDeviceReset();
344 
345     if (correct)
346     {
347         return EXIT_SUCCESS;
348     }
349     else
350     {
351         return EXIT_FAILURE;
352     }
353 }
354 
355 
356 /**
357  * Program main
358  */
359 int main(int argc, char **argv)
360 {
361     printf("[Matrix Multiply Using CUDA] - Starting...\n");
362 
363     if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
364         checkCmdLineFlag(argc, (const char **)argv, "?"))
365     {
366         printf("Usage -device=n (n >= 0 for deviceID)\n");
367         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
368         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
369         printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
370 
371         exit(EXIT_SUCCESS);
372     }
373 
374     // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
375     int devID = 0;
376 
377     if (checkCmdLineFlag(argc, (const char **)argv, "device"))
378     {
379         devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
380         cudaSetDevice(devID);
381     }
382 
383     cudaError_t error;
384     cudaDeviceProp deviceProp;
385     error = cudaGetDevice(&devID);
386 
387     if (error != cudaSuccess)
388     {
389         printf("cudaGetDevice returned error code %d, line(%d)\n", error, __LINE__);
390     }
391 
392     error = cudaGetDeviceProperties(&deviceProp, devID);
393 
394     if (deviceProp.computeMode == cudaComputeModeProhibited)
395     {
396         fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
397         exit(EXIT_SUCCESS);
398     }
399 
400     if (error != cudaSuccess)
401     {
402         printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
403     }
404     else
405     {
406         printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
407     }
408 
409     // Use a larger block size for Fermi and above
410     int block_size = (deviceProp.major < 2) ? 16 : 32;
411 
412     dim3 dimsA(64*block_size, 64*block_size, 1);
413     dim3 dimsB(64*block_size, 64*block_size, 1);
414     // width of Matrix A
415     if (checkCmdLineFlag(argc, (const char **)argv, "wA"))
416     {
417         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
418     }
419 
420     // height of Matrix A
421     if (checkCmdLineFlag(argc, (const char **)argv, "hA"))
422     {
423         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
424     }
425 
426     // width of Matrix B
427     if (checkCmdLineFlag(argc, (const char **)argv, "wB"))
428     {
429         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
430     }
431 
432     // height of Matrix B
433     if (checkCmdLineFlag(argc, (const char **)argv, "hB"))
434     {
435         dimsB.y =  getCmdLineArgumentInt(argc, (const char **)argv, "hB");
436     }
437 
438     if (dimsA.x != dimsB.y)
439     {
440         printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
441                dimsA.x, dimsB.y);
442         exit(EXIT_FAILURE);
443     }
444 
445     printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
446 
447     int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
448 
449     exit(matrix_result);
450 }

 

 1 CUDA_PATH ?=/usr/local/cuda-7.0  #Makefile Using for compiling cuda project
 2 NVCC      :=$(CUDA_PATH)/bin/nvcc -ccbin g++
 3 INCLUDE   :=-I/usr/local/cuda/include/\
 4            -I/usr/local/cuda/samples/common/inc\
 5         -I/usr/include/c++\
 6         -I./
 7 
 8 LIBRARIES :=-L/usr/local/cuda/lib64 -lcudart
 9 TARGETS   :=$(notdir $(PWD))
10 OBJECTS   :=$(addsuffix .o, $(TARGETS))
11 
12 .SUFFIXES:.o .cu .cpp
13 .cu.o: 
14     $(NVCC) -arch=sm_20  $(INCLUDE) -c -o $@  $< $(LIBRARIES)
15 .cpp.o:
16     $(CXX) $(INCLUDE) -c -o $@ $< $(LIBRARIES)
17 
18 all: $(TARGETS)
19 
20 $(TARGETS): $(OBJECTS)
21     #sudo cp /usr/local/cuda/lib64/libcufft.so.7.0 /usr/lib    
22     sudo ln -s libcudart.so.7.0.28  libcudart.so
23     sudo ln -s libcudart.so.7.0.28  libcudart.so.7
24     g++    $(INCLUDE) -o $@ $^ $(LIBRARIES)
25 run:
26     ./$(TARGETS)
27 clean:
28     rm -rf *.o kernel libcudart.so libcudart.so.7 $(TARGETS)

 

./kernel
P[0] = 2048
P[1] = 2048
P[2] = 2048
P[3] = 2048
P[4] = 2048
P[5] = 2048
P[6] = 2048
P[7] = 2048
P[8] = 2048
P[9] = 2048
P[1048566] = 2048
P[1048567] = 2048
P[1048568] = 2048
P[1048569] = 2048
P[1048570] = 2048
P[1048571] = 2048
P[1048572] = 2048
P[1048573] = 2048
P[1048574] = 2048
P[1048575] = 2048
time = 57.198975 msec

 

posted @ 2017-03-21 21:34  souwang  阅读(630)  评论(0编辑  收藏  举报