CUDA共享内存的使用示例
CUDA共享内存使用示例如下:参考教材《GPU高性能编程CUDA实战》。P54-P65
教材下载地址:http://download.csdn.net/download/yizhaoyanbo/10150300。如果没有下载分可以评论区留下邮箱,我发你。
1 #include <cuda.h> 2 #include <cuda_runtime.h> 3 #include <device_launch_parameters.h> 4 #include <device_functions.h> 5 #include <iostream> 6 #include <string> 7 8 using namespace std; 9 10 #define imin(a,b) (a<b? a:b) 11 const int N = 33 * 1024; 12 const int threadsPerBlock = 256; 13 const int blocksPerGrid = imin(32, (N + threadsPerBlock - 1) / threadsPerBlock); 14 15 __global__ void dot(float *a, float *b, float *c) 16 { 17 __shared__ float cache[threadsPerBlock]; 18 int tid = threadIdx.x + blockDim.x*blockIdx.x; 19 int cacheIndex = threadIdx.x; 20 21 float temp = 0; 22 //每个线程负责计算的点乘,再加和 23 while (tid<N) 24 { 25 temp += a[tid] * b[tid]; 26 tid += blockDim.x*gridDim.x; 27 } 28 29 //每个线程块中线程计算的加和保存到缓冲区cache,一共有blocksPerGrid个缓冲区副本 30 cache[cacheIndex] = temp; 31 //对线程块中的线程进行同步 32 __syncthreads(); 33 34 //归约运算,将每个缓冲区中的值加和,存放到缓冲区第一个元素位置 35 int i = blockDim.x / 2; 36 while (i != 0) 37 { 38 if (cacheIndex < i) 39 { 40 cache[cacheIndex] += cache[cacheIndex + i]; 41 } 42 __syncthreads(); 43 i /= 2; 44 } 45 //使用第一个线程取出每个缓冲区第一个元素赋值到C数组 46 if (cacheIndex == 0) 47 { 48 c[blockIdx.x] = cache[0]; 49 } 50 } 51 52 void main() 53 { 54 float *a, *b, c, *partial_c; 55 float *dev_a, *dev_b, *dev_partial_c; 56 57 //分配CPU内存 58 a = (float*)malloc(N * sizeof(float)); 59 b = (float*)malloc(N * sizeof(float)); 60 partial_c = (float*)malloc(blocksPerGrid * sizeof(float)); 61 62 //分配GPU内存 63 cudaMalloc(&dev_a, N * sizeof(float)); 64 cudaMalloc(&dev_b, N * sizeof(float)); 65 cudaMalloc(&dev_partial_c, blocksPerGrid * sizeof(float)); 66 67 float sum = 0; 68 for (int i = 0; i < N; i++) 69 { 70 a[i] = i; 71 b[i] = i * 2; 72 } 73 74 //将数组上传到GPU 75 cudaMemcpy(dev_a, a, N * sizeof(float), cudaMemcpyHostToDevice); 76 cudaMemcpy(dev_b, b, N * sizeof(float), cudaMemcpyHostToDevice); 77 78 dot << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b, dev_partial_c); 79 80 cudaMemcpy(partial_c, dev_partial_c, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost); 81 82 //CPU 完成最终求和 83 c = 0; 84 for (int i = 0; i < blocksPerGrid; i++) 85 { 86 c += partial_c[i]; 87 } 88 89 #define sum_squares(x) (x*(x+1)*(2*x+1)/6) 90 printf("does GPU value %.6g = %.6g?\n", c, 2 * sum_squares((float)(N - 1))); 91 92 cudaFree(dev_a); 93 cudaFree(dev_b); 94 cudaFree(dev_partial_c); 95 96 free(a); 97 free(b); 98 free(partial_c); 99 }
我的博客即将同步至腾讯云+社区,邀请大家一同入驻。
---------------------------------
业精于勤而荒于嬉 行成于思而毁于随
---------------------------------