爨爨爨好

  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

简单的 CUDA 应用模板,白送的 Sample。

 

▶ 源代码

1 //template_cpu.cpp
2 extern "C" void computeGold(float *, const unsigned int);
3 
4 void computeGold(float *idata, const unsigned int len)
5 {
6     const float f_len = static_cast<float>(len);
7     for (unsigned int i = 0; i < len; ++i)
8         idata[i] *= f_len;
9 }
 1 // template.cu
 2 #include <stdio.h>
 3 #include <cuda_runtime.h>
 4 #include "device_launch_parameters.h"
 5 #include <helper_functions.h>
 6 
 7 extern "C" void computeGold(float *, const unsigned int);
 8 
 9 __global__ void testKernel(float *g_idata, float *g_odata)
10 {
11     extern  __shared__  float sdata[];
12     const unsigned int tid = threadIdx.x;
13 
14     sdata[tid] = g_idata[tid];
15     __syncthreads();
16     sdata[tid] = (float)blockDim.x * sdata[tid];
17     __syncthreads();    
18     g_odata[tid] = sdata[tid];
19 }
20 
21 int main()
22 {
23     printf("\n\tStart.\n");
24 
25     cudaSetDevice(0);
26     StopWatchInterface *timer = 0;
27     sdkCreateTimer(&timer);
28     sdkStartTimer(&timer);
29 
30     unsigned int num_threads = 32;
31     unsigned int mem_size = sizeof(float) * num_threads;
32     float *h_idata, *h_odata, *d_idata, *d_odata;
33     h_idata = (float *)malloc(mem_size);
34     h_odata = (float *)malloc(mem_size);
35     cudaMalloc((void **) &d_idata, mem_size);    
36     cudaMalloc((void **)&d_odata, mem_size);
37     for (unsigned int i = 0; i < num_threads; ++i)
38         h_idata[i] = (float)i;
39     cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice);
40     
41     testKernel << < dim3(1, 1, 1), dim3(num_threads, 1, 1), mem_size >> > (d_idata, d_odata);
42     //getLastCudaError("Kernel execution failed");// 检查内核调用的报错结果
43     cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost);
44     cudaDeviceSynchronize();
45 
46     sdkStopTimer(&timer);
47     printf("\n\tProcessing time: %f ms\n", sdkGetTimerValue(&timer));
48     sdkDeleteTimer(&timer);
49 
50     computeGold(h_idata, num_threads);
51     printf("\n\tFinish, return %s.\n", compareData(h_idata, h_odata, num_threads, 0.0f,0.0f) ? "Passed" : "Failed");
52     
53     free(h_idata);
54     free(h_odata);
55     cudaFree(d_idata);
56     cudaFree(d_odata);
57     getchar();
58     return 0;
59 }

▶ 输出结果:

    Start.

    Processing time: 101.169357 ms

    Finish, return Passed.

 

▶ 涨姿势:没有

 

posted on 2017-12-09 12:44  爨爨爨好  阅读(274)  评论(0编辑  收藏  举报