基于共享内存的位图

 基于共享内存的位图,项目打包下载

 1 /*
 2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 3 *
 4 * NVIDIA Corporation and its licensors retain all intellectual property and
 5 * proprietary rights in and to this software and related documentation.
 6 * Any use, reproduction, disclosure, or distribution of this software
 7 * and related documentation without an express license agreement from
 8 * NVIDIA Corporation is strictly prohibited.
 9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15 #include <GL\glut.h>
16 #include "cuda.h"
17 #include "cuda_runtime.h"
18 #include "device_launch_parameters.h"
19 #include "cuda.h"
20 #include "../common/book.h"
21 #include "../common/cpu_bitmap.h"
22 
23 
24 #define DIM 1024
25 #define PI 3.1415926535897932f
26 
27 __global__ void kernel(unsigned char *ptr) {
28     // map from threadIdx/BlockIdx to pixel position
29     int x = threadIdx.x + blockIdx.x * blockDim.x;
30     int y = threadIdx.y + blockIdx.y * blockDim.y;
31     int offset = x + y * blockDim.x * gridDim.x;
32 
33     __shared__ float    shared[16][16];
34 
35     // now calculate the value at that position
36     const float period = 128.0f;
37 
38     shared[threadIdx.x][threadIdx.y] =
39         255 * (sinf(x*2.0f*PI / period) + 1.0f) *
40         (sinf(y*2.0f*PI / period) + 1.0f) / 4.0f;
41 
42     // removing this syncthreads shows graphically what happens
43     // when it doesn't exist.  this is an example of why we need it.
44     __syncthreads();
45 
46     ptr[offset * 4 + 0] = 0;
47     ptr[offset * 4 + 1] = shared[15 - threadIdx.x][15 - threadIdx.y];
48     ptr[offset * 4 + 2] = 0;
49     ptr[offset * 4 + 3] = 255;
50 }
51 
52 // globals needed by the update routine
53 struct DataBlock {
54     unsigned char   *dev_bitmap;
55 };
56 
57 int main(void) {
58     DataBlock   data;
59     CPUBitmap bitmap(DIM, DIM, &data);
60     unsigned char    *dev_bitmap;
61 
62     HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap,
63         bitmap.image_size()));
64     data.dev_bitmap = dev_bitmap;
65 
66     dim3    grids(DIM / 16, DIM / 16);
67     dim3    threads(16, 16);
68     kernel <<<grids, threads >>>(dev_bitmap);
69 
70     HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), dev_bitmap,
71         bitmap.image_size(),
72         cudaMemcpyDeviceToHost));
73 
74     HANDLE_ERROR(cudaFree(dev_bitmap));
75 
76     bitmap.display_and_exit();
77 }

kernel函数中加粗标红的 __syncthreads()在去掉和加上时的效果图是不一样的。

取消时:

加上时:

 

这也是为什么加上同步的重要性。

 

抛砖引玉

这个

int offset = x + y * blockDim.x * gridDim.x;

 以及

1 ptr[offset * 4 + 0] = 0;
2 ptr[offset * 4 + 1] = shared[15 - threadIdx.x][15 - threadIdx.y];
3 ptr[offset * 4 + 2] = 0;
4 ptr[offset * 4 + 3] = 255;

如何理解?!

posted @ 2014-09-22 16:35  青竹居士  阅读(360)  评论(0编辑  收藏  举报