PCL+CUDA编程（一）

点云的操作对运算资源的消耗是十分高的。但利用GPU并行运算的优点可以解决这个问题。下面我将跟大家分享关于利用CUDA处理PCL点云数据的一些经验。

首先举一个简单的例子说明CUDA程序是如何运作的。

我们先写一个简单的C++程序helloworld.cpp

 1 /*
 2  * helloworld.cpp
 3  *
 4  *  Created on: Nov 25, 2016
 5  *      Author: lzp
 6  */
 7 
 8 #include <iostream>
 9 
10 #include <addition.h>
11 
12 
13 int main(int argc, char** argv)
14 {
15     int a=1,b=2,c;
16 
17     if(addition(a,b,&c))
18         std::cout<<"c="<<c<<std::endl;
19     else
20         std::cout<<"Addition failed!"<<std::endl;
21 
22     return 0;
23 }

我们将利用addition()函数将a和b相加，然后由c储存它们的和。
addition()函数在头文件声明：

 1 /*
 2  * addition.h
 3  *
 4  *  Created on: Nov 25, 2016
 5  *      Author: lzp
 6  */
 7 
 8 #ifndef INCLUDES_ADDITION_H_
 9 #define INCLUDES_ADDITION_H_
10 
11 /*check if the compiler is of C++*/
12 #ifdef __cplusplus
13 extern "C" bool addition(int a, int b, int *c);
14 
15 #endif
16 
17 
18 
19 #endif /* INCLUDES_ADDITION_H_ */

修饰符extern "C"是CUDA和C++混合编程时必须的。然后我们来看addition()的在CUDA上的实现：

 1 #include <addition.h>
 2 __global__ void add(int *a, int *b, int *c)
 3 {
 4     *c=*a+*b;
 5 }
 6 
 7 extern "C" bool addition(int a, int b, int *c)
 8 {
 9     int *d_a, *d_b, *d_c;
10     int size=sizeof(int);
11     
12     cudaMalloc((void **)&d_a, size);
13     cudaMalloc((void **)&d_b, size);
14     cudaMalloc((void **)&d_c, size);
15     
16     cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
17     cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
18     
19     add<<<1,1>>>(d_a, d_b, d_c);
20     
21     cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
22     
23     cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
24     return true;
25 }

其中，带有__global__修饰符的函数称为”核函数“，它负责处理GPU内存里的数据，是并行计算发生的地方。而bool addition(int a, int b, int *c)充当了CPU和GPU之间数据传输的角色。也就是Host和Device之间的数据传输。

最后，编写CMake文件编译。

cmake_minimum_required(VERSION 2.6 FATAL_ERROR)

project(helloworld)

find_package(CUDA REQUIRED)

include_directories(../../includes)

cuda_add_executable (helloworld helloworld.cpp addition.cu)

其中include_directories的参数为.h文件所在的目录。

下面我们用相同的程序结构，写一个最简单的例子，用CUDA对PCL点云中的一个点进行操作。

 1 /*
 2  * pcl_points_gpu.cpp
 3  *
 4  *  Created on: Nov 24, 2016
 5  *      Author: lzp
 6  */
 7 
 8 
 9 
10 #include <gpu_draw_cloud.h>
11 #include <pcl/io/pcd_io.h>
12 
13 int main(int argc, char** argv)
14 {
15     pcl::PointCloud<pcl::PointXYZRGB> cloud;
16     pcl::gpu::DeviceArray<pcl::PointXYZRGB> cloud_device;
17 
18 
19     cloud.width = 1;
20     cloud.height =1;
21     cloud.is_dense=false;
22     cloud.points.resize(cloud.width*cloud.height);
23 
24     std::vector<float> point_val;
25 
26     for(size_t i=0; i<3*cloud.points.size(); ++i)
27     {
28         point_val.push_back(1024*rand()/(RAND_MAX+1.0f));
29     }
30 
31     for (size_t i = 0; i < cloud.points.size(); ++i) {
32         cloud.points[i].x = point_val[3 * i];
33         cloud.points[i].y = point_val[3 * i + 1];
34         cloud.points[i].z = point_val[3 * i + 2];
35     }
36 
37     std::cout<<"cloud.points="<<cloud.points[0]<<std::endl;
38 
39     cloud_device.upload(cloud.points);
40 
41     cloud2GPU(cloud_device);
42 
43     cloud_device.download(cloud.points);
44 
45     std::cout<<"cloud.points="<<cloud.points[0]<<std::endl;
46     return (0);
47 }

这段代码模仿了PCL中写点云的一个例子，生成了一个点，坐标是随机生成的。关键点是pcl::gpu::DeviceArray<pcl::PointXYZRGB>，这是一个可以将点云传输到GPU上的桥梁。它的upload() 和download()方法相当于前面例子中的cudaMemcpy()。详情可参考PCL的源码仓库中/gpu/examples/和/gpu/octree/这两个目录的源码。

接下来是头文件：

 1 /*
 2  * gpu_draw_cloud.h
 3  *
 4  *  Created on: Nov 25, 2016
 5  *      Author: lzp
 6  */
 7 
 8 #ifndef INCLUDES_GPU_DRAW_CLOUD_H_
 9 #define INCLUDES_GPU_DRAW_CLOUD_H_
10 
11 
12 #include <iostream>
13 #include <pcl/point_types.h>
14 #include <pcl/gpu/containers/device_array.h>
15 
16 /*check if the compiler is of C++*/
17 #ifdef __cplusplus
18 
19 
20 /*
21  * Try accessing GPU with pointcloud
22  * */
23 extern "C" bool cloud2GPU(pcl::gpu::DeviceArray<pcl::PointXYZRGB>& cloud_device);
24 
25 
26 #endif
27 
28 
29 #endif /* INCLUDES_GPU_DRAW_CLOUD_H_ */

然后是函数实现体：

 1 #include <gpu_draw_cloud.h>
 2 
 3 
 4 
 5 
 6 __global__ void change_points(pcl::gpu::PtrSz<pcl::PointXYZRGB> cloud_device)
 7 {
 8     cloud_device[0].x+=1;
 9     pcl::PointXYZRGB q=cloud_device.data[0];
10     printf("x=%f, y=%f, z=%f, r=%d, g=%d, b=%d \n", q.x, q.y, q.z, q.r, q.g, q.b);
11 }
12 
13 
14 
15 extern "C" bool 
16 cloud2GPU(pcl::gpu::DeviceArray<pcl::PointXYZRGB>& cloud_device)
17 {
18     change_points<<<1,1>>>(cloud_device);
19     return true;
20 }

在这个例子中，我将CPU和GPU的数据交互放到主函数中了，因此cloud2GPU函数只充当了一个调用核函数的接口。值得注意的是，在核函数的参数中，传入的pcl::gpu::DeviceArray<pcl::PointXYZRGB>隐式转换成pcl::gpu::PtrSz<pcl::PointXYZRGB>了。这两个数据类型是实现C++和CUDA混合编程的关键。

最后附上CMakeLists。

 1 project(pcl_points_gpu)
 2 
 3 find_package(PCL 1.8 REQUIRED)
 4 find_package(CUDA REQUIRED)
 5 INCLUDE(FindCUDA)
 6 
 7 include_directories(../../includes)
 8 
 9 include_directories(${PCL_INCLUDE_DIRS})
10 link_directories(${PCL_LIBRARY_DIRS})
11 add_definitions(${PCL_DEFINITIONS})
12 
13 get_directory_property(dir_defs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
14 set(vtk_flags)
15 
16 foreach(it ${dir_defs})
17     if(it MATCHES "vtk*")
18     list(APPEND vtk_flags ${it})
19     endif()
20 endforeach()
21 
22 foreach(d ${vtk_flags})
23     remove_definitions(-D${d})
24 endforeach()
25 
26 cuda_add_executable (pcl_points_gpu pcl_points_gpu.cpp gpu_draw_cloud.cu)
27 target_link_libraries (pcl_points_gpu ${PCL_LIBRARIES})

留意13-24行，如果没有这几行，nvcc编译时会报出类似这样的错误：

nvcc fatal : A single input file is required for a non-link phase when an outputfile is specified
CMake Error at pcl_points_gpu_generated_gpu_draw_cloud.cu.o.cmake:209 (message):
Error generating
XXXXXXXXXXXXXXXX./pcl_points_gpu_generated_gpu_draw_cloud.cu.o

根据https://github.com/PointCloudLibrary/pcl/issues/776的描述，这是VTK的一个bug所致，因此在CMake中添加了这几行脚本。

希望这些例子对刚接触PCL和CUDA的人有帮助。本人也是新手，对很多概念仍然模糊不清，望体谅。

zipeilu

PCL+CUDA编程（一）

公告