把代码文件和主程序文件分开编译,使用头文件的形式进行引用。
▶ 源代码
1 // simpleDeviceLibrary.cuh 2 #ifndef SIMPLE_DEVICE_LIBRARY_CUH 3 #define SIMPLE_DEVICE_LIBRARY_CUH 4 5 extern __device__ float multiplyByTwo(float number); 6 7 extern __device__ float divideByTwo(float number); 8 9 #endif
1 // simpleDeviceLibrary.cu 2 #include <cuda_runtime.h> 3 #include "device_launch_parameters.h" 4 __device__ float multiplyByTwo(float number) 5 { 6 return number * 2.0f; 7 } 8 9 __device__ float divideByTwo(float number) 10 { 11 return number * 0.5f; 12 }
1 #include <stdio.h> 2 #include <iostream> 3 #include <vector> 4 #include <cuda_runtime.h> 5 #include "device_launch_parameters.h" 6 #include "simpleDeviceLibrary.cuh" 7 8 using std::cout; 9 using std::endl; 10 using std::vector; 11 12 #define EPS 1e-5 13 14 typedef float(*deviceFunc)(float); 15 16 __device__ deviceFunc dMultiplyByTwoPtr = multiplyByTwo; 17 __device__ deviceFunc dDivideByTwoPtr = divideByTwo; 18 19 __global__ void transformVector(float *v, deviceFunc f, unsigned int size) 20 { 21 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; 22 if (tid < size) 23 v[tid] = (*f)(v[tid]); 24 } 25 26 bool test() 27 { 28 bool result = true; 29 cudaSetDevice(0); 30 31 const unsigned int kVectorSize = 1000; 32 vector<float> hVector(kVectorSize); 33 for (unsigned int i = 0; i < kVectorSize; ++i) 34 hVector[i] = rand() / static_cast<float>(RAND_MAX); 35 float *dVector; 36 cudaMalloc(&dVector, kVectorSize * sizeof(float)); 37 cudaMemcpy(dVector, &hVector[0], kVectorSize * sizeof(float), cudaMemcpyHostToDevice); 38 39 dim3 dimGrid(1); 40 dim3 dimBlock(1024); 41 42 // 函数指针需要用 cudaMemcpyFromSymbol 放入设备常量内存 43 deviceFunc hFunctionPtr; 44 cudaMemcpyFromSymbol(&hFunctionPtr, dMultiplyByTwoPtr, sizeof(deviceFunc)); 45 transformVector << <dimGrid, dimBlock >> > (dVector, hFunctionPtr, kVectorSize); 46 cudaGetLastError(); 47 48 cudaMemcpyFromSymbol(&hFunctionPtr, dDivideByTwoPtr, sizeof(deviceFunc)); 49 transformVector << <dimGrid, dimBlock >> > (dVector, hFunctionPtr, kVectorSize); 50 cudaGetLastError(); 51 52 vector<float> hResultVector(kVectorSize); 53 cudaMemcpy(&hResultVector[0], dVector, kVectorSize * sizeof(float), cudaMemcpyDeviceToHost); 54 55 // 检查结果 56 for (int i = 0; i < kVectorSize; ++i) 57 { 58 if (fabs(hVector[i] - hResultVector[i]) > EPS) 59 { 60 printf("\n\tError at %d, gpu[i] = %f, cpu[i] = %f\n", i, hResultVector[i], hVector[i]); 61 result = false; 62 break; 63 } 64 } 65 return result; 66 } 67 68 int main(int argc, char **argv) 69 { 70 printf("\n\tStart\n"); 71 printf("\n\tFinish, %s\n", test() ? "Passed" : "Failed"); 72 73 getchar(); 74 return 0; 75 }
▶ 输出结果
未测试
▶ 涨姿势
●写在其他 .cpp 文件中的设备函数,需要用函数 cudaMemcpyFromSymbol() 放入设备常量内存才能使用。
1 typedef float(*deviceFunc)(float); 2 deviceFunc hFunctionPtr; 3 cudaMemcpyFromSymbol(&hFunctionPtr, dMultiplyByTwoPtr, sizeof(deviceFunc));