▶ 简单的将纯 C/C++ 函数放到另一个文件中,利用头文件引用到主体 .cu 中来,编译时共同编译。
▶ 源代码,把 C++ 的部分去掉了
1 // simpleDeviceLibrary.cuh 2 #ifndef SIMPLE_DEVICE_LIBRARY_CUH 3 #define SIMPLE_DEVICE_LIBRARY_CUH 4 5 extern "C" __device__ float multiplyByTwo(float number); 6 7 extern "C" __device__ float divideByTwo(float number); 8 9 #endif
1 // simpleDeviceLibrary.cu 2 #include <cuda_runtime.h> 3 4 extern "C" __device__ float multiplyByTwo(float number) 5 { 6 return number * 2.0f; 7 } 8 9 extern "C" __device__ float divideByTwo(float number) 10 { 11 return number * 0.5f; 12 }
1 // simpleSeparateCompilation.cu 2 #include <stdio.h> 3 #include <stdlib.h> 4 #include <math.h> 5 #include <cuda_runtime.h> 6 #include "device_launch_parameters.h" 7 #include "simpleDeviceLibrary.cuh" 8 9 #define EPS 1e-5 10 11 typedef float(*deviceFunc)(float); 12 __device__ deviceFunc dMultiplyByTwoPtr = multiplyByTwo; // 本地声明,直接在代码中调用 multiplyByTwo / divideByTwo 会导致运行时错误 13 __device__ deviceFunc dDivideByTwoPtr = divideByTwo; 14 15 __global__ void transformVector(float *v, deviceFunc f, unsigned int size) 16 { 17 unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; 18 if (tid < size) 19 v[tid] = (*f)(v[tid]); 20 } 21 22 int test() 23 { 24 cudaSetDevice(0); 25 const unsigned int size = 1000; 26 float hVector[size], hResultVector[size], *dVector; 27 for (unsigned int i = 0; i < size; ++i) 28 { 29 hVector[i] = rand() / (float)RAND_MAX; 30 hResultVector[i] = 0.0f; 31 } 32 cudaMalloc((void **)&dVector, size * sizeof(float)); 33 cudaMemcpy(dVector, hVector, sizeof(float) * size, cudaMemcpyHostToDevice); 34 35 deviceFunc hFunctionPtr; // 作为调用参数的函数指针 36 cudaMemcpyFromSymbol(&hFunctionPtr, dMultiplyByTwoPtr, sizeof(deviceFunc)); // 给 hFunctionPtr 一个地址,方便调用 37 transformVector << <1, 1024 >>>(dVector, hFunctionPtr, size); 38 cudaMemcpyFromSymbol(&hFunctionPtr, dDivideByTwoPtr, sizeof(deviceFunc)); 39 transformVector << <1, 1024 >> > (dVector, hFunctionPtr, size); 40 41 cudaMemcpy(hResultVector, dVector, sizeof(float) * size, cudaMemcpyDeviceToHost); 42 cudaDeviceSynchronize(); 43 if (dVector) 44 cudaFree(dVector); 45 for (int i = 0; i < size; ++i) 46 { 47 if (fabs(hVector[i] - hResultVector[i]) > EPS) 48 { 49 printf("\nError at i == %d, hVector[i] == %f, hResultVector[i] == %f", i, hVector[i], hResultVector[i]); 50 return 0; 51 } 52 } 53 return 1; 54 } 55 56 int main() 57 { 58 printf("\n\tStart.\n"); 59 printf("\n\tFinish: %s\n", test() ? "Pass" : "Fail"); 60 getchar(); 61 return 0; 62 }
● 输出结果:
Start.
Finish: Pass
▶ 涨姿势
// cuda_runtime_api.h #define __dv(v) \ = v extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)); // 从指定符号 symbol 处偏移 offset 字节处,拷贝 count 字节到 dst,默认模式为设备拷到主机