▶ 分离编译【留坑,在 Linux 上用命令行试一下】
▶ 源代码:
1 // cppIntegration_gold.cpp 2 #include <vector_types.h> 3 4 extern "C" void computeGold(char *reference, char *idata, const unsigned int len); 5 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len); 6 7 void computeGold(char *reference, char *idata, const unsigned int len) 8 { 9 for (unsigned int i = 0; i < len; ++i) 10 reference[i] = idata[i] - 10; 11 } 12 13 void computeGold2(int2 *reference, int2 *idata, const unsigned int len) 14 { 15 for (unsigned int i = 0; i < len; ++i) 16 { 17 reference[i].x = idata[i].x - idata[i].y; 18 reference[i].y = idata[i].y; 19 } 20 }
1 // cppIntegration.cu 2 #include <stdlib.h> 3 #include <stdio.h> 4 #include <string.h> 5 #include <math.h> 6 #include <assert.h> 7 #include <cuda_runtime.h> 8 #include <helper_cuda.h> 9 #include <helper_functions.h> 10 11 #ifndef MAX 12 #define MAX(a,b) (a > b ? a : b) 13 #endif 14 15 extern "C" void computeGold(char *reference, char *idata, const unsigned int len); 16 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len); 17 18 // GPU上的运算 19 __global__ void kernel(int *g_data) 20 { 21 const unsigned int tid = threadIdx.x; 22 int data = g_data[tid]; 23 // data 每个字节的值减去 10,再拼接到一起 24 g_data[tid] = ((((data << 0) >> 24) - 10) << 24) | ((((data << 8) >> 24) - 10) << 16) | ((((data << 16) >> 24) - 10) << 8) | ((((data << 24) >> 24) - 10) << 0) ; 25 } 26 27 __global__ void kernel2(int2 *g_data) // 使用 int2* 格式的输入 28 { 29 const unsigned int tid = threadIdx.x; 30 int2 data = g_data[tid]; 31 g_data[tid].x = data.x - data.y; // data.x 中每个元素减去 data.y 中对应元素的偏移量 32 } 33 34 // 测试不同的核函数处理的结果。输入两种格式的待处理数据,及其长度 35 extern "C" bool runTest(char *data, int2 *data_int2, unsigned int len) 36 { 37 assert((len % 4) == 0); // 要求数组长度为 4 的倍数 38 const unsigned int num_threads = len / 4, mem_size = sizeof(char) * len, mem_size_int2 = sizeof(int2) * len; 39 40 char *d_data; 41 cudaMalloc((void **)&d_data, mem_size); 42 cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice); 43 int2 *d_data_int2; 44 cudaMalloc((void **)&d_data_int2, mem_size_int2); 45 cudaMemcpy(d_data_int2, data_int2, mem_size_int2, cudaMemcpyHostToDevice); 46 47 kernel << < dim3(1, 1, 1), dim3(num_threads, 1, 1) >> > ((int *)d_data); 48 kernel2 << < dim3(1, 1, 1), dim3(len, 1, 1) >> > (d_data_int2); 49 50 getLastCudaError("Kernel execution failed"); // 检查和函数运行是否有错误,有错则输出这话 51 52 char *reference = (char *)malloc(mem_size); // 使用 CPU 计算 53 computeGold(reference, data, len); 54 printf("ref char*:%s\n", reference); 55 int2 *reference2 = (int2 *)malloc(mem_size_int2); 56 computeGold2(reference2, data_int2, len); 57 printf("ref int2 :"); 58 for (int i = 0; i < len;i++) 59 printf("%c", reference2[i].x); 60 printf("\n"); 61 62 cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost); 63 cudaMemcpy(data_int2, d_data_int2, mem_size_int2, cudaMemcpyDeviceToHost); 64 cudaDeviceSynchronize(); 65 printf("gpu char*:%s\n", (char *)data); 66 printf("gpu int2 :"); 67 for (int i = 0; i < len; i++) 68 printf("%c", data_int2[i].x); 69 printf("\n"); 70 71 cudaFree(d_data); 72 cudaFree(d_data_int2); 73 free(reference); 74 free(reference2); 75 return 0; 76 }
1 // main.cpp 2 #include <iostream> 3 #include <cstdlib> 4 #include <cuda_runtime.h> 5 #include <vector_types.h> 6 #include <helper_cuda.h> 7 8 extern "C" bool runTest(char *data, int2 *data_int2, unsigned int len); 9 10 int main() 11 { 12 const int len = 16; 13 int2 i2[16]; // cuda 内置的 int2 类型 14 char str[len] = { 82, 111, 118, 118,121, 42, 97, 121, 124, 118, 110, 56, 10, 10, 10, 10}; 15 for (int i = 0; i < len; i++) 16 { 17 i2[i].x = str[i]; 18 i2[i].y = 10; 19 } 20 runTest(str, i2, len); 21 22 getchar(); 23 return 0; 24 }
● 输出结果:
ref char*: Hello World. ref int2 :Hello World. gpu char*: Hello World. gpu int2 :Hello World.
▶ 涨姿势:
● cuda 内置的 int2 类型,整数有序对。涉及的定义如下:
1 #define __cuda_builtin_vector_align8(tag, members) \ 2 struct __device_builtin__ __align__(8) tag \ 3 { \ 4 members \ 5 } 6 7 __cuda_builtin_vector_align8(int2, int x; int y;); 8 9 typedef __device_builtin__ struct int2 int2;
● 警告函数和错误检查函数
1 #define assert(expression) (void) \ 2 ( \ 3 (!!(expression)) || (_wassert(_CRT_WIDE(#expression), _CRT_WIDE(__FILE__), (unsigned)(__LINE__)), 0)\ 4 ) 5 6 #define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__) 7 8 inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) 9 { 10 cudaError_t err = cudaGetLastError(); 11 if (cudaSuccess != err) 12 { 13 fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", 14 file, line, errorMessage, (int)err, cudaGetErrorString(err)); 15 DEVICE_RESET 16 exit(EXIT_FAILURE); 17 } 18 }