▶ 在OpenMP的多线程程序中,各线程分别调用CUDA进行计算。OpenMP的简单示例。
▶ 源代码,OpenMP 出了点问题,没有正确输出结果
1 #include <stdio.h> 2 #include <omp.h> 3 #include <cuda.h> 4 #include <cuda_runtime.h> 5 #include "device_launch_parameters.h" 6 #include <helper_cuda.h> 7 8 __global__ void kernelAddConstant(int *g_a, const int b) 9 { 10 int idx = blockIdx.x * blockDim.x + threadIdx.x; 11 g_a[idx] += b; 12 } 13 14 int main(int argc, char *argv[]) 15 { 16 const int num_gpus = 1; 17 unsigned int n = num_gpus * 8192, nbytes = sizeof(int) * n; 18 omp_set_num_threads(num_gpus); // 使用CPU线程数量等于GPU设备数量。可以使用更多,如 2*num_gpus 19 20 int b = 3; 21 int *a = (int *)malloc(nbytes); 22 if (a == NULL) 23 { 24 printf("couldn't allocate CPU memory\n"); 25 return 1; 26 } 27 for (unsigned int i = 0; i < n; i++) 28 a[i] = i; 29 30 #pragma omp parallel num_threads(8) // 强制使用 8 个线程 31 { 32 unsigned int thread_size = omp_get_num_threads(), thread_rank = omp_get_thread_num(); 33 34 int gpu_id = -1; 35 cudaSetDevice(thread_rank % num_gpus); // 使用 % 使得一个 GPU 能接受更多 CPU 线程 36 cudaGetDevice(&gpu_id); 37 printf("CPU thread %d (of %d) uses CUDA device %d\n", thread_rank, thread_size, gpu_id); 38 39 int *d_a = NULL; 40 int *sub_a = a + thread_rank * n / thread_size; // 主机内存分段,每个线程计算不同的分段 41 unsigned int byte_per_kernel = nbytes / thread_size; 42 cudaMalloc((void **)&d_a, byte_per_kernel); 43 cudaMemset(d_a, 0, byte_per_kernel); 44 cudaMemcpy(d_a, sub_a, byte_per_kernel, cudaMemcpyHostToDevice); 45 46 dim3 gpu_threads(128); 47 dim3 gpu_blocks(n / (gpu_threads.x * thread_size)); 48 kernelAddConstant << <gpu_blocks, gpu_threads >> >(d_a, b); 49 cudaMemcpy(sub_a, d_a, byte_per_kernel, cudaMemcpyDeviceToHost); 50 cudaFree(d_a); 51 } 52 53 if (cudaGetLastError() != cudaSuccess) // 检查结果 54 printf("%s\n", cudaGetErrorString(cudaGetLastError())); 55 for (int i = 0; i < n; i++) 56 { 57 if (a[i] != i + b) 58 { 59 printf("Error at i == %d, a[i] == %d", i, a[i]); 60 break; 61 } 62 } 63 printf("finish!\n"); 64 65 free(a); 66 getchar(); 67 return 0; 68 }