c++ nvcc编译CUDA程序入门示例
#include<stdio.h> __global__ void helloFromGPU(void) { printf("Hello World from GPU!\n"); } int main(void) { printf("Hello World from CPU!\n"); helloFromGPU<<<1,10>>>(); cudaDeviceReset(); return 0; }
其中__global__表示后面的函数交由GPU处理,通常GPU编程包含以下五步
(1)分配GPU内存
(2)将CPU的内容拷贝给GPU内存
(3)调用CUDA的内核函数进行处理
(4)将GPU处理完的数据拷贝给CPU
(5)释放GPU的内存
上述代码中__global__之后的内容为内核函数<<<>>>代表从主线程到设备端代码的调用,
里面的参数10代表调用10个线程。
打开终端,进入cu文件所在的文件夹输入
nvcc -o helloworld helloworld.cu
1
输出结果应为
Hello World from CPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
Hello World from GPU!
这次做一下矩阵加法运算,代码如下:
#include <iostream> #include <cuda_runtime.h> using namespace std; __global__ void add(int *a, int *b, int *c, int n) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < n) { c[index] = a[index] + b[index]; } } void random_ints(int* a, int n) { for (int i = 0; i < n; ++i) a[i] = rand(); } int main() { int n = (2048 * 2048); int threads_per_block = 512; int *a, *b, *c; int *d_a, *d_b, *d_c; int size = n * sizeof(int); cudaMalloc((void**)&d_a, size); cudaMalloc((void**)&d_b, size); cudaMalloc((void**)&d_c, size); a = (int*)malloc(size); random_ints(a, n); b = (int*)malloc(size); random_ints(b, n); c = (int*)malloc(size); cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice); add<<<(n + threads_per_block - 1)/threads_per_block, threads_per_block>>>(d_a, d_b, d_c, n); cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); for (int i = 0; i < n; ++i) { cout << c[i] << ","; } cout << endl; cout << cudaGetErrorString(cudaGetLastError()) << endl; free(a); free(b); free(c); return 0; }
示例虽小,五脏俱全。
流程梳理如下:
- 准备待处理数据
- 在device上分配存储空间
- 把数据从host拷贝到device
- 执行device运算
- 把结果从device拷贝回host
- 结束后释放device空间
- 回收host资源
更具体的细节后续博客中介绍。