cuda流的使用
CUDA流表示一个GPU操作队列,该队列中的操作将以添加到流中的先后顺序而依次执行。可以将一个流看做是GPU上的一个任务,不同任务可以并行执行。使用CUDA流,首先要选择一个支持设备重叠(Device Overlap)功能的设备,支持设备重叠功能的GPU能够在执行一个CUDA核函数的同时,还能在主机和设备之间执行复制数据操作。
支持重叠功能的设备的这一特性很重要,可以在一定程度上提升GPU程序的执行效率。一般情况下,CPU内存远大于GPU内存,对于数据量比较大的情况,不可能把CPU缓冲区中的数据一次性传输给GPU,需要分块传输,如果能够在分块传输的同时,GPU也在执行核函数运算,这样的异步操作,就用到设备的重叠功能,能够提高运算性能。
不使用流:
#include "cuda_runtime.h" #include <iostream> #include <stdio.h> #include <math.h> #include <device_launch_parameters.h> #define N (1024*1024) #define FULL_DATA_SIZE N*20 __global__ void kernel(int* a, int *b, int*c) { int threadID = blockIdx.x * blockDim.x + threadIdx.x; if (threadID < N) { c[threadID] = (a[threadID] + b[threadID]) / 2; } } int main() { //启动计时器 cudaEvent_t start, stop; float elapsedTime; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); int *host_a, *host_b, *host_c; int *dev_a, *dev_b, *dev_c; //在GPU上分配内存 cudaMalloc((void**)&dev_a, FULL_DATA_SIZE * sizeof(int)); cudaMalloc((void**)&dev_b, FULL_DATA_SIZE * sizeof(int)); cudaMalloc((void**)&dev_c, FULL_DATA_SIZE * sizeof(int)); //在CPU上分配可分页内存 host_a = (int*)malloc(FULL_DATA_SIZE * sizeof(int)); host_b = (int*)malloc(FULL_DATA_SIZE * sizeof(int)); host_c = (int*)malloc(FULL_DATA_SIZE * sizeof(int)); //主机上的内存赋值 for (int i = 0; i < FULL_DATA_SIZE; i++) { host_a[i] = i; host_b[i] = FULL_DATA_SIZE - i; } //从主机到设备复制数据 cudaMemcpy(dev_a, host_a, FULL_DATA_SIZE * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, host_b, FULL_DATA_SIZE * sizeof(int), cudaMemcpyHostToDevice); kernel << <FULL_DATA_SIZE / 1024, 1024 >> > (dev_a, dev_b, dev_c); //数据拷贝回主机 cudaMemcpy(host_c, dev_c, FULL_DATA_SIZE * sizeof(int), cudaMemcpyDeviceToHost); //计时结束 cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); std::cout << "消耗时间: " << elapsedTime << std::endl; //输出前10个结果 for (int i = 0; i < 10; i++) { std::cout << host_c[i] << std::endl; } getchar(); cudaFreeHost(host_a); cudaFreeHost(host_b); cudaFreeHost(host_c); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); return 0; }
使用流:
#include "cuda_runtime.h" #include <iostream> #include <stdio.h> #include <math.h> #include <device_launch_parameters.h> #define N (1024*1024) #define FULL_DATA_SIZE N*20 __global__ void kernel(int* a, int *b, int*c) { int threadID = blockIdx.x * blockDim.x + threadIdx.x; if (threadID < N) { c[threadID] = (a[threadID] + b[threadID]) / 2; } } int main() { //获取设备属性 cudaDeviceProp prop; int deviceID; cudaGetDevice(&deviceID); cudaGetDeviceProperties(&prop, deviceID); //检查设备是否支持重叠功能 if (!prop.deviceOverlap) { printf("No device will handle overlaps. so no speed up from stream.\n"); return 0; } //启动计时器 cudaEvent_t start, stop; float elapsedTime; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); //创建一个CUDA流 cudaStream_t stream; cudaStreamCreate(&stream); int *host_a, *host_b, *host_c; int *dev_a, *dev_b, *dev_c; //在GPU上分配内存 cudaMalloc((void**)&dev_a, N * sizeof(int)); cudaMalloc((void**)&dev_b, N * sizeof(int)); cudaMalloc((void**)&dev_c, N * sizeof(int)); //在CPU上分配页锁定内存 cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault); cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault); cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault); //主机上的内存赋值 for (int i = 0; i < FULL_DATA_SIZE; i++) { host_a[i] = i; host_b[i] = FULL_DATA_SIZE - i; } for (int i = 0; i < FULL_DATA_SIZE; i += N) { cudaMemcpyAsync(dev_a, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(dev_b, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream); kernel << <N / 1024, 1024, 0, stream >> > (dev_a, dev_b, dev_c); cudaMemcpyAsync(host_c + i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream); } // wait until gpu execution finish cudaStreamSynchronize(stream); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); std::cout << "消耗时间: " << elapsedTime << std::endl; //输出前10个结果 for (int i = 0; i < 10; i++) { std::cout << host_c[i] << std::endl; } getchar(); // free stream and mem cudaFreeHost(host_a); cudaFreeHost(host_b); cudaFreeHost(host_c); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c); cudaStreamDestroy(stream); return 0; }