| #include <stdio.h> |
| #include <stdlib.h> |
| #include <cuda.h> |
| #include <math.h> |
| #include <cuda_runtime.h> |
| #include <device_launch_parameters.h> |
| |
| typedef float FLOAT; |
| |
| double get_time(); |
| void warm_up(); |
| void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N); |
| __global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N); |
| |
| |
| #define get_tid() ((blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x) |
| #define get_bid() (blockIdx.y * gridDim.x + blockIdx.x) |
| |
| #define WINDOWS 0 |
| |
| #if WINDOWS |
| #include <windows.h> |
| double get_time() |
| { |
| LARGE_INTEGER timer; |
| static LARGE_INTEGER fre; |
| static int init = 0; |
| double t; |
| |
| if(init != 1) |
| { |
| QueryPerformanceFrequency(&fre); |
| init = 1; |
| } |
| |
| QueryPerformanceFrequency(&timer); |
| t = timer.QuadPart * 1. / fre.QuadPart; |
| return t; |
| |
| } |
| |
| #else |
| |
| #include <sys/time.h> |
| #include <time.h> |
| |
| double get_time() |
| { |
| struct timeval tv; |
| double t; |
| gettimeofday(&tv, (struct timezone*)0); |
| t = tv.tv_sec + (double)tv.tv_usec*1e-6; |
| return t; |
| |
| } |
| |
| #endif |
| |
| |
| |
| __global__ void warmup_knl(void) |
| { |
| int i, j; |
| i = 1; |
| j = 1; |
| i = i + j; |
| } |
| |
| |
| void warm_up() |
| { |
| int i = 0; |
| for (; i < 0; ++i) |
| { |
| warmup_knl <<<1, 256>>> (); |
| } |
| } |
| |
| |
| void vec_add_host(FLOAT* x, FLOAT* y, FLOAT* z, int N) |
| { |
| int i; |
| for (int i = 0; i < N; ++i) |
| z[i] = x[i] + y[i] + z[i]; |
| } |
| |
| __global__ void vec_add_device(FLOAT* x, FLOAT* y, FLOAT* z, int N) |
| { |
| int idx = get_tid(); |
| if (idx < N) |
| z[idx] = x[idx] + y[idx] + z[idx]; |
| } |
| |
| int main() |
| { |
| int N = 20000000; |
| int nbytes = N * sizeof(FLOAT); |
| |
| |
| int bs = 256; |
| int s = ceil(sqrt((N + bs - 1.) / bs)); |
| dim3 grid = dim3(s, s); |
| |
| FLOAT* dx = NULL, *hx = NULL; |
| FLOAT* dy = NULL, *hy = NULL; |
| FLOAT* dz = NULL, *hz = NULL; |
| |
| int iter = 30; |
| int i; |
| double th, td; |
| |
| warm_up(); |
| |
| |
| cudaMalloc((void**)&dx, nbytes); |
| cudaMalloc((void**)&dy, nbytes); |
| cudaMalloc((void**)&dz, nbytes); |
| |
| if(dx == NULL || dy == NULL || dz == NULL) |
| { |
| printf("Couldn't allocate GPU Memory"); |
| return -1; |
| } |
| |
| |
| hx = (FLOAT*)malloc(nbytes); |
| hy = (FLOAT*)malloc(nbytes); |
| hz = (FLOAT*)malloc(nbytes); |
| |
| if(hx == NULL || hy == NULL || hz == NULL) |
| { |
| printf("Couldn't allocate CPU Memory"); |
| } |
| |
| |
| for(int i = 0; i < N; ++i) |
| { |
| hx[i] = 1; |
| hy[i] = 1; |
| hz[i] = 1; |
| } |
| |
| |
| cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice); |
| cudaMemcpy(dy, hy, nbytes, cudaMemcpyHostToDevice); |
| cudaMemcpy(dz, hz, nbytes, cudaMemcpyHostToDevice); |
| |
| warm_up(); |
| |
| cudaThreadSynchronize(); |
| |
| td = get_time(); |
| for(i = 0; i < iter; ++i) |
| vec_add_device<<<grid, bs>>> (dx, dy, dz, N); |
| td = get_time() - td; |
| |
| |
| |
| |
| th = get_time(); |
| for(i = 0; i < iter; ++i) |
| vec_add_host(hx, hy, hz, N); |
| th = get_time() - th; |
| |
| printf("GPU time: %.4f, CPU time: %.4f. SppedUp: %g \n", td, th, th/td); |
| |
| |
| free(hx); |
| free(hy); |
| free(hz); |
| cudaFree(hx); |
| cudaFree(hy); |
| cudaFree(hz); |
| |
| |
| return 0; |
| } |
| GPU time: 0.0109, CPU time: 2.6454. SppedUp: 242.811 |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 提示词工程——AI应用必不可少的技术
· 地球OL攻略 —— 某应届生求职总结
· 字符编码:从基础到乱码解决
· SpringCloud带你走进微服务的世界