步骤一
CHECK (cudaMallocHost (&h_x,N*sizeof (DTYPE)));
CHECK (cudaMallocHost (&h_y,N*sizeof (DTYPE)));
CHECK (cudaMallocHost (&h_z,N*sizeof (DTYPE)));
cudaFreeHost (h_x);
cudaFreeHost (h_y);
cudaFreeHost (h_z);
void vec_add_multiple_streams_overlapped (const DTYPE *h_x, const DTYPE *h_y, DTYPE *h_z, const int n)
{
DTYPE *d_x,*d_y,*d_z;
DTYPE *h_z1 = (DTYPE*) malloc (sizeof (DTYPE) * N);
CHECK (cudaMalloc (&d_x, N*sizeof (DTYPE)));
CHECK (cudaMalloc (&d_y, N*sizeof (DTYPE)));
CHECK (cudaMalloc (&d_z, N*sizeof (DTYPE)));
cudaStream_t *stream=(cudaStream_t*)malloc (NUM_STREAMS*sizeof (cuda_Steam_t));
for (int i=0 ;i<NUM_STREAMS;i++) CHECK (cudaStreamCreate (&sream[i]));
int cnt=N/NUM_STREAMS;
for (int i=0 ;i<NUM_STREAMS;i++){
cudaMemcpyAsync (d_x+i*cnt, h_x+i*cnt, cnt*sizeof (DTYPE), cudaMemcpyHostToDevice,stream[i]);
cudaMemcpyAsync (d_y+i*cnt, h_y+i*cnt, cnt*sizeof (DTYPE), cudaMemcpyHostToDevice,stream[i]);
}
GpuTimer timer;
timer.Start ();
const int grid_size = (cnt - 1 ) / BLOCK_SIZE + 1 ;
for (int i=0 ;i<NUM_STREAMS;i++){
vec_add_kernel<<<grid_size, BLOCK_SIZE,stream[i]>>>(d_x+i*cnt, d_y+i*cnt, d_z+i*cnt, cnt);
}
timer.Stop ();
for (int i=0 ;i<NUM_STREAMS;i++) cudaMemcpyAsync (h_z+i*cnt, d_z+i*cnt, cnt*sizeof (DTYPE), cudaMemcpyDeviceToHOst,stream[i]);
printf ("[vec_add_default_stream] Time cost: %f ms\n" , timer.Elapsed ());
CHECK (cudaDeviceSynchronize ());
if (vec_compare (h_z1, h_z, N)==1 ){ printf (" PASSED!\n" ); }else { printf (" FAILED\n" ); }
CHECK (cudaFreeHost (d_z1));
CHECK (cudaFree (d_x));
CHECK (cudaFree (d_y));
CHECK (cudaFree (d_z));
}
步骤二
void gpu_vec_add (Vector *vec1, Vector *vec2, Vector *vec_out)
{
Vector *d_vec1,*d_vec2, *d_vec_out;
float *d_data1, *d_data2, *d_data_out;
cudaMalloc ((void **) &d_vec1, sizeof (Vector));
cudaMalloc ((void **) &d_data1, (vec1->length) * sizeof (float ));
cudaMemcpy (d_vec1, vec1, sizeof (Vector), cudaMemcpyHostToDevice);
cudaMemcpy (d_data1, vec1->data, vec1->length * sizeof (float ), cudaMemcpyHostToDevice);
cudaMemcpy (&(d_vec1->data), &d_data1, sizeof (float *), cudaMemcpyHostToDevice);
cudaMalloc ((void **) &d_vec2, sizeof (Vector));
cudaMalloc ((void **) &d_data2,(vec2->length) * sizeof (float ));
cudaMemcpy (d_vec2, vec2, sizeof (Vector), cudaMemcpyHostToDevice);
cudaMemcpy (d_data2, vec2->data, vec2->length * sizeof (float ), cudaMemcpyHostToDevice);
cudaMemcpy ( &(d_vec2->data), &d_data2, sizeof (float *),cudaMemcpyHostToDevice);
cudaMalloc ((void **) &d_vec_out, sizeof (Vector));
cudaMalloc ((void **) &d_data_out, (vec_out->length) * sizeof (float ));
cudaMemcpy ( d_vec_out, vec_out, sizeof (Vector), cudaMemcpyHostToDevice);
cudaMemcpy ( &(d_vec_out->data), &d_data_out, sizeof (float *), cudaMemcpyHostToDevice);
const int grid_size = (vec1->length) / BLOCK_SIZE + 1 ;
kernel_vec_add<<<grid_size, BLOCK_SIZE>>>(d_vec1, d_vec2, d_vec_out);
cudaMemcpy (vec_out->data, d_data_out, vec_out->length*sizeof (float ), cudaMemcpyDeviceToHost);
}
void gpu_vec_add (Vector *vec1, Vector *vec2, Vector *vec_out) {
const int grid_size=(vec_out->length-1 )/BLOCK_SIZE+1 ;
kernel_vec_add<<<grid_size,BLOCK_SIZE>>>(vec1,vec2,vec_out);
cudaDeviceSynchronize ();
}
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)