步骤一
// Todo 1
// Allocate host memory for pointers [*h_x, *h_y, *h_z] using cudaMallocHost
CHECK(cudaMallocHost(&h_x,N*sizeof(DTYPE)));
CHECK(cudaMallocHost(&h_y,N*sizeof(DTYPE)));
CHECK(cudaMallocHost(&h_z,N*sizeof(DTYPE)));
// Todo 2
// Free host memory pointers [*h_x, *h_y, *h_z] using cudaFreeHost
cudaFreeHost(h_x);
cudaFreeHost(h_y);
cudaFreeHost(h_z);
// Todo 3
// Using multiple streams to tmplement the following function achieve overlapped memcpy [cudaMemcpyAsync] and kernel computing
void vec_add_multiple_streams_overlapped(const DTYPE *h_x, const DTYPE *h_y, DTYPE *h_z, const int n)
{
DTYPE *d_x,*d_y,*d_z;
DTYPE *h_z1 = (DTYPE*) malloc(sizeof(DTYPE) * N);
CHECK(cudaMalloc(&d_x, N*sizeof(DTYPE)));
CHECK(cudaMalloc(&d_y, N*sizeof(DTYPE)));
CHECK(cudaMalloc(&d_z, N*sizeof(DTYPE)));
cudaStream_t *stream=(cudaStream_t*)malloc(NUM_STREAMS*sizeof(cuda_Steam_t));
for(int i=0;i<NUM_STREAMS;i++) CHECK(cudaStreamCreate(&sream[i]));
int cnt=N/NUM_STREAMS;
for(int i=0;i<NUM_STREAMS;i++){
cudaMemcpyAsync(d_x+i*cnt, h_x+i*cnt, cnt*sizeof(DTYPE), cudaMemcpyHostToDevice,stream[i]);
cudaMemcpyAsync(d_y+i*cnt, h_y+i*cnt, cnt*sizeof(DTYPE), cudaMemcpyHostToDevice,stream[i]);
}
GpuTimer timer;
timer.Start();
const int grid_size = (cnt - 1) / BLOCK_SIZE + 1;
for(int i=0;i<NUM_STREAMS;i++){
vec_add_kernel<<<grid_size, BLOCK_SIZE,stream[i]>>>(d_x+i*cnt, d_y+i*cnt, d_z+i*cnt, cnt);
}
timer.Stop();
for(int i=0;i<NUM_STREAMS;i++) cudaMemcpyAsync(h_z+i*cnt, d_z+i*cnt, cnt*sizeof(DTYPE), cudaMemcpyDeviceToHOst,stream[i]);
printf("[vec_add_default_stream] Time cost: %f ms\n", timer.Elapsed());
CHECK(cudaDeviceSynchronize());
if(vec_compare(h_z1, h_z, N)==1){ printf(" PASSED!\n"); }else{ printf(" FAILED\n"); }
CHECK(cudaFreeHost(d_z1));
CHECK(cudaFree(d_x));
CHECK(cudaFree(d_y));
CHECK(cudaFree(d_z));
}
步骤二
void gpu_vec_add(Vector *vec1, Vector *vec2, Vector *vec_out)
{
Vector *d_vec1,*d_vec2, *d_vec_out;
float *d_data1, *d_data2, *d_data_out;
cudaMalloc((void **) &d_vec1, sizeof(Vector));
cudaMalloc((void **) &d_data1, (vec1->length) * sizeof(float));
cudaMemcpy(d_vec1, vec1, sizeof(Vector), cudaMemcpyHostToDevice);
cudaMemcpy(d_data1, vec1->data, vec1->length * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(&(d_vec1->data), &d_data1, sizeof(float *), cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_vec2, sizeof(Vector));
cudaMalloc((void **) &d_data2,(vec2->length) * sizeof(float));
cudaMemcpy(d_vec2, vec2, sizeof(Vector), cudaMemcpyHostToDevice);
cudaMemcpy(d_data2, vec2->data, vec2->length * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy( &(d_vec2->data), &d_data2, sizeof(float *),cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_vec_out, sizeof(Vector));
cudaMalloc((void **) &d_data_out, (vec_out->length) * sizeof(float));
cudaMemcpy( d_vec_out, vec_out, sizeof(Vector), cudaMemcpyHostToDevice);
cudaMemcpy( &(d_vec_out->data), &d_data_out, sizeof(float *), cudaMemcpyHostToDevice);
const int grid_size = (vec1->length) / BLOCK_SIZE + 1;
kernel_vec_add<<<grid_size, BLOCK_SIZE>>>(d_vec1, d_vec2, d_vec_out);
cudaMemcpy(vec_out->data, d_data_out, vec_out->length*sizeof(float), cudaMemcpyDeviceToHost);
}
void gpu_vec_add(Vector *vec1, Vector *vec2, Vector *vec_out){
const int grid_size=(vec_out->length-1)/BLOCK_SIZE+1;
kernel_vec_add<<<grid_size,BLOCK_SIZE>>>(vec1,vec2,vec_out);
cudaDeviceSynchronize();
}