cuda vectorized实现矩阵转置
使用了共享内存和向量化传输,目前为止效果最好的一个实现
__global__ void transposeSmemVec(float* input, float* output, const int X, const int Y){ __shared__ float smem[32 * 4 * 32]; unsigned int ix = 4 * (blockDim.x * blockIdx.x + threadIdx.x); unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y; unsigned int ti = iy * X + ix; float reg[4]; unsigned int thread_index = threadIdx.y * blockDim.x + threadIdx.x; unsigned int new_tx = thread_index % (blockDim.y / 4); unsigned int new_ty = thread_index / (blockDim.y / 4); unsigned int new_ix = blockIdx.y * blockDim.y + new_tx * 4; unsigned int new_iy = blockIdx.x * blockDim.x * 4 + new_ty; unsigned int to = new_iy * Y + new_ix; if (ix < X && iy < Y) { *reinterpret_cast<float4*>(&smem[(threadIdx.y * blockDim.x + threadIdx.x) * 4]) = *reinterpret_cast<float4*>(&input[ti]); __syncthreads(); // *reinterpret_cast<float4*>(®[0]) = *reinterpret_cast<float4*>(&smem[threadIdx.y][threadIdx.x * 4]); // *reinterpret_cast<float4*>(&output[iy * X + ix]) = *reinterpret_cast<float4*>(®[0]); reg[0] = smem[4 * new_tx * blockDim.x * 4 + new_ty]; reg[1] = smem[(4 * new_tx + 1) * blockDim.x * 4 + new_ty]; reg[2] = smem[(4 * new_tx + 2) * blockDim.x * 4 + new_ty]; reg[3] = smem[(4 * new_tx + 3) * blockDim.x * 4 + new_ty]; // printf("---------\n"); *reinterpret_cast<float4*>(&output[to]) = *reinterpret_cast<float4*>(®[0]); } }
注意在调用核函数的时候grid的x维度要缩小1/4
主函数调用核函数代码
checkRuntime(cudaMemset(d_output, 0, sizeof(float) * X * Y)); memset(gpu_ref, 0, sizeof(float) * X * Y); checkRuntime(cudaEventRecord(start)); dim3 grid2((X + 32 * 4 - 1) / (32 * 4), (Y + 32 - 1) / 32); transposeSmemVec<<<grid2, block>>>(d_input, d_output, X, Y); checkRuntime(cudaEventRecord(end)); checkRuntime(cudaMemcpy(gpu_ref, d_output, sizeof(float) * X * Y, cudaMemcpyDeviceToHost)); checkRuntime(cudaEventSynchronize(end)); checkRuntime(cudaEventElapsedTime(&ms, start, end)); printf("transpose vectorialize bandwidth = %fGB/s\n", X * Y * 2 * sizeof(float) / ms / 1e6); compareResult(cpu_ref, gpu_ref, X * Y);
block的size在主函数最前面定义为32,注意grid和block的设置形式是(x, y, z),x在最前面,而矩阵的表示方法是(行,列)也就是(y, x),和block,grid设置是相反的
向量化操作需要注意指令对应的线程和数据对应的线程,这两者是不一样的
无情的摸鱼机器