GPU 编程第七次作业（实验八矩阵转置）

1.1 代码

/* 
* Todo:
* Implement the kernel function while satisfying the following requirements*
* 1.1 Utilizing shared memory to achieve coalesced memory access to both input and output matrices *
*/
__global__ void kernel_transpose_per_element_tiled(DTYPE *input, DTYPE *output, int num_rows, int num_cols)
{
    __shared__ DTYPE Shared[BLOCK_SIZE][BLOCK_SIZE];
    int x = threadIdx.x;
    int y = threadIdx.y;
    int col_idx = blockIdx.x * blockDim.x + threadIdx.x;
    int row_idx = blockIdx.y * blockDim.y + threadIdx.y;
    if (row_idx < num_rows && col_idx < num_cols) {
        Shared[x][y] = input[row_idx * num_cols + col_idx];
    }
    __syncthreads();
    col_idx = blockIdx.y * blockDim.y + threadIdx.x;
    row_idx = blockIdx.x * blockDim.x + threadIdx.y;
    if (row_idx < num_cols && col_idx < num_rows) {
        output[row_idx * num_rows + col_idx] = Shared[y][x];
    }
}

/* 
* Todo:
* Implement the kernel function while satisfying the following requirements*
* 2.1 Utilizing shared memory to achieve coalesced memory access to both input and output matrices *
* 2.2 Avoid bank conflicts * 
*/
__global__ void kernel_transpose_per_element_tiled_no_bank_conflicts(DTYPE *input, DTYPE *output, int num_rows, int num_cols)
{
    __shared__ DTYPE Shared[BLOCK_SIZE][BLOCK_SIZE + 1];
    int x = threadIdx.x;
    int y = threadIdx.y;
    int col_idx = blockIdx.x * blockDim.x + threadIdx.x;
    int row_idx = blockIdx.y * blockDim.y + threadIdx.y;
    if (row_idx < num_rows && col_idx < num_cols) {
        Shared[x][y] = input[row_idx * num_cols + col_idx];
    }
    __syncthreads();
    col_idx = blockIdx.y * blockDim.y + threadIdx.x;
    row_idx = blockIdx.x * blockDim.x + threadIdx.y;
    if (row_idx < num_cols && col_idx < num_rows) {
        output[row_idx * num_rows + col_idx] = Shared[y][x];
    }
}

1.2 运行结果

posted @ 2023-05-11 14:04 缙云山车神阅读(29) 评论(0) 编辑收藏举报

刷新页面返回顶部

缙云山车神

我会飞

GPU 编程第七次作业（实验八矩阵转置）

1.1 代码

1.2 运行结果

公告

缙云山车神

我会飞

GPU 编程第七次作业（实验八 矩阵转置）

1.1 代码

1.2 运行结果

公告

GPU 编程第七次作业（实验八矩阵转置）