转置的好用的cuda程序
通过sample的例子自己改编的一个例子
#include <stdio.h> #define BLOCK_DIM 5 // Transpose kernel (see transpose CUDA Sample for details) __global__ void d_transpose(float *odata, float *idata, int width, int height) { __shared__ float block[BLOCK_DIM][BLOCK_DIM+1]; // read the matrix tile into shared memory unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x; unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y; if ((xIndex < width) && (yIndex < height)) { unsigned int index_in = yIndex * width + xIndex; block[threadIdx.y][threadIdx.x] = idata[index_in]; } __syncthreads(); // write the transposed matrix tile to global memory xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x; yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y; if ((xIndex < height) && (yIndex < width)) { unsigned int index_out = yIndex * height + xIndex; odata[index_out] = block[threadIdx.x][threadIdx.y]; } } void print_arr(float a[],int row,int col,char * info){ printf("%s\n",info); for(int i=0;i<row;i++){ for(int j=0;j<col;j++){ printf("%f ",a[i*col+j]); } printf("\n"); } } int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } /* Transpose a 2D array (see SDK transpose example) */ extern "C" void transpose(float *d_src, float *d_dest,int width, int height) { dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1); dim3 threads(BLOCK_DIM, BLOCK_DIM, 1); d_transpose<<< grid, threads >>>(d_dest, d_src, width, height); } int main(){ const int nx = 32; const int ny = 32; const int mem_size = nx*ny*sizeof(float); float *h_idata = (float *)malloc(mem_size); float *h_cdata = (float *)malloc(mem_size); float *h_tdata = (float*)malloc(mem_size); float *d_idata, *d_cdata, *d_tdata; cudaMalloc(&d_idata, mem_size) ; cudaMalloc(&d_cdata, mem_size) ; cudaMalloc(&d_tdata, mem_size) ; for (int j = 0; j < ny; j++){ for (int i = 0; i < nx; i++){ h_idata[j*nx + i] = j+0.1;//j*nx + i; } } cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;; transpose(d_idata,d_tdata ,nx,ny); cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost); print_arr(h_idata,nx,ny,"origin data is"); print_arr(h_tdata,nx,ny,"transposed data is"); cudaFree(d_tdata) ; cudaFree(d_cdata) ; cudaFree(d_idata) ; free(h_idata); free(h_tdata); free(h_cdata); return 0; }还有一个程序,挺奇怪的现在还不知道怎么用,有知道的可以交流一下
#include <stdio.h> #include <cuda.h> const int TILE_DIM = 16; const int BLOCK_ROWS = 8; void print_arr(float a[],int row,int col,char * info){ printf("%s\n",info); for(int i=0;i<row;i++){ for(int j=0;j<col;j++){ printf("%f ",a[i*col+j]); } printf("\n"); } } __global__ void transposeNaive(float *odata, const float *idata) { int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int width = gridDim.x * TILE_DIM; for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) odata[x*width + (y+j)] = idata[(y+j)*width + x]; } int main(){ const int nx = 32; const int ny = 32; const int mem_size = nx*ny*sizeof(float); float *h_idata = (float*)malloc(mem_size); float *h_cdata = (float*)malloc(mem_size); float *h_tdata = (float*)malloc(mem_size); float *d_idata, *d_cdata, *d_tdata; cudaMalloc(&d_idata, mem_size) ; cudaMalloc(&d_cdata, mem_size) ; cudaMalloc(&d_tdata, mem_size) ; for (int j = 0; j < ny; j++) for (int i = 0; i < nx; i++) h_idata[j*nx + i] = j%32;//j*nx + i; cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ; dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); cudaMemset(d_tdata, 0, mem_size) ; transposeNaive<<<dimGrid, dimBlock>>>(d_tdata, d_idata); cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost); print_arr(h_idata,32,32,"origin data is"); print_arr(h_tdata,32,32,"transposed data is"); cudaFree(d_tdata) ; cudaFree(d_cdata) ; cudaFree(d_idata) ; free(h_idata); free(h_tdata); free(h_cdata); return 0; }