

#include <stdio.h>
#define BLOCK_DIM 5

// Transpose kernel (see transpose CUDA Sample for details)
__global__ void d_transpose(float *odata, float *idata, int width, int height)
    __shared__ float block[BLOCK_DIM][BLOCK_DIM+1];

    // read the matrix tile into shared memory
    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;

    if ((xIndex < width) && (yIndex < height))
        unsigned int index_in = yIndex * width + xIndex;
        block[threadIdx.y][threadIdx.x] = idata[index_in];


    // write the transposed matrix tile to global memory
    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;

    if ((xIndex < height) && (yIndex < width))
        unsigned int index_out = yIndex * height + xIndex;
        odata[index_out] = block[threadIdx.x][threadIdx.y];
void print_arr(float a[],int row,int col,char * info){
        for(int i=0;i<row;i++){
                for(int j=0;j<col;j++){
                        printf("%f ",a[i*col+j]);
int iDivUp(int a, int b)
    return (a % b != 0) ? (a / b + 1) : (a / b);

    Transpose a 2D array (see SDK transpose example)
extern "C"
void transpose(float *d_src, float *d_dest,int width, int height)
    dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1);
    dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
    d_transpose<<< grid, threads >>>(d_dest, d_src, width, height);

int main(){
	const int  nx = 32;
  	const int  ny = 32;
  	const int mem_size = nx*ny*sizeof(float);

	float *h_idata = (float *)malloc(mem_size);
  	float  *h_cdata = (float *)malloc(mem_size);
  	float *h_tdata = (float*)malloc(mem_size);

	float *d_idata, *d_cdata, *d_tdata;
  	cudaMalloc(&d_idata, mem_size) ;
 	cudaMalloc(&d_cdata, mem_size) ;
  	cudaMalloc(&d_tdata, mem_size) ;

	for (int j = 0; j < ny; j++){
			for (int i = 0; i < nx; i++){
				h_idata[j*nx + i] = j+0.1;//j*nx + i;
	cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;;

	transpose(d_idata,d_tdata ,nx,ny);

	cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);
	print_arr(h_idata,nx,ny,"origin data is");	
	print_arr(h_tdata,nx,ny,"transposed data is");	

	cudaFree(d_tdata) ;
  	cudaFree(d_cdata) ;
  	cudaFree(d_idata) ;
	return 0;

#include <stdio.h>
#include <cuda.h>
const int TILE_DIM = 16;
const int BLOCK_ROWS = 8;
void print_arr(float a[],int row,int col,char * info){
        for(int i=0;i<row;i++){
                for(int j=0;j<col;j++){
                        printf("%f ",a[i*col+j]);
__global__ void transposeNaive(float *odata, const float *idata)
  int x = blockIdx.x * TILE_DIM + threadIdx.x;
  int y = blockIdx.y * TILE_DIM + threadIdx.y;
  int width = gridDim.x * TILE_DIM;

  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
    odata[x*width + (y+j)] = idata[(y+j)*width + x];
int main(){
	const int nx = 32;
  	const int ny = 32;
  	const int mem_size = nx*ny*sizeof(float);

	float *h_idata = (float*)malloc(mem_size);
  	float *h_cdata = (float*)malloc(mem_size);
  	float *h_tdata = (float*)malloc(mem_size);

	float *d_idata, *d_cdata, *d_tdata;
  	cudaMalloc(&d_idata, mem_size) ;
 	cudaMalloc(&d_cdata, mem_size) ;
  	cudaMalloc(&d_tdata, mem_size) ;

	for (int j = 0; j < ny; j++)
    		for (int i = 0; i < nx; i++)
      			h_idata[j*nx + i] = j%32;//j*nx + i;	
	cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;

	dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1);
  	dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
	cudaMemset(d_tdata, 0, mem_size) ;
	transposeNaive<<<dimGrid, dimBlock>>>(d_tdata, d_idata);

	cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);
	print_arr(h_idata,32,32,"origin data is");	
	print_arr(h_tdata,32,32,"transposed data is");	

	cudaFree(d_tdata) ;
  	cudaFree(d_cdata) ;
  	cudaFree(d_idata) ;
	return 0;

posted @ 2017-05-25 17:02  开往春天的拖拉机  阅读(392)  评论(0编辑  收藏  举报