▶ 使用函数 cudaMallocPitch() 和配套的函数 cudaMemcpy2D() 来使用二维数组。C 中二维数组内存分配是转化为一维数组,连贯紧凑,每次访问数组中的元素都必须从数组首元素开始遍历;而 cuda 中这样分配的二维数组内存保证了数组每一行首元素的地址值都按照 256 或 512 的倍数对齐,提高访问效率,但使得每行末尾元素与下一行首元素地址可能不连贯,使用指针寻址时要注意考虑尾部。
1 // cuda_rumtime_api.h 2 extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t widthByte, size_t height); 3 4 extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
● cudaMAllocPitch() 传入存储器指针 **devPtr,偏移值的指针 *pitch,数组行字节数 widthByte,数组行数 height。函数返回后指针指向分配的内存(每行地址对齐到 AlignByte 字节,为 256B 或 512B),偏移值指针指向的值为该行实际字节数(= sizeof(datatype) * width + alignByte - 1) / alignByte)。
● cudaMemcpy2D() 传入目标存储器的指针 *dst,目标存储器行字节数 dpitch,源存储器指针 *src,源存储器行字节数 spitch,数组行字节数 width,数组行数 height,拷贝方向 kind。这里要求存储器行字节数不小于数组行字节数,多出来的部分就是每行尾部空白部分。
● 整个测试代码。
1 #include <stdio.h> 2 #include <malloc.h> 3 #include <cuda_runtime_api.h> 4 #include "device_launch_parameters.h" 5 6 __global__ void myKernel(float* devPtr, int height, int width, int pitch) 7 { 8 int row, col; 9 float *rowHead; 10 11 for (row = 0; row < height; row++) 12 { 13 rowHead = (float*)((char*)devPtr + row * pitch); 14 15 for (col = 0; col < width; col++) 16 { 17 printf("\t%f", rowHead[col]);// 逐个打印并自增 1 18 rowHead[col]++; 19 } 20 printf("\n"); 21 } 22 } 23 24 int main() 25 { 26 size_t width = 6; 27 size_t height = 5; 28 float *h_data, *d_data; 29 size_t pitch; 30 31 h_data = (float *)malloc(sizeof(float)*width*height); 32 for (int i = 0; i < width*height; i++) 33 h_data[i] = (float)i; 34 35 printf("\n\tAlloc memory."); 36 cudaMallocPitch((void **)&d_data, &pitch, sizeof(float)*width, height); 37 printf("\n\tPitch = %d B\n", pitch); 38 39 printf("\n\tCopy to Device.\n"); 40 cudaMemcpy2D(d_data, pitch, h_data, sizeof(float)*width, sizeof(float)*width, height, cudaMemcpyHostToDevice); 41 42 myKernel << <1, 1 >> > (d_data, height, width, pitch); 43 cudaDeviceSynchronize(); 44 45 printf("\n\tCopy back to Host.\n"); 46 cudaMemcpy2D(h_data, sizeof(float)*width, d_data, pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); 47 48 for (int i = 0; i < width*height; i++) 49 { 50 printf("\t%f", h_data[i]); 51 if ((i + 1) % width == 0) 52 printf("\n"); 53 } 54 55 free(h_data); 56 cudaFree(d_data); 57 58 getchar(); 59 return 0; 60 }
● 输出结果:
Alloc memory. Pitch = 512 B Copy to Device. 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.000000 13.000000 14.000000 15.000000 16.000000 17.000000 18.000000 19.000000 20.000000 21.000000 22.000000 23.000000 24.000000 25.000000 26.000000 27.000000 28.000000 29.000000 Copy back to Host. 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.000000 13.000000 14.000000 15.000000 16.000000 17.000000 18.000000 19.000000 20.000000 21.000000 22.000000 23.000000 24.000000 25.000000 26.000000 27.000000 28.000000 29.000000 30.000000
▶ 使用函数 cudaMalloc3D() 和配套的函数 cudaMemcpy3D() 来使用三维数组。因为涉及的参数较多,需要定义一些用来传参的结构,形式上和二维数组的使用有较大差距,不好看。
● 涉及的相关代码
1 // driver_types.h 2 struct cudaArray; // cuda 数组 3 typedef struct cudaArray * cudaArray_t;// cuda 指针 4 5 struct __device_builtin__ cudaPitchedPtr 6 { 7 void *ptr; // 实际数组指针(用完后要用 cudaFree() 释放掉) 8 size_t pitch; // 数组行字节数 9 size_t xsize; // 数组列数 10 size_t ysize; // 数组行数 11 }; 12 13 struct __device_builtin__ cudaExtent 14 { 15 size_t width; // 数组行字节数 16 size_t height; // 数组行数 17 size_t depth; // 数组层数 18 }; 19 20 struct __device_builtin__ cudaPos 21 { 22 size_t x; 23 size_t y; 24 size_t z; 25 }; 26 27 struct __device_builtin__ cudaMemcpy3DParms 28 { 29 cudaArray_t srcArray; // 原数组指针 30 struct cudaPos srcPos; // 原数组偏移 31 struct cudaPitchedPtr srcPtr; // ?Pitched source memory address 32 33 cudaArray_t dstArray; // 目标数组指针 34 struct cudaPos dstPos; // 目标数组偏移 35 struct cudaPitchedPtr dstPtr; // ?Pitched destination memory address 36 37 struct cudaExtent extent; // 数组实际尺寸(去掉对齐用的空白部分) 38 enum cudaMemcpyKind kind; // 拷贝类型 39 }; 40 41 // driver_functions.h 42 static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 43 { // 简单生成 cudaPitchedPtr 结构的方法 44 struct cudaPitchedPtr s; 45 46 s.ptr = d; 47 s.pitch = p; 48 s.xsize = xsz; 49 s.ysize = ysz; 50 51 return s; 52 } 53 54 static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 55 { // 简单的生成 cudaPos 结构的方法 56 struct cudaPos p; 57 58 p.x = x; 59 p.y = y; 60 p.z = z; 61 62 return p; 63 } 64 65 static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 66 { // 简单的生成 cudaExtent 结构的方法 67 struct cudaExtent e; 68 69 e.width = w; 70 e.height = h; 71 e.depth = d; 72 73 return e; 74 } 75 76 // cuda_runtime_api.h 77 extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent); 78 79 extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
● 完整的测试程序
1 #include <stdio.h> 2 #include <malloc.h> 3 #include <cuda_runtime_api.h> 4 #include "device_launch_parameters.h" 5 #include <driver_functions.h> 6 7 __global__ void myKernel(cudaPitchedPtr devPitchedPtr, cudaExtent extent) 8 { 9 float * devPtr = (float *)devPitchedPtr.ptr; 10 float *sliceHead, *rowHead; 11 // 可以定义为 char * 作面、行迁移的时候直接加减字节数,取行内元素的时候再换回 float * 12 13 for (int z = 0; z < extent.depth; z++) 14 { 15 sliceHead = (float *)((char *)devPtr + z * devPitchedPtr.pitch * extent.height); 16 for (int y = 0; y < extent.height; y++) 17 { 18 rowHead = (float*)((char *)sliceHead + y * devPitchedPtr.pitch); 19 for (int x = 0; x < extent.width / sizeof(float); x++)// extent 存储的是行有效字节数,要除以元素大小 20 { 21 printf("\t%f",rowHead[x]);// 逐个打印并自增 1 22 rowHead[x]++; 23 } 24 printf("\n"); 25 } 26 printf("\n"); 27 } 28 } 29 30 int main() 31 { 32 size_t width = 2; 33 size_t height = 3; 34 size_t depth = 4; 35 float *h_data; 36 37 cudaPitchedPtr d_data; 38 cudaExtent extent; 39 cudaMemcpy3DParms cpyParm; 40 41 h_data = (float *)malloc(sizeof(float) * width * height * depth); 42 for (int i = 0; i < width * height * depth; i++) 43 h_data[i] = (float)i; 44 45 printf("\n\tAlloc memory."); 46 extent = make_cudaExtent(sizeof(float) * width, height, depth); 47 cudaMalloc3D(&d_data, extent); 48 49 printf("\n\tCopy to Device.\n"); 50 cpyParm = {0}; 51 cpyParm.srcPtr = make_cudaPitchedPtr((void*)h_data, sizeof(float) * width, width, height); 52 cpyParm.dstPtr = d_data; 53 cpyParm.extent = extent; 54 cpyParm.kind = cudaMemcpyHostToDevice; 55 cudaMemcpy3D(&cpyParm); 56 57 myKernel << <1, 1 >> > (d_data, extent); 58 cudaDeviceSynchronize(); 59 60 printf("\n\tCopy back to Host.\n"); 61 cpyParm = { 0 }; 62 cpyParm.srcPtr = d_data; 63 cpyParm.dstPtr = make_cudaPitchedPtr((void*)h_data, sizeof(float) * width, width, height); 64 cpyParm.extent = extent; 65 cpyParm.kind = cudaMemcpyDeviceToHost; 66 cudaMemcpy3D(&cpyParm); 67 68 for (int i = 0; i < width*height*depth; i++) 69 { 70 printf("\t%f", h_data[i]); 71 if ((i + 1) % width == 0) 72 printf("\n"); 73 if ((i + 1) % (width*height) == 0) 74 printf("\n"); 75 } 76 77 free(h_data); 78 cudaFree(d_data.ptr); 79 getchar(); 80 return 0; 81 }
● 输出结果:
Alloc memory. Copy to Device. 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.000000 13.000000 14.000000 15.000000 16.000000 17.000000 18.000000 19.000000 20.000000 21.000000 22.000000 23.000000 Copy back to Host. 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.000000 13.000000 14.000000 15.000000 16.000000 17.000000 18.000000 19.000000 20.000000 21.000000 22.000000 23.000000 24.000000