▶ 使用cuda内置无符号整数结构(__half2)及其汇编函数,计算两个向量的内积。
▶ 源代码
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <time.h> 4 #include "cuda_runtime.h" 5 #include "device_launch_parameters.h" 6 #include "cuda_fp16.h" 7 #include "helper_cuda.h" 8 9 // 将数组 v 进行二分规约加法,使用 __forceinline__ 强制内联 10 __forceinline__ __device__ void reduceInShared(half2 * const v) 11 { 12 if (threadIdx.x < 64) 13 v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]); 14 __syncthreads(); 15 for (int i = 32; i > 0; i /= 2) 16 { 17 if (threadIdx.x < 32) 18 v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + i]); 19 __syncthreads(); 20 } 21 } 22 23 // 将数组 a 与 b 相加后进行规约加法,输入还包括指向结果的指针 h_result 及数组大小 24 __global__ void scalarProductKernel(half2 const * const a, half2 const * const b, float * const h_result, size_t const size) 25 { 26 __shared__ half2 shArray[128]; 27 const int stride = gridDim.x * blockDim.x; 28 29 shArray[threadIdx.x] = __float2half2_rn(0.f); // 浮点数转无符号整数,这里相当于初始化为 0 30 31 half2 value = __float2half2_rn(0.f); 32 for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) // 半精度混合乘加,value = a[i] * b[i] + value 33 value = __hfma2(a[i], b[i], value); 34 shArray[threadIdx.x] = value; 35 __syncthreads(); 36 37 reduceInShared(shArray); // 规约得 a 和 b 的内积,因为使用了内联,共享内存指针可以传入 38 39 if (threadIdx.x == 0) // 0 号线程负责写入结果 40 { 41 half2 result = shArray[0]; 42 h_result[blockIdx.x] = (float)(__low2float(result) + __high2float(result)); 43 } 44 } 45 46 void generateInput(half2 * a, size_t size) // 生成随机数组 47 { 48 for (size_t i = 0; i < size; ++i) 49 { 50 unsigned temp = rand(); 51 temp &= 0x83FF83FF; // 2214560767(10), 10000011111111111000001111111111(2) 52 temp |= 0x3C003C00; // 1006648320(10), 111100000000000011110000000000(2) 53 a[i] = *(half2*)&temp; 54 } 55 } 56 57 int main(int argc, char *argv[]) 58 { 59 srand(time(NULL)); 60 const int blocks = 128, threads = 128; 61 size_t size = blocks * threads * 16; 62 63 int devID = 0; 64 cudaDeviceProp devProp; 65 cudaGetDeviceProperties(&devProp, devID); 66 if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) 67 { 68 printf("required GPU with compute SM 5.3 or higher.\n"); 69 return EXIT_WAIVED; 70 } 71 72 half2 *h_vec[2], *d_vec[2]; 73 float *h_result, *d_result; 74 for (int i = 0; i < 2; ++i) 75 { 76 cudaMallocHost((void**)&h_vec[i], size * sizeof*h_vec[i]); 77 cudaMalloc((void**)&d_vec[i], size * sizeof*d_vec[i]); 78 } 79 cudaMallocHost((void**)&h_result, blocks * sizeof*h_result); 80 cudaMalloc((void**)&d_result, blocks * sizeof*d_result); 81 for (int i = 0; i < 2; ++i) 82 { 83 generateInput(h_vec[i], size); 84 cudaMemcpy(d_vec[i], h_vec[i], size * sizeof*h_vec[i], cudaMemcpyHostToDevice); 85 } 86 scalarProductKernel << <blocks, threads >> >(d_vec[0], d_vec[1], d_result, size); 87 cudaMemcpy(h_result, d_result, blocks * sizeof * h_result, cudaMemcpyDeviceToHost); 88 cudaDeviceSynchronize(); 89 90 float result = 0; 91 for (int i = 0; i < blocks; ++i) 92 result += h_result[i]; 93 printf("Result: %f \n", result); 94 95 for (int i = 0; i < 2; ++i) 96 { 97 cudaFree(d_vec[i]); 98 cudaFreeHost(h_vec[i]); 99 } 100 cudaFree(d_result); 101 cudaFreeHost(h_result); 102 getchar(); 103 return EXIT_SUCCESS; 104 }
● 输出结果
GPU Device 0: "GeForce GTX 1070" with compute capability 6.1 Result: 853856.000000
▶ 涨姿势
● CUDA 无符号半精度整数,就是用 unsigned short 对齐到 2 Byte 来封装的
1 typedef struct __align__(2) { unsigned short x; } __half; 2 3 typedef struct __align__(4) { unsigned int x; } __half2; 4 5 #ifndef CUDA_NO_HALF 6 typedef __half half; 7 typedef __half2 half2; 8 #endif
● 关于 __inline__ 和 __forceinline__
参考stackoverflow。https://stackoverflow.com/questions/19897803/forceinline-effect-at-cuda-c-device-functions
与C中__forceinline__类似,忽略编译器的建议,强制实现内联函数。如果函数只调用累次那么优化没有效果,但是如果调用了多次(如内联函数出现在循环中),则会产生明显的提升。另外,在递归中一般不用。
● 关于 __CUDACC__ 和 __CUDA_ARCH__
■ 参考 stackoverflow【https://stackoverflow.com/questions/8796369/cuda-and-nvcc-using-the-preprocessor-to-choose-between-float-or-double】
■ __CUDACC__ 使用 nvcc 进行编译时有定义。
■ __CUDA_ARCH__ 编译主机代码时无定义(无论是否使用 nvcc);编译设备代码时有定义,且值等于编译命令指定的计算能力号。
■ 范例代码:(为了方便查看,使用了缩进)
1 #ifdef __CUDACC__ 2 #warning using nvcc 3 4 template <typename T> // 一般的核函数 5 __global__ void add(T *x, T *y, T *z) 6 { 7 int idx = threadIdx.x + blockDim.x * blockIdx.x; 8 z[idx] = x[idx] + y[idx]; 9 } 10 11 #ifdef __CUDA_ARCH__ 12 #warning device code trajectory 13 #if __CUDA_ARCH__ > 120 14 #warning compiling with datatype double 15 template void add<double>(double *, double *, double *); 16 #else 17 #warning compiling with datatype float 18 template void add<float>(float *, float *, float *); 19 #endif 20 #else 21 #warning nvcc host code trajectory 22 #endif 23 #else 24 #warning non - nvcc code trajectory 25 #endif
■ 编译及输出结果
$ ln -s cudaarch.cu cudaarch.cc $ gcc -c cudaarch.cc -o cudaarch.o cudaarch.cc:26:2: warning: #warning non-nvcc code trajectory $ nvcc -arch=sm_11 -Xptxas="-v" -c cudaarch.cu -o cudaarch.cu.o cudaarch.cu:3:2: warning: #warning using nvcc cudaarch.cu:14:2: warning: #warning device code trajectory cudaarch.cu:19:2: warning: #warning compiling with datatype float cudaarch.cu:3:2: warning: #warning using nvcc cudaarch.cu:23:2: warning: #warning nvcc host code trajectory ptxas info : Compiling entry function '_Z3addIfEvPT_S1_S1_' for 'sm_11' ptxas info : Used 4 registers, 12+16 bytes smem $ nvcc -arch=sm_20 -Xptxas="-v" -c cudaarch.cu -o cudaarch.cu.o cudaarch.cu:3:2: warning: #warning using nvcc cudaarch.cu:14:2: warning: #warning device code trajectory cudaarch.cu:16:2: warning: #warning compiling with datatype double cudaarch.cu:3:2: warning: #warning using nvcc cudaarch.cu:23:2: warning: #warning nvcc host code trajectory ptxas info : Compiling entry function '_Z3addIdEvPT_S1_S1_' for 'sm_20' ptxas info : Used 8 registers, 44 bytes cmem[0]
● 用到的汇编函数
1 // 表明主机和设备共有代码 2 #define __CUDA_FP16_DECL__ __host__ __device__ 3 4 // 浮点数转无符号整数 5 __CUDA_FP16_DECL__ __half2 __float2half2_rn(const float f) 6 { 7 __half2 val; 8 asm("{.reg .f16 low;\n" 9 " cvt.rn.f16.f32 low, %1;\n" 10 " mov.b32 %0, {low,low};}\n" : "=r"(val.x) : "f"(f)); 11 return val; 12 } 13 14 // 计算无符号整数 a + b 15 #define BINARY_OP_HALF2_MACRO(name) \ 16 do \ 17 { \ 18 __half2 val; \ 19 asm("{"#name".f16x2 %0,%1,%2;\n}" :"=r"(val.x) : "r"(a.x), "r"(b.x)); \ 20 return val; \ 21 } \ 22 while(0); 23 24 __CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) 25 { 26 BINARY_OP_HALF2_MACRO(add); 27 } 28 29 // 计算无符号整数 a * b + c 30 #define TERNARY_OP_HALF2_MACRO(name) \ 31 do \ 32 { \ 33 __half2 val; \ 34 asm("{"#name".f16x2 %0,%1,%2,%3;\n}" : "=r"(val.x) : "r"(a.x), "r"(b.x), "r"(c.x)); \ 35 return val; \ 36 } \ 37 while(0); 38 39 __CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) 40 { 41 TERNARY_OP_HALF2_MACRO(fma.rn); 42 } 43 44 // 将无符号整数的低 2 字节转化为浮点数 45 __CUDA_FP16_DECL__ float __low2float(const __half2 l) 46 { 47 float val; 48 asm("{.reg .f16 low,high;\n" 49 " mov.b32 {low,high},%1;\n" 50 " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(l.x)); 51 return val; 52 } 53 54 // 将无符号整数的高 2 字节转化为浮点数 55 __CUDA_FP16_DECL__ float __high2float(const __half2 l) 56 { 57 float val; 58 asm("{.reg .f16 low,high;\n" 59 " mov.b32 {low,high},%1;\n" 60 " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(l.x)); 61 return val; 62 }