HIPCC编译初探
目前信创产业如火如荼,工作需要开始接触国产海光DCU。DCU与AMD同源,采用了基于GCN的Vega架构设计,后续有机会详细学习相关硬件内容。
现针对DCU适配的DCU Tool Kit环境,对hipcc进行了一次简单的测试,研究其编译器优化的效果。
测试代码段:
__global__ void kernel_test(const int *a, const int *b, int *c, unsigned n) { unsigned gid = hipThreadIdx_x; // <- coordinate index function aaa = 10; //#pragma unroll 1 for (int idx = 0; idx < 10; ++idx) { c[gid] = a[gid] + b[gid]; } }
测试环境:
指令简介:
编译器for循环展开:
1.使用参数-O0,未优化的代码很长(整段函数指令集有291行),仅截取其中一段,可以看到使用s_branch进行跳转循环,其中还调用了s_waitcnt vmcnt(0) lgkmcnt(0),较为耗时。
hip_test.cpp: 6 { 0x00002aaebc214600 <+0>: s_mov_b32 s32, 0x1c00 0x00002aaebc214608 <+8>: s_mov_b32 s33, 0 0x00002aaebc21460c <+12>: s_add_u32 flat_scratch_lo, s6, s9 0x00002aaebc214610 <+16>: s_addc_u32 flat_scratch_hi, s7, 0 0x00002aaebc214614 <+20>: s_add_u32 s0, s0, s9 0x00002aaebc214618 <+24>: s_addc_u32 s1, s1, 0 0x00002aaebc21461c <+28>: s_load_dwordx2 s[6:7], s[4:5], 0x0 0x00002aaebc214624 <+36>: s_load_dwordx2 s[8:9], s[4:5], 0x8 0x00002aaebc21462c <+44>: s_load_dwordx2 s[10:11], s[4:5], 0x10 0x00002aaebc214634 <+52>: s_load_dword s4, s[4:5], 0x18 0x00002aaebc21463c <+60>: s_mov_b32 s5, -1 0x00002aaebc214640 <+64>: v_mov_b32_e32 v3, 8 0x00002aaebc214644 <+68>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc21464c <+76>: s_mov_b64 s[14:15], 0 0x00002aaebc214650 <+80>: s_mov_b32 s16, s15 0x00002aaebc214654 <+84>: s_getreg_b32 s17, hwreg(HW_REG_SH_MEM_BASES, 0, 16) 0x00002aaebc214658 <+88>: s_lshl_b32 s17, s17, 16 0x00002aaebc21465c <+92>: v_mov_b32_e32 v4, s16 0x00002aaebc214660 <+96>: v_mov_b32_e32 v5, s17 0x00002aaebc214664 <+100>: v_cndmask_b32_e64 v4, v4, v5, s[12:13] 0x00002aaebc21466c <+108>: s_mov_b32 s18, s14 0x00002aaebc214670 <+112>: v_mov_b32_e32 v5, s18 0x00002aaebc214674 <+116>: v_cndmask_b32_e64 v3, v5, v3, s[12:13] 0x00002aaebc21467c <+124>: v_mov_b32_e32 v5, v3 0x00002aaebc214680 <+128>: v_mov_b32_e32 v6, v4 0x00002aaebc214684 <+132>: v_mov_b32_e32 v3, 16 0x00002aaebc214688 <+136>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc214690 <+144>: v_mov_b32_e32 v4, s16 0x00002aaebc214694 <+148>: v_mov_b32_e32 v7, s17 0x00002aaebc214698 <+152>: v_cndmask_b32_e64 v4, v4, v7, s[12:13] 0x00002aaebc2146a0 <+160>: v_mov_b32_e32 v7, s18 0x00002aaebc2146a4 <+164>: v_cndmask_b32_e64 v3, v7, v3, s[12:13] 0x00002aaebc2146ac <+172>: v_mov_b32_e32 v7, v3 0x00002aaebc2146b0 <+176>: v_mov_b32_e32 v8, v4 0x00002aaebc2146b4 <+180>: v_mov_b32_e32 v3, 24 0x00002aaebc2146b8 <+184>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc2146c0 <+192>: v_mov_b32_e32 v4, s16 0x00002aaebc2146c4 <+196>: v_mov_b32_e32 v9, s17 0x00002aaebc2146c8 <+200>: v_cndmask_b32_e64 v4, v4, v9, s[12:13] 0x00002aaebc2146d0 <+208>: v_mov_b32_e32 v9, s18 0x00002aaebc2146d4 <+212>: v_cndmask_b32_e64 v3, v9, v3, s[12:13] 0x00002aaebc2146dc <+220>: v_mov_b32_e32 v9, v3 0x00002aaebc2146e0 <+224>: v_mov_b32_e32 v10, v4 0x00002aaebc2146e4 <+228>: v_mov_b32_e32 v3, 32 0x00002aaebc2146e8 <+232>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc2146f0 <+240>: v_mov_b32_e32 v4, s16 0x00002aaebc2146f4 <+244>: v_mov_b32_e32 v11, s17 0x00002aaebc2146f8 <+248>: v_cndmask_b32_e64 v4, v4, v11, s[12:13] 0x00002aaebc214700 <+256>: v_mov_b32_e32 v11, s18 0x00002aaebc214704 <+260>: v_cndmask_b32_e64 v3, v11, v3, s[12:13] 0x00002aaebc21470c <+268>: v_mov_b32_e32 v11, v3 0x00002aaebc214710 <+272>: v_mov_b32_e32 v12, v4 0x00002aaebc214714 <+276>: v_mov_b32_e32 v3, 40 0x00002aaebc214718 <+280>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc214720 <+288>: v_mov_b32_e32 v4, s16 0x00002aaebc214724 <+292>: v_mov_b32_e32 v13, s17 0x00002aaebc214728 <+296>: v_cndmask_b32_e64 v4, v4, v13, s[12:13] 0x00002aaebc214730 <+304>: v_mov_b32_e32 v13, s18 0x00002aaebc214734 <+308>: v_cndmask_b32_e64 v3, v13, v3, s[12:13] 0x00002aaebc21473c <+316>: v_mov_b32_e32 v13, v3 0x00002aaebc214740 <+320>: v_mov_b32_e32 v14, v4 0x00002aaebc214744 <+324>: v_mov_b32_e32 v3, 48 0x00002aaebc214748 <+328>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc214750 <+336>: v_mov_b32_e32 v4, s16 0x00002aaebc214754 <+340>: v_mov_b32_e32 v15, s17 0x00002aaebc214758 <+344>: v_cndmask_b32_e64 v4, v4, v15, s[12:13] 0x00002aaebc214760 <+352>: v_mov_b32_e32 v15, s18 0x00002aaebc214764 <+356>: v_cndmask_b32_e64 v3, v15, v3, s[12:13] 0x00002aaebc21476c <+364>: v_mov_b32_e32 v15, v3 0x00002aaebc214770 <+368>: v_mov_b32_e32 v16, v4 0x00002aaebc214774 <+372>: v_mov_b32_e32 v3, 56 0x00002aaebc214778 <+376>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc214780 <+384>: v_mov_b32_e32 v4, s16 0x00002aaebc214784 <+388>: v_mov_b32_e32 v17, s17 0x00002aaebc214788 <+392>: v_cndmask_b32_e64 v4, v4, v17, s[12:13] 0x00002aaebc214790 <+400>: v_mov_b32_e32 v17, s18 0x00002aaebc214794 <+404>: v_cndmask_b32_e64 v3, v17, v3, s[12:13] 0x00002aaebc21479c <+412>: v_mov_b32_e32 v17, v3 0x00002aaebc2147a0 <+416>: v_mov_b32_e32 v18, v4 0x00002aaebc2147a4 <+420>: v_mov_b32_e32 v3, 60 0x00002aaebc2147a8 <+424>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc2147b0 <+432>: v_mov_b32_e32 v4, s16 0x00002aaebc2147b4 <+436>: v_mov_b32_e32 v19, s17 0x00002aaebc2147b8 <+440>: v_cndmask_b32_e64 v4, v4, v19, s[12:13] 0x00002aaebc2147c0 <+448>: v_mov_b32_e32 v19, s18 0x00002aaebc2147c4 <+452>: v_cndmask_b32_e64 v3, v19, v3, s[12:13] 0x00002aaebc2147cc <+460>: v_mov_b32_e32 v19, v3 0x00002aaebc2147d0 <+464>: v_mov_b32_e32 v20, v4 0x00002aaebc2147d4 <+468>: v_mov_b32_e32 v3, 64 0x00002aaebc2147d8 <+472>: v_cmp_ne_u32_e64 s[12:13], v3, s5 0x00002aaebc2147e0 <+480>: v_mov_b32_e32 v4, s16 0x00002aaebc2147e4 <+484>: v_mov_b32_e32 v21, s17 0x00002aaebc2147e8 <+488>: v_cndmask_b32_e64 v4, v4, v21, s[12:13] 0x00002aaebc2147f0 <+496>: v_mov_b32_e32 v21, s18 0x00002aaebc2147f4 <+500>: v_cndmask_b32_e64 v3, v21, v3, s[12:13] 0x00002aaebc2147fc <+508>: v_mov_b32_e32 v21, v3 0x00002aaebc214800 <+512>: v_mov_b32_e32 v22, v4 0x00002aaebc214804 <+516>: v_mov_b32_e32 v3, v5 0x00002aaebc214808 <+520>: v_mov_b32_e32 v4, v6 0x00002aaebc21480c <+524>: s_waitcnt lgkmcnt(0) 0x00002aaebc214810 <+528>: v_mov_b32_e32 v24, s7 0x00002aaebc214814 <+532>: v_mov_b32_e32 v23, s6 0x00002aaebc214818 <+536>: flat_store_dwordx2 v[3:4], v[23:24] 0x00002aaebc214820 <+544>: flat_load_dwordx2 v[3:4], v[5:6] 0x00002aaebc214828 <+552>: v_mov_b32_e32 v5, v7 0x00002aaebc21482c <+556>: v_mov_b32_e32 v6, v8 0x00002aaebc214830 <+560>: v_mov_b32_e32 v24, s9 0x00002aaebc214834 <+564>: v_mov_b32_e32 v23, s8 0x00002aaebc214838 <+568>: flat_store_dwordx2 v[5:6], v[23:24] 0x00002aaebc214840 <+576>: flat_load_dwordx2 v[5:6], v[7:8] 0x00002aaebc214848 <+584>: v_mov_b32_e32 v7, v9 0x00002aaebc21484c <+588>: v_mov_b32_e32 v8, v10 0x00002aaebc214850 <+592>: v_mov_b32_e32 v24, s11 0x00002aaebc214854 <+596>: v_mov_b32_e32 v23, s10 0x00002aaebc214858 <+600>: flat_store_dwordx2 v[7:8], v[23:24] 0x00002aaebc214860 <+608>: flat_load_dwordx2 v[7:8], v[9:10] 0x00002aaebc214868 <+616>: v_mov_b32_e32 v9, v11 0x00002aaebc21486c <+620>: v_mov_b32_e32 v10, v12 0x00002aaebc214870 <+624>: s_waitcnt vmcnt(0) lgkmcnt(0) 0x00002aaebc214874 <+628>: flat_store_dwordx2 v[9:10], v[3:4] 0x00002aaebc21487c <+636>: v_mov_b32_e32 v3, v13 0x00002aaebc214880 <+640>: v_mov_b32_e32 v4, v14 0x00002aaebc214884 <+644>: flat_store_dwordx2 v[3:4], v[5:6] 0x00002aaebc21488c <+652>: v_mov_b32_e32 v3, v15 0x00002aaebc214890 <+656>: v_mov_b32_e32 v4, v16 0x00002aaebc214894 <+660>: flat_store_dwordx2 v[3:4], v[7:8] 0x00002aaebc21489c <+668>: v_mov_b32_e32 v3, s4 0x00002aaebc2148a0 <+672>: flat_store_dword v[17:18], v3 7 unsigned gid = hipThreadIdx_x; // <- coordinate index function => 0x00002aaebc2148a8 <+680>: s_getpc_b64 s[4:5] 0x00002aaebc2148ac <+684>: s_add_u32 s4, s4, 0xfffff794 0x00002aaebc2148b4 <+692>: s_addc_u32 s5, s5, -1 0x00002aaebc2148bc <+700>: s_mov_b64 s[10:11], s[2:3] 0x00002aaebc2148c0 <+704>: s_mov_b64 s[8:9], s[0:1] 0x00002aaebc2148c4 <+708>: s_mov_b32 s6, 20 0x00002aaebc2148c8 <+712>: v_lshlrev_b32_e64 v2, s6, v2 0x00002aaebc2148d0 <+720>: s_mov_b32 s6, 10 0x00002aaebc2148d4 <+724>: v_lshlrev_b32_e64 v1, s6, v1 0x00002aaebc2148dc <+732>: v_or3_b32 v1, v0, v1, v2 0x00002aaebc2148e4 <+740>: v_mov_b32_e32 v0, 0 0x00002aaebc2148e8 <+744>: s_mov_b64 s[0:1], s[8:9] 0x00002aaebc2148ec <+748>: s_mov_b64 s[2:3], s[10:11] 0x00002aaebc2148f0 <+752>: buffer_store_dword v0, off, s[0:3], s33 offset:68 0x00002aaebc2148f8 <+760>: v_writelane_b32 v25, s14, 0 0x00002aaebc214900 <+768>: v_writelane_b32 v25, s15, 1 0x00002aaebc214908 <+776>: buffer_store_dword v11, off, s[0:3], s33 offset:72 0x00002aaebc214910 <+784>: buffer_store_dword v12, off, s[0:3], s33 offset:76 0x00002aaebc214918 <+792>: buffer_store_dword v13, off, s[0:3], s33 offset:80 0x00002aaebc214920 <+800>: buffer_store_dword v14, off, s[0:3], s33 offset:84 0x00002aaebc214928 <+808>: buffer_store_dword v15, off, s[0:3], s33 offset:88 0x00002aaebc214930 <+816>: buffer_store_dword v16, off, s[0:3], s33 offset:92 0x00002aaebc214938 <+824>: buffer_store_dword v19, off, s[0:3], s33 offset:96 0x00002aaebc214940 <+832>: buffer_store_dword v20, off, s[0:3], s33 offset:100 0x00002aaebc214948 <+840>: buffer_store_dword v21, off, s[0:3], s33 offset:104 0x00002aaebc214950 <+848>: buffer_store_dword v22, off, s[0:3], s33 offset:108 0x00002aaebc214958 <+856>: s_swappc_b64 s[30:31], s[4:5] 0x00002aaebc21495c <+860>: v_mov_b32_e32 v2, v0 0x00002aaebc214960 <+864>: v_mov_b32_e32 v3, v1 0x00002aaebc214964 <+868>: buffer_load_dword v0, off, s[0:3], s33 offset:96 0x00002aaebc21496c <+876>: buffer_load_dword v1, off, s[0:3], s33 offset:100 0x00002aaebc214974 <+884>: s_waitcnt vmcnt(0) 0x00002aaebc214978 <+888>: flat_store_dword v[0:1], v2 0x00002aaebc214980 <+896>: buffer_load_dword v0, off, s[0:3], s33 offset:104 0x00002aaebc214988 <+904>: buffer_load_dword v1, off, s[0:3], s33 offset:108 8 //#pragma unroll 1 9 for (int idx = 0; idx < 10; ++idx) 0x00002aaebc214990 <+912>: buffer_load_dword v2, off, s[0:3], s33 offset:68 0x00002aaebc214998 <+920>: s_waitcnt vmcnt(0) 0x00002aaebc21499c <+924>: flat_store_dword v[0:1], v2 0x00002aaebc2149a4 <+932>: v_readlane_b32 s6, v25, 0 0x00002aaebc2149ac <+940>: v_readlane_b32 s7, v25, 1 0x00002aaebc2149b4 <+948>: v_writelane_b32 v25, s4, 2 0x00002aaebc2149bc <+956>: v_writelane_b32 v25, s5, 3 0x00002aaebc2149c4 <+964>: v_writelane_b32 v25, s6, 4 0x00002aaebc2149cc <+972>: v_writelane_b32 v25, s7, 5 0x00002aaebc2149d4 <+980>: v_readlane_b32 s4, v25, 4 0x00002aaebc2149dc <+988>: v_readlane_b32 s5, v25, 5 0x00002aaebc2149e4 <+996>: v_readlane_b32 s6, v25, 2 0x00002aaebc2149ec <+1004>: v_readlane_b32 s7, v25, 3 0x00002aaebc2149f4 <+1012>: buffer_load_dword v0, off, s[0:3], s33 offset:104 0x00002aaebc2149fc <+1020>: buffer_load_dword v1, off, s[0:3], s33 offset:108 0x00002aaebc214a04 <+1028>: s_waitcnt vmcnt(0) 0x00002aaebc214a08 <+1032>: flat_load_dword v0, v[0:1] 0x00002aaebc214a10 <+1040>: s_mov_b32 s8, 10 0x00002aaebc214a14 <+1044>: s_waitcnt vmcnt(0) lgkmcnt(0) 0x00002aaebc214a18 <+1048>: v_cmp_lt_i32_e64 s[8:9], v0, s8 0x00002aaebc214a20 <+1056>: s_mov_b64 s[10:11], -1 0x00002aaebc214a24 <+1060>: s_or_b64 s[6:7], s[6:7], exec 0x00002aaebc214a28 <+1064>: s_mov_b64 s[10:11], s[6:7] 0x00002aaebc214a2c <+1068>: s_mov_b64 s[12:13], exec 0x00002aaebc214a30 <+1072>: s_and_b64 s[8:9], s[12:13], s[8:9] 0x00002aaebc214a34 <+1076>: v_writelane_b32 v25, s4, 6 0x00002aaebc214a3c <+1084>: v_writelane_b32 v25, s5, 7 0x00002aaebc214a44 <+1092>: v_writelane_b32 v25, s6, 8 0x00002aaebc214a4c <+1100>: v_writelane_b32 v25, s7, 9 0x00002aaebc214a54 <+1108>: v_writelane_b32 v25, s10, 10 0x00002aaebc214a5c <+1116>: v_writelane_b32 v25, s11, 11 0x00002aaebc214a64 <+1124>: v_writelane_b32 v25, s12, 12 0x00002aaebc214a6c <+1132>: v_writelane_b32 v25, s13, 13 0x00002aaebc214a74 <+1140>: s_mov_b64 exec, s[8:9] 0x00002aaebc214a78 <+1144>: s_cbranch_execz 73 # 0x2aaebc214ba0 <_Z11kernel_testPKiS0_Pij+1440> 10 { 11 c[gid] = a[gid] + b[gid]; 0x00002aaebc214a7c <+1148>: buffer_load_dword v0, off, s[0:3], s33 offset:72 0x00002aaebc214a84 <+1156>: buffer_load_dword v1, off, s[0:3], s33 offset:76 0x00002aaebc214a8c <+1164>: s_waitcnt vmcnt(0) 0x00002aaebc214a90 <+1168>: flat_load_dwordx2 v[0:1], v[0:1] 0x00002aaebc214a98 <+1176>: buffer_load_dword v2, off, s[0:3], s33 offset:96 0x00002aaebc214aa0 <+1184>: buffer_load_dword v3, off, s[0:3], s33 offset:100 0x00002aaebc214aa8 <+1192>: s_waitcnt vmcnt(0) 0x00002aaebc214aac <+1196>: flat_load_dword v2, v[2:3] 0x00002aaebc214ab4 <+1204>: s_mov_b32 s4, 0 0x00002aaebc214ab8 <+1208>: v_mov_b32_e32 v3, 0 0x00002aaebc214abc <+1212>: s_waitcnt vmcnt(0) lgkmcnt(0) 0x00002aaebc214ac0 <+1216>: v_mov_b32_e32 v4, v2 0x00002aaebc214ac4 <+1220>: v_mov_b32_e32 v5, v3 0x00002aaebc214ac8 <+1224>: s_mov_b32 s4, 2 0x00002aaebc214acc <+1228>: v_lshlrev_b64 v[2:3], s4, v[4:5] 0x00002aaebc214ad4 <+1236>: v_mov_b32_e32 v4, v0 0x00002aaebc214ad8 <+1240>: v_mov_b32_e32 v5, v2 0x00002aaebc214adc <+1244>: v_mov_b32_e32 v0, v3 0x00002aaebc214ae0 <+1248>: v_add_co_u32_e64 v4, s[4:5], v4, v5 0x00002aaebc214ae8 <+1256>: v_addc_co_u32_e64 v0, s[4:5], v1, v0, s[4:5] 0x00002aaebc214af0 <+1264>: v_mov_b32_e32 v5, v0 0x00002aaebc214af4 <+1268>: flat_load_dword v0, v[4:5] 0x00002aaebc214afc <+1276>: buffer_load_dword v4, off, s[0:3], s33 offset:80 0x00002aaebc214b04 <+1284>: buffer_load_dword v5, off, s[0:3], s33 offset:84 0x00002aaebc214b0c <+1292>: s_waitcnt vmcnt(0) 0x00002aaebc214b10 <+1296>: flat_load_dwordx2 v[4:5], v[4:5] 0x00002aaebc214b18 <+1304>: s_waitcnt vmcnt(0) lgkmcnt(0) 0x00002aaebc214b1c <+1308>: v_mov_b32_e32 v1, v4 0x00002aaebc214b20 <+1312>: v_mov_b32_e32 v6, v2 0x00002aaebc214b24 <+1316>: v_mov_b32_e32 v4, v3 0x00002aaebc214b28 <+1320>: v_add_co_u32_e64 v1, s[4:5], v1, v6 0x00002aaebc214b30 <+1328>: v_addc_co_u32_e64 v4, s[4:5], v5, v4, s[4:5] 0x00002aaebc214b38 <+1336>: v_mov_b32_e32 v5, v1 0x00002aaebc214b3c <+1340>: v_mov_b32_e32 v6, v4 0x00002aaebc214b40 <+1344>: flat_load_dword v1, v[5:6] 0x00002aaebc214b48 <+1352>: s_waitcnt vmcnt(0) lgkmcnt(0) 0x00002aaebc214b4c <+1356>: v_add_u32_e64 v0, v0, v1 0x00002aaebc214b54 <+1364>: buffer_load_dword v4, off, s[0:3], s33 offset:88 0x00002aaebc214b5c <+1372>: buffer_load_dword v5, off, s[0:3], s33 offset:92 0x00002aaebc214b64 <+1380>: s_waitcnt vmcnt(0) 0x00002aaebc214b68 <+1384>: flat_load_dwordx2 v[4:5], v[4:5] 0x00002aaebc214b70 <+1392>: s_waitcnt vmcnt(0) lgkmcnt(0) 0x00002aaebc214b74 <+1396>: v_mov_b32_e32 v1, v4 0x00002aaebc214b78 <+1400>: v_mov_b32_e32 v6, v2 0x00002aaebc214b7c <+1404>: v_add_co_u32_e64 v1, s[4:5], v1, v6 0x00002aaebc214b84 <+1412>: v_addc_co_u32_e64 v2, s[4:5], v5, v3, s[4:5] 0x00002aaebc214b8c <+1420>: v_mov_b32_e32 v3, v1 0x00002aaebc214b90 <+1424>: v_mov_b32_e32 v4, v2 0x00002aaebc214b94 <+1428>: flat_store_dword v[3:4], v0 0x00002aaebc214b9c <+1436>: s_branch 33 # 0x2aaebc214c24 <_Z11kernel_testPKiS0_Pij+1572> 0x00002aaebc214ba0 <+1440>: v_readlane_b32 s4, v25, 12 0x00002aaebc214ba8 <+1448>: v_readlane_b32 s5, v25, 13 0x00002aaebc214bb0 <+1456>: s_or_b64 exec, exec, s[4:5] 0x00002aaebc214bb4 <+1460>: v_readlane_b32 s6, v25, 10 0x00002aaebc214bbc <+1468>: v_readlane_b32 s7, v25, 11 0x00002aaebc214bc4 <+1476>: s_mov_b64 s[8:9], s[6:7] 0x00002aaebc214bc8 <+1480>: s_and_b64 s[8:9], exec, s[8:9] 0x00002aaebc214bcc <+1484>: v_readlane_b32 s10, v25, 6 0x00002aaebc214bd4 <+1492>: v_readlane_b32 s11, v25, 7 0x00002aaebc214bdc <+1500>: s_or_b64 s[8:9], s[8:9], s[10:11] 0x00002aaebc214be0 <+1504>: s_mov_b64 s[12:13], s[8:9] 0x00002aaebc214be4 <+1508>: s_mov_b64 s[14:15], s[8:9] 0x00002aaebc214be8 <+1512>: v_writelane_b32 v25, s6, 2 0x00002aaebc214bf0 <+1520>: v_writelane_b32 v25, s7, 3 0x00002aaebc214bf8 <+1528>: v_writelane_b32 v25, s12, 4 0x00002aaebc214c00 <+1536>: v_writelane_b32 v25, s13, 5 0x00002aaebc214c08 <+1544>: v_writelane_b32 v25, s14, 14 0x00002aaebc214c10 <+1552>: v_writelane_b32 v25, s15, 15 0x00002aaebc214c18 <+1560>: s_andn2_b64 exec, exec, s[8:9] 0x00002aaebc214c1c <+1564>: s_cbranch_execnz 65389 # 0x2aaebc2149d4 <_Z11kernel_testPKiS0_Pij+980> 0x00002aaebc214c20 <+1568>: s_branch 29 # 0x2aaebc214c98 <_Z11kernel_testPKiS0_Pij+1688> 9 for (int idx = 0; idx < 10; ++idx) 0x00002aaebc214c24 <+1572>: buffer_load_dword v0, off, s[0:3], s33 offset:104 0x00002aaebc214c2c <+1580>: buffer_load_dword v1, off, s[0:3], s33 offset:108 0x00002aaebc214c34 <+1588>: s_waitcnt vmcnt(0) 0x00002aaebc214c38 <+1592>: flat_load_dword v0, v[0:1] 0x00002aaebc214c40 <+1600>: s_mov_b32 s4, 1 0x00002aaebc214c44 <+1604>: s_waitcnt vmcnt(0) lgkmcnt(0) 0x00002aaebc214c48 <+1608>: v_add_u32_e64 v0, v0, s4 0x00002aaebc214c50 <+1616>: buffer_load_dword v1, off, s[0:3], s33 offset:104 0x00002aaebc214c58 <+1624>: buffer_load_dword v2, off, s[0:3], s33 offset:108 0x00002aaebc214c60 <+1632>: s_waitcnt vmcnt(0) 0x00002aaebc214c64 <+1636>: flat_store_dword v[1:2], v0 0x00002aaebc214c6c <+1644>: s_mov_b64 s[4:5], 0 0x00002aaebc214c70 <+1648>: v_readlane_b32 s4, v25, 8 0x00002aaebc214c78 <+1656>: v_readlane_b32 s5, v25, 9 0x00002aaebc214c80 <+1664>: s_andn2_b64 s[6:7], s[4:5], exec 0x00002aaebc214c84 <+1668>: v_writelane_b32 v25, s6, 10 0x00002aaebc214c8c <+1676>: v_writelane_b32 v25, s7, 11 0x00002aaebc214c94 <+1684>: s_branch 65474 # 0x2aaebc214ba0 <_Z11kernel_testPKiS0_Pij+1440> 0x00002aaebc214c98 <+1688>: v_readlane_b32 s4, v25, 14 0x00002aaebc214ca0 <+1696>: v_readlane_b32 s5, v25, 15 0x00002aaebc214ca8 <+1704>: s_or_b64 exec, exec, s[4:5] 12 } 13 } 0x00002aaebc214cac <+1708>: s_endpgm
2.使用参数-O1等级编译,指令行数为23行,for循环根据条件进行跳转。
5 __global__ void kernel_test(const int *a, const int *b, int *c, unsigned n) 0x00002aaebc20d000 <+0>: s_load_dwordx4 s[0:3], s[4:5], 0x0 0x00002aaebc20d008 <+8>: s_load_dwordx4 s[4:7], s[4:5], 0x10 0x00002aaebc20d010 <+16>: v_lshlrev_b32_e32 v4, 2, v0 0x00002aaebc20d014 <+20>: s_waitcnt lgkmcnt(0) 0x00002aaebc20d018 <+24>: v_mov_b32_e32 v1, s1 0x00002aaebc20d01c <+28>: v_add_co_u32_e32 v0, vcc, s0, v4 0x00002aaebc20d020 <+32>: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 0x00002aaebc20d024 <+36>: v_mov_b32_e32 v3, s3 0x00002aaebc20d028 <+40>: v_add_co_u32_e32 v2, vcc, s2, v4 0x00002aaebc20d02c <+44>: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 0x00002aaebc20d030 <+48>: v_mov_b32_e32 v5, s5 0x00002aaebc20d034 <+52>: v_add_co_u32_e32 v4, vcc, s4, v4 0x00002aaebc20d038 <+56>: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 0x00002aaebc20d03c <+60>: s_mov_b32 s0, 10 10 { 11 c[gid] = a[gid] + b[gid]; 0x00002aaebc20d040 <+64>: global_load_dword v6, v[0:1], off 0x00002aaebc20d048 <+72>: global_load_dword v7, v[2:3], off 9 for (int idx = 0; idx < 10; ++idx) => 0x00002aaebc20d050 <+80>: s_add_i32 s0, s0, -1 10 { 11 c[gid] = a[gid] + b[gid]; 0x00002aaebc20d054 <+84>: s_waitcnt vmcnt(0) 0x00002aaebc20d058 <+88>: v_add_u32_e32 v6, v7, v6 9 for (int idx = 0; idx < 10; ++idx) 0x00002aaebc20d05c <+92>: s_cmp_eq_u32 s0, 0 10 { 11 c[gid] = a[gid] + b[gid]; 0x00002aaebc20d060 <+96>: global_store_dword v[4:5], v6, off 0x00002aaebc20d068 <+104>: s_cbranch_scc0 65525 # 0x2aaebc20d040 <_Z11kernel_testPKiS0_Pij+64> 12 } 13 } 0x00002aaebc20d06c <+108>: s_endpgm End of assembler dump.
3.使用参数-O2或-O3优化编译,指令行数为64行,并且for循环展开为10次下图红框中的操作(加载、等待加载完成、加操作、保存结果)
5 __global__ void kernel_test(const int *a, const int *b, int *c, unsigned n) 0x00002aaebc20d000 <+0>: s_load_dwordx4 s[0:3], s[4:5], 0x0 0x00002aaebc20d008 <+8>: s_load_dwordx4 s[4:7], s[4:5], 0x10 0x00002aaebc20d010 <+16>: v_lshlrev_b32_e32 v4, 2, v0 0x00002aaebc20d014 <+20>: s_waitcnt lgkmcnt(0) 0x00002aaebc20d018 <+24>: v_mov_b32_e32 v1, s1 0x00002aaebc20d01c <+28>: v_add_co_u32_e32 v0, vcc, s0, v4 0x00002aaebc20d020 <+32>: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 0x00002aaebc20d024 <+36>: v_mov_b32_e32 v3, s3 0x00002aaebc20d028 <+40>: v_add_co_u32_e32 v2, vcc, s2, v4 0x00002aaebc20d02c <+44>: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 6 { 7 unsigned gid = hipThreadIdx_x; // <- coordinate index function 8 //#pragma unroll 1 9 for (int idx = 0; idx < 10; ++idx) 10 { 11 c[gid] = a[gid] + b[gid]; => 0x00002aaebc20d030 <+48>: global_load_dword v6, v[0:1], off 0x00002aaebc20d038 <+56>: global_load_dword v7, v[2:3], off 0x00002aaebc20d040 <+64>: v_mov_b32_e32 v5, s5 0x00002aaebc20d044 <+68>: v_add_co_u32_e32 v4, vcc, s4, v4 0x00002aaebc20d048 <+72>: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 0x00002aaebc20d04c <+76>: s_waitcnt vmcnt(0) 0x00002aaebc20d050 <+80>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d054 <+84>: global_store_dword v[4:5], v6, off 0x00002aaebc20d05c <+92>: global_load_dword v6, v[0:1], off 0x00002aaebc20d064 <+100>: global_load_dword v7, v[2:3], off 0x00002aaebc20d06c <+108>: s_waitcnt vmcnt(0) 0x00002aaebc20d070 <+112>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d074 <+116>: global_store_dword v[4:5], v6, off 0x00002aaebc20d07c <+124>: global_load_dword v6, v[0:1], off 0x00002aaebc20d084 <+132>: global_load_dword v7, v[2:3], off 0x00002aaebc20d08c <+140>: s_waitcnt vmcnt(0) 0x00002aaebc20d090 <+144>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d094 <+148>: global_store_dword v[4:5], v6, off 0x00002aaebc20d09c <+156>: global_load_dword v6, v[0:1], off 0x00002aaebc20d0a4 <+164>: global_load_dword v7, v[2:3], off 0x00002aaebc20d0ac <+172>: s_waitcnt vmcnt(0) 0x00002aaebc20d0b0 <+176>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d0b4 <+180>: global_store_dword v[4:5], v6, off 0x00002aaebc20d0bc <+188>: global_load_dword v6, v[0:1], off 0x00002aaebc20d0c4 <+196>: global_load_dword v7, v[2:3], off 0x00002aaebc20d0cc <+204>: s_waitcnt vmcnt(0) 0x00002aaebc20d0d0 <+208>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d0d4 <+212>: global_store_dword v[4:5], v6, off 0x00002aaebc20d0dc <+220>: global_load_dword v6, v[0:1], off 0x00002aaebc20d0e4 <+228>: global_load_dword v7, v[2:3], off 0x00002aaebc20d0ec <+236>: s_waitcnt vmcnt(0) 0x00002aaebc20d0f0 <+240>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d0f4 <+244>: global_store_dword v[4:5], v6, off 0x00002aaebc20d0fc <+252>: global_load_dword v6, v[0:1], off 0x00002aaebc20d104 <+260>: global_load_dword v7, v[2:3], off 0x00002aaebc20d10c <+268>: s_waitcnt vmcnt(0) 0x00002aaebc20d110 <+272>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d114 <+276>: global_store_dword v[4:5], v6, off 0x00002aaebc20d11c <+284>: global_load_dword v6, v[0:1], off 0x00002aaebc20d124 <+292>: global_load_dword v7, v[2:3], off 0x00002aaebc20d12c <+300>: s_waitcnt vmcnt(0) 0x00002aaebc20d130 <+304>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d134 <+308>: global_store_dword v[4:5], v6, off 0x00002aaebc20d13c <+316>: global_load_dword v6, v[0:1], off 0x00002aaebc20d144 <+324>: global_load_dword v7, v[2:3], off 0x00002aaebc20d14c <+332>: s_waitcnt vmcnt(0) 0x00002aaebc20d150 <+336>: v_add_u32_e32 v6, v7, v6 0x00002aaebc20d154 <+340>: global_store_dword v[4:5], v6, off 0x00002aaebc20d15c <+348>: global_load_dword v0, v[0:1], off 0x00002aaebc20d164 <+356>: global_load_dword v1, v[2:3], off 0x00002aaebc20d16c <+364>: s_waitcnt vmcnt(0) 0x00002aaebc20d170 <+368>: v_add_u32_e32 v0, v1, v0 0x00002aaebc20d174 <+372>: global_store_dword v[4:5], v0, off 12 } 13 } 0x00002aaebc20d17c <+380>: s_endpgm
结论:此测试使用的环境版本(dtk-21.04)中hipcc在-O2、-O3的编译优化等级下,会展开固定循环次数的循环;-O1编译等级下会做优化,但不展开for循环,代码体量最小;-O0不做任何优化。