HIPCC编译初探

目前信创产业如火如荼,工作需要开始接触国产海光DCU。DCU与AMD同源,采用了基于GCN的Vega架构设计,后续有机会详细学习相关硬件内容。

现针对DCU适配的DCU Tool Kit环境,对hipcc进行了一次简单的测试,研究其编译器优化的效果。

测试结论

 

测试代码段:

__global__ void kernel_test(const int *a, const int *b, int *c, unsigned n)
{
    unsigned gid = hipThreadIdx_x; // <- coordinate index function
    aaa = 10;
//#pragma unroll 1
    for (int idx = 0; idx < 10; ++idx)
    {
        c[gid] = a[gid] + b[gid];
    }
}

 

测试环境:

 

指令简介:

 

 

 

编译器for循环展开:

1.使用参数-O0,未优化的代码很长(整段函数指令集有291行),仅截取其中一段,可以看到使用s_branch进行跳转循环,其中还调用了s_waitcnt vmcnt(0) lgkmcnt(0),较为耗时。

hip_test.cpp:
6       {
   0x00002aaebc214600 <+0>:     s_mov_b32 s32, 0x1c00
   0x00002aaebc214608 <+8>:     s_mov_b32 s33, 0
   0x00002aaebc21460c <+12>:    s_add_u32 flat_scratch_lo, s6, s9
   0x00002aaebc214610 <+16>:    s_addc_u32 flat_scratch_hi, s7, 0
   0x00002aaebc214614 <+20>:    s_add_u32 s0, s0, s9
   0x00002aaebc214618 <+24>:    s_addc_u32 s1, s1, 0
   0x00002aaebc21461c <+28>:    s_load_dwordx2 s[6:7], s[4:5], 0x0
   0x00002aaebc214624 <+36>:    s_load_dwordx2 s[8:9], s[4:5], 0x8
   0x00002aaebc21462c <+44>:    s_load_dwordx2 s[10:11], s[4:5], 0x10
   0x00002aaebc214634 <+52>:    s_load_dword s4, s[4:5], 0x18
   0x00002aaebc21463c <+60>:    s_mov_b32 s5, -1
   0x00002aaebc214640 <+64>:    v_mov_b32_e32 v3, 8
   0x00002aaebc214644 <+68>:    v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc21464c <+76>:    s_mov_b64 s[14:15], 0
   0x00002aaebc214650 <+80>:    s_mov_b32 s16, s15
   0x00002aaebc214654 <+84>:    s_getreg_b32 s17, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
   0x00002aaebc214658 <+88>:    s_lshl_b32 s17, s17, 16
   0x00002aaebc21465c <+92>:    v_mov_b32_e32 v4, s16
   0x00002aaebc214660 <+96>:    v_mov_b32_e32 v5, s17
   0x00002aaebc214664 <+100>:   v_cndmask_b32_e64 v4, v4, v5, s[12:13]
   0x00002aaebc21466c <+108>:   s_mov_b32 s18, s14
   0x00002aaebc214670 <+112>:   v_mov_b32_e32 v5, s18
   0x00002aaebc214674 <+116>:   v_cndmask_b32_e64 v3, v5, v3, s[12:13]
   0x00002aaebc21467c <+124>:   v_mov_b32_e32 v5, v3
   0x00002aaebc214680 <+128>:   v_mov_b32_e32 v6, v4
   0x00002aaebc214684 <+132>:   v_mov_b32_e32 v3, 16
   0x00002aaebc214688 <+136>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc214690 <+144>:   v_mov_b32_e32 v4, s16
   0x00002aaebc214694 <+148>:   v_mov_b32_e32 v7, s17
   0x00002aaebc214698 <+152>:   v_cndmask_b32_e64 v4, v4, v7, s[12:13]
   0x00002aaebc2146a0 <+160>:   v_mov_b32_e32 v7, s18
   0x00002aaebc2146a4 <+164>:   v_cndmask_b32_e64 v3, v7, v3, s[12:13]
   0x00002aaebc2146ac <+172>:   v_mov_b32_e32 v7, v3
   0x00002aaebc2146b0 <+176>:   v_mov_b32_e32 v8, v4
   0x00002aaebc2146b4 <+180>:   v_mov_b32_e32 v3, 24
   0x00002aaebc2146b8 <+184>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc2146c0 <+192>:   v_mov_b32_e32 v4, s16
   0x00002aaebc2146c4 <+196>:   v_mov_b32_e32 v9, s17
   0x00002aaebc2146c8 <+200>:   v_cndmask_b32_e64 v4, v4, v9, s[12:13]
   0x00002aaebc2146d0 <+208>:   v_mov_b32_e32 v9, s18
   0x00002aaebc2146d4 <+212>:   v_cndmask_b32_e64 v3, v9, v3, s[12:13]
   0x00002aaebc2146dc <+220>:   v_mov_b32_e32 v9, v3
   0x00002aaebc2146e0 <+224>:   v_mov_b32_e32 v10, v4
   0x00002aaebc2146e4 <+228>:   v_mov_b32_e32 v3, 32
   0x00002aaebc2146e8 <+232>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc2146f0 <+240>:   v_mov_b32_e32 v4, s16
   0x00002aaebc2146f4 <+244>:   v_mov_b32_e32 v11, s17
   0x00002aaebc2146f8 <+248>:   v_cndmask_b32_e64 v4, v4, v11, s[12:13]
   0x00002aaebc214700 <+256>:   v_mov_b32_e32 v11, s18
   0x00002aaebc214704 <+260>:   v_cndmask_b32_e64 v3, v11, v3, s[12:13]
   0x00002aaebc21470c <+268>:   v_mov_b32_e32 v11, v3
   0x00002aaebc214710 <+272>:   v_mov_b32_e32 v12, v4
   0x00002aaebc214714 <+276>:   v_mov_b32_e32 v3, 40
   0x00002aaebc214718 <+280>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc214720 <+288>:   v_mov_b32_e32 v4, s16
   0x00002aaebc214724 <+292>:   v_mov_b32_e32 v13, s17
   0x00002aaebc214728 <+296>:   v_cndmask_b32_e64 v4, v4, v13, s[12:13]
   0x00002aaebc214730 <+304>:   v_mov_b32_e32 v13, s18
   0x00002aaebc214734 <+308>:   v_cndmask_b32_e64 v3, v13, v3, s[12:13]
   0x00002aaebc21473c <+316>:   v_mov_b32_e32 v13, v3
   0x00002aaebc214740 <+320>:   v_mov_b32_e32 v14, v4
   0x00002aaebc214744 <+324>:   v_mov_b32_e32 v3, 48
   0x00002aaebc214748 <+328>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc214750 <+336>:   v_mov_b32_e32 v4, s16
   0x00002aaebc214754 <+340>:   v_mov_b32_e32 v15, s17
   0x00002aaebc214758 <+344>:   v_cndmask_b32_e64 v4, v4, v15, s[12:13]
   0x00002aaebc214760 <+352>:   v_mov_b32_e32 v15, s18
   0x00002aaebc214764 <+356>:   v_cndmask_b32_e64 v3, v15, v3, s[12:13]
   0x00002aaebc21476c <+364>:   v_mov_b32_e32 v15, v3
   0x00002aaebc214770 <+368>:   v_mov_b32_e32 v16, v4
   0x00002aaebc214774 <+372>:   v_mov_b32_e32 v3, 56
   0x00002aaebc214778 <+376>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc214780 <+384>:   v_mov_b32_e32 v4, s16
   0x00002aaebc214784 <+388>:   v_mov_b32_e32 v17, s17
   0x00002aaebc214788 <+392>:   v_cndmask_b32_e64 v4, v4, v17, s[12:13]
   0x00002aaebc214790 <+400>:   v_mov_b32_e32 v17, s18
   0x00002aaebc214794 <+404>:   v_cndmask_b32_e64 v3, v17, v3, s[12:13]
   0x00002aaebc21479c <+412>:   v_mov_b32_e32 v17, v3
   0x00002aaebc2147a0 <+416>:   v_mov_b32_e32 v18, v4
   0x00002aaebc2147a4 <+420>:   v_mov_b32_e32 v3, 60
   0x00002aaebc2147a8 <+424>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc2147b0 <+432>:   v_mov_b32_e32 v4, s16
   0x00002aaebc2147b4 <+436>:   v_mov_b32_e32 v19, s17
   0x00002aaebc2147b8 <+440>:   v_cndmask_b32_e64 v4, v4, v19, s[12:13]
   0x00002aaebc2147c0 <+448>:   v_mov_b32_e32 v19, s18
   0x00002aaebc2147c4 <+452>:   v_cndmask_b32_e64 v3, v19, v3, s[12:13]
   0x00002aaebc2147cc <+460>:   v_mov_b32_e32 v19, v3
   0x00002aaebc2147d0 <+464>:   v_mov_b32_e32 v20, v4
   0x00002aaebc2147d4 <+468>:   v_mov_b32_e32 v3, 64
   0x00002aaebc2147d8 <+472>:   v_cmp_ne_u32_e64 s[12:13], v3, s5
   0x00002aaebc2147e0 <+480>:   v_mov_b32_e32 v4, s16
   0x00002aaebc2147e4 <+484>:   v_mov_b32_e32 v21, s17
   0x00002aaebc2147e8 <+488>:   v_cndmask_b32_e64 v4, v4, v21, s[12:13]
   0x00002aaebc2147f0 <+496>:   v_mov_b32_e32 v21, s18
   0x00002aaebc2147f4 <+500>:   v_cndmask_b32_e64 v3, v21, v3, s[12:13]
   0x00002aaebc2147fc <+508>:   v_mov_b32_e32 v21, v3
   0x00002aaebc214800 <+512>:   v_mov_b32_e32 v22, v4
   0x00002aaebc214804 <+516>:   v_mov_b32_e32 v3, v5
   0x00002aaebc214808 <+520>:   v_mov_b32_e32 v4, v6
   0x00002aaebc21480c <+524>:   s_waitcnt lgkmcnt(0)
   0x00002aaebc214810 <+528>:   v_mov_b32_e32 v24, s7
   0x00002aaebc214814 <+532>:   v_mov_b32_e32 v23, s6
   0x00002aaebc214818 <+536>:   flat_store_dwordx2 v[3:4], v[23:24]
   0x00002aaebc214820 <+544>:   flat_load_dwordx2 v[3:4], v[5:6]
   0x00002aaebc214828 <+552>:   v_mov_b32_e32 v5, v7
   0x00002aaebc21482c <+556>:   v_mov_b32_e32 v6, v8
   0x00002aaebc214830 <+560>:   v_mov_b32_e32 v24, s9
   0x00002aaebc214834 <+564>:   v_mov_b32_e32 v23, s8
   0x00002aaebc214838 <+568>:   flat_store_dwordx2 v[5:6], v[23:24]
   0x00002aaebc214840 <+576>:   flat_load_dwordx2 v[5:6], v[7:8]
   0x00002aaebc214848 <+584>:   v_mov_b32_e32 v7, v9
   0x00002aaebc21484c <+588>:   v_mov_b32_e32 v8, v10
   0x00002aaebc214850 <+592>:   v_mov_b32_e32 v24, s11
   0x00002aaebc214854 <+596>:   v_mov_b32_e32 v23, s10
   0x00002aaebc214858 <+600>:   flat_store_dwordx2 v[7:8], v[23:24]
   0x00002aaebc214860 <+608>:   flat_load_dwordx2 v[7:8], v[9:10]
   0x00002aaebc214868 <+616>:   v_mov_b32_e32 v9, v11
   0x00002aaebc21486c <+620>:   v_mov_b32_e32 v10, v12
   0x00002aaebc214870 <+624>:   s_waitcnt vmcnt(0) lgkmcnt(0)
   0x00002aaebc214874 <+628>:   flat_store_dwordx2 v[9:10], v[3:4]
   0x00002aaebc21487c <+636>:   v_mov_b32_e32 v3, v13
   0x00002aaebc214880 <+640>:   v_mov_b32_e32 v4, v14
   0x00002aaebc214884 <+644>:   flat_store_dwordx2 v[3:4], v[5:6]
   0x00002aaebc21488c <+652>:   v_mov_b32_e32 v3, v15
   0x00002aaebc214890 <+656>:   v_mov_b32_e32 v4, v16
   0x00002aaebc214894 <+660>:   flat_store_dwordx2 v[3:4], v[7:8]
   0x00002aaebc21489c <+668>:   v_mov_b32_e32 v3, s4
   0x00002aaebc2148a0 <+672>:   flat_store_dword v[17:18], v3

7           unsigned gid = hipThreadIdx_x; // <- coordinate index function
=> 0x00002aaebc2148a8 <+680>:   s_getpc_b64 s[4:5]
   0x00002aaebc2148ac <+684>:   s_add_u32 s4, s4, 0xfffff794
   0x00002aaebc2148b4 <+692>:   s_addc_u32 s5, s5, -1
   0x00002aaebc2148bc <+700>:   s_mov_b64 s[10:11], s[2:3]
   0x00002aaebc2148c0 <+704>:   s_mov_b64 s[8:9], s[0:1]
   0x00002aaebc2148c4 <+708>:   s_mov_b32 s6, 20
   0x00002aaebc2148c8 <+712>:   v_lshlrev_b32_e64 v2, s6, v2
   0x00002aaebc2148d0 <+720>:   s_mov_b32 s6, 10
   0x00002aaebc2148d4 <+724>:   v_lshlrev_b32_e64 v1, s6, v1
   0x00002aaebc2148dc <+732>:   v_or3_b32 v1, v0, v1, v2
   0x00002aaebc2148e4 <+740>:   v_mov_b32_e32 v0, 0
   0x00002aaebc2148e8 <+744>:   s_mov_b64 s[0:1], s[8:9]
   0x00002aaebc2148ec <+748>:   s_mov_b64 s[2:3], s[10:11]
   0x00002aaebc2148f0 <+752>:   buffer_store_dword v0, off, s[0:3], s33 offset:68
   0x00002aaebc2148f8 <+760>:   v_writelane_b32 v25, s14, 0
   0x00002aaebc214900 <+768>:   v_writelane_b32 v25, s15, 1
   0x00002aaebc214908 <+776>:   buffer_store_dword v11, off, s[0:3], s33 offset:72
   0x00002aaebc214910 <+784>:   buffer_store_dword v12, off, s[0:3], s33 offset:76
   0x00002aaebc214918 <+792>:   buffer_store_dword v13, off, s[0:3], s33 offset:80
   0x00002aaebc214920 <+800>:   buffer_store_dword v14, off, s[0:3], s33 offset:84
   0x00002aaebc214928 <+808>:   buffer_store_dword v15, off, s[0:3], s33 offset:88
   0x00002aaebc214930 <+816>:   buffer_store_dword v16, off, s[0:3], s33 offset:92
   0x00002aaebc214938 <+824>:   buffer_store_dword v19, off, s[0:3], s33 offset:96
   0x00002aaebc214940 <+832>:   buffer_store_dword v20, off, s[0:3], s33 offset:100
   0x00002aaebc214948 <+840>:   buffer_store_dword v21, off, s[0:3], s33 offset:104
   0x00002aaebc214950 <+848>:   buffer_store_dword v22, off, s[0:3], s33 offset:108
   0x00002aaebc214958 <+856>:   s_swappc_b64 s[30:31], s[4:5]
   0x00002aaebc21495c <+860>:   v_mov_b32_e32 v2, v0
   0x00002aaebc214960 <+864>:   v_mov_b32_e32 v3, v1
   0x00002aaebc214964 <+868>:   buffer_load_dword v0, off, s[0:3], s33 offset:96
   0x00002aaebc21496c <+876>:   buffer_load_dword v1, off, s[0:3], s33 offset:100
   0x00002aaebc214974 <+884>:   s_waitcnt vmcnt(0)
   0x00002aaebc214978 <+888>:   flat_store_dword v[0:1], v2
   0x00002aaebc214980 <+896>:   buffer_load_dword v0, off, s[0:3], s33 offset:104
   0x00002aaebc214988 <+904>:   buffer_load_dword v1, off, s[0:3], s33 offset:108

8       //#pragma unroll 1
9           for (int idx = 0; idx < 10; ++idx)
   0x00002aaebc214990 <+912>:   buffer_load_dword v2, off, s[0:3], s33 offset:68
   0x00002aaebc214998 <+920>:   s_waitcnt vmcnt(0)
   0x00002aaebc21499c <+924>:   flat_store_dword v[0:1], v2
   0x00002aaebc2149a4 <+932>:   v_readlane_b32 s6, v25, 0
   0x00002aaebc2149ac <+940>:   v_readlane_b32 s7, v25, 1
   0x00002aaebc2149b4 <+948>:   v_writelane_b32 v25, s4, 2
   0x00002aaebc2149bc <+956>:   v_writelane_b32 v25, s5, 3
   0x00002aaebc2149c4 <+964>:   v_writelane_b32 v25, s6, 4
   0x00002aaebc2149cc <+972>:   v_writelane_b32 v25, s7, 5
   0x00002aaebc2149d4 <+980>:   v_readlane_b32 s4, v25, 4
   0x00002aaebc2149dc <+988>:   v_readlane_b32 s5, v25, 5
   0x00002aaebc2149e4 <+996>:   v_readlane_b32 s6, v25, 2
   0x00002aaebc2149ec <+1004>:  v_readlane_b32 s7, v25, 3
   0x00002aaebc2149f4 <+1012>:  buffer_load_dword v0, off, s[0:3], s33 offset:104
   0x00002aaebc2149fc <+1020>:  buffer_load_dword v1, off, s[0:3], s33 offset:108
   0x00002aaebc214a04 <+1028>:  s_waitcnt vmcnt(0)
   0x00002aaebc214a08 <+1032>:  flat_load_dword v0, v[0:1]
   0x00002aaebc214a10 <+1040>:  s_mov_b32 s8, 10
   0x00002aaebc214a14 <+1044>:  s_waitcnt vmcnt(0) lgkmcnt(0)
   0x00002aaebc214a18 <+1048>:  v_cmp_lt_i32_e64 s[8:9], v0, s8
   0x00002aaebc214a20 <+1056>:  s_mov_b64 s[10:11], -1
   0x00002aaebc214a24 <+1060>:  s_or_b64 s[6:7], s[6:7], exec
   0x00002aaebc214a28 <+1064>:  s_mov_b64 s[10:11], s[6:7]
   0x00002aaebc214a2c <+1068>:  s_mov_b64 s[12:13], exec
   0x00002aaebc214a30 <+1072>:  s_and_b64 s[8:9], s[12:13], s[8:9]
   0x00002aaebc214a34 <+1076>:  v_writelane_b32 v25, s4, 6
   0x00002aaebc214a3c <+1084>:  v_writelane_b32 v25, s5, 7
   0x00002aaebc214a44 <+1092>:  v_writelane_b32 v25, s6, 8
   0x00002aaebc214a4c <+1100>:  v_writelane_b32 v25, s7, 9
   0x00002aaebc214a54 <+1108>:  v_writelane_b32 v25, s10, 10
   0x00002aaebc214a5c <+1116>:  v_writelane_b32 v25, s11, 11
   0x00002aaebc214a64 <+1124>:  v_writelane_b32 v25, s12, 12
   0x00002aaebc214a6c <+1132>:  v_writelane_b32 v25, s13, 13
   0x00002aaebc214a74 <+1140>:  s_mov_b64 exec, s[8:9]
   0x00002aaebc214a78 <+1144>:  s_cbranch_execz 73  # 0x2aaebc214ba0 <_Z11kernel_testPKiS0_Pij+1440>

10          {
11              c[gid] = a[gid] + b[gid];
   0x00002aaebc214a7c <+1148>:  buffer_load_dword v0, off, s[0:3], s33 offset:72
   0x00002aaebc214a84 <+1156>:  buffer_load_dword v1, off, s[0:3], s33 offset:76
   0x00002aaebc214a8c <+1164>:  s_waitcnt vmcnt(0)
   0x00002aaebc214a90 <+1168>:  flat_load_dwordx2 v[0:1], v[0:1]
   0x00002aaebc214a98 <+1176>:  buffer_load_dword v2, off, s[0:3], s33 offset:96
   0x00002aaebc214aa0 <+1184>:  buffer_load_dword v3, off, s[0:3], s33 offset:100
   0x00002aaebc214aa8 <+1192>:  s_waitcnt vmcnt(0)
   0x00002aaebc214aac <+1196>:  flat_load_dword v2, v[2:3]
   0x00002aaebc214ab4 <+1204>:  s_mov_b32 s4, 0
   0x00002aaebc214ab8 <+1208>:  v_mov_b32_e32 v3, 0
   0x00002aaebc214abc <+1212>:  s_waitcnt vmcnt(0) lgkmcnt(0)
   0x00002aaebc214ac0 <+1216>:  v_mov_b32_e32 v4, v2
   0x00002aaebc214ac4 <+1220>:  v_mov_b32_e32 v5, v3
   0x00002aaebc214ac8 <+1224>:  s_mov_b32 s4, 2
   0x00002aaebc214acc <+1228>:  v_lshlrev_b64 v[2:3], s4, v[4:5]
   0x00002aaebc214ad4 <+1236>:  v_mov_b32_e32 v4, v0
   0x00002aaebc214ad8 <+1240>:  v_mov_b32_e32 v5, v2
   0x00002aaebc214adc <+1244>:  v_mov_b32_e32 v0, v3
   0x00002aaebc214ae0 <+1248>:  v_add_co_u32_e64 v4, s[4:5], v4, v5
   0x00002aaebc214ae8 <+1256>:  v_addc_co_u32_e64 v0, s[4:5], v1, v0, s[4:5]
   0x00002aaebc214af0 <+1264>:  v_mov_b32_e32 v5, v0
   0x00002aaebc214af4 <+1268>:  flat_load_dword v0, v[4:5]
   0x00002aaebc214afc <+1276>:  buffer_load_dword v4, off, s[0:3], s33 offset:80
   0x00002aaebc214b04 <+1284>:  buffer_load_dword v5, off, s[0:3], s33 offset:84
   0x00002aaebc214b0c <+1292>:  s_waitcnt vmcnt(0)
   0x00002aaebc214b10 <+1296>:  flat_load_dwordx2 v[4:5], v[4:5]
   0x00002aaebc214b18 <+1304>:  s_waitcnt vmcnt(0) lgkmcnt(0)
   0x00002aaebc214b1c <+1308>:  v_mov_b32_e32 v1, v4
   0x00002aaebc214b20 <+1312>:  v_mov_b32_e32 v6, v2
   0x00002aaebc214b24 <+1316>:  v_mov_b32_e32 v4, v3
   0x00002aaebc214b28 <+1320>:  v_add_co_u32_e64 v1, s[4:5], v1, v6
   0x00002aaebc214b30 <+1328>:  v_addc_co_u32_e64 v4, s[4:5], v5, v4, s[4:5]
   0x00002aaebc214b38 <+1336>:  v_mov_b32_e32 v5, v1
   0x00002aaebc214b3c <+1340>:  v_mov_b32_e32 v6, v4
   0x00002aaebc214b40 <+1344>:  flat_load_dword v1, v[5:6]
   0x00002aaebc214b48 <+1352>:  s_waitcnt vmcnt(0) lgkmcnt(0)
   0x00002aaebc214b4c <+1356>:  v_add_u32_e64 v0, v0, v1
   0x00002aaebc214b54 <+1364>:  buffer_load_dword v4, off, s[0:3], s33 offset:88
   0x00002aaebc214b5c <+1372>:  buffer_load_dword v5, off, s[0:3], s33 offset:92
   0x00002aaebc214b64 <+1380>:  s_waitcnt vmcnt(0)
   0x00002aaebc214b68 <+1384>:  flat_load_dwordx2 v[4:5], v[4:5]
   0x00002aaebc214b70 <+1392>:  s_waitcnt vmcnt(0) lgkmcnt(0)
   0x00002aaebc214b74 <+1396>:  v_mov_b32_e32 v1, v4
   0x00002aaebc214b78 <+1400>:  v_mov_b32_e32 v6, v2
   0x00002aaebc214b7c <+1404>:  v_add_co_u32_e64 v1, s[4:5], v1, v6
   0x00002aaebc214b84 <+1412>:  v_addc_co_u32_e64 v2, s[4:5], v5, v3, s[4:5]
   0x00002aaebc214b8c <+1420>:  v_mov_b32_e32 v3, v1
   0x00002aaebc214b90 <+1424>:  v_mov_b32_e32 v4, v2
   0x00002aaebc214b94 <+1428>:  flat_store_dword v[3:4], v0
   0x00002aaebc214b9c <+1436>:  s_branch 33  # 0x2aaebc214c24 <_Z11kernel_testPKiS0_Pij+1572>
   0x00002aaebc214ba0 <+1440>:  v_readlane_b32 s4, v25, 12
   0x00002aaebc214ba8 <+1448>:  v_readlane_b32 s5, v25, 13
   0x00002aaebc214bb0 <+1456>:  s_or_b64 exec, exec, s[4:5]
   0x00002aaebc214bb4 <+1460>:  v_readlane_b32 s6, v25, 10
   0x00002aaebc214bbc <+1468>:  v_readlane_b32 s7, v25, 11
   0x00002aaebc214bc4 <+1476>:  s_mov_b64 s[8:9], s[6:7]
   0x00002aaebc214bc8 <+1480>:  s_and_b64 s[8:9], exec, s[8:9]
   0x00002aaebc214bcc <+1484>:  v_readlane_b32 s10, v25, 6
   0x00002aaebc214bd4 <+1492>:  v_readlane_b32 s11, v25, 7
   0x00002aaebc214bdc <+1500>:  s_or_b64 s[8:9], s[8:9], s[10:11]
   0x00002aaebc214be0 <+1504>:  s_mov_b64 s[12:13], s[8:9]
   0x00002aaebc214be4 <+1508>:  s_mov_b64 s[14:15], s[8:9]
   0x00002aaebc214be8 <+1512>:  v_writelane_b32 v25, s6, 2
   0x00002aaebc214bf0 <+1520>:  v_writelane_b32 v25, s7, 3
   0x00002aaebc214bf8 <+1528>:  v_writelane_b32 v25, s12, 4
   0x00002aaebc214c00 <+1536>:  v_writelane_b32 v25, s13, 5
   0x00002aaebc214c08 <+1544>:  v_writelane_b32 v25, s14, 14
   0x00002aaebc214c10 <+1552>:  v_writelane_b32 v25, s15, 15
   0x00002aaebc214c18 <+1560>:  s_andn2_b64 exec, exec, s[8:9]
   0x00002aaebc214c1c <+1564>:  s_cbranch_execnz 65389  # 0x2aaebc2149d4 <_Z11kernel_testPKiS0_Pij+980>
   0x00002aaebc214c20 <+1568>:  s_branch 29  # 0x2aaebc214c98 <_Z11kernel_testPKiS0_Pij+1688>

9           for (int idx = 0; idx < 10; ++idx)
   0x00002aaebc214c24 <+1572>:  buffer_load_dword v0, off, s[0:3], s33 offset:104
   0x00002aaebc214c2c <+1580>:  buffer_load_dword v1, off, s[0:3], s33 offset:108
   0x00002aaebc214c34 <+1588>:  s_waitcnt vmcnt(0)
   0x00002aaebc214c38 <+1592>:  flat_load_dword v0, v[0:1]
   0x00002aaebc214c40 <+1600>:  s_mov_b32 s4, 1
   0x00002aaebc214c44 <+1604>:  s_waitcnt vmcnt(0) lgkmcnt(0)
   0x00002aaebc214c48 <+1608>:  v_add_u32_e64 v0, v0, s4
   0x00002aaebc214c50 <+1616>:  buffer_load_dword v1, off, s[0:3], s33 offset:104
   0x00002aaebc214c58 <+1624>:  buffer_load_dword v2, off, s[0:3], s33 offset:108
   0x00002aaebc214c60 <+1632>:  s_waitcnt vmcnt(0)
   0x00002aaebc214c64 <+1636>:  flat_store_dword v[1:2], v0
   0x00002aaebc214c6c <+1644>:  s_mov_b64 s[4:5], 0
   0x00002aaebc214c70 <+1648>:  v_readlane_b32 s4, v25, 8
   0x00002aaebc214c78 <+1656>:  v_readlane_b32 s5, v25, 9
   0x00002aaebc214c80 <+1664>:  s_andn2_b64 s[6:7], s[4:5], exec
   0x00002aaebc214c84 <+1668>:  v_writelane_b32 v25, s6, 10
   0x00002aaebc214c8c <+1676>:  v_writelane_b32 v25, s7, 11
   0x00002aaebc214c94 <+1684>:  s_branch 65474  # 0x2aaebc214ba0 <_Z11kernel_testPKiS0_Pij+1440>
   0x00002aaebc214c98 <+1688>:  v_readlane_b32 s4, v25, 14
   0x00002aaebc214ca0 <+1696>:  v_readlane_b32 s5, v25, 15
   0x00002aaebc214ca8 <+1704>:  s_or_b64 exec, exec, s[4:5]

12          }
13      }
   0x00002aaebc214cac <+1708>:  s_endpgm

 

2.使用参数-O1等级编译,指令行数为23行,for循环根据条件进行跳转。

5       __global__ void kernel_test(const int *a, const int *b, int *c, unsigned n)
   0x00002aaebc20d000 <+0>:     s_load_dwordx4 s[0:3], s[4:5], 0x0
   0x00002aaebc20d008 <+8>:     s_load_dwordx4 s[4:7], s[4:5], 0x10
   0x00002aaebc20d010 <+16>:    v_lshlrev_b32_e32 v4, 2, v0
   0x00002aaebc20d014 <+20>:    s_waitcnt lgkmcnt(0)
   0x00002aaebc20d018 <+24>:    v_mov_b32_e32 v1, s1
   0x00002aaebc20d01c <+28>:    v_add_co_u32_e32 v0, vcc, s0, v4
   0x00002aaebc20d020 <+32>:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
   0x00002aaebc20d024 <+36>:    v_mov_b32_e32 v3, s3
   0x00002aaebc20d028 <+40>:    v_add_co_u32_e32 v2, vcc, s2, v4
   0x00002aaebc20d02c <+44>:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
   0x00002aaebc20d030 <+48>:    v_mov_b32_e32 v5, s5
   0x00002aaebc20d034 <+52>:    v_add_co_u32_e32 v4, vcc, s4, v4
   0x00002aaebc20d038 <+56>:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
   0x00002aaebc20d03c <+60>:    s_mov_b32 s0, 10

10          {
11              c[gid] = a[gid] + b[gid];
   0x00002aaebc20d040 <+64>:    global_load_dword v6, v[0:1], off
   0x00002aaebc20d048 <+72>:    global_load_dword v7, v[2:3], off

9           for (int idx = 0; idx < 10; ++idx)
=> 0x00002aaebc20d050 <+80>:    s_add_i32 s0, s0, -1

10          {
11              c[gid] = a[gid] + b[gid];
   0x00002aaebc20d054 <+84>:    s_waitcnt vmcnt(0)
   0x00002aaebc20d058 <+88>:    v_add_u32_e32 v6, v7, v6

9           for (int idx = 0; idx < 10; ++idx)
   0x00002aaebc20d05c <+92>:    s_cmp_eq_u32 s0, 0

10          {
11              c[gid] = a[gid] + b[gid];
   0x00002aaebc20d060 <+96>:    global_store_dword v[4:5], v6, off
   0x00002aaebc20d068 <+104>:   s_cbranch_scc0 65525  # 0x2aaebc20d040 <_Z11kernel_testPKiS0_Pij+64>

12          }
13      }
   0x00002aaebc20d06c <+108>:   s_endpgm
End of assembler dump.

 

3.使用参数-O2或-O3优化编译,指令行数为64行,并且for循环展开为10次下图红框中的操作(加载、等待加载完成、加操作、保存结果)

5       __global__ void kernel_test(const int *a, const int *b, int *c, unsigned n)
   0x00002aaebc20d000 <+0>:     s_load_dwordx4 s[0:3], s[4:5], 0x0
   0x00002aaebc20d008 <+8>:     s_load_dwordx4 s[4:7], s[4:5], 0x10
   0x00002aaebc20d010 <+16>:    v_lshlrev_b32_e32 v4, 2, v0
   0x00002aaebc20d014 <+20>:    s_waitcnt lgkmcnt(0)
   0x00002aaebc20d018 <+24>:    v_mov_b32_e32 v1, s1
   0x00002aaebc20d01c <+28>:    v_add_co_u32_e32 v0, vcc, s0, v4
   0x00002aaebc20d020 <+32>:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
   0x00002aaebc20d024 <+36>:    v_mov_b32_e32 v3, s3
   0x00002aaebc20d028 <+40>:    v_add_co_u32_e32 v2, vcc, s2, v4
   0x00002aaebc20d02c <+44>:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc

6       {
7           unsigned gid = hipThreadIdx_x; // <- coordinate index function
8       //#pragma unroll 1
9           for (int idx = 0; idx < 10; ++idx)
10          {
11              c[gid] = a[gid] + b[gid];
=> 0x00002aaebc20d030 <+48>:    global_load_dword v6, v[0:1], off
   0x00002aaebc20d038 <+56>:    global_load_dword v7, v[2:3], off
   0x00002aaebc20d040 <+64>:    v_mov_b32_e32 v5, s5
   0x00002aaebc20d044 <+68>:    v_add_co_u32_e32 v4, vcc, s4, v4
   0x00002aaebc20d048 <+72>:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
   0x00002aaebc20d04c <+76>:    s_waitcnt vmcnt(0)
   0x00002aaebc20d050 <+80>:    v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d054 <+84>:    global_store_dword v[4:5], v6, off
   0x00002aaebc20d05c <+92>:    global_load_dword v6, v[0:1], off
   0x00002aaebc20d064 <+100>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d06c <+108>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d070 <+112>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d074 <+116>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d07c <+124>:   global_load_dword v6, v[0:1], off
   0x00002aaebc20d084 <+132>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d08c <+140>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d090 <+144>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d094 <+148>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d09c <+156>:   global_load_dword v6, v[0:1], off
   0x00002aaebc20d0a4 <+164>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d0ac <+172>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d0b0 <+176>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d0b4 <+180>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d0bc <+188>:   global_load_dword v6, v[0:1], off
   0x00002aaebc20d0c4 <+196>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d0cc <+204>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d0d0 <+208>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d0d4 <+212>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d0dc <+220>:   global_load_dword v6, v[0:1], off
   0x00002aaebc20d0e4 <+228>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d0ec <+236>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d0f0 <+240>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d0f4 <+244>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d0fc <+252>:   global_load_dword v6, v[0:1], off
   0x00002aaebc20d104 <+260>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d10c <+268>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d110 <+272>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d114 <+276>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d11c <+284>:   global_load_dword v6, v[0:1], off
   0x00002aaebc20d124 <+292>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d12c <+300>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d130 <+304>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d134 <+308>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d13c <+316>:   global_load_dword v6, v[0:1], off
   0x00002aaebc20d144 <+324>:   global_load_dword v7, v[2:3], off
   0x00002aaebc20d14c <+332>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d150 <+336>:   v_add_u32_e32 v6, v7, v6
   0x00002aaebc20d154 <+340>:   global_store_dword v[4:5], v6, off
   0x00002aaebc20d15c <+348>:   global_load_dword v0, v[0:1], off
   0x00002aaebc20d164 <+356>:   global_load_dword v1, v[2:3], off
   0x00002aaebc20d16c <+364>:   s_waitcnt vmcnt(0)
   0x00002aaebc20d170 <+368>:   v_add_u32_e32 v0, v1, v0
   0x00002aaebc20d174 <+372>:   global_store_dword v[4:5], v0, off

12          }
13      }
   0x00002aaebc20d17c <+380>:   s_endpgm

 

 

结论:此测试使用的环境版本(dtk-21.04)中hipcc在-O2、-O3的编译优化等级下,会展开固定循环次数的循环;-O1编译等级下会做优化,但不展开for循环,代码体量最小;-O0不做任何优化。

posted on 2021-11-09 10:00  陈狗蛋儿  阅读(1661)  评论(0编辑  收藏  举报

导航