AscendC学习

算子工程中sim和cpu模式都运行在cpu上,两者有什么区别?
猜测sim的等效是在npu指令层面,cpu的等效仅仅在AsecendC的层面?
表现在sim可以完全等效得跑cce代码算子,而cpu只能跑AscendC的算子。

当cce代码保存为.cpp后缀的文件,采用ccec编译,需要指定-x cce,即指定编译语言。

ccec -x cce --cce-aicore-arch=dav-c220-vec -c add.cce编译出来的结果为x86格式的elf文件,但npu的指令放在了elf中的一个段。至于是怎么包含的?ccec启动bisheng, 添加参数"-fcce-include-aibinary", "/tmp/add_custom-e50633/add_custom-dav-c220-vec.o"将事先编译的npu object文件包含进来。

atc转换过程中,tbe调用build构建出的算子为纯粹npu格式的elf文件。

一个简单问题:bisheng是怎么区分编译npu的object还是x86+npu代码段的?

看到bisheng的一些关键参数:x86的嵌入+npu代码段时,--triple x86_64-unknown-linux-gnu -fcce-fatobj-compile, 编译npu算子的命令参数为--triple hiipu64-hisilicon-cce -fcce-is-aicore

一个调用ccec生成纯粹npu object编译命令如下,

ccec -cc1 -triple hiipu64-hisilicon-cce -fcce-is-aicore "-resource-dir" "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5" -include __clang_cce_runtime_wrapper.h -o add2.o -x cce add.cce "-internal-isystem"  "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11"  "-internal-isystem"  "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11"  "-internal-isystem"  "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/backward"  "-internal-isystem"  "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5/include"  "-internal-isystem"  "/usr/local/include"  "-internal-isystem"  "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../x86_64-linux-gnu/include"  "-internal-externc-isystem"  "/usr/include/x86_64-linux-gnu"  "-internal-externc-isystem"  "/include"  "-internal-externc-isystem"  "/usr/include" -emit-obj -target-cpu dav-c220-vec

命令中几个参数可以关注一下:

  1. 包含的头文件__clang_cce_runtime_wrapper.h,这让你可以使用__aicore等宏,各种vector和cube的指令。
  2. -cc1表示像clang前端传递参数并只运行ast到llvm-IR的步骤,最后加的-emit-obj表示生成object文件,默认只会到ir自然退出。

上述命令只会编译cpp中加了__aicore__的函数。aicore上的函数要求返回为void。假设cce中定义了一个函数fun, 编译结果elf中会有一个fun的函数和一个fun__的对象,后者fun__表示的是kernelArgSize,即函数如参数Byte数。

'+all' is not a recognized feature for this target (ignoring feature)
_Z8myadd_doPhS_S_S_:
 000000f0:  ff c3 00 d1                 sub     sp, sp, #0x30
 000000f4:  fd 7b 02 a9                 stp     x29, x30, [sp, #0x20]
 000000f8:  fd 83 00 91                 add     x29, sp, #0x20
 000000fc:  a0 83 1f f8                 stur    x0, [x29, #-0x8]
 00000100:  e1 0b 00 f9                 str     x1, [sp, #0x10]
 00000104:  e2 07 00 f9                 str     x2, [sp, #0x8]
 00000108:  e3 03 00 f9                 str     x3, [sp]
 0000010c:  e2 03 40 f9                 ldr     x2, [sp]
 00000110:  00 01 80 52                 mov     w0, #0x8
 00000114:  e1 03 1f aa                 mov     x1, xzr
 00000118:  00 00 00 94                 bl      #0x0 // NotExistType : _Z21__cce_rtConfigureCalljPvS_ + 0
 0000011c:  08 00 00 71                 subs    w8, w0, #0x0
 00000120:  e8 07 9f 1a                 cset    w8, ne
 00000124:  e8 00 00 37                 tbnz    w8, #0x0, #0x1c
 00000128:  01 00 00 14                 b       #0x4
 0000012c:  a0 83 5f f8                 ldur    x0, [x29, #-0x8]
 00000130:  e1 0b 40 f9                 ldr     x1, [sp, #0x10]
 00000134:  e2 07 40 f9                 ldr     x2, [sp, #0x8]
 00000138:  00 00 00 94                 bl      #0x0 // NotExistType : myadd + 0
 0000013c:  01 00 00 14                 b       #0x4
 00000140:  fd 7b 42 a9                 ldp     x29, x30, [sp, #0x20]
 00000144:  ff c3 00 91                 add     sp, sp, #0x30
 00000148:  c0 03 5f d6                 ret

打开cce代码看,

extern "C" __global__ __aicore__ void myadd(GM_ADDR x, GM_ADDR y, GM_ADDR z) 
{
    ...
}

#ifndef __CCE_KT_TEST__
void myadd_do(uint8_t* x, uint8_t* y, uint8_t* z, uint8_t* stream) {
  myadd<<<8, nullptr, stream>>>((half*)x, (half*)y, (float*)z);
}
#endif

通过ccec -x cce -c add.cce -o add.o --cce-aicore-arch=dav-c220-vec编译出add.o的aarch64 object文件,并将npu的指令内嵌到某个代码段,

一些在aarch64 CPU架构上的函数的反汇编如下:

// myadd_do函数,内部使用<<<>>>方式调用算子,三箭头的调用步骤是,将blockDim, l2ctrl, stream传给_Z21__cce_rtConfigureCalljPvS_函数(即__cce_rtConfigureCall(unsigned int, void*, void*))完成算子rt的配置,若配置结果不为零,直接跳出。否则将x,y,z三个参数传递给myadd的函数
_Z8myadd_doPhS_S_S_:
 000000f0:  ff c3 00 d1                 sub     sp, sp, #0x30
 000000f4:  fd 7b 02 a9                 stp     x29, x30, [sp, #0x20]
 000000f8:  fd 83 00 91                 add     x29, sp, #0x20
 000000fc:  a0 83 1f f8                 stur    x0, [x29, #-0x8]
 00000100:  e1 0b 00 f9                 str     x1, [sp, #0x10]
 00000104:  e2 07 00 f9                 str     x2, [sp, #0x8]
 00000108:  e3 03 00 f9                 str     x3, [sp]
 0000010c:  e2 03 40 f9                 ldr     x2, [sp]
 00000110:  00 01 80 52                 mov     w0, #0x8
 00000114:  e1 03 1f aa                 mov     x1, xzr
 00000118:  00 00 00 94                 bl      #0x0 // NotExistType : _Z21__cce_rtConfigureCalljPvS_ + 0
 0000011c:  08 00 00 71                 subs    w8, w0, #0x0
 00000120:  e8 07 9f 1a                 cset    w8, ne
 00000124:  e8 00 00 37                 tbnz    w8, #0x0, #0x1c
 00000128:  01 00 00 14                 b       #0x4
 0000012c:  a0 83 5f f8                 ldur    x0, [x29, #-0x8]
 00000130:  e1 0b 40 f9                 ldr     x1, [sp, #0x10]
 00000134:  e2 07 40 f9                 ldr     x2, [sp, #0x8]
 00000138:  00 00 00 94                 bl      #0x0 // NotExistType : myadd + 0
 0000013c:  01 00 00 14                 b       #0x4
 00000140:  fd 7b 42 a9                 ldp     x29, x30, [sp, #0x20]
 00000144:  ff c3 00 91                 add     sp, sp, #0x30
 00000148:  c0 03 5f d6                 ret


// 传入blockDim, l2ctrl, stream三个参数,调用rtConfigureCall
_Z21__cce_rtConfigureCalljPvS_:
 00000000:  ff c3 00 d1                 sub     sp, sp, #0x30
 00000004:  fd 7b 02 a9                 stp     x29, x30, [sp, #0x20]
 00000008:  fd 83 00 91                 add     x29, sp, #0x20
 0000000c:  a0 c3 1f b8                 stur    w0, [x29, #-0x4]
 00000010:  e1 0b 00 f9                 str     x1, [sp, #0x10]
 00000014:  e2 07 00 f9                 str     x2, [sp, #0x8]
 00000018:  a0 c3 5f b8                 ldur    w0, [x29, #-0x4]
 0000001c:  e1 03 1f 2a                 mov     w1, wzr
 00000020:  00 00 00 94                 bl      #0x0 // NotExistType : __cce_getOrSetBlockNum + 0 //这里表示设置blockNum=8
 00000024:  a0 c3 5f b8                 ldur    w0, [x29, #-0x4]
 00000028:  e1 0b 40 f9                 ldr     x1, [sp, #0x10]
 0000002c:  e2 07 40 f9                 ldr     x2, [sp, #0x8]
 00000030:  00 00 00 94                 bl      #0x0 // NotExistType : rtConfigureCall + 0
 00000034:  fd 7b 42 a9                 ldp     x29, x30, [sp, #0x20]
 00000038:  ff c3 00 91                 add     sp, sp, #0x30
 0000003c:  c0 03 5f d6                 ret

myadd函数应该是elf中离调用npu指令最近的地方了,单独拎出来看,

//myadd输入为GM上三个地址,x,y,z,三个参数,调用了三次rtSetupArgument(在libruntime.so或者libruntime_camodel.so中),并检验设置情况,
//然后获取设置的blockNum, 最后将myadd的函数地址、myadd算子名字,算子名长度,blockNum四个参数给到`__cce_rtLaunch`启动执行。
//这里猜测rtLaunch中递归启动了blockDim个算子。rtLaunch实现也在runtime so中
myadd:
 00000000:  ff c3 00 d1                 sub     sp, sp, #0x30
 00000004:  fd 7b 02 a9                 stp     x29, x30, [sp, #0x20]
 00000008:  fd 83 00 91                 add     x29, sp, #0x20
 0000000c:  e8 03 00 aa                 mov     x8, x0
 00000010:  a0 23 00 d1                 sub     x0, x29, #0x8
 00000014:  a8 83 1f f8                 stur    x8, [x29, #-0x8]
 00000018:  e1 0b 00 f9                 str     x1, [sp, #0x10]
 0000001c:  e2 07 00 f9                 str     x2, [sp, #0x8]
 00000020:  01 01 80 d2                 mov     x1, #0x8
 00000024:  e2 03 1f aa                 mov     x2, xzr
 00000028:  00 00 00 94                 bl      #0x0 // NotExistType : rtSetupArgument + 0
 0000002c:  08 00 00 71                 subs    w8, w0, #0x0
 00000030:  e8 07 9f 1a                 cset    w8, ne
 00000034:  a8 03 00 37                 tbnz    w8, #0x0, #0x74
 00000038:  01 00 00 14                 b       #0x4
 0000003c:  e0 43 00 91                 add     x0, sp, #0x10
 00000040:  02 01 80 d2                 mov     x2, #0x8
 00000044:  e1 03 02 aa                 mov     x1, x2
 00000048:  00 00 00 94                 bl      #0x0 // NotExistType : rtSetupArgument + 0
 0000004c:  08 00 00 71                 subs    w8, w0, #0x0
 00000050:  e8 07 9f 1a                 cset    w8, ne
 00000054:  a8 02 00 37                 tbnz    w8, #0x0, #0x54
 00000058:  01 00 00 14                 b       #0x4
 0000005c:  e0 23 00 91                 add     x0, sp, #0x8
 00000060:  01 01 80 d2                 mov     x1, #0x8
 00000064:  02 02 80 d2                 mov     x2, #0x10
 00000068:  00 00 00 94                 bl      #0x0 // NotExistType : rtSetupArgument + 0
 0000006c:  08 00 00 71                 subs    w8, w0, #0x0
 00000070:  e8 07 9f 1a                 cset    w8, ne
 00000074:  a8 01 00 37                 tbnz    w8, #0x0, #0x34
 00000078:  01 00 00 14                 b       #0x4
 0000007c:  e0 03 1f 2a                 mov     w0, wzr
 00000080:  21 00 80 52                 mov     w1, #0x1
 00000084:  00 00 00 94                 bl      #0x0 // NotExistType : __cce_getOrSetBlockNum + 0 //表示get blockNum
 00000088:  e3 03 00 2a                 mov     w3, w0
 0000008c:  00 00 00 90                 adrp    x0, #0 // NotExistType : myadd + 0
 00000090:  00 00 00 91                 add     x0, x0, #0x0 // NotExistType : myadd + 0
 00000094:  01 00 00 90                 adrp    x1, #0 // NotExistType :  .rodata.str1.1 + 0
 00000098:  21 00 00 91                 add     x1, x1, #0x0 // NotExistType :  .rodata.str1.1 + 0
 0000009c:  a2 00 80 d2                 mov     x2, #0x5
 000000a0:  00 00 00 94                 bl      #0x0 // NotExistType : __cce_rtLaunch + 0
 000000a4:  01 00 00 14                 b       #0x4
 000000a8:  fd 7b 42 a9                 ldp     x29, x30, [sp, #0x20]
 000000ac:  ff c3 00 91                 add     sp, sp, #0x30
 000000b0:  c0 03 5f d6                 ret

其中__cce_getOrSetBlockNum如下所示,type=0表示set,否则为get,不同thread获得的值不同。

inline __attribute__((alway_inline)) int __cce_getOrSetBlockNum(int value,
                                                                int type) {
  static thread_local int local = 0;
  if (type == 0)
    local = value;
  return local;
}

在cann包的__clang_cce_link.stub文件中找到__cce_rtLaunch的实现,分带profiler和不带profiler的版本,不带profiler基本就是直接调rtLaunch。

#ifdef HAS_PROFILER
__attribute__ ((visibility("hidden"))) __attribute__((weak)) void __cce_rtLaunch(
                                 void *stubFunc,
                                 char *kernelName,
                                 unsigned long int  length,
                                 unsigned int bolckNum)
  {
    unsigned long int  beginTime = 0;
    unsigned long int endTime = 0;
    unsigned long int opName = 0;
    unsigned int threadId = 0;
    MsprofRegisterCallback(8, ProfCtrlHandle);      // 8 - CCE defined in msprof headerfile slog.h
    if (__MsprofFlagL0 || __MsprofFlagL1) {
      beginTime = MsprofSysCycleTime();
    }
    rtLaunch(stubFunc);
    if (__MsprofFlagL0 || __MsprofFlagL1) {
      endTime = MsprofSysCycleTime();
      opName = MsprofGetHashId(kernelName, length);
      threadId = (unsigned int)(syscall(SYS_gettid));
      MsprofApi info;
      info.magicNumber = 0x5a5a;      //MSPROF_REPORT_DATA_MAGIC_NUM
      info.level = 10000;             //MSPROF_REPORT_NODE_LEVEL
      info.type = 5;                  //MSPROF_REPORT_NODE_LAUNCH_TYPE
      info.threadId = threadId;
      info.reserve = 0;
      info.beginTime = beginTime;
      info.endTime = endTime;
      info.itemId = opName;
      MsprofReportApi(0, &info);
    }

    if (__MsprofFlagL1) {
      MsprofCompactInfo nodeBasicInfo;
      nodeBasicInfo.magicNumber = 0x5a5a;      //MSPROF_REPORT_DATA_MAGIC_NUM
      nodeBasicInfo.level = 10000;             //MSPROF_REPORT_NODE_LEVEL
      nodeBasicInfo.type = 0;                  //MSPROF_REPORT_NODE_BASIC_INFO_TYPE
      nodeBasicInfo.threadId = threadId;
      nodeBasicInfo.timeStamp = endTime;
      nodeBasicInfo.data.nodeBasicInfo.opName = opName;
      nodeBasicInfo.data.nodeBasicInfo.taskType = 0; //MSPROF_GE_TASK_TYPE_AI_CORE
      nodeBasicInfo.data.nodeBasicInfo.opType = opName;
      nodeBasicInfo.data.nodeBasicInfo.blockDim = bolckNum;
      MsprofReportCompactInfo(0, &nodeBasicInfo, sizeof(MsprofCompactInfo));
    }
  }
#else
  __attribute__ ((visibility("hidden"))) __attribute__((weak)) void __cce_rtLaunch(
                                 void *stubFunc,
                                 char *kernelName,
                                 unsigned long int  length,
                                 unsigned int bolckNum)
  {
    (void)kernelName;
    (void)length;
    (void)bolckNum;
    rtLaunch(stubFunc);
  }
#endif

另外在add.o中有一个函数cceModuleCtor,应该是elf加载初期需要进行初始化的工作。猜测这个函数内完成npu指令的部署。反汇编如下,

// 调用 rtLinkedDevBinaryRegister(__aicore_rel_rec, rtRegisterGlobals, 0)
cceModuleCtor:
 000001a4:  fe 0f 1f f8                 str     x30, [sp, #-0x10]!
 000001a8:  00 00 00 90                 adrp    x0, #0 // NotExistType : __aicore_rel_rec + 0
 000001ac:  00 00 00 91                 add     x0, x0, #0x0 // NotExistType : __aicore_rel_rec + 0
 000001b0:  01 00 00 90                 adrp    x1, #0 // NotExistType :  rtRegisterGlobals + 0
 000001b4:  21 00 00 91                 add     x1, x1, #0x0 // NotExistType : rtRegisterGlobals  + 0
 000001b8:  e2 03 1f 2a                 mov     w2, wzr //0
 000001bc:  00 00 00 94                 bl      #0x0 // NotExistType : rtLinkedDevBinaryRegister + 0
 000001c0:  fe 07 41 f8                 ldr     x30, [sp], #0x10
 000001c4:  c0 03 5f d6                 ret

其中__aicore_rel_rec中放的应该就是npu指令集的地址,rtLinkedDevBinaryRegister函数实现在__clang_cce_link.stub也有。

Relocation section '.rela__aicore_rel_rec' at offset 0x1158 contains 1 entry:
    Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
0000000000000008  0000000e00000101 R_AARCH64_ABS64        0000000000000000 __aicore_rel_binary + 0

sections信息
  [13] __aicore_rel_binary PROGBITS        0000000000000000 0002a0 0006b1 00   A  0   0  8
  [14] __aicore_rel_rec  PROGBITS        0000000000000000 000958 000018 00  WA  0   0  8
  [15] .rela__aicore_rel_rec RELA            0000000000000000 001158 000018 18   I 23  14  8
  __attribute__ ((visibility("hidden"))) void rtLinkedDevBinaryRegister(
                                 const __fatBinC_Wrapper_t *prelinked_fatbinc,
                                 void (*callback_fp)(void *),
                                 int DevBinaryMagicNum)
  {
    bool IsAICore = false;
    bool IsAICpu = false;

    // record the return value when registering binary.
    // Return value of 0 represents that register binary successfully.
    int RegisterAICoreRet = 0;
    int RegisterAICpuRet = 0;
    const int RT_ERROR_NONE = 0;

    static bool IsAICoreFirstRegister = true;
    static bool IsAICpuFirstRegister = true;
  // AICore has two types of cores, one is general, and another one
    // is specific for vector's compute.
    if (DevBinaryMagicNum == AICORE_MAGIC_NUM ||
        DevBinaryMagicNum == AICORE_MAGIC_NUM_VEC ||
        DevBinaryMagicNum == AICORE_MAGIC_NUM_CUBE) {
      IsAICore = true;
    } else if (DevBinaryMagicNum == AICPU_MAGIC_NUM) {
      IsAICpu = true;
    } else {
      unsigned int MagicNumber = prelinked_fatbinc->magic;
      // AICore has two types of cores, one is general, and another one
      // is specific for vector's compute.
      if (MagicNumber == AICORE_MAGIC_NUM ||
          MagicNumber == AICORE_MAGIC_NUM_VEC ||
          MagicNumber == AICORE_MAGIC_NUM_CUBE) {
        IsAICore = true;
      } else if (MagicNumber == AICPU_MAGIC_NUM) {
        IsAICpu = true;
      }
    }

     // when this function is called, it must have kernels in the binary,
     // so we don't need to check whether the callback_fp is not null which
     // is guaranteed by the compiler.

     // if the binary is registered firstly, then register it, or never
     // register the binary later.
#ifdef CCE_FATBINSECTION
     // CCE_FATBINSECTION macro is defined in the AICore binary C file,
     // it denotes a valid C file for AICore be present
     if (IsAICoreFirstRegister && IsAICore) {
       IsAICoreFirstRegister = false;
       RegisterAICoreRet = rtDevBinaryRegister((void*)&__fatDeviceTextAICore, &CCEFatCubinHandleAICore);

#ifdef LOAD_BIN_TO_FAST_MEM
       // Runtime will report error if binary registers unsuccessfully.
       if (RegisterAICoreRet == RT_ERROR_NONE) {
         rtBinaryRegisterToFastMemory(CCEFatCubinHandleAICore);
       }
#endif

       atexit(rtUnregisterBinaryUtilAICore);
     }
#endif

#ifdef CCE_FATBINSECTIONAICPU
     if (IsAICpuFirstRegister && IsAICpu) {
       IsAICpuFirstRegister = false;
       RegisterAICpuRet = rtDevBinaryRegister((void*)&__fatDeviceTextAICpu, &CCEFatCubinHandleAICpu);
       atexit(rtUnregisterBinaryUtilAICpu);
     }
#endif
     // we should register kernel functions every time this function is called
     // the callback_fp checking can be removed
     // Runtime will report error if binary registers unsuccessfully.
     if (IsAICore && callback_fp != nullptr &&
         RegisterAICoreRet == RT_ERROR_NONE)
       (*callback_fp)(CCEFatCubinHandleAICore);
     if (IsAICpu && callback_fp != nullptr &&
         RegisterAICpuRet == RT_ERROR_NONE)
       (*callback_fp)(CCEFatCubinHandleAICpu);
  }


sim相比npu模式,编译的时候额外编译了__clang_cce_link.stub,并将编译的object用于最后的链接

"/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/bin/ccec", "-c", "-x", "c++", "", "-DCCEFATBINFILE=\"/tmp/cce_embed_binary-890a23.c\"", "-I.", "-I/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5/include/cce_stub", "-o", "/tmp/cce_all_link-bcd3b7.o", "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5/include/cce_stub/__clang_cce_link.stub", "-fPIC"
posted @ 2024-06-23 23:41  zwlwf  阅读(7)  评论(0编辑  收藏  举报