AscendC学习
算子工程中sim和cpu模式都运行在cpu上,两者有什么区别?
猜测sim的等效是在npu指令层面,cpu的等效仅仅在AsecendC的层面?
表现在sim可以完全等效得跑cce代码算子,而cpu只能跑AscendC的算子。
当cce代码保存为.cpp后缀的文件,采用ccec编译,需要指定-x cce
,即指定编译语言。
ccec -x cce --cce-aicore-arch=dav-c220-vec -c add.cce
编译出来的结果为x86格式的elf文件,但npu的指令放在了elf中的一个段。至于是怎么包含的?ccec
启动bisheng, 添加参数"-fcce-include-aibinary", "/tmp/add_custom-e50633/add_custom-dav-c220-vec.o"
将事先编译的npu object文件包含进来。
atc转换过程中,tbe调用build构建出的算子为纯粹npu格式的elf文件。
一个简单问题:bisheng是怎么区分编译npu的object还是x86+npu代码段的?
看到bisheng的一些关键参数:x86的嵌入+npu代码段时,--triple x86_64-unknown-linux-gnu -fcce-fatobj-compile
, 编译npu算子的命令参数为--triple hiipu64-hisilicon-cce -fcce-is-aicore
。
一个调用ccec生成纯粹npu object编译命令如下,
ccec -cc1 -triple hiipu64-hisilicon-cce -fcce-is-aicore "-resource-dir" "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5" -include __clang_cce_runtime_wrapper.h -o add2.o -x cce add.cce "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/x86_64-linux-gnu/c++/11" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/backward" "-internal-isystem" "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5/include" "-internal-isystem" "/usr/local/include" "-internal-isystem" "/usr/lib/gcc/x86_64-linux-gnu/11/../../../../x86_64-linux-gnu/include" "-internal-externc-isystem" "/usr/include/x86_64-linux-gnu" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" -emit-obj -target-cpu dav-c220-vec
命令中几个参数可以关注一下:
- 包含的头文件
__clang_cce_runtime_wrapper.h
,这让你可以使用__aicore
等宏,各种vector和cube的指令。 -cc1
表示像clang前端传递参数并只运行ast到llvm-IR的步骤,最后加的-emit-obj
表示生成object文件,默认只会到ir自然退出。
上述命令只会编译cpp中加了__aicore__
的函数。aicore上的函数要求返回为void。假设cce中定义了一个函数fun
, 编译结果elf中会有一个fun
的函数和一个fun__
的对象,后者fun__
表示的是kernelArgSize,即函数如参数Byte数。
'+all' is not a recognized feature for this target (ignoring feature)
_Z8myadd_doPhS_S_S_:
000000f0: ff c3 00 d1 sub sp, sp, #0x30
000000f4: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
000000f8: fd 83 00 91 add x29, sp, #0x20
000000fc: a0 83 1f f8 stur x0, [x29, #-0x8]
00000100: e1 0b 00 f9 str x1, [sp, #0x10]
00000104: e2 07 00 f9 str x2, [sp, #0x8]
00000108: e3 03 00 f9 str x3, [sp]
0000010c: e2 03 40 f9 ldr x2, [sp]
00000110: 00 01 80 52 mov w0, #0x8
00000114: e1 03 1f aa mov x1, xzr
00000118: 00 00 00 94 bl #0x0 // NotExistType : _Z21__cce_rtConfigureCalljPvS_ + 0
0000011c: 08 00 00 71 subs w8, w0, #0x0
00000120: e8 07 9f 1a cset w8, ne
00000124: e8 00 00 37 tbnz w8, #0x0, #0x1c
00000128: 01 00 00 14 b #0x4
0000012c: a0 83 5f f8 ldur x0, [x29, #-0x8]
00000130: e1 0b 40 f9 ldr x1, [sp, #0x10]
00000134: e2 07 40 f9 ldr x2, [sp, #0x8]
00000138: 00 00 00 94 bl #0x0 // NotExistType : myadd + 0
0000013c: 01 00 00 14 b #0x4
00000140: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
00000144: ff c3 00 91 add sp, sp, #0x30
00000148: c0 03 5f d6 ret
打开cce代码看,
extern "C" __global__ __aicore__ void myadd(GM_ADDR x, GM_ADDR y, GM_ADDR z)
{
...
}
#ifndef __CCE_KT_TEST__
void myadd_do(uint8_t* x, uint8_t* y, uint8_t* z, uint8_t* stream) {
myadd<<<8, nullptr, stream>>>((half*)x, (half*)y, (float*)z);
}
#endif
通过ccec -x cce -c add.cce -o add.o --cce-aicore-arch=dav-c220-vec
编译出add.o的aarch64 object文件,并将npu的指令内嵌到某个代码段,
一些在aarch64 CPU架构上的函数的反汇编如下:
// myadd_do函数,内部使用<<<>>>方式调用算子,三箭头的调用步骤是,将blockDim, l2ctrl, stream传给_Z21__cce_rtConfigureCalljPvS_函数(即__cce_rtConfigureCall(unsigned int, void*, void*))完成算子rt的配置,若配置结果不为零,直接跳出。否则将x,y,z三个参数传递给myadd的函数
_Z8myadd_doPhS_S_S_:
000000f0: ff c3 00 d1 sub sp, sp, #0x30
000000f4: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
000000f8: fd 83 00 91 add x29, sp, #0x20
000000fc: a0 83 1f f8 stur x0, [x29, #-0x8]
00000100: e1 0b 00 f9 str x1, [sp, #0x10]
00000104: e2 07 00 f9 str x2, [sp, #0x8]
00000108: e3 03 00 f9 str x3, [sp]
0000010c: e2 03 40 f9 ldr x2, [sp]
00000110: 00 01 80 52 mov w0, #0x8
00000114: e1 03 1f aa mov x1, xzr
00000118: 00 00 00 94 bl #0x0 // NotExistType : _Z21__cce_rtConfigureCalljPvS_ + 0
0000011c: 08 00 00 71 subs w8, w0, #0x0
00000120: e8 07 9f 1a cset w8, ne
00000124: e8 00 00 37 tbnz w8, #0x0, #0x1c
00000128: 01 00 00 14 b #0x4
0000012c: a0 83 5f f8 ldur x0, [x29, #-0x8]
00000130: e1 0b 40 f9 ldr x1, [sp, #0x10]
00000134: e2 07 40 f9 ldr x2, [sp, #0x8]
00000138: 00 00 00 94 bl #0x0 // NotExistType : myadd + 0
0000013c: 01 00 00 14 b #0x4
00000140: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
00000144: ff c3 00 91 add sp, sp, #0x30
00000148: c0 03 5f d6 ret
// 传入blockDim, l2ctrl, stream三个参数,调用rtConfigureCall
_Z21__cce_rtConfigureCalljPvS_:
00000000: ff c3 00 d1 sub sp, sp, #0x30
00000004: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
00000008: fd 83 00 91 add x29, sp, #0x20
0000000c: a0 c3 1f b8 stur w0, [x29, #-0x4]
00000010: e1 0b 00 f9 str x1, [sp, #0x10]
00000014: e2 07 00 f9 str x2, [sp, #0x8]
00000018: a0 c3 5f b8 ldur w0, [x29, #-0x4]
0000001c: e1 03 1f 2a mov w1, wzr
00000020: 00 00 00 94 bl #0x0 // NotExistType : __cce_getOrSetBlockNum + 0 //这里表示设置blockNum=8
00000024: a0 c3 5f b8 ldur w0, [x29, #-0x4]
00000028: e1 0b 40 f9 ldr x1, [sp, #0x10]
0000002c: e2 07 40 f9 ldr x2, [sp, #0x8]
00000030: 00 00 00 94 bl #0x0 // NotExistType : rtConfigureCall + 0
00000034: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
00000038: ff c3 00 91 add sp, sp, #0x30
0000003c: c0 03 5f d6 ret
myadd函数应该是elf中离调用npu指令最近的地方了,单独拎出来看,
//myadd输入为GM上三个地址,x,y,z,三个参数,调用了三次rtSetupArgument(在libruntime.so或者libruntime_camodel.so中),并检验设置情况,
//然后获取设置的blockNum, 最后将myadd的函数地址、myadd算子名字,算子名长度,blockNum四个参数给到`__cce_rtLaunch`启动执行。
//这里猜测rtLaunch中递归启动了blockDim个算子。rtLaunch实现也在runtime so中
myadd:
00000000: ff c3 00 d1 sub sp, sp, #0x30
00000004: fd 7b 02 a9 stp x29, x30, [sp, #0x20]
00000008: fd 83 00 91 add x29, sp, #0x20
0000000c: e8 03 00 aa mov x8, x0
00000010: a0 23 00 d1 sub x0, x29, #0x8
00000014: a8 83 1f f8 stur x8, [x29, #-0x8]
00000018: e1 0b 00 f9 str x1, [sp, #0x10]
0000001c: e2 07 00 f9 str x2, [sp, #0x8]
00000020: 01 01 80 d2 mov x1, #0x8
00000024: e2 03 1f aa mov x2, xzr
00000028: 00 00 00 94 bl #0x0 // NotExistType : rtSetupArgument + 0
0000002c: 08 00 00 71 subs w8, w0, #0x0
00000030: e8 07 9f 1a cset w8, ne
00000034: a8 03 00 37 tbnz w8, #0x0, #0x74
00000038: 01 00 00 14 b #0x4
0000003c: e0 43 00 91 add x0, sp, #0x10
00000040: 02 01 80 d2 mov x2, #0x8
00000044: e1 03 02 aa mov x1, x2
00000048: 00 00 00 94 bl #0x0 // NotExistType : rtSetupArgument + 0
0000004c: 08 00 00 71 subs w8, w0, #0x0
00000050: e8 07 9f 1a cset w8, ne
00000054: a8 02 00 37 tbnz w8, #0x0, #0x54
00000058: 01 00 00 14 b #0x4
0000005c: e0 23 00 91 add x0, sp, #0x8
00000060: 01 01 80 d2 mov x1, #0x8
00000064: 02 02 80 d2 mov x2, #0x10
00000068: 00 00 00 94 bl #0x0 // NotExistType : rtSetupArgument + 0
0000006c: 08 00 00 71 subs w8, w0, #0x0
00000070: e8 07 9f 1a cset w8, ne
00000074: a8 01 00 37 tbnz w8, #0x0, #0x34
00000078: 01 00 00 14 b #0x4
0000007c: e0 03 1f 2a mov w0, wzr
00000080: 21 00 80 52 mov w1, #0x1
00000084: 00 00 00 94 bl #0x0 // NotExistType : __cce_getOrSetBlockNum + 0 //表示get blockNum
00000088: e3 03 00 2a mov w3, w0
0000008c: 00 00 00 90 adrp x0, #0 // NotExistType : myadd + 0
00000090: 00 00 00 91 add x0, x0, #0x0 // NotExistType : myadd + 0
00000094: 01 00 00 90 adrp x1, #0 // NotExistType : .rodata.str1.1 + 0
00000098: 21 00 00 91 add x1, x1, #0x0 // NotExistType : .rodata.str1.1 + 0
0000009c: a2 00 80 d2 mov x2, #0x5
000000a0: 00 00 00 94 bl #0x0 // NotExistType : __cce_rtLaunch + 0
000000a4: 01 00 00 14 b #0x4
000000a8: fd 7b 42 a9 ldp x29, x30, [sp, #0x20]
000000ac: ff c3 00 91 add sp, sp, #0x30
000000b0: c0 03 5f d6 ret
其中__cce_getOrSetBlockNum
如下所示,type=0表示set,否则为get,不同thread获得的值不同。
inline __attribute__((alway_inline)) int __cce_getOrSetBlockNum(int value,
int type) {
static thread_local int local = 0;
if (type == 0)
local = value;
return local;
}
在cann包的__clang_cce_link.stub
文件中找到__cce_rtLaunch
的实现,分带profiler和不带profiler的版本,不带profiler基本就是直接调rtLaunch。
#ifdef HAS_PROFILER
__attribute__ ((visibility("hidden"))) __attribute__((weak)) void __cce_rtLaunch(
void *stubFunc,
char *kernelName,
unsigned long int length,
unsigned int bolckNum)
{
unsigned long int beginTime = 0;
unsigned long int endTime = 0;
unsigned long int opName = 0;
unsigned int threadId = 0;
MsprofRegisterCallback(8, ProfCtrlHandle); // 8 - CCE defined in msprof headerfile slog.h
if (__MsprofFlagL0 || __MsprofFlagL1) {
beginTime = MsprofSysCycleTime();
}
rtLaunch(stubFunc);
if (__MsprofFlagL0 || __MsprofFlagL1) {
endTime = MsprofSysCycleTime();
opName = MsprofGetHashId(kernelName, length);
threadId = (unsigned int)(syscall(SYS_gettid));
MsprofApi info;
info.magicNumber = 0x5a5a; //MSPROF_REPORT_DATA_MAGIC_NUM
info.level = 10000; //MSPROF_REPORT_NODE_LEVEL
info.type = 5; //MSPROF_REPORT_NODE_LAUNCH_TYPE
info.threadId = threadId;
info.reserve = 0;
info.beginTime = beginTime;
info.endTime = endTime;
info.itemId = opName;
MsprofReportApi(0, &info);
}
if (__MsprofFlagL1) {
MsprofCompactInfo nodeBasicInfo;
nodeBasicInfo.magicNumber = 0x5a5a; //MSPROF_REPORT_DATA_MAGIC_NUM
nodeBasicInfo.level = 10000; //MSPROF_REPORT_NODE_LEVEL
nodeBasicInfo.type = 0; //MSPROF_REPORT_NODE_BASIC_INFO_TYPE
nodeBasicInfo.threadId = threadId;
nodeBasicInfo.timeStamp = endTime;
nodeBasicInfo.data.nodeBasicInfo.opName = opName;
nodeBasicInfo.data.nodeBasicInfo.taskType = 0; //MSPROF_GE_TASK_TYPE_AI_CORE
nodeBasicInfo.data.nodeBasicInfo.opType = opName;
nodeBasicInfo.data.nodeBasicInfo.blockDim = bolckNum;
MsprofReportCompactInfo(0, &nodeBasicInfo, sizeof(MsprofCompactInfo));
}
}
#else
__attribute__ ((visibility("hidden"))) __attribute__((weak)) void __cce_rtLaunch(
void *stubFunc,
char *kernelName,
unsigned long int length,
unsigned int bolckNum)
{
(void)kernelName;
(void)length;
(void)bolckNum;
rtLaunch(stubFunc);
}
#endif
另外在add.o中有一个函数cceModuleCtor,应该是elf加载初期需要进行初始化的工作。猜测这个函数内完成npu指令的部署。反汇编如下,
// 调用 rtLinkedDevBinaryRegister(__aicore_rel_rec, rtRegisterGlobals, 0)
cceModuleCtor:
000001a4: fe 0f 1f f8 str x30, [sp, #-0x10]!
000001a8: 00 00 00 90 adrp x0, #0 // NotExistType : __aicore_rel_rec + 0
000001ac: 00 00 00 91 add x0, x0, #0x0 // NotExistType : __aicore_rel_rec + 0
000001b0: 01 00 00 90 adrp x1, #0 // NotExistType : rtRegisterGlobals + 0
000001b4: 21 00 00 91 add x1, x1, #0x0 // NotExistType : rtRegisterGlobals + 0
000001b8: e2 03 1f 2a mov w2, wzr //0
000001bc: 00 00 00 94 bl #0x0 // NotExistType : rtLinkedDevBinaryRegister + 0
000001c0: fe 07 41 f8 ldr x30, [sp], #0x10
000001c4: c0 03 5f d6 ret
其中__aicore_rel_rec
中放的应该就是npu指令集的地址,rtLinkedDevBinaryRegister
函数实现在__clang_cce_link.stub
也有。
Relocation section '.rela__aicore_rel_rec' at offset 0x1158 contains 1 entry:
Offset Info Type Symbol's Value Symbol's Name + Addend
0000000000000008 0000000e00000101 R_AARCH64_ABS64 0000000000000000 __aicore_rel_binary + 0
sections信息
[13] __aicore_rel_binary PROGBITS 0000000000000000 0002a0 0006b1 00 A 0 0 8
[14] __aicore_rel_rec PROGBITS 0000000000000000 000958 000018 00 WA 0 0 8
[15] .rela__aicore_rel_rec RELA 0000000000000000 001158 000018 18 I 23 14 8
__attribute__ ((visibility("hidden"))) void rtLinkedDevBinaryRegister(
const __fatBinC_Wrapper_t *prelinked_fatbinc,
void (*callback_fp)(void *),
int DevBinaryMagicNum)
{
bool IsAICore = false;
bool IsAICpu = false;
// record the return value when registering binary.
// Return value of 0 represents that register binary successfully.
int RegisterAICoreRet = 0;
int RegisterAICpuRet = 0;
const int RT_ERROR_NONE = 0;
static bool IsAICoreFirstRegister = true;
static bool IsAICpuFirstRegister = true;
// AICore has two types of cores, one is general, and another one
// is specific for vector's compute.
if (DevBinaryMagicNum == AICORE_MAGIC_NUM ||
DevBinaryMagicNum == AICORE_MAGIC_NUM_VEC ||
DevBinaryMagicNum == AICORE_MAGIC_NUM_CUBE) {
IsAICore = true;
} else if (DevBinaryMagicNum == AICPU_MAGIC_NUM) {
IsAICpu = true;
} else {
unsigned int MagicNumber = prelinked_fatbinc->magic;
// AICore has two types of cores, one is general, and another one
// is specific for vector's compute.
if (MagicNumber == AICORE_MAGIC_NUM ||
MagicNumber == AICORE_MAGIC_NUM_VEC ||
MagicNumber == AICORE_MAGIC_NUM_CUBE) {
IsAICore = true;
} else if (MagicNumber == AICPU_MAGIC_NUM) {
IsAICpu = true;
}
}
// when this function is called, it must have kernels in the binary,
// so we don't need to check whether the callback_fp is not null which
// is guaranteed by the compiler.
// if the binary is registered firstly, then register it, or never
// register the binary later.
#ifdef CCE_FATBINSECTION
// CCE_FATBINSECTION macro is defined in the AICore binary C file,
// it denotes a valid C file for AICore be present
if (IsAICoreFirstRegister && IsAICore) {
IsAICoreFirstRegister = false;
RegisterAICoreRet = rtDevBinaryRegister((void*)&__fatDeviceTextAICore, &CCEFatCubinHandleAICore);
#ifdef LOAD_BIN_TO_FAST_MEM
// Runtime will report error if binary registers unsuccessfully.
if (RegisterAICoreRet == RT_ERROR_NONE) {
rtBinaryRegisterToFastMemory(CCEFatCubinHandleAICore);
}
#endif
atexit(rtUnregisterBinaryUtilAICore);
}
#endif
#ifdef CCE_FATBINSECTIONAICPU
if (IsAICpuFirstRegister && IsAICpu) {
IsAICpuFirstRegister = false;
RegisterAICpuRet = rtDevBinaryRegister((void*)&__fatDeviceTextAICpu, &CCEFatCubinHandleAICpu);
atexit(rtUnregisterBinaryUtilAICpu);
}
#endif
// we should register kernel functions every time this function is called
// the callback_fp checking can be removed
// Runtime will report error if binary registers unsuccessfully.
if (IsAICore && callback_fp != nullptr &&
RegisterAICoreRet == RT_ERROR_NONE)
(*callback_fp)(CCEFatCubinHandleAICore);
if (IsAICpu && callback_fp != nullptr &&
RegisterAICpuRet == RT_ERROR_NONE)
(*callback_fp)(CCEFatCubinHandleAICpu);
}
sim相比npu模式,编译的时候额外编译了__clang_cce_link.stub
,并将编译的object用于最后的链接
"/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/bin/ccec", "-c", "-x", "c++", "", "-DCCEFATBINFILE=\"/tmp/cce_embed_binary-890a23.c\"", "-I.", "-I/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5/include/cce_stub", "-o", "/tmp/cce_all_link-bcd3b7.o", "/home/zwl/Ascend/ascend-toolkit/8.0.RC2.alpha002/x86_64-linux/ccec_compiler/lib/clang/15.0.5/include/cce_stub/__clang_cce_link.stub", "-fPIC"