cuobjdump -sass 1 > 1.s

#if defined(__CUDACC__)
template <typename T>
__device__ inline void *memcpy(T *d, const T *s, size_t n) {
for (size_t i = 0; i < n / sizeof (T); i++) {
d[i] = s[i];
}
return d;
}
#endif




#ifndef AA
#include "include/cuda_runtime_api.h"
#endif

#if 01
#define __global__
enum {
cudaMemcpyHostToHost = 0, /**< Host -> Host */
cudaMemcpyHostToDevice = 1, /**< Host -> Device */
cudaMemcpyDeviceToHost = 2, /**< Device -> Host */
cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */
cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
};

typedef int cudaError_t;

#include "stdlib.h"
#include "memory.h"
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, int ) {
memcpy(dst, src, count);
return 0;
}

cudaError_t cudaMalloc(void **devPtr, size_t size) {
*devPtr = malloc(size);
return 0;
}

cudaError_t cudaFree(void *devPtr) {
free(devPtr);
return 0;
}

cudaError_t cudaStreamQuery(struct cudaStream* stream) {
return 0;
}
cudaError_t cudaDeviceSynchronize() {
return 0;
}

cudaError_t cudaGetLastError() {
return 0;
}

const char* cudaGetErrorString(cudaError_t error) {
return "ok";
}

struct Dim3 {
unsigned x, y, z;
};
Dim3 threadIdx, blockIdx, blockDim, gridDim;

#endif

 

 


Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

code for sm_52
Function : _Z2fbPv
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001fc400fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ MOV32I R0, 0x3 ; /* 0x010000000037f000 */
/*0018*/ MOV32I R4, 0xfffffffc ; /* 0x010fffffffc7f004 */
/* 0x001fc000fe4007f1 */
/*0028*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/*0030*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/*0038*/ { MOV32I R5, 0xfffffffe ; /* 0x010fffffffe7f005 */
/* 0x001fc400fe0007f1 */
/*0048*/ STG.E [R2+0x4], R0 }
/* 0xeedc200000470200 */
/*0050*/ { MOV32I R7, 0x8 ; /* 0x010000000087f007 */
/*0058*/ STG.E [R2], R4 }
/* 0xeedc200000070204 */
/* 0x001fbc00fe2007f1 */
/*0068*/ STG.E [R2+0x8], R5 ; /* 0xeedc200000870205 */
/*0070*/ STG.E [R2+0xc], R7 ; /* 0xeedc200000c70207 */
/*0078*/ NOP ; /* 0x50b0000000070f00 */
/* 0x001ffc00fc8007ef */
/*0088*/ NOP ; /* 0x50b0000000070f00 */
/*0090*/ NOP ; /* 0x50b0000000070f00 */
/*0098*/ EXIT ; /* 0xe30000000007000f */
/* 0x001f8000fc0007ff */
/*00a8*/ BRA 0xa0 ; /* 0xe2400fffff07000f */
/*00b0*/ NOP; /* 0x50b0000000070f00 */
/*00b8*/ NOP; /* 0x50b0000000070f00 */
..........



Fatbin ptx code:
================
arch = sm_52
code version = [7,7]
host = linux
compile_size = 64bit
compressed







__global__ void fb(void *da) {
int32_t *a = (int32_t *)da;
uint32_t k = 0;
k-=1;
uint64_t c = k;
k-=2;
c = c * k;

a[1] = (uint32_t)c;
a[0] = (uint32_t)(c>>32);
int32_t k1 = -3;
a[2] = k1>>1;
a[3] = sizeof (void *);
}




__global__ void fb(void *da) {
int32_t *a = (int32_t *)da;
uint32_t k = a[0];
k-=1;
uint64_t c = k;
k-=2;
c = c * k;

a[1] = (uint32_t)c;
a[0] = (uint32_t)(c>>32);
int32_t k1 = a[2];
a[2] = k1>>1;
a[3] = sizeof (void *);
}



__global__ void fb(void *da) {
int32_t *a = (int32_t *)da;
uint32_t k = a[0];
k-=1;
uint32_t g = k-2;
uint64_t c;
asm("mul.wide.u32 %0, %1, %2;" : "=l"(c) : "r"(k), "r"(g) );
// a[1] = (uint32_t)c;
// a[0] = (uint32_t)(c>>32);
*(uint64_t *)a = c;
uint32_t k1 = a[2];
a[2] = k1>>1;
a[3] = sizeof (void *);
}


__global__ void fb(void *da) {
int32_t *a = (int32_t *)da;
uint32_t k = a[0];
k-=1;
uint32_t g = k-2;
uint32_t c1, c0;
asm("mul.hi.u32 %0, %1, %2;" : "=r"(c1) : "r"(k), "r"(g) );
asm("mul.lo.u32 %0, %1, %2;" : "=r"(c0) : "r"(k), "r"(g) );

a[1] = c1;
a[0] = c0;
uint32_t k1 = a[2];
a[2] = k1>>1;
a[3] = sizeof (void *);
}



https://forums.developer.nvidia.com/t/long-integer-multiplication-mul-wide-u64-and-mul-wide-u128/51520





int main() {
int r = cudaSuccess;
int a[N];
for (int i = 0; i< N; i++) {
a[i] = i;
}
a[2]=-3;
void *da = 0, *db = 0;
r=cudaMalloc(&da, sizeof (int)*N);
r=cudaMemcpy(da, a, sizeof a, cudaMemcpyHostToDevice);
//fa<<<1,32>>>(da);
fb<<<1,32>>>(da);
r=cudaMemcpy(a, da, sizeof a, cudaMemcpyDeviceToHost);
r=cudaFree(da);
r=cudaDeviceSynchronize();
for (int i = 0; i< N; i++) {
printf("%d\t", a[i]);
}
printf("\n");
return 0;
}






__device__ void f2(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
uint32_t alo = a, ahi = a >> 32, blo = b, bhi = b >> 32, r0, r1, r2, r3;
asm("mul.lo.u32 %0, %1, %2;" : "=r"(r0) : "r"(alo), "r"(bhi));
asm("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r1) : "r"(blo), "r"(ahi), "r"(r0));
asm("madc.hi.u32 %0, %1, %2, 0;" : "=r"(r0) : "r"(alo), "r"(bhi));
asm("mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(r2) : "r"(blo), "r"(ahi), "r"(r0));
asm("madc.hi.u32 %0, %1, %2, 0;" : "=r"(r3) : "r"(ahi), "r"(bhi));
asm("mad.hi.cc.u32 %0, %1, %2, %0;" : "+r"(r1) : "r"(alo), "r"(blo));
asm("madc.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r2) : "r"(ahi), "r"(bhi));
asm("addc.u32 %0, %0, 0;" : "+r"(r3) : );
asm("mul.lo.u32 %0, %1, %2;" : "=r"(r0) : "r"(alo), "r"(blo));
*h = (uint64_t)r3 << 32 | r2;
*l = (uint64_t)r1 << 32 | r0;
}


__global__ void fa(void *da) {
uint64_t *a = (uint64_t *) da;
f2(&a[0], &a[1], a[2], a[3]);
}


static __device__ void f1(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
uint32_t alo = a, ahi = a >> 32, blo = b, bhi = b >> 32, r0, r1 = 0 , r2 = 0 , r3 = 0 ;
asm (
"mul.lo.u32 %0, %4, %6;"
"mul.hi.u32 %1, %4, %6;"
"mad.lo.cc.u32 %1, %4, %7, %1;"
"madc.hi.u32 %2, %4, %7, 0;"
"mad.lo.cc.u32 %1, %5, %6, %1;"
"madc.hi.cc.u32 %2, %5, %6, %2;"
"madc.hi.u32 %3, %5, %7, 0;"
"mad.lo.cc.u32 %2, %5, %7, %2;"
"addc.u32 %3, %3, 0;"
: "=r"(r0), "+r"(r1), "+r"(r2), "+r"(r3)
: "r"(alo), "r"(ahi), "r"(blo), "r"(bhi));
*h = (uint64_t)r3 << 32 | r2;
*l = (uint64_t)r1 << 32 | r0;
}


static __device__ void f11(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
uint32_t alo = a, ahi = a >> 32, blo = b, bhi = b >> 32, r0, r1, r2, r3;
asm("mul.lo.u32 %0, %1, %2;" : "=r"(r0) : "r"(alo), "r"(blo));
asm("mul.hi.u32 %0, %1, %2;" : "=r"(r1) : "r"(alo), "r"(blo));
asm("mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r1) : "r"(alo), "r"(bhi));
asm("madc.hi.u32 %0, %1, %2, 0;" : "=r"(r2) : "r"(alo), "r"(bhi));
asm("mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r1) : "r"(blo), "r"(ahi));
asm("madc.hi.u32 %0, %1, %2, %0;" : "+r"(r2) : "r"(blo), "r"(ahi));
asm("madc.hi.u32 %0, %1, %2, 0;" : "=r"(r3) : "r"(ahi), "r"(bhi));
asm("mad.lo.cc.u32 %0, %1, %2, %0;" : "+r"(r2) : "r"(ahi), "r"(bhi));
asm("addc.u32 %0, %0, 0;" : "+r"(r3) : );
*h = (uint64_t)r3 << 32 | r2;
*l = (uint64_t)r1 << 32 | r0;
}


__global__ void fb(void *da) {
uint64_t *a = (uint64_t *) da;
f1(&a[0], &a[1], a[2], a[3]);
}

static __device__ void f(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
asm (
".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;"
"mov.b64 {alo,ahi}, %2;"
"mov.b64 {blo,bhi}, %3;"
"mul.lo.u32 r0, alo, blo;"
"mul.hi.u32 r1, alo, blo;"
"mad.lo.cc.u32 r1, alo, bhi, r1;"
"madc.hi.u32 r2, alo, bhi, 0;"
"mad.lo.cc.u32 r1, ahi, blo, r1;"
"madc.hi.cc.u32 r2, ahi, blo, r2;"
"madc.hi.u32 r3, ahi, bhi, 0;"
"mad.lo.cc.u32 r2, ahi, bhi, r2;"
"addc.u32 r3, r3, 0;"
"mov.b64 %0, {r0,r1};"
"mov.b64 %1, {r2,r3};"
: "=l"(*l), "=l"(*h)
: "l"(a), "l"(b));
}


__global__ void fbb(void *da) {
printf("tid:%d\t bid:%d\t blockdim:%d\t griddim:%d\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x);
}

cudaError_t e = cudaGetLastError();
if (e) {
printf("GPU: %d (%s)\n", e, cudaGetErrorString(e));
return e;
}



#include "unistd.h"
int main(int argc, char *argv[])
{
cudaError_t r;
unsigned bs = strtoul(argv[1], 0, 10);
unsigned ts = strtoul(argv[2], 0, 10);
unsigned n = bs * ts;
T *a = new T[n];
void *da = 0;
r=cudaMalloc(&da, sizeof(T)*n);
//r=cudaMemcpy(da, a, sizeof(T)*n, cudaMemcpyHostToDevice);
f<<<bs,ts>>>(da);
cudaError_t e = cudaGetLastError();
if (e) {
printf("GPU: %d (%s)\n", e, cudaGetErrorString(e));
return e;
}

r = cudaErrorNotReady;
while (r) {
sleep(1);
r = cudaStreamQuery(0);
}
//r=cudaDeviceSynchronize();
r=cudaMemcpy(a, da, sizeof(T)*n, cudaMemcpyDeviceToHost);
r=cudaFree(da);

unsigned g[] = {0, 1, n>>4, n>>5, n>>9, n/4, n/3, n/2, n-2, n-1};
for (unsigned i = 0; i < sizeof(g)/sizeof(*g); i++){
for (unsigned j = 0; j < sizeof(T); j++){
printf("%d, ", a[g[i]][j]);
}
printf("\n");
}
delete []a;
return 0;
}
posted @   zJanly  阅读(46)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
点击右上角即可分享
微信分享提示