cuobjdump -sass 1 > 1.s

#if defined(__CUDACC__)
template <typename T>
__device__ inline void *memcpy(T *d, const T *s, size_t n) {
    for (size_t i = 0; i < n / sizeof (T); i++) {
        d[i] = s[i];
    }
    return d;
}
#endif



#ifndef AA
#include "include/cuda_runtime_api.h"
#endif

#if 01
#define __global__
enum {
    cudaMemcpyHostToHost          =   0,      /**< Host   -> Host */
    cudaMemcpyHostToDevice        =   1,      /**< Host   -> Device */
    cudaMemcpyDeviceToHost        =   2,      /**< Device -> Host */
    cudaMemcpyDeviceToDevice      =   3,      /**< Device -> Device */
    cudaMemcpyDefault             =   4       /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
};

typedef int cudaError_t;

#include "stdlib.h"
#include "memory.h"
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, int ) {
    memcpy(dst, src, count);
    return 0;
}

cudaError_t cudaMalloc(void **devPtr, size_t size) {
    *devPtr = malloc(size);
    return 0;
}

cudaError_t cudaFree(void *devPtr) {
    free(devPtr);
    return 0;
}

cudaError_t cudaStreamQuery(struct cudaStream* stream) {
    return 0;
}
cudaError_t cudaDeviceSynchronize() {
    return 0;
}

cudaError_t cudaGetLastError() {
    return 0;
}

const char* cudaGetErrorString(cudaError_t error) {
    return "ok";
}

struct Dim3 {
    unsigned x, y, z;
};
Dim3 threadIdx, blockIdx, blockDim, gridDim;

#endif


Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

   code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
host = linux
compile_size = 64bit

   code for sm_52
      Function : _Z2fbPv
   .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                               /* 0x001fc400fe2007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;      /* 0x4c98078000870001 */
        /*0010*/                   MOV32I R0, 0x3 ;            /* 0x010000000037f000 */
        /*0018*/                   MOV32I R4, 0xfffffffc ;     /* 0x010fffffffc7f004 */
                                                               /* 0x001fc000fe4007f1 */
        /*0028*/                   MOV R2, c[0x0][0x140] ;     /* 0x4c98078005070002 */
        /*0030*/                   MOV R3, c[0x0][0x144] ;     /* 0x4c98078005170003 */
        /*0038*/         {         MOV32I R5, 0xfffffffe ;     /* 0x010fffffffe7f005 */
                                                               /* 0x001fc400fe0007f1 */
        /*0048*/                   STG.E [R2+0x4], R0         }
                                                               /* 0xeedc200000470200 */
        /*0050*/         {         MOV32I R7, 0x8 ;            /* 0x010000000087f007 */
        /*0058*/                   STG.E [R2], R4         }
                                                               /* 0xeedc200000070204 */
                                                               /* 0x001fbc00fe2007f1 */
        /*0068*/                   STG.E [R2+0x8], R5 ;        /* 0xeedc200000870205 */
        /*0070*/                   STG.E [R2+0xc], R7 ;        /* 0xeedc200000c70207 */
        /*0078*/                   NOP ;                       /* 0x50b0000000070f00 */
                                                               /* 0x001ffc00fc8007ef */
        /*0088*/                   NOP ;                       /* 0x50b0000000070f00 */
        /*0090*/                   NOP ;                       /* 0x50b0000000070f00 */
        /*0098*/                   EXIT ;                      /* 0xe30000000007000f */
                                                               /* 0x001f8000fc0007ff */
        /*00a8*/                   BRA 0xa0 ;                  /* 0xe2400fffff07000f */
        /*00b0*/                   NOP;                        /* 0x50b0000000070f00 */
        /*00b8*/                   NOP;                        /* 0x50b0000000070f00 */
      ..........



Fatbin ptx code:
================
arch = sm_52
code version = [7,7]
host = linux
compile_size = 64bit
compressed

__global__ void fb(void *da) {
    int32_t *a = (int32_t *)da;
    uint32_t k = 0;
    k-=1;
    uint64_t c = k;
    k-=2;
    c = c * k;

    a[1] = (uint32_t)c;
    a[0] = (uint32_t)(c>>32);
    int32_t k1 = -3;
    a[2] = k1>>1;
    a[3] = sizeof (void *);
}

__global__ void fb(void *da) {
    int32_t *a = (int32_t *)da;
    uint32_t k = a[0];
    k-=1;
    uint64_t c = k;
    k-=2;
    c = c * k;

    a[1] = (uint32_t)c;
    a[0] = (uint32_t)(c>>32);
    int32_t k1 = a[2];
    a[2] = k1>>1;
    a[3] = sizeof (void *);
}

__global__ void fb(void *da) {
    int32_t *a = (int32_t *)da;
    uint32_t k = a[0];
    k-=1;
    uint32_t g = k-2;
    uint64_t c;
    asm("mul.wide.u32 %0, %1, %2;" : "=l"(c) : "r"(k), "r"(g) );
//    a[1] = (uint32_t)c;
//    a[0] = (uint32_t)(c>>32);
    *(uint64_t *)a = c;
    uint32_t k1 = a[2];
    a[2] = k1>>1;
    a[3] = sizeof (void *);
}

__global__ void fb(void *da) {
    int32_t *a = (int32_t *)da;
    uint32_t k = a[0];
    k-=1;
    uint32_t g = k-2;
    uint32_t c1, c0;
    asm("mul.hi.u32 %0, %1, %2;" : "=r"(c1) : "r"(k), "r"(g) );
    asm("mul.lo.u32 %0, %1, %2;" : "=r"(c0) : "r"(k), "r"(g) );

    a[1] = c1;
    a[0] = c0;
    uint32_t k1 = a[2];
    a[2] = k1>>1;
    a[3] = sizeof (void *);
}




https://forums.developer.nvidia.com/t/long-integer-multiplication-mul-wide-u64-and-mul-wide-u128/51520

int main() {
    int r = cudaSuccess;
    int a[N];
    for (int i = 0; i< N; i++) {
        a[i] = i;
    }
    a[2]=-3;
    void *da = 0, *db = 0;
    r=cudaMalloc(&da, sizeof (int)*N);
    r=cudaMemcpy(da, a, sizeof a, cudaMemcpyHostToDevice);
    //fa<<<1,32>>>(da);
    fb<<<1,32>>>(da);
    r=cudaMemcpy(a, da, sizeof a, cudaMemcpyDeviceToHost);
    r=cudaFree(da);
    r=cudaDeviceSynchronize();
    for (int i = 0; i< N; i++) {
        printf("%d\t", a[i]);
    }
    printf("\n");
    return 0;
}


__device__ void f2(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
    uint32_t alo = a, ahi = a >> 32, blo = b, bhi = b >> 32, r0, r1, r2, r3;
    asm("mul.lo.u32      %0, %1, %2;" : "=r"(r0) : "r"(alo), "r"(bhi));
    asm("mad.lo.cc.u32   %0, %1, %2, %3;" : "=r"(r1) : "r"(blo), "r"(ahi), "r"(r0));
    asm("madc.hi.u32     %0, %1, %2, 0;" : "=r"(r0) : "r"(alo), "r"(bhi));
    asm("mad.hi.cc.u32   %0, %1, %2, %3;" : "=r"(r2) : "r"(blo), "r"(ahi), "r"(r0));
    asm("madc.hi.u32     %0, %1, %2, 0;" : "=r"(r3) : "r"(ahi), "r"(bhi));
    asm("mad.hi.cc.u32   %0, %1, %2, %0;" : "+r"(r1) : "r"(alo), "r"(blo));
    asm("madc.lo.cc.u32  %0, %1, %2, %0;" : "+r"(r2) : "r"(ahi), "r"(bhi));
    asm("addc.u32        %0, %0, 0;" : "+r"(r3) : );
    asm("mul.lo.u32      %0, %1, %2;" : "=r"(r0) : "r"(alo), "r"(blo));
    *h = (uint64_t)r3 << 32 | r2;
    *l = (uint64_t)r1 << 32 | r0;
}


__global__ void fa(void *da) {
    uint64_t *a = (uint64_t *) da;
    f2(&a[0], &a[1], a[2], a[3]);
}


static __device__ void f1(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
    uint32_t alo = a, ahi = a >> 32, blo = b, bhi = b >> 32, r0, r1 = 0 , r2 = 0 , r3 = 0 ;
    asm (
    "mul.lo.u32      %0, %4, %6;"
    "mul.hi.u32      %1, %4, %6;"
    "mad.lo.cc.u32   %1, %4, %7, %1;"
    "madc.hi.u32     %2, %4, %7, 0;"
    "mad.lo.cc.u32   %1, %5, %6, %1;"
    "madc.hi.cc.u32  %2, %5, %6, %2;"
    "madc.hi.u32     %3, %5, %7, 0;"
    "mad.lo.cc.u32   %2, %5, %7, %2;"
    "addc.u32        %3, %3, 0;"
    : "=r"(r0), "+r"(r1), "+r"(r2), "+r"(r3)
    : "r"(alo), "r"(ahi), "r"(blo), "r"(bhi));
    *h = (uint64_t)r3 << 32 | r2;
    *l = (uint64_t)r1 << 32 | r0;
}

static __device__ void f11(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
    uint32_t alo = a, ahi = a >> 32, blo = b, bhi = b >> 32, r0, r1, r2, r3;
    asm("mul.lo.u32      %0, %1, %2;" : "=r"(r0) : "r"(alo), "r"(blo));
    asm("mul.hi.u32      %0, %1, %2;" : "=r"(r1) : "r"(alo), "r"(blo));
    asm("mad.lo.cc.u32   %0, %1, %2, %0;" : "+r"(r1) : "r"(alo), "r"(bhi));
    asm("madc.hi.u32     %0, %1, %2, 0;" : "=r"(r2) : "r"(alo), "r"(bhi));
    asm("mad.lo.cc.u32   %0, %1, %2, %0;" : "+r"(r1) : "r"(blo), "r"(ahi));
    asm("madc.hi.u32     %0, %1, %2, %0;" : "+r"(r2) : "r"(blo), "r"(ahi));
    asm("madc.hi.u32     %0, %1, %2, 0;" : "=r"(r3) : "r"(ahi), "r"(bhi));
    asm("mad.lo.cc.u32   %0, %1, %2, %0;" : "+r"(r2) : "r"(ahi), "r"(bhi));
    asm("addc.u32        %0, %0, 0;" : "+r"(r3) : );
    *h = (uint64_t)r3 << 32 | r2;
    *l = (uint64_t)r1 << 32 | r0;
}


__global__ void fb(void *da) {
    uint64_t *a = (uint64_t *) da;
    f1(&a[0], &a[1], a[2], a[3]);
}

static __device__ void f(uint64_t *h, uint64_t *l, uint64_t a, uint64_t b){
    asm (
         ".reg .u32 r0, r1, r2, r3, alo, ahi, blo, bhi;"
         "mov.b64         {alo,ahi}, %2;"
         "mov.b64         {blo,bhi}, %3;"
         "mul.lo.u32      r0, alo, blo;"
         "mul.hi.u32      r1, alo, blo;"
         "mad.lo.cc.u32   r1, alo, bhi, r1;"
         "madc.hi.u32     r2, alo, bhi, 0;"
         "mad.lo.cc.u32   r1, ahi, blo, r1;"
         "madc.hi.cc.u32  r2, ahi, blo, r2;"
         "madc.hi.u32     r3, ahi, bhi, 0;"
         "mad.lo.cc.u32   r2, ahi, bhi, r2;"
         "addc.u32        r3, r3, 0;"
         "mov.b64         %0, {r0,r1};"
         "mov.b64         %1, {r2,r3};"
    : "=l"(*l), "=l"(*h)
    : "l"(a), "l"(b));
}

__global__ void fbb(void *da) {
    printf("tid:%d\t bid:%d\t blockdim:%d\t griddim:%d\n", threadIdx.x, blockIdx.x, blockDim.x, gridDim.x);
}

cudaError_t e = cudaGetLastError();
if (e) {
    printf("GPU: %d (%s)\n", e, cudaGetErrorString(e));
    return e;
}


#include "unistd.h"
int main(int argc, char *argv[])
{
    cudaError_t r;
    unsigned bs = strtoul(argv[1], 0, 10);
    unsigned ts = strtoul(argv[2], 0, 10);
    unsigned n = bs * ts;
    T *a = new T[n];
    void *da = 0;
    r=cudaMalloc(&da, sizeof(T)*n);
    //r=cudaMemcpy(da, a, sizeof(T)*n, cudaMemcpyHostToDevice);
    f<<<bs,ts>>>(da);
    cudaError_t e = cudaGetLastError();
    if (e) {
        printf("GPU: %d (%s)\n", e, cudaGetErrorString(e));
        return e;
    }

    r = cudaErrorNotReady;
    while (r) {
        sleep(1);
        r = cudaStreamQuery(0);
    }
    //r=cudaDeviceSynchronize();
    r=cudaMemcpy(a, da, sizeof(T)*n, cudaMemcpyDeviceToHost);
    r=cudaFree(da);

    unsigned g[] = {0, 1, n>>4, n>>5, n>>9, n/4, n/3, n/2, n-2, n-1};
    for (unsigned i = 0; i < sizeof(g)/sizeof(*g); i++){
        for (unsigned j = 0; j < sizeof(T); j++){
            printf("%d, ", a[g[i]][j]);
        }
        printf("\n");
    }
    delete []a;
    return 0;
}

posted @ 2022-06-28 16:02 zJanly 阅读(59) 评论(0) 收藏举报

刷新页面返回顶部

zJanly

cuobjdump -sass 1 > 1.s

公告