Analyse VecAdd.cu

source code, VecAdd.cu

VecAdd.cu
#include <iostream>
#define MAXN 128

__global__
void VecAdd( float* A, float* B, float* C ) {
int i = threadIdx.x;
for( ; i < MAXN; i += 128 ) {
C[i]
= A[i] + B[i];
}
}

int main( int argc, char **argv ) {
float h_A[MAXN], h_B[MAXN], h_C[MAXN];
float *d_A, *d_B, *d_C;
int i;
for( i=0; i < MAXN; i++ ) {
h_A[i]
= i;
h_B[i]
= 1;
h_C[i]
= 0;
}

int size = MAXN * sizeof( float );
cudaMalloc( (
void**)&d_A, size );
cudaMalloc( (
void**)&d_B, size );
cudaMalloc( (
void**)&d_C, size );

cudaMemcpy( d_A, h_A, size, cudaMemcpyHostToDevice );
cudaMemcpy( d_B, h_B, size, cudaMemcpyHostToDevice );

VecAdd
<<< 1, 128 >>>( d_A, d_B, d_C );

cudaMemcpy( h_C, d_C, size, cudaMemcpyDeviceToHost );

for( i = 0; i < MAXN; i++ ) {
std::cout
<< h_C[i] << " ";
}

std::cout
<< std::endl;

return 0;
}

Using the command to get all the intermediate files.

nvcc -keep VecAdd.cu


PTX code, VecAdd.ptx

VecAdd.ptx
.version 1.4
.target sm_10, map_f64_to_f32
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 3.2 built on 2010-11-03

//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00007c2a_00000000-9_VecAdd.cpp3.i (/tmp/ccBI#.VBTmkR)
//-----------------------------------------------------------

//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_10, Endian:little, Pointer Size:32
// -O3 (Optimization level)
// -g0 (Debug level)-
// -m2 (Report advisories)
//-----------------------------------------------------------

.file
1 "<command-line>"
.file
2 "/tmp/tmpxft_00007c2a_00000000-8_VecAdd.cudafe2.gpu"
.file
3 "/usr/lib/gcc/i486-linux-gnu/4.2.4/include/stddef.h"
.file
4 "/usr/local/cuda/bin/../include/crt/device_runtime.h"
.file
5 "/usr/local/cuda/bin/../include/host_defines.h"
.file
6 "/usr/local/cuda/bin/../include/builtin_types.h"
.file
7 "/usr/local/cuda/bin/../include/device_types.h"
.file
8 "/usr/local/cuda/bin/../include/driver_types.h"
.file
9 "/usr/local/cuda/bin/../include/surface_types.h"
.file
10 "/usr/local/cuda/bin/../include/texture_types.h"
.file
11 "/usr/local/cuda/bin/../include/vector_types.h"
.file
12 "/usr/local/cuda/bin/../include/device_launch_parameters.h"
.file
13 "/usr/local/cuda/bin/../include/crt/storage_class.h"
.file
14 "/usr/include/bits/types.h"
.file
15 "/usr/include/time.h"
.file
16 "/usr/local/cuda/bin/../include/texture_fetch_functions.h"
.file
17 "/usr/local/cuda/bin/../include/common_functions.h"
.file
18 "/usr/local/cuda/bin/../include/math_functions.h"
.file
19 "/usr/local/cuda/bin/../include/math_constants.h"
.file
20 "/usr/local/cuda/bin/../include/device_functions.h"
.file
21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h"
.file
22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h"
.file
23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h"
.file
24 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h"
.file
25 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h"
.file
26 "/usr/local/cuda/bin/../include/surface_functions.h"
.file
27 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h"
.file
28 "VecAdd.cu"


.entry _Z6VecAddPfS_S_ (
.param .u32 __cudaparm__Z6VecAddPfS_S__A,
.param .u32 __cudaparm__Z6VecAddPfS_S__B,
.param .u32 __cudaparm__Z6VecAddPfS_S__C)
{
.reg .u32
%r<20>;
.reg .f32
%f<5>;
.reg .pred
%p<4>;
.loc
28 4 0
$LDWbegin__Z6VecAddPfS_S_:
.loc
28 5 0
cvt.s32.u16
%r1, %tid.x; // r1 = i , covert u16 to s32
mov.u32 %r2, 127; // r2 = 127
setp.gt.s32 %p1, %r1, %r2; // p1 = i > 127
@%p1 bra $Lt_0_1282; // jmp p1
mov.s32 %r3, 255; // r3 = 255
sub.s32 %r4, %r3, %r1; // r4 = 255 - i
shr.s32 %r5, %r4, 31; // r5 = r4 >>> 31
mov.s32 %r6, 127; // r6 = 127
and.b32 %r7, %r5, %r6; // r7 = ((255 - i) >>> 31) & 127
add.s32 %r8, %r7, %r4; // r8 = 255 - i + (((255 - i) >>> 31) & 127)
shr.s32 %r9, %r8, 7; // r9 = r8 >>> 7
mul24.lo.u32 %r10, %r1, 4; // offset for &A[i] because sizeof(float)=4
ld.param.u32 %r11, [__cudaparm__Z6VecAddPfS_S__A]; // A[0]
add.u32 %r12, %r10, %r11; // addr for &A[i] because sizeof(float)=4
add.u32 %r13, %r11, 508; // 512-4, addr for last byte of 128*4 bytes, shows the MAXN here
ld.param.u32 %r14, [__cudaparm__Z6VecAddPfS_S__B]; // B[0]
add.u32 %r15, %r14, %r10; // addr for &B[i] because sizeof(float)=4
ld.param.u32 %r16, [__cudaparm__Z6VecAddPfS_S__C]; // C[0]
add.u32 %r17, %r16, %r10; // addr for &C[i] because sizeof(float)=4
mov.s32 %r18, %r9; // r18 = r9
$Lt_0_1794:
//<loop> Loop body line 5, nesting depth: 1, estimated iterations: unknown
.loc 28 7 0
ld.
global.f32 %f1, [%r12+0];
ld.
global.f32 %f2, [%r15+0];
add.f32
%f3, %f1, %f2; //C[i] = A[i] +B[i]
st.global.f32 [%r17+0], %f3;
add.u32
%r17, %r17, 512;
add.u32
%r15, %r15, 512;
add.u32
%r12, %r12, 512;
setp.le.u32
%p2, %r12, %r13;
@
%p2 bra $Lt_0_1794;
$Lt_0_1282:
.loc
28 9 0
exit;
$LDWend__Z6VecAddPfS_S_:
}
// _Z6VecAddPfS_S_
posted @ 2011-04-07 18:17  soulnearby  阅读(242)  评论(0编辑  收藏  举报