NVCC DOCS [cudatoolkit3.2]

Ref to "The CUDA Compiler Driver NVCC, Last modified on:<08-23-2010>"

Some useful options:

-c

-ptx

-v

-keep

-clean

-arch

-code

-gencode

Using this command: nvcc VecAdd.cu -keep -v

Verbose
#$ _SPACE_=
#$ _CUDART_
=cudart
#$ _HERE_
=/usr/local/cuda/bin
#$ _THERE_
=/usr/local/cuda/bin
#$ _TARGET_SIZE_
=
#$ TOP
=/usr/local/cuda/bin/..
#$ LD_LIBRARY_PATH
=/usr/local/cuda/bin/../lib:/usr/local/cuda/bin/../extools/lib::/usr/local/cuda/lib:/usr/local/cuda/computeprof/bin
#$ PATH
=/usr/local/cuda/bin/../open64/bin:/usr/local/cuda/bin:/nextop/tool/edatools/debussy/bin:/nextop/tool/synopsys/mg/linux/ctg/bin:/usr/bin:/bin:/nextop/tool/synopsys/vera/vera_vA-2007.12_linux/bin:/nextop/tool/synopsys/vcs/bin:/nextop/tool/synopsys/vcs/linux/bin:/nextop/tool/cds/specman/tools/bin:/nextop/tool/cds/ius/tools/bin:/opt/sge/bin/lx24-x86:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/nextop/tool/mgc/modeltech/linux:/nextop/tool/edatools/syn200509/bin:/nextop/tool/altera8.0/quartus/bin:/home/tian/ShortCut:/home/tian/MyShell:/usr/local/cuda/bin
#$ INCLUDES
="-I/usr/local/cuda/bin/../include" "-I/usr/local/cuda/bin/../include/cudart"
#$ LIBRARIES
= "-L/usr/local/cuda/bin/../lib" -lcudart
#$ CUDAFE_FLAGS
=
#$ OPENCC_FLAGS
=
#$ PTXAS_FLAGS
=
#$ gcc
-D__CUDA_ARCH__=100 -E -x c++ -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_11_ATOMIC_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-I/usr/local/cuda/bin/../include" "-I/usr/local/cuda/bin/../include/cudart" -I. -D__CUDACC__ -C -include "cuda_runtime.h" -m32 -malign-double -o "VecAdd.cpp1.ii" "VecAdd.cu"
#$ cudafe
--m32 --gnu_version=40204 -tused --no_remove_unneeded_entities --gen_c_file_name "VecAdd.cudafe1.c" --stub_file_name "VecAdd.cudafe1.stub.c" --gen_device_file_name "VecAdd.cudafe1.gpu" --include_file_name "VecAdd.fatbin.c" "VecAdd.cpp1.ii"
#$ gcc
-D__CUDA_ARCH__=100 -E -x c -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_11_ATOMIC_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-I/usr/local/cuda/bin/../include" "-I/usr/local/cuda/bin/../include/cudart" -I. -D__CUDACC__ -C -D__CUDA_FTZ -m32 -malign-double -o "VecAdd.cpp2.i" "VecAdd.cudafe1.gpu"
#$ cudafe
--m32 --gnu_version=40204 --c --gen_c_file_name "VecAdd.cudafe2.c" --stub_file_name "VecAdd.cudafe2.stub.c" --gen_device_file_name "VecAdd.cudafe2.gpu" --include_file_name "VecAdd.fatbin.c" "VecAdd.cpp2.i"
#$ gcc
-D__CUDA_ARCH__=100 -E -x c -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_11_ATOMIC_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-I/usr/local/cuda/bin/../include" "-I/usr/local/cuda/bin/../include/cudart" -I. -D__CUDABE__ -D__CUDA_FTZ -m32 -malign-double -o "VecAdd.cpp3.i" "VecAdd.cudafe2.gpu"
#$ filehash
-s " " "VecAdd.cpp3.i" > "VecAdd.hash"
#$ gcc
-E -x c++ "-I/usr/local/cuda/bin/../include" "-I/usr/local/cuda/bin/../include/cudart" -I. -D__CUDACC__ -C -include "cuda_runtime.h" -m32 -malign-double -o "VecAdd.cpp4.ii" "VecAdd.cu"
#$ cudafe
++ --m32 --gnu_version=40204 --parse_templates --gen_c_file_name "VecAdd.cudafe1.cpp" --stub_file_name "VecAdd.cudafe1.stub.c" "VecAdd.cpp4.ii"
#$ nvopencc
-TARG:compute_10 -m32 -CG:ftz=1 -CG:prec_div=0 -CG:prec_sqrt=0 "VecAdd" "VecAdd.cpp3.i" -o "VecAdd.ptx"
#$ ptxas
-arch=sm_10 -m32 "VecAdd.ptx" -o "VecAdd.sm_10.cubin"
#$ fatbin
--key="e3ca5f8157275de1" --source-name="VecAdd.cu" --usage-mode=" " --embedded-fatbin="VecAdd.fatbin.c" "--image=profile=compute_10,file=VecAdd.ptx" "--image=profile=sm_10,file=VecAdd.sm_10.cubin"
#$ gcc
-D__CUDA_ARCH__=100 -E -x c++ -DCUDA_NO_SM_13_DOUBLE_INTRINSICS -DCUDA_NO_SM_12_ATOMIC_INTRINSICS -DCUDA_NO_SM_11_ATOMIC_INTRINSICS -DCUDA_FLOAT_MATH_FUNCTIONS "-I/usr/local/cuda/bin/../include" "-I/usr/local/cuda/bin/../include/cudart" -I. -D__CUDA_FTZ -m32 -malign-double -o "VecAdd.cu.cpp" "VecAdd.cudafe1.cpp"
#$ gcc
-c -x c++ "-I/usr/local/cuda/bin/../include" "-I/usr/local/cuda/bin/../include/cudart" -I. -fpreprocessed -m32 -malign-double -o "VecAdd.o" "VecAdd.cu.cpp"
#$ g
++ -m32 -malign-double -o "a.out" -Wl,--start-group "VecAdd.o" "-L/usr/local/cuda/bin/../lib" -lcudart -Wl,--end-group

Compilation flow

In short, CUDA compilation works as follows: the input program is separated by the CUDA front end (cudafe), into C/C++ host code and the .gpu device code. Depending on the value(s) of the –code option to nvcc, this device code is further translated by the CUDA compilers/assemblers into CUDA binary (cubin) and/or into intermediate ptx code. This code is merged into a device code descriptor which is included by the previously separated host code. This descriptor will be inspected by the CUDA runtime system whenever the device code is invoked (‘called’) by the host program, in order to obtain an appropriate load image for the current GPU.

posted @ 2011-04-07 18:17  soulnearby  阅读(336)  评论(0编辑  收藏  举报