numba
#Below is the sample to run numba benchmark #install and setting pip install numba export NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice export NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so export NUMBAPRO_CUDA_DRIVER=/usr/lib/x86_64-linux-gnu/libcuda.so #python code to run: import numba from numba import float32, float64 from numba import vectorize from numba import cuda import numpy as np import time def lerp(A,B,factor): return (1.0-factor)*A+factor*B lerp_ufunc_p = numba.vectorize(['float32(float32, float32, float32)', 'float64(float64, float64, float64)'], target="parallel")(lerp) lerp_ufunc_cuda = numba.vectorize(['float32(float32, float32, float32)', 'float64(float64, float64, float64)'], target="cuda")(lerp) lerp_ufunc_cpu = numba.vectorize(['float32(float32, float32, float32)', 'float64(float64, float64, float64)'], target="cpu")(lerp) length=5000000 B = np.arange(length, 0.0, -2.0) A = np.arange(0.0, length, 2.0) A_cuda=cuda.to_device(A) B_cuda=cuda.to_device(B) factor=0.5 #parallel t0=time.clock() for i in range(10000): c=lerp_ufunc_p(A,B,0.5) t1=time.clock() print((t1-t0)/10000) #0.0164714971 #gpu t0=time.clock() for i in range(10000): c=lerp_ufunc_cuda(A_cuda,B_cuda,0.5) t1=time.clock() print((t1-t0)/10000)#0.0014463519