pycuda学习过程中的一些发现,cuda函数的初始化要在cuda内存空间初始化之后,否则会报错
参考:
https://www.cnblogs.com/devilmaycry812839668/p/15348610.html
最近在看WarpDrive的代码,其中cuda上运行的代码是使用pycuda库进行连通的,使用pycuda可以很好的在python环境中调用cuda的代码,但是在使用中发现一些事情,那就是cuda函数的初始化要放在cuda内存空间初始化之后,否则会有报错。
代码:(可以正常运行的代码)
import numpy as np from warp_drive.managers.data_manager import CUDADataManager from warp_drive.managers.function_manager import ( CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset ) from warp_drive.utils.data_feed import DataFeed source_code = """ // A function to demonstrate how to manipulate data on the GPU. // This function increments each the random data array we pushed to the GPU before. // Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id". // Everything inside the if() loop runs in parallel for each agent and environment. // extern "C"{ __global__ void cuda_increment( float* data, int num_agents ) { int env_id = blockIdx.x; int agent_id = threadIdx.x; if (agent_id < num_agents){ int array_index = env_id * num_agents + agent_id; int increment = env_id + agent_id; data[array_index] += increment; } } } """ from timeit import Timer def push_random_data_and_increment_timer( num_runs=1, num_envs=2, num_agents=3, source_code=None ): assert source_code is not None def push_random_data(num_agents, num_envs): # Initialize the CUDA data manager cuda_data_manager = CUDADataManager( num_agents=num_agents, num_envs=num_envs, episode_length=100 ) # Create random data random_data = np.random.rand(num_envs, num_agents) # Push data from host to device data_feed = DataFeed() data_feed.add_data( name="random_data", data=random_data, ) data_feed.add_data( name="num_agents", data=num_agents ) cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager def cuda_func_init(): cuda_function_manager = CUDAFunctionManager( num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"), num_envs=num_envs #cuda_data_manager.meta_info("n_envs") ) # Load source code and initialize function cuda_function_manager.load_cuda_from_source_code( source_code, default_functions_included=False ) cuda_function_manager.initialize_functions(["cuda_increment"]) increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function): increment_function( cuda_data_manager.device_data("random_data"), cuda_data_manager.device_data("num_agents"), block=cuda_function_manager.block, grid=cuda_function_manager.grid ) # set variable # cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init # cuda_function_manager, increment_function = cuda_func_init() # cuda function run # increment_data(cuda_data_manager, cuda_function_manager, increment_function) data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) cuda_data_manager = push_random_data(num_agents, num_envs) cuda_function_manager, increment_function = cuda_func_init() program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs) print(cuda_data_manager.pull_data_from_device('random_data')) return { "data push times": data_push_time, "code run time": program_run_time } num_runs = 1000 times = {} for scenario in [ (1, 1), (1, 100), (1, 1000), (100, 1000), (1000, 1000) ]: num_envs, num_agents = scenario times.update( { f"envs={num_envs}, agents={num_agents}": push_random_data_and_increment_timer( num_runs, num_envs, num_agents, source_code ) } ) print(f"Times for {num_runs} function calls") print("*"*40) for key, value in times.items(): print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") ''' print(cuda_data_manager._meta_info) print(cuda_data_manager._host_data) print(cuda_data_manager._device_data_pointer) print(cuda_data_manager._scalar_data_list) print(cuda_data_manager._reset_data_list) print(cuda_data_manager._log_data_list) print(cuda_data_manager._device_data_via_torch) print(cuda_data_manager._shared_constants) print(cuda_data_manager._shape) print(cuda_data_manager._dtype) print(tensor_on_device) time.sleep(300) '''
报错的代码:
import numpy as np from warp_drive.managers.data_manager import CUDADataManager from warp_drive.managers.function_manager import ( CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset ) from warp_drive.utils.data_feed import DataFeed source_code = """ // A function to demonstrate how to manipulate data on the GPU. // This function increments each the random data array we pushed to the GPU before. // Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id". // Everything inside the if() loop runs in parallel for each agent and environment. // extern "C"{ __global__ void cuda_increment( float* data, int num_agents ) { int env_id = blockIdx.x; int agent_id = threadIdx.x; if (agent_id < num_agents){ int array_index = env_id * num_agents + agent_id; int increment = env_id + agent_id; data[array_index] += increment; } } } """ from timeit import Timer def push_random_data_and_increment_timer( num_runs=1, num_envs=2, num_agents=3, source_code=None ): assert source_code is not None def push_random_data(num_agents, num_envs): # Initialize the CUDA data manager cuda_data_manager = CUDADataManager( num_agents=num_agents, num_envs=num_envs, episode_length=100 ) # Create random data random_data = np.random.rand(num_envs, num_agents) # Push data from host to device data_feed = DataFeed() data_feed.add_data( name="random_data", data=random_data, ) data_feed.add_data( name="num_agents", data=num_agents ) cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager def cuda_func_init(): cuda_function_manager = CUDAFunctionManager( num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"), num_envs=num_envs #cuda_data_manager.meta_info("n_envs") ) # Load source code and initialize function cuda_function_manager.load_cuda_from_source_code( source_code, default_functions_included=False ) cuda_function_manager.initialize_functions(["cuda_increment"]) increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function): increment_function( cuda_data_manager.device_data("random_data"), cuda_data_manager.device_data("num_agents"), block=cuda_function_manager.block, grid=cuda_function_manager.grid ) # set variable # cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init # cuda_function_manager, increment_function = cuda_func_init() # cuda function run # increment_data(cuda_data_manager, cuda_function_manager, increment_function) #data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) cuda_function_manager, increment_function = cuda_func_init() ### cuda_data_manager = push_random_data(num_agents, num_envs) ### program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs) print(cuda_data_manager.pull_data_from_device('random_data')) return { "data push times": 0, #data_push_time, "code run time": program_run_time } num_runs = 1000 times = {} for scenario in [ (1, 1), (1, 100), (1, 1000), (100, 1000), (1000, 1000) ]: num_envs, num_agents = scenario times.update( { f"envs={num_envs}, agents={num_agents}": push_random_data_and_increment_timer( num_runs, num_envs, num_agents, source_code ) } ) print(f"Times for {num_runs} function calls") print("*"*40) for key, value in times.items(): print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") ''' print(cuda_data_manager._meta_info) print(cuda_data_manager._host_data) print(cuda_data_manager._device_data_pointer) print(cuda_data_manager._scalar_data_list) print(cuda_data_manager._reset_data_list) print(cuda_data_manager._log_data_list) print(cuda_data_manager._device_data_via_torch) print(cuda_data_manager._shared_constants) print(cuda_data_manager._shape) print(cuda_data_manager._dtype) print(tensor_on_device) time.sleep(300) '''
报错信息:
Traceback (most recent call last): File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 145, in <module> source_code File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 116, in push_random_data_and_increment_timer program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs) File "/home/xxxxxx/anaconda3/envs/warp_drive/lib/python3.7/timeit.py", line 177, in timeit timing = self.inner(it, self.timer) File "<timeit-src>", line 6, in inner File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 116, in <lambda> program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs) File "/home/xxxxxx/warp-drive/devil_make/tutorial-1-warp_drive_basics.py", line 97, in increment_data grid=cuda_function_manager.grid File "/home/xxxxxx/anaconda3/envs/warp_drive/lib/python3.7/site-packages/pycuda/driver.py", line 480, in function_call func._set_block_shape(*block) pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle
由此可知,在使用pycuda时,如果cuda函数初始化之前没有对cuda内存初始化则会报错:
报错信息:
pycuda._driver.LogicError: cuFuncSetBlockShape failed: invalid resource handle
如果再cuda函数初始化之前对cuda内存初始化那么就不会报错:
代码:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import numpy as np from warp_drive.managers.data_manager import CUDADataManager from warp_drive.managers.function_manager import ( CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset ) from warp_drive.utils.data_feed import DataFeed source_code = """ // A function to demonstrate how to manipulate data on the GPU. // This function increments each the random data array we pushed to the GPU before. // Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id". // Everything inside the if() loop runs in parallel for each agent and environment. // extern "C"{ __global__ void cuda_increment( float* data, int num_agents ) { int env_id = blockIdx.x; int agent_id = threadIdx.x; if (agent_id < num_agents){ int array_index = env_id * num_agents + agent_id; int increment = env_id + agent_id; data[array_index] += increment; } } } """ from timeit import Timer def push_random_data_and_increment_timer( num_runs=1, num_envs=2, num_agents=3, source_code=None ): assert source_code is not None def push_random_data(num_agents, num_envs): # Initialize the CUDA data manager cuda_data_manager = CUDADataManager( num_agents=num_agents, num_envs=num_envs, episode_length=100 ) # Create random data random_data = np.random.rand(num_envs, num_agents) # Push data from host to device data_feed = DataFeed() data_feed.add_data( name="random_data", data=random_data, ) data_feed.add_data( name="num_agents", data=num_agents ) cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager def cuda_func_init(): cuda_function_manager = CUDAFunctionManager( num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"), num_envs=num_envs #cuda_data_manager.meta_info("n_envs") ) # Load source code and initialize function cuda_function_manager.load_cuda_from_source_code( source_code, default_functions_included=False ) cuda_function_manager.initialize_functions(["cuda_increment"]) increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function): increment_function( cuda_data_manager.device_data("random_data"), cuda_data_manager.device_data("num_agents"), block=cuda_function_manager.block, grid=cuda_function_manager.grid ) # set variable # cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init # cuda_function_manager, increment_function = cuda_func_init() # cuda function run # increment_data(cuda_data_manager, cuda_function_manager, increment_function) #data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) cuda_data_manager = push_random_data(num_agents, num_envs) ### cuda_function_manager, increment_function = cuda_func_init() ### program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs) print(cuda_data_manager.pull_data_from_device('random_data')) return { "data push times": '0', #data_push_time, "code run time": program_run_time } num_runs = 1000 times = {} for scenario in [ (1, 1), (1, 100), (1, 1000), (100, 1000), (1000, 1000) ]: num_envs, num_agents = scenario times.update( { f"envs={num_envs}, agents={num_agents}": push_random_data_and_increment_timer( num_runs, num_envs, num_agents, source_code ) } ) print(f"Times for {num_runs} function calls") print("*"*40) for key, value in times.items(): print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") ''' print(cuda_data_manager._meta_info) print(cuda_data_manager._host_data) print(cuda_data_manager._device_data_pointer) print(cuda_data_manager._scalar_data_list) print(cuda_data_manager._reset_data_list) print(cuda_data_manager._log_data_list) print(cuda_data_manager._device_data_via_torch) print(cuda_data_manager._shared_constants) print(cuda_data_manager._shape) print(cuda_data_manager._dtype) print(tensor_on_device) time.sleep(300) '''
神奇的是不论在cuda函数初始化之前对cuda内存初始化多大空间的内存都不会再报错,这也是该问题神奇的地方所在。
如下代码:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import numpy as np from warp_drive.managers.data_manager import CUDADataManager from warp_drive.managers.function_manager import ( CUDAFunctionManager, CUDALogController, CUDASampler, CUDAEnvironmentReset ) from warp_drive.utils.data_feed import DataFeed source_code = """ // A function to demonstrate how to manipulate data on the GPU. // This function increments each the random data array we pushed to the GPU before. // Each index corresponding to (env_id, agent_id) in the array is incremented by "agent_id + env_id". // Everything inside the if() loop runs in parallel for each agent and environment. // extern "C"{ __global__ void cuda_increment( float* data, int num_agents ) { int env_id = blockIdx.x; int agent_id = threadIdx.x; if (agent_id < num_agents){ int array_index = env_id * num_agents + agent_id; int increment = env_id + agent_id; data[array_index] += increment; } } } """ from timeit import Timer def push_random_data_and_increment_timer( num_runs=1, num_envs=2, num_agents=3, source_code=None ): assert source_code is not None def push_random_data(num_agents, num_envs): # Initialize the CUDA data manager cuda_data_manager = CUDADataManager( num_agents=num_agents, num_envs=num_envs, episode_length=100 ) # Create random data random_data = np.random.rand(num_envs, num_agents) # Push data from host to device data_feed = DataFeed() data_feed.add_data( name="random_data", data=random_data, ) data_feed.add_data( name="num_agents", data=num_agents ) cuda_data_manager.push_data_to_device(data_feed) return cuda_data_manager # Initialize the CUDA function manager def cuda_func_init(): cuda_function_manager = CUDAFunctionManager( num_agents=num_agents, #cuda_data_manager.meta_info("n_agents"), num_envs=num_envs #cuda_data_manager.meta_info("n_envs") ) # Load source code and initialize function cuda_function_manager.load_cuda_from_source_code( source_code, default_functions_included=False ) cuda_function_manager.initialize_functions(["cuda_increment"]) increment_function = cuda_function_manager._get_function("cuda_increment") return cuda_function_manager, increment_function def increment_data(cuda_data_manager, cuda_function_manager, increment_function): increment_function( cuda_data_manager.device_data("random_data"), cuda_data_manager.device_data("num_agents"), block=cuda_function_manager.block, grid=cuda_function_manager.grid ) # set variable # cuda_data_manager = push_random_data(num_agents, num_envs) # cuda function init # cuda_function_manager, increment_function = cuda_func_init() # cuda function run # increment_data(cuda_data_manager, cuda_function_manager, increment_function) #data_push_time = Timer(lambda: push_random_data(num_agents, num_envs)).timeit(number=num_runs) push_random_data(1, 1) cuda_function_manager, increment_function = cuda_func_init() ### cuda_data_manager = push_random_data(num_agents, num_envs) ### program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs) print(cuda_data_manager.pull_data_from_device('random_data')) return { "data push times": '0', #data_push_time, "code run time": program_run_time } num_runs = 1000 times = {} for scenario in [ (1, 1), (1, 100), (1, 1000), (100, 1000), (1000, 1000) ]: num_envs, num_agents = scenario times.update( { f"envs={num_envs}, agents={num_agents}": push_random_data_and_increment_timer( num_runs, num_envs, num_agents, source_code ) } ) print(f"Times for {num_runs} function calls") print("*"*40) for key, value in times.items(): print(f"{key:30}: mean data push times: {value['data push times']:10.5}s,\t mean increment times: {value['code run time']:10.5}s") ''' print(cuda_data_manager._meta_info) print(cuda_data_manager._host_data) print(cuda_data_manager._device_data_pointer) print(cuda_data_manager._scalar_data_list) print(cuda_data_manager._reset_data_list) print(cuda_data_manager._log_data_list) print(cuda_data_manager._device_data_via_torch) print(cuda_data_manager._shared_constants) print(cuda_data_manager._shape) print(cuda_data_manager._dtype) print(tensor_on_device) time.sleep(300) '''
核心代码:
push_random_data(1, 1) cuda_function_manager, increment_function = cuda_func_init() ### cuda_data_manager = push_random_data(num_agents, num_envs) ### program_run_time = Timer(lambda: increment_data(cuda_data_manager, cuda_function_manager, increment_function)).timeit(number=num_runs)
下面代码为cuda的内存申请,即使是较小的内存申请也是可以是下面的cuda函数初始化正常运行,如果再cuda函数初始化之前没有任何对cudsa内存申请的操作那就会报错。
push_random_data(1, 1)
初始化cuda内存,cuda内存的申请操作:
push_random_data(1, 1)
cuda函数的初始化操作:
cuda_function_manager, increment_function = cuda_func_init() ###
cuda 函数的执行:
increment_data(cuda_data_manager, cuda_function_manager, increment_function)
本博客是博主个人学习时的一些记录,不保证是为原创,个别文章加入了转载的源地址,还有个别文章是汇总网上多份资料所成,在这之中也必有疏漏未加标注处,如有侵权请与博主联系。
如果未特殊标注则为原创,遵循 CC 4.0 BY-SA 版权协议。
posted on 2021-09-28 18:07 Angry_Panda 阅读(740) 评论(0) 编辑 收藏 举报