利用pynvml实现获取gpu显卡相关信息
介绍
pynvml提供了gpu管理和状态监控的python接口,对NVML库的进一步封装实现,关于NVML的介绍可以参考:http://developer.nvidia.com/nvidia-management-library-nvml
安装
pip方式安装:
pip install nvidia-ml-py
根据python版本制定2/3:
pip install nvidia-ml-py2 # python2
pip install nvidia-ml-py3 # python3
源码安装:
#下载链接:http://pypi.python.org/pypi/nvidia-ml-py/
sudo python setup.py install
使用
-
初始化
>>> import pynvml >>> pynvml.nvmlInit() # 初始化 >>> ... # 函数调用 >>> pynvml.nvmlShutdown() # 最后要关闭管理工具
-
获取gpu个数
>>> gpucount = pynvml.nvmlDeviceGetCount() >>> gpucount 4 >>>
-
获取gpu句柄id
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) # 根据GPU id获取显卡句柄,id为0,1,2,3...
-
获取驱动版本号
>>> # get gpu driver version >>> version = pynvml.nvmlSystemGetDriverVersion() >>> values.append("GPU_device_driver_version:" + version.decode()) >>> version b'440.64.00'
-
获取显卡名称、型号
>>> pynvml.nvmlDeviceGetName(handle) b'Tesla V100-SXM2-32GB' >>>
-
获取gpu显存信息
>>> meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) >>> meminfo.used 65536 # 显存使用大小(bit) >>> meminfo.total 34089730048 # 显卡总的显存大小(bit) >>> meminfo.free 34089664512 # 显卡剩余显存大小(bit)
-
获取gpu温度、风扇、电源
>>> print("Temperature is %d C" % nvmlDeviceGetTemperature(handle,0)) >>> print("Fan speed is " nvmlDeviceGetFanSpeed(handle)) # 有的环境肯能无法获取 >>> print("Power ststus", nvmlDeviceGetPowerState(handle)) >>> Temperature is 34 C >>> Fan speed is 0 >>> Power ststus 8
-
获取gpu利用率
>>> utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) >>> print(utilization.gpu) # gpu利用率 >>> values.append("GPU " + gpu_id + " " + name + " GPU-Util:" + str(utilization.gpu))
-
获取电源功耗和总功率
>>> powerusage = pynvml.nvmlDeviceGetPowerUsage(handle) >>> print(powerusage / 1000) # 当前功耗, 原始单位是mWa >>> values.append("GPU " + gpu_id + " " + name + " Pwr_Usage:" + str(powerusage / 1000)) # 当前gpu power capacity # pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)
使用案例
如下代码获取服务器节点上gpu相关的所有信息:
def get_sensor_values():
"""
get Sensor values
:return:
"""
values = list()
pynvml.nvmlInit()
# get gpu driver version
version = pynvml.nvmlSystemGetDriverVersion()
values.append("GPU_device_driver_version:" + version.decode())
gpucount = pynvml.nvmlDeviceGetCount() # 显示有几块GPU
for gpu_id in range(gpucount):
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
name = pynvml.nvmlDeviceGetName(handle).decode()
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
print(meminfo.total) # 显卡总的显存大小
gpu_id = str(gpu_id)
values.append("GPU " + gpu_id + " " + name + " mem total:" + str(common.bytes2human(meminfo.total)))
print(meminfo.used) # 显存使用大小
values.append("GPU " + gpu_id + " " + name + " mem used:" + str(common.bytes2human(meminfo.used)))
print(meminfo.free) # 显卡剩余显存大小
values.append("GPU " + gpu_id + " " + name + " mem free:" + str(common.bytes2human(meminfo.free)))
values.append("GPU " + gpu_id + " " + name + " mem free_rate:" + str(int((meminfo.free / meminfo.total) * 100)))
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
print(utilization.gpu) # gpu利用率
values.append("GPU " + gpu_id + " " + name + " GPU-Util:" + str(utilization.gpu))
powerusage = pynvml.nvmlDeviceGetPowerUsage(handle)
print(powerusage / 1000) # 当前功耗, 原始单位是mWa
values.append("GPU " + gpu_id + " " + name + " Pwr_Usage:" + str(powerusage / 1000))
# 当前gpu power capacity
# pynvml.nvmlDeviceGetEnforcedPowerLimit(handle)
# 通过以下方法可以获取到gpu的温度,暂时采用ipmi sdr获取gpu的温度,此处暂不处理
# temp = pynvml.nvmlDeviceGetTemperature(handle,0)
print(values)
return values
values打印结果:
[
'GPU_device_driver_version:440.64.00',
'GPU 0 Tesla V100-SXM2-32GB mem total:31.7G',
'GPU 0 Tesla V100-SXM2-32GB mem used:64.0K',
'GPU 0 Tesla V100-SXM2-32GB mem free:31.7G',
'GPU 0 Tesla V100-SXM2-32GB mem free_rate:99',
'GPU 0 Tesla V100-SXM2-32GB GPU-Util:0',
'GPU 0 Tesla V100-SXM2-32GB Pwr_Usage:42.55',
'GPU 1 Tesla V100-SXM2-32GB mem total:31.7G',
'GPU 1 Tesla V100-SXM2-32GB mem used:64.0K',
'GPU 1 Tesla V100-SXM2-32GB mem free:31.7G',
'GPU 1 Tesla V100-SXM2-32GB mem free_rate:99',
'GPU 1 Tesla V100-SXM2-32GB GPU-Util:0',
'GPU 1 Tesla V100-SXM2-32GB Pwr_Usage:46.97',
'GPU 2 Tesla V100-SXM2-32GB mem total:31.7G',
'GPU 2 Tesla V100-SXM2-32GB mem used:64.0K',
'GPU 2 Tesla V100-SXM2-32GB mem free:31.7G',
'GPU 2 Tesla V100-SXM2-32GB mem free_rate:99',
'GPU 2 Tesla V100-SXM2-32GB GPU-Util:0',
'GPU 2 Tesla V100-SXM2-32GB Pwr_Usage:46.452',
'GPU 3 Tesla V100-SXM2-32GB mem total:31.7G',
'GPU 3 Tesla V100-SXM2-32GB mem used:64.0K',
'GPU 3 Tesla V100-SXM2-32GB mem free:31.7G',
'GPU 3 Tesla V100-SXM2-32GB mem free_rate:99',
'GPU 3 Tesla V100-SXM2-32GB GPU-Util:0',
'GPU 3 Tesla V100-SXM2-32GB Pwr_Usage:43.066'
]
专注搬砖,擅长搬砖砸自己的脚~~~
Email:
ltwbuaa@163.com
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 【自荐】一款简洁、开源的在线白板工具 Drawnix
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· Docker 太简单,K8s 太复杂?w7panel 让容器管理更轻松!