欧拉系统安装GPU驱动

安装NVIDIA Driver

进入英伟达官网下载页面

image-20241125154226389

按照以上方式选择即可得到>535.113.01版本的驱动,可以实现多卡推理,小于这个版本会导致多卡训练以及推理报错
虽然最新版本为550.54.15,但是535版本更加稳定,并且pytorch目前只支持到12.1,而在CUDA Toolkit选择栏中没有这个版本,所以选择12.2最稳妥

# 所有535版本
https://www.nvidia.com/en-us/drivers/results/

# 535.183.06
https://www.nvidia.com/en-us/drivers/details/229295/

image-20241125154605442

下载到服务器,在安装驱动前先安装一些依赖

# wget下载
wget https://cn.download.nvidia.com/tesla/535.183.06/NVIDIA-Linux-x86_64-535.183.06.run

# 安装依赖
sudo yum install epel-release
sudo yum update
sudo yum install pkgconfig libglvnd-devel

yum -y install gcc gcc-c++ make
yum -y install kernel-headers-$(uname -r)
yum -y install kernel-devel-$(uname -r)

检查 nouveau driver 有没有被加载(无输出,表明nouveau驱动已禁用)

lsmod |grep nouveau

在 /usr/lib/modprobe.d/dist-blacklist.conf 中添加两行内容:
blacklist nouveau
options nouveau modeset=0

给当前镜像做备份
mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak

建立新镜像
dracut /boot/initramfs-$(uname -r).img $(uname -r)

重启操作系统
reboot

检查nouveau驱动加载情况
lsmod |grep nouveau

参考文档: GPU云主机安装GPU驱动(Linux)

赋予权限后运行

chmod +x NVIDIA-Linux-x86_64-535.183.06.run
sh ./NVIDIA-Linux-x86_64-535.183.06.run -s  --no-x-check
开启显卡持久模式
sudo nvidia-persistenced --persistence-mode
nvidia-smi

image-20241125113454456

安装CUDA

进入CUDA官网下载页面 在这里插入图片描述 在服务器中输入uname -a查看服务器系统

然后选择对应的版本

image-20241125114043778

执行wget下载到GPU服务器,并安装

wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
sudo sh cuda_12.2.2_535.104.05_linux.run

运行后在出现的页面中以下操作

1.输入accept
2. - [×] Driver 取消×

按回车进行取消,然后按↓到Install进行回车进行安装

image-20241125114606070

安装完成后显示

===========
= Summary =
===========

Driver:   Not Selected
Toolkit:  Installed in /usr/local/cuda-12.2/

Please make sure that
 -   PATH includes /usr/local/cuda-12.2/bin
 -   LD_LIBRARY_PATH includes /usr/local/cuda-12.2/lib64, or, add /usr/local/cuda-12.2/lib64 to /etc/ld.so.conf and run ldconfig as root

To uninstall the CUDA Toolkit, run cuda-uninstaller in /usr/local/cuda-12.2/bin
***WARNING: Incomplete installation! This installation did not install the CUDA Driver. A driver of version at least 535.00 is required for CUDA 12.2 functionality to work.
To install the driver using this installer, run the following command, replacing <CudaInstaller> with the name of this run file:
    sudo <CudaInstaller>.run --silent --driver

Logfile is /var/log/cuda-installer.log

根据上面的提示信息设置路径

export PATH=/usr/local/cuda-12.2/bin${PATH:+:${PATH}}
export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

输入命令nvcc -V进行检验

[root@a36-pgxt-dg-171 ~]#  nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0

安装nvidia-container-toolkit

离线安装docker

cd /nfs_data/02-isntall-k8s/01-kubeadm-v1.28.13/01-docker-26.1.4
[root@gpu-3-3 /nfs_data/02-isntall-k8s/01-kubeadm-v1.28.13/01-docker-26.1.4]#  ll
total 116940
-rw-r--r-- 1 root root 37045876 Sep  1 00:10 containerd.io-1.6.33-3.1.el7.x86_64.rpm
-rw-r--r-- 1 root root    37941 Sep  1 00:10 container-selinux-2.138-4.oe2203sp1.noarch.rpm
-rw-r--r-- 1 root root 14280912 Sep  1 00:10 docker-buildx-plugin-0.14.1-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 28596976 Sep  1 00:10 docker-ce-26.1.4-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 15445372 Sep  1 00:10 docker-ce-cli-26.1.4-1.el7.x86_64.rpm
-rw-r--r-- 1 root root  9840840 Sep  1 00:10 docker-ce-rootless-extras-26.1.4-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 14058416 Sep  1 00:10 docker-compose-plugin-2.27.1-1.el7.x86_64.rpm
-rw-r--r-- 1 root root   117849 Sep  1 00:10 fuse3-3.10.5-5.oe2203sp1.x86_64.rpm
-rw-r--r-- 1 root root    13969 Sep  1 00:10 fuse3-help-3.10.5-5.oe2203sp1.x86_64.rpm
-rw-r--r-- 1 root root    63985 Sep  1 00:10 fuse-overlayfs-1.9-1.oe2203sp1.x86_64.rpm
-rw-r--r-- 1 root root    98801 Sep  1 00:10 libcgroup-0.42.2-3.oe2203sp1.x86_64.rpm
-rw-r--r-- 1 root root    72625 Sep  1 00:10 libslirp-4.7.0-2.oe2203sp1.x86_64.rpm
-rw-r--r-- 1 root root    48429 Sep  1 00:10 slirp4netns-1.2.0-1.oe2203sp1.x86_64.rpm
[root@gpu-3-3 /nfs_data/02-isntall-k8s/01-kubeadm-v1.28.13/01-docker-26.1.4]#  yum localinstall *.rpm -y

参考连接: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html

安装nvidia-container-toolkit

# GitHub的rmp包
https://github.com/NVIDIA/libnvidia-container/tree/gh-pages/stable/centos8/x86_64

# 安装下面包去GitHub下载对应的
[root@b93-hhht-pgxt-115 ~]# rpm -qa|grep nvidia
nvidia-container-toolkit-1.13.5-1.x86_64
nvidia-container-toolkit-base-1.13.5-1.x86_64
libnvidia-container1-1.13.5-1.x86_64
libnvidia-container-tools-1.13.5-1.x86_64
nvidia-docker2-2.13.0-1.noarch


cd /nfs_data/alg/01-GPU-NVIDIA-linux/nvidia-container-toolkit
#手动安装 RPM 包,如果您是手动安装 .rpm 文件,确保版本匹配后执行以下命令:
sudo rpm -ivh libnvidia-container1-1.13.5-1.x86_64.rpm
sudo rpm -ivh libnvidia-container-tools-1.13.5-1.x86_64.rpm
sudo rpm -ivh nvidia-container-toolkit-base-1.13.5-1.x86_64.rpm
sudo rpm -ivh nvidia-container-toolkit-1.13.5-1.x86_64.rpm
sudo rpm -ivh nvidia-docker2-2.13.0-1.noarch.rpm

# 验证nvidia
[root@gpu-3-3 /data_ssd1]#  nvidia-container-cli --version
cli-version: 1.13.5
lib-version: 1.13.5
build date: 2023-07-18T11:38+00:00
build revision: 66607bd046341f7aad7de80a9f022f122d1f2fce
build compiler: gcc 8.5.0 20210514 (Red Hat 8.5.0-20)
build platform: x86_64
build flags: -D_GNU_SOURCE -D_FORTIFY_SOURCE=2 -DNDEBUG -std=gnu11 -O2 -g -fdata-sections -ffunction-sections -fplan9-extensions -fstack-protector -fno-strict-aliasing -fvisibility=hidden -Wall -Wextra -Wcast-align -Wpointer-arith -Wmissing-prototypes -Wnonnull -Wwrite-strings -Wlogical-op -Wformat=2 -Wmissing-format-attribute -Winit-self -Wshadow -Wstrict-prototypes -Wunreachable-code -Wconversion -Wsign-conversion -Wno-unknown-warning-option -Wno-format-extra-args -Wno-gnu-alignof-expression -Wl,-zrelro -Wl,-znow -Wl,-zdefs -Wl,--gc-sections
[root@gpu-3-3 /data_ssd1]#  nvidia-docker --version
Docker version 26.1.4, build 5650f9b
[root@gpu-3-3 /data_ssd1]#  docker info | grep nvidia
 Runtimes: nvidia runc io.containerd.runc.v2
[root@gpu-3-3 /data_ssd1]#  


# 配置docker
[root@gpu-3-3 /data_ssd1]#  cat <<EOF> /etc/docker/daemon.json
{
    "registry-mirrors": [
        "https://kfwkfulq.mirror.aliyuncs.com",
        "https://2lqq34jg.mirror.aliyuncs.com",
        "https://pee6w651.mirror.aliyuncs.com",
        "https://registry.docker-cn.com",
        "http://hub-mirror.c.163.com"
    ],
    "exec-opts": ["native.cgroupdriver=cgroupfs"],
    "data-root": "/data_ssd1/docker_data",
    "log-driver": "json-file",
    "log-opts": {"max-size":"500m", "max-file":"3"},
    "insecure-registries": ["10.19.29.2:7890"],

    "runtimes": {
        "nvidia": {
            "path": "nvidia-container-runtime",
            "runtimeArgs": []
        }
    }
}
EOF

# 启动docker
systemctl enable --now docker

# 测试容器使用
docker run --rm --gpus all nvcr.io/nvidia/cuda:10.2-base nvidia-smi

image-20241125153256332

posted @ 2024-12-26 14:34  broadviews  阅读(9)  评论(0编辑  收藏  举报