容器集群实现多机多卡分布式微调大模型chatglm2-6b(deepseed + LLaMA + NCCL)

环境信息

2台物理机(187.135,187.136),各两张p4显卡,安装好docker=20.10.0, 安装好nvidia驱动(driver version=470.223.02, cuda version=11.4)

构造容器集群(docker swarm 187.136节点作为manager节点,187.135节点作为worker节点)

[root@host-136 ~]# docker swarm init --advertise-addr 192.168.187.136
Swarm initialized: current node (deabb74bpqy0pi7rub9xeo9c1) is now a manager.

To add a worker to this swarm, run the following command:

    docker swarm join --token SWMTKN-1-4s0stnrvi3y4554q05e8qqgko15ilmr3rctfsttehggk4zoasc-3sbbaro2y9ktxr0a1e6k5psiz 192.168.187.136:2377

To add a manager to this swarm, run 'docker swarm join-token manager' and follow the instructions.

[root@host-136 ~]# docker node ls
ID                            HOSTNAME   STATUS    AVAILABILITY   MANAGER STATUS   ENGINE VERSION
ibh3it1mghvs47n9v3no5etmn     host-135   Ready     Active                          20.10.0
deabb74bpqy0pi7rub9xeo9c1 *   host-136   Ready     Active         Leader           20.10.0

创建overlay共享网络

# 创建网络
[root@host-136 ~]# docker network create --driver=overlay --attachable test-net
skhbpczg0bbxod5tba7aqm88h
[root@host-136 ~]# docker network ls
NETWORK ID     NAME              DRIVER    SCOPE
b6b330d657ff   bridge            bridge    local
bff04fb77add   docker_gwbridge   bridge    local
76793b0128ee   host              host      local
i0ah3bt50dcz   ingress           overlay   swarm
1bb001aeb475   none              null      local
skhbpczg0bbx   test-net          overlay   swarm

# worker节点同步网络
开启一个容器,强制指定网络test-net,docker会自动同步对应的网络
[root@host-135 ~]# docker run -dit --name nginx-syw --network test-net nginx:1.22
9c2856c60012ea5c88877a027e1e8f4431d334e9eba7508d76195b458e67a331
[root@host-135 ~]# 
[root@host-135 ~]# 
[root@host-135 ~]# 
[root@host-135 ~]# docker network ls
NETWORK ID     NAME              DRIVER    SCOPE
ab0d44c632cb   bridge            bridge    local
fe4401855c6d   docker_gwbridge   bridge    local
344f2764f0c6   host              host      local
i0ah3bt50dcz   ingress           overlay   swarm
73f36a5af721   none              null      local
skhbpczg0bbx   test-net          overlay   swarm

搭建运行大模型的容器(manager,worker如下相同操作)

[root@host-136 deepseed]# tree -L 1
.
├── code
├── docker-compose.yml
├── Dockerfile
└── downloadmodel.py


git clone https://github.com/hiyouga/LLaMA-Factory code

[root@host-136 deepseed]# cat Dockerfile 
FROM nvidia/cuda:11.7.1-devel-ubuntu20.04

# 更新系统包
RUN apt-get update && apt-get install -y git build-essential zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libsqlite3-dev libreadline-dev libffi-dev liblzma-dev libbz2-dev curl wget net-tools iputils-ping pdsh

# 安装Python
WORKDIR /home/user

RUN  wget https://www.python.org/ftp/python/3.10.6/Python-3.10.6.tgz && \
  tar -zvxf Python-3.10.6.tgz && cd Python-3.10.6 && \
  ./configure --enable-optimizations && make -j 4 && make install

[root@host-136 deepseed]# cat docker-compose.yml 
version: "3"
services:
  llm:
    build:
      context: .
      dockerfile: Dockerfile
    container_name: llm
    tty: true
    restart: always
    ulimits:
      memlock: -1
      stack: 67108864
    shm_size: 40G
    deploy:
      resources:
        reservations:
          devices:
            - capabilities: [gpu]
    volumes:
      - ./code:/home/user/code:cached
    networks:
      - test-net

networks:
  test-net:
    external: true

# 执行命令,拉起容器
docker-compose up -d

# 查看容器运行状态
[root@host-136 deepseed]# docker ps |grep llm
847ddde85555   deepseed_llm                                     "/opt/nvidia/nvidia_…"   2 days ago     Up 25 hours             llm

[root@host-135 deepseed]# docker ps |grep llm
a17e384a43cf   deepseed_llm                                          "/opt/nvidia/nvidia_…"   47 hours ago   Up 47 hours             llm

验证两个容器网络互通

root@847ddde85555:/home/user/code# ifconfig 
eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1450
        inet 10.0.1.4  netmask 255.255.255.0  broadcast 10.0.1.255
        ether 02:42:0a:00:01:04  txqueuelen 0  (Ethernet)
        RX packets 818092  bytes 58597646 (58.5 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 202087  bytes 12531129076 (12.5 GB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 172.18.0.3  netmask 255.255.0.0  broadcast 172.18.255.255
        ether 02:42:ac:12:00:03  txqueuelen 0  (Ethernet)
        RX packets 275546  bytes 453236309 (453.2 MB)
        RX errors 0  dropped 3038  overruns 0  frame 0
        TX packets 203019  bytes 11385340 (11.3 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

lo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 22820  bytes 3897088 (3.8 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 22820  bytes 3897088 (3.8 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

root@847ddde85555:/home/user/code# ping 10.0.1.6
PING 10.0.1.6 (10.0.1.6) 56(84) bytes of data.
64 bytes from 10.0.1.6: icmp_seq=1 ttl=64 time=0.348 ms
64 bytes from 10.0.1.6: icmp_seq=2 ttl=64 time=0.233 ms
64 bytes from 10.0.1.6: icmp_seq=3 ttl=64 time=0.187 ms
64 bytes from 10.0.1.6: icmp_seq=4 ttl=64 time=0.215 ms
64 bytes from 10.0.1.6: icmp_seq=5 ttl=64 time=0.195 ms


root@a17e384a43cf:/home/user# ifconfig 
eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1450
        inet 10.0.1.6  netmask 255.255.255.0  broadcast 10.0.1.255
        ether 02:42:0a:00:01:06  txqueuelen 0  (Ethernet)
        RX packets 930659  bytes 12603974816 (12.6 GB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 982235  bytes 95824458 (95.8 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 172.18.0.4  netmask 255.255.0.0  broadcast 172.18.255.255
        ether 02:42:ac:12:00:04  txqueuelen 0  (Ethernet)
        RX packets 3490223  bytes 5026263215 (5.0 GB)
        RX errors 0  dropped 5669  overruns 0  frame 0
        TX packets 2630239  bytes 149491283 (149.4 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

lo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 7063  bytes 2081977 (2.0 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 7063  bytes 2081977 (2.0 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

root@a17e384a43cf:/home/user# ping 10.0.1.4
PING 10.0.1.4 (10.0.1.4) 56(84) bytes of data.
64 bytes from 10.0.1.4: icmp_seq=1 ttl=64 time=0.270 ms
64 bytes from 10.0.1.4: icmp_seq=2 ttl=64 time=0.161 ms
64 bytes from 10.0.1.4: icmp_seq=3 ttl=64 time=0.248 ms

容器间配置ssh免密

# 安装ssh服务
apt-get install openssh-server -y

/etc/init.d/ssh start

ssh-keygen -t rsa
将manager节点中的~/.ssh/id_rsa.pub的内容复制写入到manager节点和worker节点中的~/.ssh/authorized_keys文件中
将worker节点中的~/.ssh/id_rsa.pub的内容复制写入到manager节点和worker节点中的~/.ssh/authorized_keys文件中

容器内配置NCCL网络

NCCL相关配置需要修改一下,要不然训练的时候会卡住。先要看容器使用的是哪个网卡,就是看节点的ip对应的网卡。

root@847ddde85555:/home/user/code/LLaMA-Factory# ifconfig
eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1450
        inet 10.0.1.4  netmask 255.255.255.0  broadcast 10.0.1.255
        ether 02:42:0a:00:01:04  txqueuelen 0  (Ethernet)
        RX packets 814794  bytes 57932933 (57.9 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 198802  bytes 12530810523 (12.5 GB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

eth1: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet 172.18.0.3  netmask 255.255.0.0  broadcast 172.18.255.255
        ether 02:42:ac:12:00:03  txqueuelen 0  (Ethernet)
        RX packets 272926  bytes 452619218 (452.6 MB)
        RX errors 0  dropped 540  overruns 0  frame 0
        TX packets 202802  bytes 11371054 (11.3 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

lo: flags=73<UP,LOOPBACK,RUNNING>  mtu 65536
        inet 127.0.0.1  netmask 255.0.0.0
        loop  txqueuelen 1000  (Local Loopback)
        RX packets 11592  bytes 1467359 (1.4 MB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 11592  bytes 1467359 (1.4 MB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0


然后去/etc/profile中

# 根据机器情况制动NCCL的通讯网卡
export NCCL_SOCKET_IFNAME=eth0

容器内准备chatglm2-6b模型深度学习框架LLaMA以及运行的环境依赖

root@847ddde85555:/home/user/code# tree -L 1
.
|-- LLaMA-Factory # 深度学习框架
|-- chatglm2-6b  # 开源大模型
|-- downloadmodel.py
`-- requirements.txt # python依赖包

root@847ddde85555:/home/user/code# cat requirements.txt 
torch==2.0.1
transformers==4.33.1
datasets==2.14.6
deepspeed==0.12.3
accelerate==0.24.1
peft==0.6.0
trl==0.7.2
gradio==3.38.0
scipy==1.11.3
sentencepiece==0.1.99
protobuf==4.25.0
tiktoken==0.5.1
jieba==0.42.1
rouge-chinese==1.0.3
nltk==3.8.1
uvicorn==0.24.0
pydantic==1.10.11
fastapi==0.95.1
sse-starlette==1.6.5
matplotlib==3.8.1

deepseed运行相关文件及配置

root@847ddde85555:/home/user/code/LLaMA-Factory# tree -L 1 
.
|-- LICENSE
|-- Makefile
|-- README.md
|-- README_zh.md
|-- assets
|-- data
|-- ds_config.json
|-- evaluation
|-- examples
|-- hostfile
|-- pyproject.toml
|-- requirements.txt
|-- run.sh
|-- scripts
|-- setup.py
`-- src


root@847ddde85555:/home/user/code/LLaMA-Factory# cat hostfile 
manager slots=2  # 节点gpu的个数
worker slots=2

root@847ddde85555:/home/user/code/LLaMA-Factory# cat ds_config.json 
{
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "zero_allow_untested_optimizer": true,
  "fp16": {
    "enabled": "auto",
    "loss_scale": 0,
    "initial_scale_power": 16,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "zero_optimization": {
    "stage": 3,
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,
    "reduce_scatter": true,
    "reduce_bucket_size": 5e8,
    "overlap_comm": false,
    "contiguous_gradients": true
  },
  "offload_param": {
     "device": "cpu",
     "pin_memory": true,
     "max_in_cpu": 1e9
  }
}

# 分布数模型启动的脚本
root@847ddde85555:/home/user/code/LLaMA-Factory# cat run.sh 
deepspeed --hostfile hostfile src/train_bash.py \
    --deepspeed ds_config.json \
    --stage sft \
    --model_name_or_path /home/user/code/chatglm2-6b \
    --do_train \
    --dataset alpaca_gpt4_zh \
    --template chatglm2 \
    --finetuning_type lora \
    --lora_target query_key_value \
    --output_dir chatglm2_sft \
    --overwrite_cache \
    --overwrite_output_dir \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --logging_steps 10 \
    --save_steps 10000 \
    --learning_rate 5e-5 \
    --num_train_epochs 0.25 \
    --plot_loss True \
    --fp16 True \
    --lora_rank 8 \
    --lora_alpha 16 \
    --lora_dropout 0.05 \
    --lora_target q_proj,v_proj

运行结果展示

 

posted @ 2024-03-14 15:56  威威后花园  阅读(335)  评论(0编辑  收藏  举报