pytorch加速训练过程(单机多卡)

复制代码
第一种方式:nn.DataParallel方式

#
main.py import torch import torch.distributed as dist gpus = [0, 1, 2, 3]#指定有哪些gpu torch.cuda.set_device('cuda:{}'.format(gpus[0]))# train_dataset = ... train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=...) model = ... model = nn.DataParallel(model.to(device), device_ids=gpus, output_device=gpus[0])#output_device为指定哪一张卡进行汇总梯度,这张卡内存要大一些一般
optimizer = optim.SGD(model.parameters()) 

for epoch in range(100):
  
for batch_idx, (data, target) in enumerate(train_loader):
    images
= images.cuda(non_blocking=True)#将数据放入cuda
    target
= target.cuda(non_blocking=True)#将标签放入cuda
    ...
    output
= model(images)
    loss
= criterion(output, target)
    ...
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

#训练的话直接python main.py
复制代码

 

复制代码
第二种方式:使用torch.distributed加速
#
main.py import torch import argparse import torch.distributed as dist #获取当前Gpu进程的index parser = argparse.ArgumentParser() parser.add_argument('--local_rank', default=-1, type=int, help='node rank for distributed training') args = parser.parse_args() #设置GPU之间通信使用的后端和端口 dist.init_process_group(backend='nccl') torch.cuda.set_device(args.local_rank) #使用DistributedSampler对数据集进行划分
train_dataset = ... 
train_sampler
= torch.utils.data.distributed.DistributedSampler(train_dataset)
train_loader
= torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler)
#使用 DistributedDataParallel 包装模型,汇总不同GPU算得的梯度,并同步计算结果
model
= ...
model
= torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank])
optimizer
= optim.SGD(model.parameters())
for epoch in range(100):
  
for batch_idx, (data, target) in enumerate(train_loader):
    images
= images.cuda(non_blocking=True)
    target
= target.cuda(non_blocking=True)
    ...
    output
= model(images)
    loss
= criterion(output, target)
    ...
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
#启动方法
CUDA_VISIBLE_DEVICES=0,1,2,3 #指定使用哪些GPU,--nproc_per_node表示有几个进程,有几个GPU就有几个进程

python -m torch.distributed.launch --nproc_per_node=4 main.py
#使用 torch.multiprocessing 取代启动器

# main.py
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
args=parser.parse_args()
args.nprocs=torch.cuda.device_count() mp.spawn(main_worker, nprocs=args.nprocs, args=(args.nprocs, args)) def main_worker(local_rank, nprocs, args): dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', world_size=args.nprocs, rank=local_rank) torch.cuda.set_device(args.local_rank) train_dataset = ... train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler) model = ... model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank]) optimizer = optim.SGD(model.parameters()) for epoch in range(100): for batch_idx, (data, target) in enumerate(train_loader): images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) ... output = model(images) loss = criterion(output, target) ... optimizer.zero_grad() loss.backward() optimizer.step()
 #启动方法:python main.py
复制代码

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
第三种:使用Apex加速<br># main.py
import torch
import argparse
import torch.distributed as dist
 
from apex.parallel import DistributedDataParallel
 
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', default=-1, type=int,
                    help='node rank for distributed training')
args = parser.parse_args()
 
dist.init_process_group(backend='nccl')
torch.cuda.set_device(args.local_rank)
 
train_dataset = ...
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
 
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=..., sampler=train_sampler)
 
model = ...
model, optimizer = amp.initialize(model, optimizer)
model = DistributedDataParallel(model, device_ids=[args.local_rank])
 
optimizer = optim.SGD(model.parameters())
 
for epoch in range(100):
   for batch_idx, (data, target) in enumerate(train_loader):
      images = images.cuda(non_blocking=True)
      target = target.cuda(non_blocking=True)
      ...
      output = model(images)
      loss = criterion(output, target)
      optimizer.zero_grad()
      with amp.scale_loss(loss, optimizer) as scaled_loss:
         scaled_loss.backward()
      optimizer.step()<br># 启动方法CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 main.py

  

posted @   zhaojianhui  阅读(528)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!
点击右上角即可分享
微信分享提示