6-3使用GPU训练模型
深度学习的训练过程常常非常耗时,一个模型训练几个小时是家常便饭,训练几天也是常有的事情,有时候甚至要训练几十天。
训练过程的耗时主要来自于两个部分,一部分来自数据准备,另一部分来自参数迭代。
当数据准备过程还是模型训练时间的主要瓶颈时,我们可以使用更多进程来准备数据。
当参数迭代过程成为训练时间的主要瓶颈时,我们通常的方法是应用GPU来进行加速。
import torch
import torchkeras
import torchmetrics
print("torch.__version__ = ",torch.__version__)
print("torchkeras.__version__ = ",torchkeras.__version__)
print("torchmetrics.__version__ = ",torchmetrics.__version__)
"""
torch.__version__ = 2.3.1+cu121
torchkeras.__version__ = 3.9.6
torchmetrics.__version__ = 1.4.1
"""
Pytorch中使用GPU加速模型非常简单,只要将模型和数据移动到GPU上。核心代码只有以下几行。
# 定义模型 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # 移动模型到cuda # 训练模型 features = features.to(device) # 移动数据到cuda labels = labels.to(device) # 或者 labels = labels.cuda() if torch.cuda.is_available() else labels
如果要使用多个GPU训练模型,也非常简单。只需要在将模型设置为数据并行风格模型。
则模型移动到GPU上之后,会在每一个GPU上拷贝一个副本,并把数据平分到各个GPU上进行训练。核心代码如下。# 定义模型 if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # 包装为并行风格模型 # 训练模型 features = features.to(device) # 移动数据到cuda labels = labels.to(device) # 或者 labels = labels.cuda() if torch.cuda.is_available() else labels
1.GPU相关操作汇总
import torch
from torch import nn
# 查看gpu信息
if_cuda = torch.cuda.is_available()
print("if_cuda=", if_cuda)
gpu_count = torch.cuda.device_count()
print("gpu_count=", gpu_count)
"""
if_cuda= True
gpu_count= 1
"""
# 将张量在gpu和cpu间移动
tensor = torch.rand((100, 100))
tensor_gpu = tensor.to("cuda:0") # 或者tensor_gpu = tensor.cuda()
print(tensor_gpu.device)
print(tensor_gpu.is_cuda)
tensor_cpu = tensor_gpu.to("cpu") # 或者tensor_cpu = tensor_gpu.cpu()
print(tensor_cpu.device)
"""
cuda:0
True
cpu
"""
# 将模型中的全部张量移动到gpu上
net = nn.Linear(2, 1)
print(next(net.parameters()).is_cuda)
net.to("cuda:0") # 将模型中的全部参数张量依次移动到GPU上,注意,无需重新赋值为net = net.to("cuda:0")
print(next(net.parameters()).is_cuda)
print(next(net.parameters()).device)
"""
False
True
cuda:0
"""
# 创建支持多个gpu数据并行的模型
linear = nn.Linear(2, 1)
print(next(linear.parameters()).device)
model = nn.DataParallel(linear)
print(model.device_ids)
print(next(model.module.parameters()).device)
# 注意保存参数时要制定保存model.module的参数
torch.save(model.module.state_dict(), "model_parameter.pt")
linear = nn.Linear(2, 1)
linear.load_state_dict(torch.load("model_parameter.pt"))
"""
cpu
[0]
cuda:0
<All keys matched successfully>
"""
2.矩阵乘法范例
下面分别使用CPU和GPU作一个矩阵乘法,并比较其计算效率。
import time
import torch
from torch import nn
# 使用CPU
a = torch.rand((10000, 200))
b = torch.rand((200, 10000))
tic = time.time()
c = torch.matmul(a, b)
toc = time.time()
print(toc - tic)
print(a.device)
print(b.device)
"""
0.26279258728027344
cpu
cpu
"""
# 使用GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
a = torch.rand((10000, 200), device=device) # 可以指定在GPU上创建张量
b = torch.rand((200, 10000))
b = b.to(device) # 或者 b = b.cuda() if torch.cuda.is_available() else b
tic = time.time()
c = torch.matmul(a, b)
toc = time.time()
print(toc - tic)
print(a.device)
print(b.device)
"""
0.5037369728088379
cuda:0
cuda:0
"""
3.线性回归范例
下面对比使用CPU和GPU训练一个线性回归模型的效率
# 准备数据
n = 10000000 # 样本数量
X = 10 * torch.rand([n, 2]) - 5.0
w0 = torch.tensor([[2.0, -3.0]])
b0 = torch.tensor([[10.0]])
Y = X@w0.t() + b0 + torch.normal(0.0, 2.0, size=[n, 1])
# 定义模型
class LinearRegression(nn.Module):
def __init__(self):
super().__init__()
self.w = nn.Parameter(torch.randn_like(w0))
self.b = nn.Parameter(torch.zeros_like(b0))
def forward(self, x):
return x@self.w.t() + self.b
linear = LinearRegression()
# 训练模型
optimizer = torch.optim.Adam(linear.parameters(), lr=0.1)
loss_fn = nn.MSELoss()
def train(epoches):
tic = time.time()
for epoch in range(epoches):
optimizer.zero_grad()
Y_pred = linear(X)
loss = loss_fn(Y_pred, Y)
loss.backward()
optimizer.step()
if epoch % 50 == 0:
print({"epoch": epoch, "loss": loss.item()})
toc = time.time()
print("time used:", toc - tic)
train(500)
"""
{'epoch': 0, 'loss': 211.0252227783203}
{'epoch': 50, 'loss': 33.406837463378906}
{'epoch': 100, 'loss': 9.043604850769043}
{'epoch': 150, 'loss': 4.492393970489502}
{'epoch': 200, 'loss': 4.024799346923828}
{'epoch': 250, 'loss': 4.001006603240967}
{'epoch': 300, 'loss': 4.000483512878418}
{'epoch': 350, 'loss': 4.0004801750183105}
{'epoch': 400, 'loss': 4.0004801750183105}
{'epoch': 450, 'loss': 4.0004801750183105}
time used: 48.405426263809204
"""
# 使用GPU
# 准备数据
n = 1000000 #样本数量
X = 10*torch.rand([n,2])-5.0 #torch.rand是均匀分布
w0 = torch.tensor([[2.0,-3.0]])
b0 = torch.tensor([[10.0]])
Y = X@w0.t() + b0 + torch.normal( 0.0,2.0,size = [n,1]) # @表示矩阵乘法,增加正态扰动
# 数据移动到GPU上
print("torch.cuda.is_available() = ",torch.cuda.is_available())
X = X.cuda()
Y = Y.cuda()
print("X.device:",X.device)
print("Y.device:",Y.device)
# 定义模型
class LinearRegression(nn.Module):
def __init__(self):
super().__init__()
self.w = nn.Parameter(torch.randn_like(w0))
self.b = nn.Parameter(torch.zeros_like(b0))
#正向传播
def forward(self,x):
return x@self.w.t() + self.b
linear = LinearRegression()
# 移动模型到GPU上
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
linear.to(device)
#查看模型是否已经移动到GPU上
print("if on cuda:",next(linear.parameters()).is_cuda)
# 训练模型
optimizer = torch.optim.Adam(linear.parameters(),lr = 0.1)
loss_fn = nn.MSELoss()
def train(epoches):
tic = time.time()
for epoch in range(epoches):
optimizer.zero_grad()
Y_pred = linear(X)
loss = loss_fn(Y_pred,Y)
loss.backward()
optimizer.step()
if epoch%50==0:
print({"epoch":epoch,"loss":loss.item()})
toc = time.time()
print("time used:",toc-tic)
train(500)
"""
torch.cuda.is_available() = True
X.device: cuda:0
Y.device: cuda:0
if on cuda: True
{'epoch': 0, 'loss': 186.33334350585938}
{'epoch': 50, 'loss': 33.154579162597656}
{'epoch': 100, 'loss': 9.04254150390625}
{'epoch': 150, 'loss': 4.492138862609863}
{'epoch': 200, 'loss': 4.024778366088867}
{'epoch': 250, 'loss': 4.00100564956665}
{'epoch': 300, 'loss': 4.00048303604126}
{'epoch': 350, 'loss': 4.0004801750183105}
{'epoch': 400, 'loss': 4.0004801750183105}
{'epoch': 450, 'loss': 4.0004801750183105}
time used: 1.3654239177703857
"""
4.图片分类范例
import torch
from torch import nn
import torchvision
from torchvision import transforms
transform = transforms.Compose([transforms.ToTensor()])
ds_train = torchvision.datasets.MNIST(root="dataset/mnist/",train=True,download=False,transform=transform)
ds_val = torchvision.datasets.MNIST(root="dataset/mnist/",train=False,download=False,transform=transform)
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=128, shuffle=True, num_workers=2)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=128, shuffle=False, num_workers=2)
print(len(ds_train))
print(len(ds_val))
"""
60000
10000
"""
def create_net():
net = nn.Sequential()
net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3))
net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("conv2",nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5))
net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
net.add_module("dropout",nn.Dropout2d(p = 0.1))
net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
net.add_module("flatten",nn.Flatten())
net.add_module("linear1",nn.Linear(64,32))
net.add_module("relu",nn.ReLU())
net.add_module("linear2",nn.Linear(32,10))
return net
net = create_net()
print(net)
"""
Sequential(
(conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
(pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
(pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(dropout): Dropout2d(p=0.1, inplace=False)
(adaptive_pool): AdaptiveMaxPool2d(output_size=(1, 1))
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear1): Linear(in_features=64, out_features=32, bias=True)
(relu): ReLU()
(linear2): Linear(in_features=32, out_features=10, bias=True)
)
"""
# 使用CPU进行训练
import os,sys,time
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm
import torch
from torch import nn
from copy import deepcopy
from torchmetrics import Accuracy
#注:多分类使用torchmetrics中的评估指标,二分类使用torchkeras.metrics中的评估指标
def printlog(info):
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print(str(info)+"\n")
net = create_net()
loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}
epochs = 3
ckpt_path='checkpoint.pt'
#early_stopping相关设置
monitor="val_acc"
patience=1
mode="max"
history = {}
for epoch in range(1, epochs+1):
printlog("Epoch {0} / {1}".format(epoch, epochs))
# 1,train -------------------------------------------------
net.train()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_train), total =len(dl_train),file=sys.stdout)
train_metrics_dict = deepcopy(metrics_dict)
for i, batch in loop:
features,labels = batch
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#backward
loss.backward()
optimizer.step()
optimizer.zero_grad()
#metrics
step_metrics = {"train_"+name:metric_fn(preds, labels).item()
for name,metric_fn in train_metrics_dict.items()}
step_log = dict({"train_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_train)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = total_loss/step
epoch_metrics = {"train_"+name:metric_fn.compute().item()
for name,metric_fn in train_metrics_dict.items()}
epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in train_metrics_dict.items():
metric_fn.reset()
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 2,validate -------------------------------------------------
net.eval()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_val), total =len(dl_val),file=sys.stdout)
val_metrics_dict = deepcopy(metrics_dict)
with torch.no_grad():
for i, batch in loop:
features,labels = batch
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#metrics
step_metrics = {"val_"+name:metric_fn(preds, labels).item()
for name,metric_fn in val_metrics_dict.items()}
step_log = dict({"val_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_val)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = (total_loss/step)
epoch_metrics = {"val_"+name:metric_fn.compute().item()
for name,metric_fn in val_metrics_dict.items()}
epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in val_metrics_dict.items():
metric_fn.reset()
epoch_log["epoch"] = epoch
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 3,early-stopping -------------------------------------------------
arr_scores = history[monitor]
best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
if best_score_idx==len(arr_scores)-1:
torch.save(net.state_dict(),ckpt_path)
print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
arr_scores[best_score_idx]))
if len(arr_scores)-best_score_idx>patience:
print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
monitor,patience))
break
net.load_state_dict(torch.load(ckpt_path))
dfhistory = pd.DataFrame(history)
"""
================================================================================2024-08-04 16:57:44
Epoch 1 / 3
100%|█████████████████████████████████████████████| 469/469 [00:44<00:00, 10.42it/s, train_acc=0.903, train_loss=0.295]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 24.28it/s, val_acc=0.975, val_loss=0.0781]
<<<<<< reach best val_acc : 0.9751999974250793 >>>>>>
================================================================================2024-08-04 16:58:32
Epoch 2 / 3
100%|█████████████████████████████████████████████| 469/469 [00:44<00:00, 10.54it/s, train_acc=0.967, train_loss=0.108]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 25.28it/s, val_acc=0.976, val_loss=0.0732]
<<<<<< reach best val_acc : 0.9757999777793884 >>>>>>
================================================================================2024-08-04 16:59:20
Epoch 3 / 3
100%|█████████████████████████████████████████████| 469/469 [00:42<00:00, 11.03it/s, train_acc=0.972, train_loss=0.094]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 25.24it/s, val_acc=0.981, val_loss=0.0605]
<<<<<< reach best val_acc : 0.9811999797821045 >>>>>>
"""
# 使用GPU进行训练
import os,sys,time
import numpy as np
import pandas as pd
import datetime
from tqdm import tqdm
import torch
from torch import nn
from copy import deepcopy
from torchmetrics import Accuracy
#注:多分类使用torchmetrics中的评估指标,二分类使用torchkeras.metrics中的评估指标
def printlog(info):
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("\n"+"=========="*8 + "%s"%nowtime)
print(str(info)+"\n")
net = create_net()
loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}
# ============================移动模型到GPU上============================
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
loss_fn.to(device)
for name, fn in metrics_dict.items():
fn.to(device)
# ======================================================================
epochs = 5
ckpt_path='checkpoint.pt'
#early_stopping相关设置
monitor="val_acc"
patience=1
mode="max"
history = {}
for epoch in range(1, epochs+1):
printlog("Epoch {0} / {1}".format(epoch, epochs))
# 1,train -------------------------------------------------
net.train()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_train), total =len(dl_train),file=sys.stdout)
train_metrics_dict = deepcopy(metrics_dict)
for i, batch in loop:
features,labels = batch
# ====================================移动数据到GPU上====================================
features = features.to(device)
labels = labels.to(device)
# ======================================================================================
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#backward
loss.backward()
optimizer.step()
optimizer.zero_grad()
#metrics
step_metrics = {"train_"+name:metric_fn(preds, labels).item()
for name,metric_fn in train_metrics_dict.items()}
step_log = dict({"train_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_train)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = total_loss/step
epoch_metrics = {"train_"+name:metric_fn.compute().item()
for name,metric_fn in train_metrics_dict.items()}
epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in train_metrics_dict.items():
metric_fn.reset()
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 2,validate -------------------------------------------------
net.eval()
total_loss,step = 0,0
loop = tqdm(enumerate(dl_val), total =len(dl_val),file=sys.stdout)
val_metrics_dict = deepcopy(metrics_dict)
with torch.no_grad():
for i, batch in loop:
features, labels = batch
# ====================================移动数据到GPU上====================================
features = features.to(device)
labels = labels.to(device)
# ======================================================================================
#forward
preds = net(features)
loss = loss_fn(preds,labels)
#metrics
step_metrics = {"val_"+name:metric_fn(preds, labels).item()
for name,metric_fn in val_metrics_dict.items()}
step_log = dict({"val_loss":loss.item()},**step_metrics)
total_loss += loss.item()
step+=1
if i!=len(dl_val)-1:
loop.set_postfix(**step_log)
else:
epoch_loss = (total_loss/step)
epoch_metrics = {"val_"+name:metric_fn.compute().item()
for name,metric_fn in val_metrics_dict.items()}
epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
loop.set_postfix(**epoch_log)
for name,metric_fn in val_metrics_dict.items():
metric_fn.reset()
epoch_log["epoch"] = epoch
for name, metric in epoch_log.items():
history[name] = history.get(name, []) + [metric]
# 3,early-stopping -------------------------------------------------
arr_scores = history[monitor]
best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
if best_score_idx==len(arr_scores)-1:
torch.save(net.state_dict(),ckpt_path)
print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
arr_scores[best_score_idx]))
if len(arr_scores)-best_score_idx>patience:
print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
monitor,patience))
break
net.load_state_dict(torch.load(ckpt_path))
dfhistory = pd.DataFrame(history)
"""
================================================================================2024-08-04 17:03:49
Epoch 1 / 5
100%|█████████████████████████████████████████████| 469/469 [00:07<00:00, 63.35it/s, train_acc=0.886, train_loss=0.347]
100%|███████████████████████████████████████████████████| 79/79 [00:02<00:00, 31.36it/s, val_acc=0.952, val_loss=0.153]
<<<<<< reach best val_acc : 0.9517999887466431 >>>>>>
================================================================================2024-08-04 17:03:59
Epoch 2 / 5
100%|█████████████████████████████████████████████| 469/469 [00:05<00:00, 91.89it/s, train_acc=0.965, train_loss=0.116]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 22.98it/s, val_acc=0.975, val_loss=0.0832]
<<<<<< reach best val_acc : 0.9751999974250793 >>>>>>
================================================================================2024-08-04 17:04:07
Epoch 3 / 5
100%|████████████████████████████████████████████| 469/469 [00:04<00:00, 94.34it/s, train_acc=0.973, train_loss=0.0886]
100%|███████████████████████████████████████████████████| 79/79 [00:02<00:00, 32.07it/s, val_acc=0.98, val_loss=0.0649]
<<<<<< reach best val_acc : 0.9800000190734863 >>>>>>
================================================================================2024-08-04 17:04:15
Epoch 4 / 5
100%|████████████████████████████████████████████| 469/469 [00:05<00:00, 91.59it/s, train_acc=0.975, train_loss=0.0861]
100%|██████████████████████████████████████████████████| 79/79 [00:02<00:00, 30.38it/s, val_acc=0.979, val_loss=0.0748]
<<<<<< val_acc without improvement in 1 epoch, early stopping >>>>>>
"""
5.torchkeras.KerasModel中使用GPU
从上面的例子可以看到,在pytorch中使用GPU并不复杂,但对于经常炼丹的同学来说,模型和数据老是移来移去还是蛮麻烦的。
一不小心就会忘了移动某些数据或者某些module,导致报错。
torchkeras.KerasModel在设计的适合就考虑到了这一点,如果环境中存在可用的GPU,会自动使用GPU,反之则使用CPU。
通过引入accelerate的一些基础功能,torchkeras.KerasModel以非常优雅的方式在GPU和CPU之间切换/
详细实现可以参考torchkeras.KerasModel的源码。
import accelerate
accelerator = accelerate.Accelerator()
print(accelerator.device)
"""
cuda
"""
from torchkeras import KerasModel
from torchmetrics import Accuracy
net = create_net()
model = KerasModel(net, loss_fn=nn.CrossEntropyLoss(), metrics_dict={"acc": Accuracy(task="multiclass", num_classes=10)},
optimizer=torch.optim.Adam(net.parameters(), lr=0.01))
model.fit(train_data=dl_train, val_data=dl_val, epochs=10, patience=3, monitor='val_acc', mode='max')
作者:lotuslaw
出处:https://www.cnblogs.com/lotuslaw/p/18342000
版权:本作品采用「署名-非商业性使用-相同方式共享 4.0 国际」许可协议进行许可。
标签:
Pytorch
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧