pipeline
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=4, shuffle=False, pin_memory=True)
optimizer = torch.optim.AdamW(model.parameters(),lr=lr,weight_decay=weight_decay)
# optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
criterion = nn.CrossEntropyLoss(ignore_index=11)
for epoch in range(num_epochs):
loop = tqdm(train_loader)
model.train()
for data, target in loop:
prediction = model(data)
loss = criterion(prediction, target)
losses.append(loss.item())
loop.set_description('Epoch {}/{}'.format(epoch + 1, num_epochs))
loop.set_postfix(loss=loss.item())
loop.set_postfix(lr=scheduler.get_lr())
loss.backward()
scheduler.step(epoch=epoch)
optimizer.step()
avg_loss = sum(losses) / len(losses)
torch.save(model.state_dict(), savename)
torch.save(optimizer.state_dict(), savename2)
model.eval()
optimizer
optimizer.SGD([{'params': model.base.parameters()},
{'params': model.classifier.parameters(), 'lr': 1e-3}],
lr=1e-2, momentum=0.9)
# 手动修改lr
lr = base_lr*((1-float(iter)/max_iter)**(power))
optimizer.param_groups[0]['lr'] = lr
optimizer.param_groups:是长度为2的list,其中的元素是2个字典;
optimizer.param_groups[0].keys()
['amsgrad', 'params', 'lr', 'betas', 'weight_decay', 'eps']
loss
torch.nn.KLDivLoss
log: ln
https://pytorch.org/docs/stable/nn.html?highlight=torch%20nn%20kldivloss#torch.nn.KLDivLoss
a = torch.Tensor(list(range(1,19))).reshape([2,3,3]) # b,c,d
t = a*10
af = F.softmax(a,dim=-1)
tf = F.softmax(t,dim=-1)
KLdiv = torch.sum(tf*(torch.log(tf)-torch.log(af))) # 2.4429
torch.nn.KLDivLoss(reduction='sum')(torch.log(af),tf) # 2.4429
torch.nn.KLDivLoss(reduction='mean')(torch.log(af),tf) # 0.1357
torch.nn.KLDivLoss(reduction='batchmean')(torch.log(af),tf) # 1.2215 batch内sum,batch间mean = sum()/#batch
torch.nn.KLDivLoss(reduction='none')(torch.log(af),tf) # shape: 2,3,3,element-wise
save / load model
https://zhuanlan.zhihu.com/p/38056115
# 保存整个网络
torch.save(net, PATH)
model_dict=torch.load(PATH)
# 保存网络中的参数, 速度快,占空间少
torch.save(net.state_dict(),PATH)
model_dict=model.load_state_dict(torch.load(PATH))
torch.save({
'epoch': epochID + 1,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
'best_loss': lossMIN,
'alpha': loss.alpha,
'gamma': loss.gamma
}, 'PATH.pth.tar')
def load_checkpoint(model, optimizer, checkpoint_PATH):
if checkpoint != None:
model_CKPT = torch.load(checkpoint_PATH)
print('loading checkpoint!')
model.load_state_dict(model_CKPT['state_dict'])
optimizer.load_state_dict(model_CKPT['optimizer'])
return model, optimizer
load时过滤不存在的param
def load_checkpoint(model, checkpoint, optimizer, loadOptimizer):
if checkpoint != 'None':
print("loading checkpoint...")
model_dict = model.state_dict()
modelCheckpoint = torch.load(checkpoint)
pretrained_dict = modelCheckpoint['state_dict']
# 过滤操作
new_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict.keys()}
model_dict.update(new_dict)
model.load_state_dict(model_dict)
# 打印出来,更新了多少的参数
print('Total : {}, update: {}'.format(len(pretrained_dict), len(new_dict)))
print("loaded finished!")
# 如果不需要更新优化器那么设置为false
if loadOptimizer == True:
optimizer.load_state_dict(modelCheckpoint['optimizer'])
print('loaded! optimizer')
else:
print('not loaded optimizer')
else:
print('No checkpoint is included')
return model, optimizer
冻结参数
https://blog.csdn.net/lingzhou33/article/details/88977700
# 直接fix
for p in net.parameters():
p.requires_grad = False
class RESNET_attention(nn.Module):
def __init__(self, model, pretrained):
super(RESNET_attetnion, self).__init__()
self.resnet = model(pretrained)
# for循环以上的参数固定
for p in self.parameters():
p.requires_grad = False
self.f = nn.Conv2d(2048, 512, 1)
self.g = nn.Conv2d(2048, 512, 1)
self.h = nn.Conv2d(2048, 2048, 1)
self.softmax = nn.Softmax(-1)
self.gamma = nn.Parameter(torch.FloatTensor([0.0]))
self.avgpool = nn.AvgPool2d(7, stride=1)
self.resnet.fc = nn.Linear(2048, 10)
# fix参数名含有word1的参数
word1 = 'seg'
for name, p in decode_net.named_parameters():
# a = [m.start() for m in re.finditer(word1, name)]
# if a: #列表a不为空的话就设置回传的标识为False
if word1 in name:
p.requires_grad = False
else:
p.requires_grad = True
# if p.requires_grad: print(name)
# 优化器需要filter掉不更新的parameters
optimizer = optim.Adam(
filter(lambda p: p.requires_grad, model.parameters()),
lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=1e-5)
根据参数id fix
base_params_id =
list(map(id, net.conv1.parameters())) +
list(map(id,net.bn1.parameters())) +
list(map(id,net.layer1.parameters())) +
list(map(id,net.layer2.parameters())) +
list(map(id,net.layer3.parameters())) +
list(map(id,net.layer4.parameters()))
new_params = filter(lambda p: id(p) not in base_params_id and p.requires_grad,\
netG.parameters())
base_params = filter(lambda p: id(p) in base_params_id,
netG.parameters())
optimizerG = optim.SGD([{'params': base_params, 'lr': 1e-4},
{'params': new_params}], lr = opt.lr, momentum = 0.9, weight_decay=0.0005)
参数量计算
https://github.com/Lyken17/pytorch-OpCounter 计算macs, params
params = list(model.parameters())
k = 0
for i in params:
l = 1
# print("该层的结构:" + str(list(i.size())))
for j in i.size():
l *= j
# print("该层参数和:" + str(l))
k = k + l
print("总参数数量和:" + str(k))
Debug
RuntimeError:Expected to have finished reduction in the prior iteration
解决办法:
# 添加find_unused_parameters=true
model=torch.nn.parallel.DistributedDataParallel(model,find_unused_parameters=true)