02修剪标准&&方法
2.1修剪标准
2.1.1基于权重大小的修剪标准
参考上一节,对权重做绝对值按大小修剪,或者做L1/L2范数来进行修剪
2.1.2基于梯度幅度来修剪
基于前面可知,我们按照值的大小来修剪,把值小的裁剪掉了,或者说某个权重在训练过程中一直不变,直观上感觉没有那么重要。但其实这样是不对的,从梯度上来说,该权值可能初始就很低(接近合适值),所以他的更新值就会很小,而不能认为他变动小就不重要。因此我们需要将权重大小和梯度大小结合考虑,进行裁剪。最简单的一种方式是考虑乘积。
下面这段代码可以应用于硕士毕业论文中,按照你的想法来剪枝
import numpy as np
import torch
def prune_by_gradient_weight_product(model, pruning_rate):
grad_weight_product_list = []
for name, param in model.named_parameters():
if 'weight' in name:
# 计算梯度与权重的乘积
grad_weight_product = torch.abs(param.grad * param.data)
grad_weight_product_list.append(grad_weight_product)
# 将所有的乘积值合并到一个张量中
all_product_values = torch.cat([torch.flatten(x) for x in grad_weight_product_list])
# 计算需要修剪的阈值
threshold = np.percentile(all_product_values.cpu().detach().numpy(), pruning_rate)
# 对权重进行修剪
for name, param in model.named_parameters():
if 'weight' in name:
# 创建一个掩码,表示哪些权重应该保留
mask = torch.where(torch.abs(param.grad * param.data) >= threshold, 1, 0)
# 应用掩码
param.data *= mask.float()
pruning_rate = 50
#一个全连接层,输入10的向量输出5的向量,然后是激活层,然后又是全连接层输入5的向量输出1的向量
model = torch.nn.Sequential(torch.nn.Linear(10, 5), torch.nn.ReLU(), torch.nn.Linear(5, 1))
input_tensor = torch.randn(1, 10) # 创建一个随机输入张量
#output_tensor就是传入input_tensor,跑一个最小的前向和反向传播
output_tensor = model(input_tensor) # 前向传递
loss = torch.sum(output_tensor) # 定义一个虚拟损失
loss.backward() # 执行反向传递以计算梯度
prune_by_gradient_weight_product(model, pruning_rate) # 对模型进行修剪
2.2修剪方法
2.2.1修剪框架
2015年提出了经典框架,训练-剪枝-微调
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
# 1. 训练基础的大网络
#这里实现了一个bigmodel只构建了三个全连接层
class BigModel(nn.Module):
def __init__(self):
super(BigModel, self).__init__()
self.fc1 = nn.Linear(784, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
# 准备MNIST数据集
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
def train(model, dataloader, criterion, optimizer, device='cpu', num_epochs=10):
model.train()
model.to(device)
for epoch in range(num_epochs):
running_loss = 0.0
for batch_idx, (inputs, targets) in enumerate(dataloader):
inputs, targets = inputs.to(device), targets.to(device)
# 前向传播
outputs = model(inputs.view(inputs.size(0), -1))
loss = criterion(outputs, targets)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)}")
return model
big_model = BigModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(big_model.parameters(), lr=1e-3)
big_model = train(big_model, train_loader, criterion, optimizer, device='cuda', num_epochs=2)
# 保存训练好的大网络
torch.save(big_model.state_dict(), "big_model.pth")
# 2. 修剪大网络为小网络 <==================================
def prune_network(model, pruning_rate=0.5, method="global"):
for name, param in model.named_parameters():
if "weight" in name:
tensor = param.data.cpu().numpy()
if method == "global":
threshold = np.percentile(abs(tensor), pruning_rate * 100)
else: # local pruning
threshold = np.percentile(abs(tensor), pruning_rate * 100, axis=1, keepdims=True)
mask = abs(tensor) > threshold
param.data = torch.FloatTensor(tensor * mask.astype(float)).to(param.device)
big_model.load_state_dict(torch.load("big_model.pth"))
prune_network(big_model, pruning_rate=0.5, method="global") # <==================================
# 保存修剪后的模型
torch.save(big_model.state_dict(), "pruned_model.pth")
# 3. 以低的学习率做微调
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(big_model.parameters(), lr=1e-4) # <==================================
finetuned_model = train(big_model, train_loader, criterion, optimizer, device='cuda', num_epochs=10)
# 保存微调后的模型
torch.save(finetuned_model.state_dict(), "finetuned_pruned_model.pth")
# Epoch 1, Loss: 0.2022465198550985
# Epoch 2, Loss: 0.08503768096334421
# Epoch 1, Loss: 0.03288614955859935
# Epoch 2, Loss: 0.021574671817958347
# Epoch 3, Loss: 0.015933904873507806
2018年提出了边训练边剪枝,在每完成一个epoch之后就进行剪枝,剪枝只是置0,下一次依然会更新
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
class BigModel(nn.Module):
def __init__(self):
super(BigModel, self).__init__()
self.fc1 = nn.Linear(784, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
def prune_network(model, pruning_rate=0.5, method="global"):
for name, param in model.named_parameters():
if "weight" in name:
tensor = param.data.cpu().numpy()
if method == "global":
threshold = np.percentile(abs(tensor), pruning_rate * 100)
else: # local pruning
threshold = np.percentile(abs(tensor), pruning_rate * 100, axis=1, keepdims=True)
mask = abs(tensor) > threshold
param.data = torch.FloatTensor(tensor * mask.astype(float)).to(param.device)
def train_with_pruning(model, dataloader, criterion, optimizer, device='cpu', num_epochs=10, pruning_rate=0.5):
model.train()
model.to(device)
for epoch in range(num_epochs):
running_loss = 0.0
for batch_idx, (inputs, targets) in enumerate(dataloader):
inputs, targets = inputs.to(device), targets.to(device)
# 前向传播
outputs = model(inputs.view(inputs.size(0), -1))
loss = criterion(outputs, targets)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)}")
# 在每个 epoch 结束后进行剪枝
prune_network(model, pruning_rate, method="global") # <================================== just prune the weights ot 0 but still allow them to grow back by optimizer.step()
return model
big_model = BigModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(big_model.parameters(), lr=1e-3)
big_model = train_with_pruning(big_model, train_loader, criterion, optimizer, device='cuda', num_epochs=10, pruning_rate=0.1)
# 保存训练好的模型
torch.save(big_model.state_dict(), "trained_with_pruning_model.pth")
直接remove剪枝,优点是可以减少模型的计算量和内存使用。可以通过减少网络容量来防止过拟合。
缺点是可能会降低网络的表示能力,导致性能下降。需要对网络结构进行改变,这可能会增加实现和微调的复杂性
# train phase
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
# 1. Train a large base network
class BigModel(nn.Module):
def __init__(self):
super(BigModel, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 16, kernel_size=3, padding=1)
self.fc = nn.Linear(16 * 28 * 28, 10)
# Initialize l1norm as a parameter and register as buffer
self.conv1_l1norm = nn.Parameter(torch.Tensor(32), requires_grad=False)
self.conv2_l1norm = nn.Parameter(torch.Tensor(16), requires_grad=False)
self.register_buffer('conv1_l1norm_buffer', self.conv1_l1norm)
self.register_buffer('conv2_l1norm_buffer', self.conv2_l1norm)
def forward(self, x):
x = torch.relu(self.conv1(x))
self.conv1_l1norm.data = torch.sum(torch.abs(self.conv1.weight.data), dim=(1, 2, 3))
x = torch.relu(self.conv2(x))
self.conv2_l1norm.data = torch.sum(torch.abs(self.conv2.weight.data), dim=(1, 2, 3))
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
# Training function
def train(model, dataloader, criterion, optimizer, device='gpu', num_epochs=10):
model.train()
model.to(device)
for epoch in range(num_epochs):
running_loss = 0.0
for batch_idx, (inputs, targets) in enumerate(dataloader):
inputs, targets = inputs.to(device), targets.to(device)
# Forward propagation
outputs = model(inputs)
loss = criterion(outputs, targets)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
# print(f"Loss: {running_loss / len(dataloader)}")
print(f"Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)}")
return model
if __name__ == "__main__":
# Prepare the MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
big_model = BigModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(big_model.parameters(), lr=1e-3)
big_model = train(big_model, train_loader, criterion, optimizer, device='cuda', num_epochs=3)
# Save the trained big network
torch.save(big_model.state_dict(), "big_model.pth")
# Set the input shape of the model
dummy_input = torch.randn(1, 1, 28, 28).to("cuda")
# Export the model to ONNX format
torch.onnx.export(big_model, dummy_input, "big_model.onnx")
# (/home/coeyliang/miniconda3) coeyliang_pruning> python train.py
# Epoch 1, Loss: 0.14482067066501939
# Epoch 2, Loss: 0.05070804020739657
# Epoch 3, Loss: 0.03378467213614771
剪枝部分,下面这份代码,对conv1修剪第一个通道,对conv2修剪第2个通道
#初始化一个空列表,用于存储二维卷积的层的L1范数
l1norms_for_local_threshold = []
for name, m in model.name_modules:
if isinstance(m, nn.Conv2d):
#为当前模块的L1范数创建一个名称
l1norm_buffer_name = f"{name}_l1norm_buffer"
#使用getattr函数从模型中获取该名称对应的属性。这个属性应该是当前模块的L1范数
l1norm = getattr(model, l1norm_buffer_name)
l1norms_for_local_threshold.append(l1norm)
#排序并且只取value,然后确定阈值,这里的0.5用于确定阈值
T_conv1 = torch.sort(l1norms_for_local_threshold[0])[0][int(len(l1norms_for_local_threshold[0]) * 0.5)]
#下面就是剪掉
#先取出来,方便后面操作
conv1 = model.conv1 #[32*1*3*3]
conv2 = model.conv2 #[16*32*3*3]
conv1_l1norm_buffer = model.conv1_l1norm_buffer
conv2_l1norm_buffer = model.conv2_l1norm_buffer
#比T_conv1大的留下
keep_idxs = torch.where(conv1_l1norm_buffer >= T_conv1)[0]
k = len(keep_idxs)
conv1.weight.data = conv1.weight.data[keep_idxs]
conv1.bias.data = conv1.bias.data[keep_idxs]
conv1_l1norm_buffer.data = conv1_l1norm_buffer.data[keep_idxs]
conv1.out_channels = k
_, keep_idxs = torch.topk(conv2_l1norm_buffer, k)
#注意这里要塞到第2个通道,所以是[:, keep_idxs]
conv2.weight.data = conv2.weight.data[:,keep_idxs]
conv2_l1norm_buffer.data = conv2_l1norm_buffer.data[keep_idxs]
conv2.in_channels = k
torch.save(model.state_dict(), "pruned_model.pth")
dummy_input = torch.randn(1, 1, 28, 28)
torch.onnx.export(model, dummy_input, "pruned_model.onnx")
#后面是finetune略
2.3.1稀疏训练
2018年提出,步骤分为
- 初始化一个带有随机mask的网络
- 训练这个pruned network 一个epoch
- 去掉一些权重较小的一些weights(或者不满足自定义条件的weights)
- 重新生成(regrow)同样数量的random weights
如下,我们拿到一个网络,考虑如何将其变成一个稀疏训练。
# raw net
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# Define the network architecture
class SparseNet(nn.Module):
def __init__(self):
super(SparseNet, self).__init__()
self.fc1 = nn.Linear(784, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = x.view(-1, 784)
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x
# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# Initialize the network, loss function, and optimizer
sparsity_rate = 0.5
model = SparseNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# Training loop
n_epochs = 10
for epoch in range(n_epochs):
running_loss = 0.0
for batch_idx, (inputs, targets) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
running_loss += loss.item()
# print(f"Loss: {running_loss / (batch_idx+1)}")
print(f"Epoch {epoch+1}/{n_epochs}, Loss: {running_loss / (batch_idx+1)}")
下图展示了稀疏训练,选出部分将其置0(对应图上红色),绿色是保留项(这只是前人的设计,后续有更好的)
下面是2018年这篇文章的实现
# sparse net
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# Define the network architecture
class SparseNet(nn.Module):
def __init__(self, sparsity_rate, mutation_rate = 0.5):
super(SparseNet, self).__init__()
self.fc1 = nn.Linear(784, 128)
self.fc2 = nn.Linear(128, 10)
self.sparsity_rate = sparsity_rate
self.mutation_rate = mutation_rate
self.initialize_masks() # <== 1.initialize a network with random mask
def forward(self, x):
x = x.view(-1, 784)
x = x @ (self.fc1.weight * self.mask1.to(x.device)).T + self.fc1.bias
x = torch.relu(x)
x = x @ (self.fc2.weight * self.mask2.to(x.device)).T + self.fc2.bias
return x
def initialize_masks(self):
self.mask1 = self.create_mask(self.fc1.weight, self.sparsity_rate)
self.mask2 = self.create_mask(self.fc2.weight, self.sparsity_rate)
def create_mask(self, weight, sparsity_rate):
k = int(sparsity_rate * weight.numel())
_, indices = torch.topk(weight.abs().view(-1), k, largest=False)
mask = torch.ones_like(weight, dtype=bool)
mask.view(-1)[indices] = False
return mask # <== 1.initialize a network with random mask
def update_masks(self):
self.mask1 = self.mutate_mask(self.fc1.weight, self.mask1, self.mutation_rate)
self.mask2 = self.mutate_mask(self.fc2.weight, self.mask2, self.mutation_rate)
def mutate_mask(self, weight, mask, mutation_rate=0.5): # weight and mask: 2d shape
# Find the number of elements in the mask that are True
num_true = torch.count_nonzero(mask)
# Compute the number of elements to mutate
mutate_num = int(mutation_rate * num_true)
# 3) pruning a certain amount of weights of lower magnitude
true_indices_2d = torch.where(mask == True) # index the 2d mask where is true
true_element_1d_idx_prune = torch.topk(weight[true_indices_2d], mutate_num, largest=False)[1]
for i in true_element_1d_idx_prune:
mask[true_indices_2d[0][i], true_indices_2d[1][i]] = False
# 4) regrowing the same amount of random weights.
# Get the indices of the False elements in the mask
false_indices = torch.nonzero(~mask)
# Randomly select n indices from the false_indices tensor
random_indices = torch.randperm(false_indices.shape[0])[:mutate_num]
# the elemnt to be regrow
regrow_indices = false_indices[random_indices]
for regrow_idx in regrow_indices:
mask[tuple(regrow_idx)] = True
return mask
# Set the device to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load MNIST dataset and move to the device
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
sparsity_rate = 0.5
model = SparseNet(sparsity_rate).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
n_epochs = 10
for epoch in range(n_epochs):
running_loss = 0.0
for batch_idx, (inputs, targets) in enumerate(train_loader):
# Move the data to the device
inputs, targets = inputs.to(device), targets.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
running_loss += loss.item()
# print(f"Loss: {running_loss / (batch_idx+1)}")
# Update masks
model.update_masks() # generate a new mask based on the updated weights
print(f"Epoch {epoch+1}/{n_epochs}, Loss: {running_loss / (batch_idx+1)}")