GoogLeNet

GoogLeNet

简介

GoogLeNet在2014年由Google团队提出,斩获当年ImageNet竞
赛中Classification Task (分类任务) 第一名。

GoogLeNet网络的优点

  • 引入了Inception结构(融合不同尺度的特征信息)
  • 使用\(1\times1\)的卷积核进行降维以及映射处理
  • 添加两个辅助分类器帮助训练
  • 丢弃全连接层,使用平均池化层(大大减少模型参数)

GoogLeNet模型架构

type patchsize/
stride
output
size
depth #1_×_1 #3_×_3
reduce
#3_×_3 #5_×_5
reduce
#5_×_5 pool
proj
params ops
convolution 7_×_7_/_2 112_×_112_×_64 1





2.7K 34M
max pool 3_×_3_/_2 56_×_56_×_64 0







convolution 3_×_3_/_1 56_×_56_×_192 2
64 192


112K 360M
max pool 3_×_3_/_2 28_×_28_×_192 0







inception(3a)
28_×_28_×_256 2 64 96 128 16 32 32 159K 128M
inception(3b)
28_×_28_×_480 2 128 128 192 32 96 64 380K 304M
max pool 3_×_3_/_2 14_×_14_×_480 0







inception(4a)
14_×_14_×_512 2 192 96 208 16 48 64 364K 73M
inception(4b)
14_×_14_×_512 2 160 112 224 24 64 64 437K 88M
inception(4c)
14_×_14_×_512 2 128 128 256 24 64 64 463K 100M
inception(4d)
14_×_14_×_528 2 112 144 288 32 64 64 580K 119M
inception(4e)
14_×_14_×_832 2 256 160 320 32 128 128 840K 170M
max pool 3_×_3_/_2 7_×_7_×_832 0







inception(5a)
7_×_7_×_832 2 256 160 320 32 128 128 1072K 54M
inception(5b)
7_×_7_×_1024 2 384 192 384 48 128 128 1388K 71M
avgpool 7_×_7_/_1 1_×_1_×_1024 0







dropout(40%)
1_×_1_×_1024 0







linear
1_×_1_×_1000 1





1000K 1M
softmax
1_×_1_×_1000 0







Inception结构

image.png

辅助分类器(Auxiliary Classifier)

  • An average pooling layer with \(5\times5\)filter size and stride 3, resulting in an \(4\times4\times512\)output for the (4a), and \(4\times4\times528\)for the (4d) stage.
  • A \(1\times 1\)convolution with 128 filters for dimension reduction and recitified linear activation.
  • A fully connected layer with 1024 units and recitified linear activation.
  • A dropout layer with \(70\%\)ratio of dropped outputs.
  • A linear layer with softmax loss as the classifier (predicting the same 1000 classes as the main classifier, but removed at inference time).

A schematic view of the resulting network is depicted in Figure 3.
image.png

卷积后图像大小:
\(out_{size}=(in_{size}-F_{size}+2P)/S+1\)

  • \(F_{size}\)为卷积核大小
  • \(P\)为填充大小
  • \(S\)为步长

GoogLeNet整体架构

使用PyTorch搭建GoogLeNet网络

编写Inception块

image.png

# 定义一个带激活函数的卷积层
class BasicConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicConv2d, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(self.conv(x))


# 定义Inception块
class Inception(nn.Module):
    def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj):
        super(Inception, self).__init__()
        # 分别编写四个分支
        self.branch1 = BasicConv2d(in_channels, ch1x1, kernel_size=1)

        self.branch2 = nn.Sequential(
            BasicConv2d(in_channels, ch3x3red, kernel_size=1),
            # padding='same'保证输入图像和输出图像大小相同
            BasicConv2d(ch3x3red, ch3x3, kernel_size=3, padding='same')
        )

        self.branch3 = nn.Sequential(
            BasicConv2d(in_channels, ch5x5red, kernel_size=1),
            BasicConv2d(ch5x5red, ch5x5, kernel_size=5, padding='same')
        )

        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding='same'),
            BasicConv2d(in_channels, pool_proj, kernel_size=1)
        )

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)
        # outputs=[branch1,branch2,branch3,branch4]
        # dim=1为通道维度,在通道维度拼接在一起
        return torch.cat([branch1, branch2, branch3, branch4], 1)

编写辅助分类器

image.png

class InceptionAux(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(InceptionAux, self).__init__()
        # 平均池化层
        self.avgPool = nn.AvgPool2d(kernel_size=5, stride=3)
        # output: [batch, 128, 4, 4]
        # 卷积层
        self.conv = BasicConv2d(in_channels, 128, kernel_size=1)

        # 线性层
        self.fc1 = nn.Linear(2048, 1024)
        self.fc2 = nn.Linear(1024, num_classes)

    def forward(self, x):
        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
        x = self.avgPool(x)
        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
        x = self.conv(x)
        # N x 128 x 4 x 4
        x = torch.flatten(x, 1)
        x = torch.dropout(x, 0.7, train=self.training)
        # N x 2048
        x = torch.relu(self.fc1(x), inplace=True)
        x = torch.dropout(x, 0.7, train=self.training)
        # N x 1024
        x = self.fc2(x)
        # N x num_classes
        return x

编写GoogLeNet网络

class GoogLeNet(nn.Module):
    def __init__(self, num_classes=1000, aux_logits=True, init_weights=False):
        super(GoogLeNet, self).__init__()
        self.aux_logits = aux_logits
        
        self.conv1 = BasicConv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        
        self.conv2 = BasicConv2d(64, 64, kernel_size=1)
        self.conv3 = BasicConv2d(64, 192, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        
        self.inception3a = Inception(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = Inception(256, 128, 128, 192, 32, 96, 64)
        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        
        self.inception4a = Inception(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = Inception(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = Inception(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = Inception(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = Inception(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        
        self.inception5a = Inception(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = Inception(832, 384, 192, 384, 48, 128, 128)
        
        if self.aux_logits:
            self.aux1 = InceptionAux(512, num_classes)
            self.aux2 = InceptionAux(528, num_classes)
            
            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
            self.dropout = nn.Dropout(0.7)
            self.fc = nn.Linear(1024, num_classes)
            if init_weights:
                self._initialize_weights()
                
                def forward(self, x):
                    # N x 3 x 224 x 224
                    x = self.conv1(x)
                    # N x 64 x 112 x 112
                    x = self.maxpool1(x)
                    # N x 64 x 56 x 56
                    x = self.conv2(x)
                    # N x 64 x 56 x 56
                    x = self.conv3(x)
                    # N x 192 x 56 x 56
                    x = self.maxpool2(x)
                    
                    # N x 192 x 28 x 28
                    x = self.inception3a(x)
                    # N x 256 x 28 x 28
                    x = self.inception3b(x)
                    # N x 480 x 28 x 28
                    x = self.maxpool3(x)
                    # N x 480 x 14 x 14
                    x = self.inception4a(x)
                    # N x 512 x 14 x 14
                    if self.training and self.aux_logits:  # eval model lose this layer
                        aux1 = self.aux1(x)
                        
                        x = self.inception4b(x)
                        # N x 512 x 14 x 14
                        x = self.inception4c(x)
                        # N x 512 x 14 x 14
                        x = self.inception4d(x)
                        # N x 528 x 14 x 14
                        if self.training and self.aux_logits:  # eval model lose this layer
                            aux2 = self.aux2(x)
                            
                            x = self.inception4e(x)
                            # N x 832 x 14 x 14
                            x = self.maxpool4(x)
                            # N x 832 x 7 x 7
                            x = self.inception5a(x)
                            # N x 832 x 7 x 7
                            x = self.inception5b(x)
                            # N x 1024 x 7 x 7
                            
                            x = self.avgpool(x)
                            # N x 1024 x 1 x 1
                            x = torch.flatten(x, 1)
                            # N x 1024
                            x = self.dropout(x)
                            x = self.fc(x)
                            # N x 1000 (num_classes)
                            if self.training and self.aux_logits:  # eval model lose this layer
                                return x, aux2, aux1
                            return x
                        
                        def _initialize_weights(self):
                            for m in self.modules():
                                if isinstance(m, nn.Conv2d):
                                    nn.init.kaiming_uniform_(m.weight, mode='fan_out', nonlinearity='relu')
                                    if m.bias is not None:
                                        nn.init.constant_(m.bias, 0)
                                    elif isinstance(m, nn.Linear):
                                        nn.init.normal_(m.weight, 0, 0.01)
                    nn.init.constant_(m.bias, 0)

模型总代码

import torch
import torch.nn as nn
import torch.nn.functional as F


# 定义一个带激活函数的卷积层
class BasicConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicConv2d, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(self.conv(x))


# 定义Inception块
class Inception(nn.Module):
    def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj):
        super(Inception, self).__init__()
        # 分别编写四个分支
        self.branch1 = BasicConv2d(in_channels, ch1x1, kernel_size=1)

        self.branch2 = nn.Sequential(
            BasicConv2d(in_channels, ch3x3red, kernel_size=1),
            # padding='same'保证输入图像和输出图像大小相同
            BasicConv2d(ch3x3red, ch3x3, kernel_size=3, padding=1)
        )

        self.branch3 = nn.Sequential(
            BasicConv2d(in_channels, ch5x5red, kernel_size=1),
            BasicConv2d(ch5x5red, ch5x5, kernel_size=5, padding=2)
        )

        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            BasicConv2d(in_channels, pool_proj, kernel_size=1)
        )

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)
        # outputs=[branch1,branch2,branch3,branch4]
        # dim=1为通道维度,在通道维度拼接在一起
        return torch.cat([branch1, branch2, branch3, branch4], 1)


class InceptionAux(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(InceptionAux, self).__init__()
        # 平均池化层
        self.avgPool = nn.AvgPool2d(kernel_size=5, stride=3)
        # output: [batch, 128, 4, 4]
        # 卷积层
        self.conv = BasicConv2d(in_channels, 128, kernel_size=1)

        # 线性层
        self.fc1 = nn.Linear(2048, 1024)
        self.fc2 = nn.Linear(1024, num_classes)

    def forward(self, x):
        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
        x = self.avgPool(x)
        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
        x = self.conv(x)
        # N x 128 x 4 x 4
        x = torch.flatten(x, 1)
        x = torch.dropout(x, 0.7, train=self.training)
        # N x 2048
        x = torch.relu(self.fc1(x))
        x = torch.dropout(x, 0.7, train=self.training)
        # N x 1024
        x = self.fc2(x)
        # N x num_classes
        return x


class GoogLeNet(nn.Module):
    def __init__(self, num_classes=1000, aux_logits=True, init_weights=False):
        super(GoogLeNet, self).__init__()
        self.aux_logits = aux_logits

        self.conv1 = BasicConv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.conv2 = BasicConv2d(64, 64, kernel_size=1)
        self.conv3 = BasicConv2d(64, 192, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception3a = Inception(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = Inception(256, 128, 128, 192, 32, 96, 64)
        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception4a = Inception(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = Inception(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = Inception(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = Inception(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = Inception(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception5a = Inception(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = Inception(832, 384, 192, 384, 48, 128, 128)

        if self.aux_logits:
            self.aux1 = InceptionAux(512, num_classes)
            self.aux2 = InceptionAux(528, num_classes)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.7)
        self.fc = nn.Linear(1024, num_classes)
        if init_weights:
            self._initialize_weights()

    def forward(self, x):
        # N x 3 x 224 x 224
        x = self.conv1(x)
        # N x 64 x 112 x 112
        x = self.maxpool1(x)
        # N x 64 x 56 x 56
        x = self.conv2(x)
        # N x 64 x 56 x 56
        x = self.conv3(x)
        # N x 192 x 56 x 56
        x = self.maxpool2(x)

        # N x 192 x 28 x 28
        x = self.inception3a(x)
        # N x 256 x 28 x 28
        x = self.inception3b(x)
        # N x 480 x 28 x 28
        x = self.maxpool3(x)
        # N x 480 x 14 x 14
        x = self.inception4a(x)
        # N x 512 x 14 x 14
        if self.training and self.aux_logits:  # eval model lose this layer
            aux1 = self.aux1(x)

        x = self.inception4b(x)
        # N x 512 x 14 x 14
        x = self.inception4c(x)
        # N x 512 x 14 x 14
        x = self.inception4d(x)
        # N x 528 x 14 x 14
        if self.training and self.aux_logits:  # eval model lose this layer
            aux2 = self.aux2(x)

        x = self.inception4e(x)
        # N x 832 x 14 x 14
        x = self.maxpool4(x)
        # N x 832 x 7 x 7
        x = self.inception5a(x)
        # N x 832 x 7 x 7
        x = self.inception5b(x)
        # N x 1024 x 7 x 7

        x = self.avgpool(x)
        # N x 1024 x 1 x 1
        x = torch.flatten(x, 1)
        # N x 1024
        x = self.dropout(x)
        x = self.fc(x)
        # N x 1000 (num_classes)
        if self.training and self.aux_logits:  # eval model lose this layer
            return x, aux2, aux1
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_uniform_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
                elif isinstance(m, nn.Linear):
                    nn.init.normal_(m.weight, 0, 0.01)
                    nn.init.constant_(m.bias, 0)

训练模型脚本

import os
import sys
import json

import torch
import torch.nn as nn
from torchvision import transforms, datasets
import torch.optim as optim
from tqdm import tqdm

from model import GoogLeNet
from utils import read_split_data, ImageDataset, compute_mean_std


def main():
    # 设置训练设备
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('using {} device.'.format(device))

    # 图片路径
    image_path = r"D:\卷积神经网络PPT\AlexNet\Alex_tf\data\flower_photos"
    # 创建训练集

    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(root=image_path)
    # 训练集均值和标准差字典
    mean_std = compute_mean_std(train_images_path)

    # 设置训练集和验证集的数据增强
    data_transform = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(224),  # 随机裁剪后缩放到指定大小
            transforms.RandomHorizontalFlip(),  # 随机水平翻转
            transforms.ToTensor(),
            transforms.Normalize(mean_std['mean'], mean_std['std'])  # 标准化
        ]),
        'val': transforms.Compose([
            transforms.Resize((224, 224)),  # 缩放到指定大小
            transforms.ToTensor(),
            transforms.Normalize(mean_std['mean'], mean_std['std'])
        ])
    }

    # 设置批次大小
    batch_size = 32
    # 设置加载数据进程数
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])
    print('using {} dataloader workers every process'.format(nw))
    train_dataset = ImageDataset(train_images_path, train_images_label, transform=data_transform['train'])
    train_num = len(train_dataset)

    # 装载训练集数据
    train_loader = torch.utils.data.DataLoader2(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               pin_memory=True,
                                               num_workers=nw)
    # 创建验证集
    valid_dataset = ImageDataset(val_images_path, val_images_label, transform=data_transform['val'])
    # 装载验证集数据

    valid_loader = torch.utils.data.DataLoader2(valid_dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               pin_memory=True,
                                               num_workers=nw)
    # 记录验证集数据量
    val_num = len(valid_dataset)
    # test_data_iter = iter(validate_loader)
    # test_image, test_label = test_data_iter.next()

    net = GoogLeNet(num_classes=5, aux_logits=True, init_weights=True)
    # 如果要使用官方的预训练权重,注意是将权重载入官方的模型,不是我们自己实现的模型
    # 官方的模型中使用了bn层以及改了一些参数,不能混用
    # import torchvision
    # net = torchvision.models.googlenet(num_classes=5)
    # model_dict = net.state_dict()
    # # 预训练权重下载地址: https://download.pytorch.org/models/googlenet-1378be20.pth
    # pretrain_model = torch.load("googlenet.pth")
    # del_list = ["aux1.fc2.weight", "aux1.fc2.bias",
    #             "aux2.fc2.weight", "aux2.fc2.bias",
    #             "fc.weight", "fc.bias"]
    # pretrain_dict = {k: v for k, v in pretrain_model.items() if k not in del_list}
    # model_dict.update(pretrain_dict)
    # net.load_state_dict(model_dict)
    net.to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.0003)

    epochs = 30
    best_acc = 0.0
    save_path = './googleNet.pth'
    train_steps = len(train_loader)
    for epoch in range(epochs):
        # train
        net.train()
        running_loss = 0.0
        train_bar = tqdm(train_loader, file=sys.stdout)
        for step, data in enumerate(train_bar):
            images, labels = data
            optimizer.zero_grad()
            logits, aux_logits2, aux_logits1 = net(images.to(device))
            loss0 = loss_function(logits, labels.to(device))
            loss1 = loss_function(aux_logits1, labels.to(device))
            loss2 = loss_function(aux_logits2, labels.to(device))
            loss = loss0 + loss1 * 0.3 + loss2 * 0.3
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(epoch + 1,
                                                                     epochs,
                                                                     loss)

        # validate
        net.eval()
        acc = 0.0  # accumulate accurate number / epoch
        with torch.no_grad():
            val_bar = tqdm(valid_loader, file=sys.stdout)
            for val_data in val_bar:
                val_images, val_labels = val_data
                outputs = net(val_images.to(device))  # eval model only have last output layer
                predict_y = torch.argmax(outputs,1)
                acc += torch.eq(predict_y, val_labels.to(device)).sum().item()

        val_accurate = acc / val_num
        print('[epoch %d] train_loss: %.3f  val_accuracy: %.3f' %
              (epoch + 1, running_loss / train_steps, val_accurate))

        if val_accurate > best_acc:
            best_acc = val_accurate
            torch.save(net.state_dict(), save_path)

    print('Finished Training')


if __name__ == '__main__':
    main()

运行结果:

train epoch[29/30] loss:1.303: 100%|██████████| 92/92 [00:28<00:00,  3.19it/s]
100%|██████████| 23/23 [00:18<00:00,  1.28it/s]
[epoch 29] train_loss: 0.995  val_accuracy: 0.795
train epoch[30/30] loss:1.112: 100%|██████████| 92/92 [00:26<00:00,  3.48it/s]
100%|██████████| 23/23 [00:17<00:00,  1.35it/s]
[epoch 30] train_loss: 0.946  val_accuracy: 0.793
Finished Training

预测脚本

import os
import json
import torch
from PIL import Image
from torchvision import transforms
import matplotlib.pyplot as plt
from model import GoogLeNet


def main():
    device = torch.device('cpu')
    data_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])
    img_path = r'D:/卷积神经网络PPT/AlexNet/Alex_Torch/img.png'
    assert os.path.exists(img_path), "file: '{}' does not exist.".format(img_path)
    img = Image.open(img_path)
    plt.imshow(img)
    img = data_transform(img)
    # 拓展成四维,增加一个批次维度
    img = torch.unsqueeze(img, dim=0)  # [batch_size,channel,height,width]
    # 读取类索引json文件
    json_path = './class_indices.json'
    assert os.path.exists(json_path), "file: '{}' does not exist.".format(json_path)
    with open(json_path, 'r') as f:
        class_indict = json.load(f)
    # 实例化模型
    model = GoogLeNet(num_classes=5, aux_logits=False)
    # 加载训练好的模型权重
    weights_path = './googleNet.pth'
    assert os.path.exists(weights_path), "file: '{}' does not exist.".format(weights_path)
    model.load_state_dict(torch.load(weights_path, map_location=device), strict=False)

    # 设置模型为评估状态
    model.eval()
    # 不跟踪梯度
    with torch.no_grad():
        outputs = torch.squeeze(model(img.to(device))).cpu()
        predict = torch.softmax(outputs, dim=0)
        predict_cla = torch.argmax(predict).numpy()
    print_res = "class: {} prob:{:.3}".format(class_indict[str(predict_cla)],
                                              predict[predict_cla].numpy())
    plt.grid(False)
    plt.axis(False)
    plt.title(print_res)
    for i in range(len(predict)):
        print("class: {:10}   prob: {:.3}".format(class_indict[str(i)],
                                                  predict[i].numpy()))
    plt.show()


if __name__ == '__main__':
    main()

运行结果:

class: daisy        prob: 0.0728
class: dandelion    prob: 0.175
class: roses        prob: 0.0419
class: sunflowers   prob: 0.0971
class: tulips       prob: 0.613

image.png

使用TensorFlow搭建GoogLeNet

Inception块

image.png

# Inception块
class Inception(layers.Layer):
    def __init__(self, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj, **kwargs):
        super(Inception, self).__init__(**kwargs)

        self.branch1 = layers.Conv2D(ch1x1, kernel_size=1, activation='relu')
        # padding='same',输入图像和输出图像大小相同
        self.branch2 = Sequential([
            layers.Conv2D(ch3x3red, kernel_size=1, activation='relu'),
            layers.Conv2D(ch3x3, kernel_size=3, padding='same', activation='relu')
        ])

        self.branch3 = Sequential([
            layers.Conv2D(ch5x5red, kernel_size=1, activation='relu'),
            layers.Conv2D(ch5x5, kernel_size=5, activation='relu')
        ])

        self.branch4 = Sequential([
            layers.MaxPool2D(pool_size=3, strides=1, padding='same'),
            layers.Conv2D(pool_proj, kernel_size=1, activation='relu')
        ])

    # 正向传播,最后将输出沿通道拼接
    def call(self, x, **kwargs):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)
        return layers.concatenate([branch1, branch2, branch3, branch4])

辅助分类器

image.png

# 辅助分类器
class InceptionAux(layers.Layer):
    def __int__(self, num_classes, **kwargs):
        super(InceptionAux, self).__init__(**kwargs)
        self.avgPool = layers.AvgPool2D(pool_size=5, strides=3)
        self.conv = layers.Conv2D(128, kernel_size=1, activation='relu')

        self.fc1 = layers.Dense(1024, activation='relu')
        self.fc2 = layers.Dense(num_classes)
        self.softmax = layers.Softmax()

    # 正向传播
    def call(self, inputs, **kwargs):
        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
        x = self.averagePool(inputs)
        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
        x = self.conv(x)
        # N x 128 x 4 x 4
        x = layers.Flatten()(x)
        x = layers.Dropout(rate=0.5)(x)
        # N x 2048
        x = self.fc1(x)
        x = layers.Dropout(rate=0.5)(x)
        # N x 1024
        x = self.fc2(x)
        # N x num_classes
        x = self.softmax(x)

        return x

GoogLeNet网络

def GoogLeNet(im_height=224, im_width=224, class_num=1000, aux_logits=False):
    # tensorflow中的tensor通道排序是NHWC
    input_image = layers.Input(shape=(im_height, im_width, 3), dtype="float32")
    # (None, 224, 224, 3)
    x = layers.Conv2D(64, kernel_size=7, strides=2, padding="SAME", activation="relu", name="conv2d_1")(input_image)
    # (None, 112, 112, 64)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_1")(x)
    # (None, 56, 56, 64)
    x = layers.Conv2D(64, kernel_size=1, activation="relu", name="conv2d_2")(x)
    # (None, 56, 56, 64)
    x = layers.Conv2D(192, kernel_size=3, padding="SAME", activation="relu", name="conv2d_3")(x)
    # (None, 56, 56, 192)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_2")(x)

    # (None, 28, 28, 192)
    x = Inception(64, 96, 128, 16, 32, 32, name="inception_3a")(x)
    # (None, 28, 28, 256)
    x = Inception(128, 128, 192, 32, 96, 64, name="inception_3b")(x)

    # (None, 28, 28, 480)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_3")(x)
    # (None, 14, 14, 480)
    x = Inception(192, 96, 208, 16, 48, 64, name="inception_4a")(x)
    if aux_logits:
        aux1 = InceptionAux(class_num, name="aux_1")(x)

    # (None, 14, 14, 512)
    x = Inception(160, 112, 224, 24, 64, 64, name="inception_4b")(x)
    # (None, 14, 14, 512)
    x = Inception(128, 128, 256, 24, 64, 64, name="inception_4c")(x)
    # (None, 14, 14, 512)
    x = Inception(112, 144, 288, 32, 64, 64, name="inception_4d")(x)
    if aux_logits:
        aux2 = InceptionAux(class_num, name="aux_2")(x)

    # (None, 14, 14, 528)
    x = Inception(256, 160, 320, 32, 128, 128, name="inception_4e")(x)
    # (None, 14, 14, 532)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_4")(x)

    # (None, 7, 7, 832)
    x = Inception(256, 160, 320, 32, 128, 128, name="inception_5a")(x)
    # (None, 7, 7, 832)
    x = Inception(384, 192, 384, 48, 128, 128, name="inception_5b")(x)
    # (None, 7, 7, 1024)
    x = layers.AvgPool2D(pool_size=7, strides=1, name="avgpool_1")(x)

    # (None, 1, 1, 1024)
    x = layers.Flatten(name="output_flatten")(x)
    # (None, 1024)
    x = layers.Dropout(rate=0.4, name="output_dropout")(x)
    x = layers.Dense(class_num, name="output_dense")(x)
    # (None, class_num)
    aux3 = layers.Softmax(name="aux_3")(x)

    if aux_logits:
        model = models.Model(inputs=input_image, outputs=[aux1, aux2, aux3])
    else:
        model = models.Model(inputs=input_image, outputs=aux3)
    return model

模型总代码

from tensorflow.keras import layers, models, Model, Sequential


def GoogLeNet(im_height=224, im_width=224, num_classes=1000, aux_logits=False):
    # tensorflow中的tensor通道排序是[batch, height, width, channel]
    input_image = layers.Input(shape=(im_height, im_width, 3), dtype="float32")
    # (None, 224, 224, 3)
    x = layers.Conv2D(64, kernel_size=7, strides=2, padding="SAME", activation="relu", name="conv2d_1")(input_image)
    # (None, 112, 112, 64)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_1")(x)
    # (None, 56, 56, 64)
    x = layers.Conv2D(64, kernel_size=1, activation="relu", name="conv2d_2")(x)
    # (None, 56, 56, 64)
    x = layers.Conv2D(192, kernel_size=3, padding="SAME", activation="relu", name="conv2d_3")(x)
    # (None, 56, 56, 192)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_2")(x)

    # (None, 28, 28, 192)
    x = Inception(64, 96, 128, 16, 32, 32, name="inception_3a")(x)
    # (None, 28, 28, 256)
    x = Inception(128, 128, 192, 32, 96, 64, name="inception_3b")(x)

    # (None, 28, 28, 480)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_3")(x)
    # (None, 14, 14, 480)
    x = Inception(192, 96, 208, 16, 48, 64, name="inception_4a")(x)
    if aux_logits:
        aux1 = InceptionAux(num_classes, name="aux_1")(x)

    # (None, 14, 14, 512)
    x = Inception(160, 112, 224, 24, 64, 64, name="inception_4b")(x)
    # (None, 14, 14, 512)
    x = Inception(128, 128, 256, 24, 64, 64, name="inception_4c")(x)
    # (None, 14, 14, 512)
    x = Inception(112, 144, 288, 32, 64, 64, name="inception_4d")(x)
    if aux_logits:
        aux2 = InceptionAux(num_classes, name="aux_2")(x)

    # (None, 14, 14, 528)
    x = Inception(256, 160, 320, 32, 128, 128, name="inception_4e")(x)
    # (None, 14, 14, 532)
    x = layers.MaxPool2D(pool_size=3, strides=2, padding="SAME", name="maxpool_4")(x)

    # (None, 7, 7, 832)
    x = Inception(256, 160, 320, 32, 128, 128, name="inception_5a")(x)
    # (None, 7, 7, 832)
    x = Inception(384, 192, 384, 48, 128, 128, name="inception_5b")(x)
    # (None, 7, 7, 1024)
    x = layers.AvgPool2D(pool_size=7, strides=1, name="avgpool_1")(x)

    # (None, 1, 1, 1024)
    x = layers.Flatten(name="output_flatten")(x)
    # (None, 1024)
    x = layers.Dropout(rate=0.4, name="output_dropout")(x)
    x = layers.Dense(num_classes, name="output_dense")(x)
    # (None, num_classes)
    aux3 = layers.Softmax(name="aux_3")(x)

    if aux_logits:
        model = models.Model(inputs=input_image, outputs=[aux1, aux2, aux3])
    else:
        model = models.Model(inputs=input_image, outputs=aux3)
    return model


class Inception(layers.Layer):
    def __init__(self, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj, **kwargs):
        super(Inception, self).__init__(**kwargs)
        self.branch1 = layers.Conv2D(ch1x1, kernel_size=1, activation="relu")

        self.branch2 = Sequential([
            layers.Conv2D(ch3x3red, kernel_size=1, activation="relu"),
            layers.Conv2D(ch3x3, kernel_size=3, padding="SAME", activation="relu")])  # output_size= input_size

        self.branch3 = Sequential([
            layers.Conv2D(ch5x5red, kernel_size=1, activation="relu"),
            layers.Conv2D(ch5x5, kernel_size=5, padding="SAME", activation="relu")])  # output_size= input_size

        self.branch4 = Sequential([
            layers.MaxPool2D(pool_size=3, strides=1, padding="SAME"),  # caution: default strides==pool_size
            layers.Conv2D(pool_proj, kernel_size=1, activation="relu")])  # output_size= input_size

    def call(self, inputs, **kwargs):
        branch1 = self.branch1(inputs)
        branch2 = self.branch2(inputs)
        branch3 = self.branch3(inputs)
        branch4 = self.branch4(inputs)
        outputs = layers.concatenate([branch1, branch2, branch3, branch4])
        return outputs


class InceptionAux(layers.Layer):
    def __init__(self, num_classes, **kwargs):
        super(InceptionAux, self).__init__(**kwargs)
        self.averagePool = layers.AvgPool2D(pool_size=5, strides=3)
        self.conv = layers.Conv2D(128, kernel_size=1, activation="relu")

        self.fc1 = layers.Dense(1024, activation="relu")
        self.fc2 = layers.Dense(num_classes)
        self.softmax = layers.Softmax()

    def call(self, inputs, **kwargs):
        # aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14
        x = self.averagePool(inputs)
        # aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4
        x = self.conv(x)
        # N x 128 x 4 x 4
        x = layers.Flatten()(x)
        x = layers.Dropout(rate=0.7)(x)
        # N x 2048
        x = self.fc1(x)
        x = layers.Dropout(rate=0.7)(x)
        # N x 1024
        x = self.fc2(x)
        # N x num_classes
        x = self.softmax(x)

        return x

训练脚本

import matplotlib.pyplot as plt
from model import GoogLeNet
import tensorflow as tf
import json
import os
import time
import random
from utils import read_split_data

os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'


def main():
    # print(tf.test.is_gpu_available())
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(e)
            exit(-1)
    image_path = r"D:\卷积神经网络PPT\AlexNet\Alex_tf\data\flower_photos"
    train_images_path, train_images_label, val_images_path, val_images_label = read_split_data(root=image_path,
                                                                                               val_rate=0.2)
    if not os.path.exists('weights'):
        os.mkdir('weights')
    train_num = len(train_images_path)
    valid_num = len(val_images_path)

    im_height = 224
    im_width = 224
    batch_size = 64
    epochs = 30

    # class dict
    with open('./class_indices.json') as f:
        cla_dict = json.load(f)
    num_classes = len(cla_dict)
    print('using {} images for training, {} images for validation'.format(len(train_images_path), len(val_images_path)))

    def process_train_img(img_path, label):
        label = tf.one_hot(label, depth=num_classes)
        image = tf.io.read_file(img_path)
        image = tf.image.decode_jpeg(image)
        image = tf.image.convert_image_dtype(image, tf.float32)
        image = tf.image.resize(image, [im_height, im_width])
        image = tf.image.random_flip_left_right(image)
        image = (image - 0.5) / 0.5
        return image, label

    def process_valid_img(img_path, label):
        label = tf.one_hot(label, depth=num_classes)
        image = tf.io.read_file(img_path)
        image = tf.image.decode_jpeg(image)
        image = tf.image.convert_image_dtype(image, tf.float32)
        image = tf.image.resize(image, [im_height, im_width])
        image = (image - 0.5) / 0.5
        return image, label

    AUTOTUNE = tf.data.experimental.AUTOTUNE
    # 加载训练集数据
    train_ds = tf.data.Dataset.from_tensor_slices((train_images_path, train_images_label))
    train_ds = train_ds.shuffle(buffer_size=train_num) \
        .map(process_train_img, num_parallel_calls=AUTOTUNE) \
        .repeat().batch(batch_size).prefetch(AUTOTUNE)
    # 加载验证集数据
    valid_ds = tf.data.Dataset.from_tensor_slices((val_images_path, val_images_label))
    valid_ds = valid_ds.map(process_valid_img, num_parallel_calls=AUTOTUNE) \
        .repeat() \
        .batch(batch_size)

    # 实例化模型
    model = GoogLeNet(im_height=224, im_width=224, num_classes=5, aux_logits=True)
    model.summary()

    # 使用低级API来进行训练
    criterion = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    optimizer = tf.keras.optimizers.Adam(lr=0.0003)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_acc = tf.keras.metrics.CategoricalAccuracy(name='train_acc')

    valid_loss = tf.keras.metrics.Mean(name='valid_loss')
    valid_acc = tf.keras.metrics.CategoricalAccuracy(name='valid_acc')

    @tf.function
    def train_step(images, labels):
        with tf.GradientTape() as tape:
            aux1, aux2, output = model(images, training=True)
            loss1 = criterion(labels, aux1)
            loss2 = criterion(labels, aux2)
            loss3 = criterion(labels, output)
            loss = loss1 * 0.3 + loss2 * 0.3 + loss3
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        train_loss(loss)
        train_acc(labels, output)

    @tf.function
    def valid_step(images, labels):
        _, _, output = model(images, training=False)
        v_loss = criterion(labels, output)

        valid_loss(v_loss)
        valid_acc(labels, output)

    best_valid_loss = float('inf')
    train_step_num = train_num // batch_size
    valid_step_num = valid_num // batch_size
    for epoch in range(1, epochs + 1):
        train_loss.reset_states()
        train_acc.reset_states()
        valid_loss.reset_states()
        valid_acc.reset_states()

        t1 = time.perf_counter()
        for idx, (images, labels) in enumerate(train_ds):
            train_step(images, labels)
            if idx + 1 % train_step_num == 0:
                break
        print(time.perf_counter() - t1)

        for idx, (images, labels) in enumerate(valid_ds):
            valid_step(images, labels)
            if idx + 1 % valid_step_num == 0:
                break
        template = 'Epoch\t{}\tTrain Loss\t{}Train Acc\t{}\tValid Loss\t{}\tValid Acc{}'
        print(template.format(epoch,
                              train_loss.result(),
                              train_acc.result() * 100,
                              valid_loss.result(),
                              valid_acc.result() * 100))
        if valid_loss.result() < best_valid_loss:
            model.save_weights('./weights/myGoogLeNet.ckpt', save_format='tf')


if __name__ == '__main__':
    main()

预测脚本

import os
import json
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from model import GoogLeNet


def main():
    im_height = 224
    im_width = 224
    # 读取图片,并对图像做预处理
    img_path = r'D:/卷积神经网络PPT/AlexNet/Alex_Torch/img.png'
    img = Image.open(img_path).convert("RGB")

    img = img.resize((im_height, im_width))
    plt.imshow(img)

    img = np.asarray(img) / 255.
    img = np.expand_dims(img, 0)

    # 读取索引对应的类
    json_path = './class_indices.json'
    with open(json_path, 'r') as f:
        class_dict = json.load(f)
    # 实例化模型,并加载权重
    model = GoogLeNet(im_height=224, im_width=224, num_classes=5, aux_logits=False)
    weights_path = './weights/myGoogLeNet.ckpt'
    model.load_weights(weights_path)

    # 缩减批次维度
    result = np.squeeze(model.predict(img))
    predict_class = np.argmax(result)  # 获取概率最大的索引
    # 输出图片预测类和预测类的概率
    print_res = "class: {}   prob: {:.3}".format(class_dict[str(predict_class)],
                                                 result[predict_class])
    plt.title(print_res)
    for i in range(len(result)):
        print("class: {:10}   prob: {:.3}".format(class_dict[str(i)],
                                                  result[i]))
    plt.show()


if __name__ == '__main__':
    main()
posted @ 2022-08-13 11:42  里列昂遗失的记事本  阅读(138)  评论(0编辑  收藏  举报