常用数据集WebVision介绍 附pytorch下的简单使用

WebVision数据集介绍

官方下载地址

WebVision数据集常用于开集/闭集噪声学习、长尾噪声学习方法在真实数据集上的评估。根据[2]的统计,干净样本占70%,OOD噪声占25%,ID噪声占5%。

由于数据集本身较大,论文中使用的都是其中很小的一部分,进入下载页面,选择《WebVision Dataset 1.0》《Resized Images (small version)》:

  • 一般需要数据集的训练集《Google Images Resized (16 GB) 》
  • 验证集《Validation Images Resized (834 MB)》
  • 这两个集合的标签《Metadata》下的《Training & Validation Labels (183 MB)》。

注意,由于测试集不提供标签,因此评估论文方法性能时不去使用。另外大部分实验仅使用了google子集,因此这里也只使用google子集。

整理下下载的数据集,放到目录 ~/data/webvision1.0 下,目录结构如下:

├─google
│  ├─q0001
│  ├─q0002
│  ├─...
│  ├─q1631
│  └─q1632
├─info
└─val_images_256

info 目录下重要的几个txt:

  1. queries_google.txt:1632行,与google目录下的文件夹相对应,每行是一个查询词。
  2. synsets.txt:1000行表示1000个类,标签 \(i(0\le i\le 999)\) 的具体含义在第 \(i+1\)行。
  3. train_filelist_google.txt:每行表示一个图片的路径和标签,路径是相对于google目录的。
  4. val_filelist.txt:同上,但是是相对于val_images_256目录的。

在Pytorch中使用WebVision数据集

大多数论文使用mini-Webvision,即仅使用前50个类。

数据集 & DataLoader

# webvision.py
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import os


class Webvision(Dataset):
    def __init__(self, root, train=True, transform=None, num_classes=50):
        root = os.path.expanduser(root)
        self.root = root
        self.transform = transform
        self.train = train
        if train:
            with open(os.path.join(root, 'info/train_filelist_google.txt')) as f:
                lines = f.readlines()
            data, targets = [], []
            for line in lines:
                img, target = line.split()
                target = int(target)
                if target < num_classes:
                    data.append(img)
                    targets.append(target)
        else:
            with open(os.path.join(root, 'info/val_filelist.txt')) as f:
                lines = f.readlines()
            data, targets = [], []
            for line in lines:
                img, target = line.split()
                target = int(target)
                if target < num_classes:
                    data.append(img)
                    targets.append(target)
        assert len(data) == len(targets)
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        img_path = self.data[index]
        target = self.targets[index]
        if self.train:
            image = Image.open(os.path.join(self.root, img_path)).convert('RGB')
        else:
            image = Image.open(os.path.join(self.root, 'val_images_256', img_path)).convert('RGB')
        image = self.transform(image)
        return image, target


class WebvisionDataloader:
    def __init__(self, batch_size=128, num_classes=50, num_workers=8, root='~/data/webvision1.0'):
        self.batch_size = batch_size
        self.num_classes = num_classes
        self.num_workers = num_workers
        self.root = root

        self.transform_train = transforms.Compose([
            transforms.Resize(320),
            transforms.RandomResizedCrop(299),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])
        self.transform_test = transforms.Compose([
            transforms.Resize(320),
            transforms.CenterCrop(299),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

    def train(self):
        dataset = Webvision(root=self.root, train=True, transform=self.transform_train,
                            num_classes=self.num_classes)
        dataloader = DataLoader(
            dataset=dataset, batch_size=self.batch_size,
            shuffle=True, num_workers=self.num_workers, pin_memory=True)

        return dataloader

    def test(self):
        dataset = Webvision(root=self.root, train=False, transform=self.transform_test,
                            num_classes=self.num_classes)

        test_loader = DataLoader(
            dataset=dataset, batch_size=self.batch_size,
            shuffle=False, num_workers=self.num_workers, pin_memory=True)
        return test_loader

网络结构

大部分论文使用InceptionV4[3]作为该数据集的网络。

# InceptionResNetV2.py
import torch
from torch import nn


class BasicConv2d(nn.Module):
    def __init__(self, in_planes, out_planes, kernel_size, stride, padding: int | tuple[int, int] = 0):
        super(BasicConv2d, self).__init__()
        self.conv = nn.Conv2d(in_planes, out_planes,
                              kernel_size=kernel_size, stride=stride,
                              padding=padding, bias=False)  # verify bias false
        self.bn = nn.BatchNorm2d(out_planes,
                                 eps=0.001,  # value found in tensorflow
                                 momentum=0.1,  # default pytorch value
                                 affine=True)
        self.relu = nn.ReLU(inplace=False)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x


class Mixed_5b(nn.Module):
    def __init__(self):
        super(Mixed_5b, self).__init__()

        self.branch0 = BasicConv2d(192, 96, kernel_size=1, stride=1)

        self.branch1 = nn.Sequential(
            BasicConv2d(192, 48, kernel_size=1, stride=1),
            BasicConv2d(48, 64, kernel_size=5, stride=1, padding=2)
        )

        self.branch2 = nn.Sequential(
            BasicConv2d(192, 64, kernel_size=1, stride=1),
            BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
            BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1)
        )

        self.branch3 = nn.Sequential(
            nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
            BasicConv2d(192, 64, kernel_size=1, stride=1)
        )

    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        out = torch.cat((x0, x1, x2, x3), 1)
        return out


class Block35(nn.Module):
    def __init__(self, scale=1.0):
        super(Block35, self).__init__()

        self.scale = scale

        self.branch0 = BasicConv2d(320, 32, kernel_size=1, stride=1)

        self.branch1 = nn.Sequential(
            BasicConv2d(320, 32, kernel_size=1, stride=1),
            BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1)
        )

        self.branch2 = nn.Sequential(
            BasicConv2d(320, 32, kernel_size=1, stride=1),
            BasicConv2d(32, 48, kernel_size=3, stride=1, padding=1),
            BasicConv2d(48, 64, kernel_size=3, stride=1, padding=1)
        )

        self.conv2d = nn.Conv2d(128, 320, kernel_size=1, stride=1)
        self.relu = nn.ReLU(inplace=False)

    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        out = torch.cat((x0, x1, x2), 1)
        out = self.conv2d(out)
        out = out * self.scale + x
        out = self.relu(out)
        return out


class Mixed_6a(nn.Module):
    def __init__(self):
        super(Mixed_6a, self).__init__()

        self.branch0 = BasicConv2d(320, 384, kernel_size=3, stride=2)

        self.branch1 = nn.Sequential(
            BasicConv2d(320, 256, kernel_size=1, stride=1),
            BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1),
            BasicConv2d(256, 384, kernel_size=3, stride=2)
        )

        self.branch2 = nn.MaxPool2d(3, stride=2)

    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        out = torch.cat((x0, x1, x2), 1)
        return out


class Block17(nn.Module):
    def __init__(self, scale=1.0):
        super(Block17, self).__init__()

        self.scale = scale

        self.branch0 = BasicConv2d(1088, 192, kernel_size=1, stride=1)

        self.branch1 = nn.Sequential(
            BasicConv2d(1088, 128, kernel_size=1, stride=1),
            BasicConv2d(128, 160, kernel_size=(1, 7), stride=1, padding=(0, 3)),
            BasicConv2d(160, 192, kernel_size=(7, 1), stride=1, padding=(3, 0))
        )

        self.conv2d = nn.Conv2d(384, 1088, kernel_size=1, stride=1)
        self.relu = nn.ReLU(inplace=False)

    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        out = torch.cat((x0, x1), 1)
        out = self.conv2d(out)
        out = out * self.scale + x
        out = self.relu(out)
        return out


class Mixed_7a(nn.Module):
    def __init__(self):
        super(Mixed_7a, self).__init__()

        self.branch0 = nn.Sequential(
            BasicConv2d(1088, 256, kernel_size=1, stride=1),
            BasicConv2d(256, 384, kernel_size=3, stride=2)
        )

        self.branch1 = nn.Sequential(
            BasicConv2d(1088, 256, kernel_size=1, stride=1),
            BasicConv2d(256, 288, kernel_size=3, stride=2)
        )

        self.branch2 = nn.Sequential(
            BasicConv2d(1088, 256, kernel_size=1, stride=1),
            BasicConv2d(256, 288, kernel_size=3, stride=1, padding=1),
            BasicConv2d(288, 320, kernel_size=3, stride=2)
        )

        self.branch3 = nn.MaxPool2d(3, stride=2)

    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        out = torch.cat((x0, x1, x2, x3), 1)
        return out


class Block8(nn.Module):
    def __init__(self, scale=1.0, noReLU=False):
        super(Block8, self).__init__()

        self.scale = scale
        self.noReLU = noReLU

        self.branch0 = BasicConv2d(2080, 192, kernel_size=1, stride=1)

        self.branch1 = nn.Sequential(
            BasicConv2d(2080, 192, kernel_size=1, stride=1),
            BasicConv2d(192, 224, kernel_size=(1, 3), stride=1, padding=(0, 1)),
            BasicConv2d(224, 256, kernel_size=(3, 1), stride=1, padding=(1, 0))
        )

        self.conv2d = nn.Conv2d(448, 2080, kernel_size=1, stride=1)
        if not self.noReLU:
            self.relu = nn.ReLU(inplace=False)

    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        out = torch.cat((x0, x1), 1)
        out = self.conv2d(out)
        out = out * self.scale + x
        if not self.noReLU:
            out = self.relu(out)
        return out


def _make_layer(block, num_blocks, **kwargs):
    layers = []
    for _ in range(num_blocks):
        layers.append(block(**kwargs))
    return nn.Sequential(*layers)


class InceptionResNetV2(nn.Module):
    def __init__(self, num_classes=1001):
        super(InceptionResNetV2, self).__init__()
        # Special attributes
        self.input_space = None
        self.input_size = (299, 299, 3)
        self.mean = None
        self.std = None
        # Modules
        self.conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2)
        self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1)
        self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.maxpool_3a = nn.MaxPool2d(3, stride=2)
        self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1)
        self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1)
        self.maxpool_5a = nn.MaxPool2d(3, stride=2)
        self.mixed_5b = Mixed_5b()
        self.repeat = _make_layer(Block35, 10, scale=0.17)
        self.mixed_6a = Mixed_6a()
        self.repeat_1 = _make_layer(Block17, 20, scale=0.10)
        self.mixed_7a = Mixed_7a()
        self.repeat_2 = _make_layer(Block8, 9, scale=0.20)
        self.block8 = Block8(noReLU=True)
        self.conv2d_7b = BasicConv2d(2080, 1536, kernel_size=1, stride=1)
        self.avgpool_1a = nn.AvgPool2d(8, count_include_pad=False)
        self.last_linear = nn.Linear(1536, num_classes)

    def features(self, input_):
        x = self.conv2d_1a(input_)
        x = self.conv2d_2a(x)
        x = self.conv2d_2b(x)
        x = self.maxpool_3a(x)
        x = self.conv2d_3b(x)
        x = self.conv2d_4a(x)
        x = self.maxpool_5a(x)
        x = self.mixed_5b(x)
        x = self.repeat(x)
        x = self.mixed_6a(x)
        x = self.repeat_1(x)
        x = self.mixed_7a(x)
        x = self.repeat_2(x)
        x = self.block8(x)
        x = self.conv2d_7b(x)
        return x

    def logits(self, features):
        x = self.avgpool_1a(features)
        x = x.view(x.size(0), -1)
        x = self.last_linear(x)
        return x

    def forward(self, input_):
        x = self.features(input_)
        x = self.logits(x)
        return x

训练

import torch
from torch import optim
from torch import nn
from torch.nn import functional as F
import time
from torch import Tensor

from InceptionResNetV2 import InceptionResNetV2
from webvision import WebvisionDataloader


def accuracy(output: Tensor, target: Tensor, topk=(1,)) -> list:
    maxk = max(topk)
    batch_size = target.size(0)

    # Get the descending order of the top k probabilities
    _, pred = output.topk(maxk, dim=1)  # Shape: [batch_size, maxk]
    pred = pred.t()  # Shape: [maxk, batch_size]
    correct = pred.eq(target.view(1, -1).expand_as(pred))  # Shape: [maxk, batch_size]

    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


best_acc1, best_acc5 = 0, 0


@torch.no_grad()
def evaluate(model, dataloader):
    global best_acc1, best_acc5
    model.eval()
    total_loss = 0.0
    total_acc1 = 0.0
    total_acc5 = 0.0
    total_samples = 0

    for x, labels in dataloader:
        x = x.cuda(non_blocking=True)
        labels = labels.cuda(non_blocking=True)
        logits = model(x)
        loss = F.cross_entropy(logits, labels)

        acc1, acc5 = accuracy(logits.cpu(), labels.cpu(), topk=(1, 5))
        batch_size = x.size(0)

        total_loss += loss.item() * batch_size
        total_acc1 += acc1.item() * batch_size
        total_acc5 += acc5.item() * batch_size
        total_samples += batch_size

    avg_loss = total_loss / total_samples
    avg_acc1 = total_acc1 / total_samples
    avg_acc5 = total_acc5 / total_samples
    best_acc1 = max(best_acc1, avg_acc1)
    best_acc5 = max(best_acc5, avg_acc5)
    print(f'Average Loss: {avg_loss:.4f}\t'
          f'Acc@1 {avg_acc1:.2f}(Best {best_acc1:.2f})\t'
          f'Acc@5 {avg_acc5:.2f}({best_acc5:.2f})')
    return avg_loss, avg_acc1, avg_acc5


def train(dataloader, model, optimizer, criterion):
    end = time.time()
    model.train()
    for i, (x, labels) in enumerate(dataloader):
        x = x.cuda(non_blocking=True)
        labels = labels.cuda(non_blocking=True)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        if i == 100:
            print(f'Loss: {loss.item():.4f}')
    print(f'Training done in {time.time() - end:.2f}s')


def main_work():
    num_classes = 50
    epochs = 80
    lr = 0.1
    end = time.time()
    dataloaders = WebvisionDataloader(num_classes=50)
    train_loder = dataloaders.train()
    test_loader = dataloaders.test()
    print(f'Dataloader made in {time.time() - end:.2f}s')

    model = InceptionResNetV2(num_classes).cuda()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss()

    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60], gamma=0.1)
    for i in range(epochs):
        print(f'Epoch[{i}] starting')
        end = time.time()
        train(train_loder, model, optimizer, criterion)
        evaluate(model, test_loader)
        scheduler.step()
        print(f'Epoch[{i}] done in {time.time() - end:.2f}s')


def main():
    end = time.time()
    main_work()
    print(f'Total time: {time.time() - end:.2f}s')


if __name__ == '__main__':
    main()

简单运行了几个epoch,设备为NVIDIA GeForce RTX 3090,还是非常耗时的。

Dataloader made in 0.40s
Epoch[0] starting
Loss: 3.4720
Training done in 4325.29s
Average Loss: 3.0656	Acc@1 19.28(Best 19.28)	Acc@5 51.40(51.40)
Epoch[0] done in 4374.54s
Epoch[1] starting
Loss: 3.0587
Training done in 4264.95s
Average Loss: 2.8655	Acc@1 25.56(Best 25.56)	Acc@5 54.72(54.72)
Epoch[1] done in 4310.30s

运行环境:torch 2.3.1

参考文献

  1. Li, Wen, et al. "Webvision database: Visual learning and understanding from web data." arXiv preprint arXiv:1708.02862 (2017).
  2. Albert, Paul, et al. "Addressing out-of-distribution label noise in webly-labelled data." Proceedings of the IEEE/CVF winter conference on applications of computer vision. 2022.
  3. Szegedy, Christian, et al. "Inception-v4, inception-resnet and the impact of residual connections on learning." Proceedings of the AAAI conference on artificial intelligence. Vol. 31. No. 1. 2017.
posted @ 2024-08-03 14:45  zh-jp  阅读(9)  评论(0编辑  收藏  举报