数据集WebVision 1.0 (google 子集)介绍 附pytorch下的简单使用
WebVision数据集介绍
WebVision数据集常用于开集/闭集噪声学习、长尾噪声学习方法在真实数据集上的评估。根据[2]的统计,干净样本占70%,OOD噪声占25%,ID噪声占5%。
由于数据集本身较大,论文中使用的都是其中很小的一部分,进入下载页面,选择《WebVision Dataset 1.0》《Resized Images (small version)》:
- 一般需要数据集的训练集《Google Images Resized (16 GB) 》
- 验证集《Validation Images Resized (834 MB)》
- 这两个集合的标签《Metadata》下的《Training & Validation Labels (183 MB)》。
注意,由于测试集不提供标签,因此评估论文方法性能时不去使用。另外大部分实验仅使用了google子集,因此这里也只使用google子集。
整理下下载的数据集,放到目录 ~/data/webvision1.0 下,目录结构如下:
├─google
│ ├─q0001
│ ├─q0002
│ ├─...
│ ├─q1631
│ └─q1632
├─info
└─val_images_256
info 目录下重要的几个txt:
- queries_google.txt:1632行,与google目录下的文件夹相对应,每行是一个查询词。
- synsets.txt:1000行表示1000个类,标签 \(i(0\le i\le 999)\) 的具体含义在第 \(i+1\)行。
- train_filelist_google.txt:每行表示一个图片的路径和标签,路径是相对于google目录的。
- val_filelist.txt:同上,但是是相对于val_images_256目录的。
在Pytorch中使用WebVision数据集
大多数论文使用mini-Webvision,即仅使用前50个类。
数据集 & DataLoader
# webvision.py
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import os
class Webvision(Dataset):
def __init__(self, root, train=True, transform=None, num_classes=50):
root = os.path.expanduser(root)
self.root = root
self.transform = transform
self.train = train
if train:
with open(os.path.join(root, 'info/train_filelist_google.txt')) as f:
lines = f.readlines()
data, targets = [], []
for line in lines:
img, target = line.split()
target = int(target)
if target < num_classes:
data.append(img)
targets.append(target)
else:
with open(os.path.join(root, 'info/val_filelist.txt')) as f:
lines = f.readlines()
data, targets = [], []
for line in lines:
img, target = line.split()
target = int(target)
if target < num_classes:
data.append(img)
targets.append(target)
assert len(data) == len(targets)
self.data = data
self.targets = targets
def __len__(self):
return len(self.targets)
def __getitem__(self, index):
img_path = self.data[index]
target = self.targets[index]
if self.train:
image = Image.open(os.path.join(self.root, img_path)).convert('RGB')
else:
image = Image.open(os.path.join(self.root, 'val_images_256', img_path)).convert('RGB')
image = self.transform(image)
return image, target
class WebvisionDataloader:
def __init__(self, batch_size=128, num_classes=50, num_workers=8, root='~/data/webvision1.0'):
self.batch_size = batch_size
self.num_classes = num_classes
self.num_workers = num_workers
self.root = root
self.transform_train = transforms.Compose([
transforms.Resize(320),
transforms.RandomResizedCrop(299),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])
self.transform_test = transforms.Compose([
transforms.Resize(320),
transforms.CenterCrop(299),
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])
def train(self):
dataset = Webvision(root=self.root, train=True, transform=self.transform_train,
num_classes=self.num_classes)
dataloader = DataLoader(
dataset=dataset, batch_size=self.batch_size,
shuffle=True, num_workers=self.num_workers, pin_memory=True)
return dataloader
def test(self):
dataset = Webvision(root=self.root, train=False, transform=self.transform_test,
num_classes=self.num_classes)
test_loader = DataLoader(
dataset=dataset, batch_size=self.batch_size,
shuffle=False, num_workers=self.num_workers, pin_memory=True)
return test_loader
网络结构
大部分论文使用Inception-resnet-V2[3]作为该数据集的网络。
# InceptionResNetV2.py
import torch
from torch import nn
class BasicConv2d(nn.Module):
def __init__(self, in_planes, out_planes, kernel_size, stride, padding: int | tuple[int, int] = 0):
super(BasicConv2d, self).__init__()
self.conv = nn.Conv2d(in_planes, out_planes,
kernel_size=kernel_size, stride=stride,
padding=padding, bias=False) # verify bias false
self.bn = nn.BatchNorm2d(out_planes,
eps=0.001, # value found in tensorflow
momentum=0.1, # default pytorch value
affine=True)
self.relu = nn.ReLU(inplace=False)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x
class Mixed_5b(nn.Module):
def __init__(self):
super(Mixed_5b, self).__init__()
self.branch0 = BasicConv2d(192, 96, kernel_size=1, stride=1)
self.branch1 = nn.Sequential(
BasicConv2d(192, 48, kernel_size=1, stride=1),
BasicConv2d(48, 64, kernel_size=5, stride=1, padding=2)
)
self.branch2 = nn.Sequential(
BasicConv2d(192, 64, kernel_size=1, stride=1),
BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),
BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1)
)
self.branch3 = nn.Sequential(
nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),
BasicConv2d(192, 64, kernel_size=1, stride=1)
)
def forward(self, x):
x0 = self.branch0(x)
x1 = self.branch1(x)
x2 = self.branch2(x)
x3 = self.branch3(x)
out = torch.cat((x0, x1, x2, x3), 1)
return out
class Block35(nn.Module):
def __init__(self, scale=1.0):
super(Block35, self).__init__()
self.scale = scale
self.branch0 = BasicConv2d(320, 32, kernel_size=1, stride=1)
self.branch1 = nn.Sequential(
BasicConv2d(320, 32, kernel_size=1, stride=1),
BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1)
)
self.branch2 = nn.Sequential(
BasicConv2d(320, 32, kernel_size=1, stride=1),
BasicConv2d(32, 48, kernel_size=3, stride=1, padding=1),
BasicConv2d(48, 64, kernel_size=3, stride=1, padding=1)
)
self.conv2d = nn.Conv2d(128, 320, kernel_size=1, stride=1)
self.relu = nn.ReLU(inplace=False)
def forward(self, x):
x0 = self.branch0(x)
x1 = self.branch1(x)
x2 = self.branch2(x)
out = torch.cat((x0, x1, x2), 1)
out = self.conv2d(out)
out = out * self.scale + x
out = self.relu(out)
return out
class Mixed_6a(nn.Module):
def __init__(self):
super(Mixed_6a, self).__init__()
self.branch0 = BasicConv2d(320, 384, kernel_size=3, stride=2)
self.branch1 = nn.Sequential(
BasicConv2d(320, 256, kernel_size=1, stride=1),
BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1),
BasicConv2d(256, 384, kernel_size=3, stride=2)
)
self.branch2 = nn.MaxPool2d(3, stride=2)
def forward(self, x):
x0 = self.branch0(x)
x1 = self.branch1(x)
x2 = self.branch2(x)
out = torch.cat((x0, x1, x2), 1)
return out
class Block17(nn.Module):
def __init__(self, scale=1.0):
super(Block17, self).__init__()
self.scale = scale
self.branch0 = BasicConv2d(1088, 192, kernel_size=1, stride=1)
self.branch1 = nn.Sequential(
BasicConv2d(1088, 128, kernel_size=1, stride=1),
BasicConv2d(128, 160, kernel_size=(1, 7), stride=1, padding=(0, 3)),
BasicConv2d(160, 192, kernel_size=(7, 1), stride=1, padding=(3, 0))
)
self.conv2d = nn.Conv2d(384, 1088, kernel_size=1, stride=1)
self.relu = nn.ReLU(inplace=False)
def forward(self, x):
x0 = self.branch0(x)
x1 = self.branch1(x)
out = torch.cat((x0, x1), 1)
out = self.conv2d(out)
out = out * self.scale + x
out = self.relu(out)
return out
class Mixed_7a(nn.Module):
def __init__(self):
super(Mixed_7a, self).__init__()
self.branch0 = nn.Sequential(
BasicConv2d(1088, 256, kernel_size=1, stride=1),
BasicConv2d(256, 384, kernel_size=3, stride=2)
)
self.branch1 = nn.Sequential(
BasicConv2d(1088, 256, kernel_size=1, stride=1),
BasicConv2d(256, 288, kernel_size=3, stride=2)
)
self.branch2 = nn.Sequential(
BasicConv2d(1088, 256, kernel_size=1, stride=1),
BasicConv2d(256, 288, kernel_size=3, stride=1, padding=1),
BasicConv2d(288, 320, kernel_size=3, stride=2)
)
self.branch3 = nn.MaxPool2d(3, stride=2)
def forward(self, x):
x0 = self.branch0(x)
x1 = self.branch1(x)
x2 = self.branch2(x)
x3 = self.branch3(x)
out = torch.cat((x0, x1, x2, x3), 1)
return out
class Block8(nn.Module):
def __init__(self, scale=1.0, noReLU=False):
super(Block8, self).__init__()
self.scale = scale
self.noReLU = noReLU
self.branch0 = BasicConv2d(2080, 192, kernel_size=1, stride=1)
self.branch1 = nn.Sequential(
BasicConv2d(2080, 192, kernel_size=1, stride=1),
BasicConv2d(192, 224, kernel_size=(1, 3), stride=1, padding=(0, 1)),
BasicConv2d(224, 256, kernel_size=(3, 1), stride=1, padding=(1, 0))
)
self.conv2d = nn.Conv2d(448, 2080, kernel_size=1, stride=1)
if not self.noReLU:
self.relu = nn.ReLU(inplace=False)
def forward(self, x):
x0 = self.branch0(x)
x1 = self.branch1(x)
out = torch.cat((x0, x1), 1)
out = self.conv2d(out)
out = out * self.scale + x
if not self.noReLU:
out = self.relu(out)
return out
def _make_layer(block, num_blocks, **kwargs):
layers = []
for _ in range(num_blocks):
layers.append(block(**kwargs))
return nn.Sequential(*layers)
class InceptionResNetV2(nn.Module):
def __init__(self, num_classes=1001):
super(InceptionResNetV2, self).__init__()
# Special attributes
self.input_space = None
self.input_size = (299, 299, 3)
self.mean = None
self.std = None
# Modules
self.conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2)
self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1)
self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1)
self.maxpool_3a = nn.MaxPool2d(3, stride=2)
self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1)
self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1)
self.maxpool_5a = nn.MaxPool2d(3, stride=2)
self.mixed_5b = Mixed_5b()
self.repeat = _make_layer(Block35, 10, scale=0.17)
self.mixed_6a = Mixed_6a()
self.repeat_1 = _make_layer(Block17, 20, scale=0.10)
self.mixed_7a = Mixed_7a()
self.repeat_2 = _make_layer(Block8, 9, scale=0.20)
self.block8 = Block8(noReLU=True)
self.conv2d_7b = BasicConv2d(2080, 1536, kernel_size=1, stride=1)
self.avgpool_1a = nn.AvgPool2d(8, count_include_pad=False)
self.last_linear = nn.Linear(1536, num_classes)
def features(self, input_):
x = self.conv2d_1a(input_)
x = self.conv2d_2a(x)
x = self.conv2d_2b(x)
x = self.maxpool_3a(x)
x = self.conv2d_3b(x)
x = self.conv2d_4a(x)
x = self.maxpool_5a(x)
x = self.mixed_5b(x)
x = self.repeat(x)
x = self.mixed_6a(x)
x = self.repeat_1(x)
x = self.mixed_7a(x)
x = self.repeat_2(x)
x = self.block8(x)
x = self.conv2d_7b(x)
return x
def logits(self, features):
x = self.avgpool_1a(features)
x = x.view(x.size(0), -1)
x = self.last_linear(x)
return x
def forward(self, input_):
x = self.features(input_)
x = self.logits(x)
return x
训练
import torch
from torch import optim
from torch import nn
from torch.nn import functional as F
import time
from torch import Tensor
from InceptionResNetV2 import InceptionResNetV2
from webvision import WebvisionDataloader
def accuracy(output: Tensor, target: Tensor, topk=(1,)) -> list:
maxk = max(topk)
batch_size = target.size(0)
# Get the descending order of the top k probabilities
_, pred = output.topk(maxk, dim=1) # Shape: [batch_size, maxk]
pred = pred.t() # Shape: [maxk, batch_size]
correct = pred.eq(target.view(1, -1).expand_as(pred)) # Shape: [maxk, batch_size]
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
best_acc1, best_acc5 = 0, 0
@torch.no_grad()
def evaluate(model, dataloader):
global best_acc1, best_acc5
model.eval()
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_samples = 0
for x, labels in dataloader:
x = x.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
logits = model(x)
loss = F.cross_entropy(logits, labels)
acc1, acc5 = accuracy(logits.cpu(), labels.cpu(), topk=(1, 5))
batch_size = x.size(0)
total_loss += loss.item() * batch_size
total_acc1 += acc1.item() * batch_size
total_acc5 += acc5.item() * batch_size
total_samples += batch_size
avg_loss = total_loss / total_samples
avg_acc1 = total_acc1 / total_samples
avg_acc5 = total_acc5 / total_samples
best_acc1 = max(best_acc1, avg_acc1)
best_acc5 = max(best_acc5, avg_acc5)
print(f'Average Loss: {avg_loss:.4f}\t'
f'Acc@1 {avg_acc1:.2f}(Best {best_acc1:.2f})\t'
f'Acc@5 {avg_acc5:.2f}({best_acc5:.2f})')
return avg_loss, avg_acc1, avg_acc5
def train(dataloader, model, optimizer, criterion):
end = time.time()
model.train()
for i, (x, labels) in enumerate(dataloader):
x = x.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
optimizer.zero_grad()
logits = model(x)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
if i == 100:
print(f'Loss: {loss.item():.4f}')
print(f'Training done in {time.time() - end:.2f}s')
def main_work():
num_classes = 50
epochs = 80
lr = 0.1
end = time.time()
dataloaders = WebvisionDataloader(num_classes=50)
train_loder = dataloaders.train()
test_loader = dataloaders.test()
print(f'Dataloader made in {time.time() - end:.2f}s')
model = InceptionResNetV2(num_classes).cuda()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60], gamma=0.1)
for i in range(epochs):
print(f'Epoch[{i}] starting')
end = time.time()
train(train_loder, model, optimizer, criterion)
evaluate(model, test_loader)
scheduler.step()
print(f'Epoch[{i}] done in {time.time() - end:.2f}s')
def main():
end = time.time()
main_work()
print(f'Total time: {time.time() - end:.2f}s')
if __name__ == '__main__':
main()
简单运行了几个epoch,设备为NVIDIA GeForce RTX 3090,还是非常耗时的。
Dataloader made in 0.40s
Epoch[0] starting
Loss: 3.4720
Training done in 4325.29s
Average Loss: 3.0656 Acc@1 19.28(Best 19.28) Acc@5 51.40(51.40)
Epoch[0] done in 4374.54s
Epoch[1] starting
Loss: 3.0587
Training done in 4264.95s
Average Loss: 2.8655 Acc@1 25.56(Best 25.56) Acc@5 54.72(54.72)
Epoch[1] done in 4310.30s
运行环境:torch 2.3.1
参考文献
- Li, Wen, et al. "Webvision database: Visual learning and understanding from web data." arXiv preprint arXiv:1708.02862 (2017).
- Albert, Paul, et al. "Addressing out-of-distribution label noise in webly-labelled data." Proceedings of the IEEE/CVF winter conference on applications of computer vision. 2022.
- Szegedy, Christian, et al. "Inception-v4, inception-resnet and the impact of residual connections on learning." Proceedings of the AAAI conference on artificial intelligence. Vol. 31. No. 1. 2017.