import MinkowskiEngine as ME
from examples.minkunet import MinkUNet34C
# Copy the network to GPU
net = MinkUNet34C(3, 20, D=3)
net = net.to(target_device)
接下来,创建一个新网络,以ME.MinkowskiSynchBatchNorm替换all ME.MinkowskiBatchNorm。这样一来,网络就可以使用大批处理量,并通过单GPU训练来保持相同的性能。
# Synchronized batch norm
net = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(net);
import torch.nn.parallel as parallel
criterion = nn.CrossEntropyLoss()
criterions = parallel.replicate(criterion, devices)
# Get new data
inputs, labels = [], []
for i in range(num_devices):
coords, feat, label = data_loader() // parallel data loaders can be used
with torch.cuda.device(devices[i]):
inputs.append(ME.SparseTensor(feat, coords=coords).to(devices[i]))
replicas = parallel.replicate(net, devices)
outputs = parallel.parallel_apply(replicas, inputs, devices=devices)
# Extract features from the sparse tensors to use a pytorch criterion
out_features = [output.F for output in outputs]
losses = parallel.parallel_apply(
criterions, tuple(zip(out_features, labels)), devices=devices)
loss = parallel.gather(losses, target_device, dim=0).mean()
import os |
import argparse |
import numpy as np |
from time import time |
from urllib.request import urlretrieve |
try: |
import open3d as o3d |
except ImportError: |
raise ImportError("Please install open3d-python with `pip install open3d`.") |
import torch |
import torch.nn as nn |
from torch.optim import SGD |
import MinkowskiEngine as ME |
from examples.minkunet import MinkUNet34C |
import torch.nn.parallel as parallel |
if not os.path.isfile("weights.pth"): |
urlretrieve("http://cvgl.stanford.edu/data2/minkowskiengine/1.ply", "1.ply") |
parser = argparse.ArgumentParser() |
parser.add_argument("--file_name", type=str, default="1.ply") |
parser.add_argument("--batch_size", type=int, default=4) |
parser.add_argument("--max_ngpu", type=int, default=2) |
cache = {} |
def load_file(file_name, voxel_size): |
if file_name not in cache: |
pcd = o3d.io.read_point_cloud(file_name) |
cache[file_name] = pcd |
pcd = cache[file_name] |
quantized_coords, feats = ME.utils.sparse_quantize( |
np.array(pcd.points, dtype=np.float32), |
np.array(pcd.colors, dtype=np.float32), |
quantization_size=voxel_size, |
) |
random_labels = torch.zeros(len(feats)) |
return quantized_coords, feats, random_labels |
def generate_input(file_name, voxel_size): |
# Create a batch, this process is done in a data loader during training in parallel. |
batch = [load_file(file_name, voxel_size)] |
coordinates_, featrues_, labels_ = list(zip(*batch)) |
coordinates, features, labels = ME.utils.sparse_collate( |
coordinates_, featrues_, labels_ |
) |
# Normalize features and create a sparse tensor |
return coordinates, (features - 0.5).float(), labels |
if __name__ == "__main__": |
# loss and network |
config = parser.parse_args() |
num_devices = torch.cuda.device_count() |
num_devices = min(config.max_ngpu, num_devices) |
devices = list(range(num_devices)) |
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''") |
print("' WARNING: This example is deprecated. '") |
print("' Please use DistributedDataParallel or pytorch-lightning'") |
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''") |
print( |
f"Testing {num_devices} GPUs. Total batch size: {num_devices * config.batch_size}" |
) |
# For copying the final loss back to one GPU |
target_device = devices[0] |
# Copy the network to GPU |
net = MinkUNet34C(3, 20, D=3) |
net = net.to(target_device) |
# Synchronized batch norm |
net = ME.MinkowskiSyncBatchNorm.convert_sync_batchnorm(net) |
optimizer = SGD(net.parameters(), lr=1e-1) |
# Copy the loss layer |
criterion = nn.CrossEntropyLoss() |
criterions = parallel.replicate(criterion, devices) |
min_time = np.inf |
for iteration in range(10): |
optimizer.zero_grad() |
# Get new data |
inputs, all_labels = [], [] |
for i in range(num_devices): |
coordinates, features, labels = generate_input(config.file_name, 0.05) |
with torch.cuda.device(devices[i]): |
inputs.append(ME.SparseTensor(features, coordinates, device=devices[i])) |
all_labels.append(labels.long().to(devices[i])) |
# The raw version of the parallel_apply |
st = time() |
replicas = parallel.replicate(net, devices) |
outputs = parallel.parallel_apply(replicas, inputs, devices=devices) |
# Extract features from the sparse tensors to use a pytorch criterion |
out_features = [output.F for output in outputs] |
losses = parallel.parallel_apply( |
criterions, tuple(zip(out_features, all_labels)), devices=devices |
) |
loss = parallel.gather(losses, target_device, dim=0).mean() |
# Gradient |
loss.backward() |
optimizer.step() |
t = time() - st |
min_time = min(t, min_time) |
print( |
f"Iteration: {iteration}, Loss: {loss.item()}, Time: {t}, Min time: {min_time}" |
) |
# Must clear cache at regular interval |
if iteration % 10 == 0: |
torch.cuda.empty_cache() |
在4x Titan XP上使用各种批次大小进行实验,并将负载平均分配给每个GPU。例如,使用1个GPU,每个批次将具有8个批处理大小。使用2个GPU,每个GPU将具有4个批次。使用4个GPU,每个GPU的批处理大小为2。
GPU数量 |
每个GPU的批量大小 |
每次迭代时间 |
加速(理想) |
1个GPU |
8 |
1.611秒 |
x1(x1) |
2个GPU |
4 |
0.916秒 |
x1.76(x2) |
4个GPU |
2 |
0.689秒 |
x2.34(x4) |
GPU数量 |
每个GPU的批量大小 |
每次迭代时间 |
加速(理想) |
1个GPU |
12 |
2.691秒 |
x1(x1) |
2个GPU |
6 |
1.413秒 |
x1.90(x2) |
3个GPU |
4 |
1.064秒 |
x2.53(x3) |
4个GPU |
3 |
1.006秒 |
x2.67(x4) |
GPU数量 |
每个GPU的批量大小 |
每次迭代时间 |
加速(理想) |
1个GPU |
16 |
3.543秒 |
x1(x1) |
2个GPU |
8 |
1.933秒 |
x1.83(x2) |
4个GPU |
4 |
1.322秒 |
x2.68(x4) |
GPU数量 |
每个GPU的批量大小 |
每次迭代时间 |
加速(理想) |
1个GPU |
18岁 |
4.391秒 |
x1(x1) |
2个GPU |
9 |
2.114秒 |
x2.08(x2) |
3个GPU |
6 |
1.660秒 |
x2.65(x3) |
GPU数量 |
每个GPU的批量大小 |
每次迭代时间 |
加速(理想) |
1个GPU |
20 |
4.639秒 |
x1(x1) |
2个GPU |
10 |
2.426秒 |
x1.91(x2) |
4个GPU |
5 |
1.707秒 |
x2.72(x4) |
GPU数量 |
每个GPU的批量大小 |
每次迭代时间 |
加速(理想) |
1个GPU |
21 |
4.894秒 |
x1(x1) |
3个GPU |
7 |
1.877秒 |
x2.61(x3) |
GPU数量 |
平均加速(理想) |
1个GPU |
x1(x1) |
2个GPU |
x1.90(x2) |
3个GPU |
x2.60(x3) |
4个GPU |
x2.60(x4) |