3d gs 源码解读
1 模型训练
def training(dataset, opt, pipe, testing_iterations, saving_iterations, checkpoint_iterations, checkpoint, debug_from):
first_iter = 0
# 初始化高斯模型,用于表示场景中的每个点的3D高斯分布
gaussians = GaussianModel(dataset.sh_degree)
# 初始化场景对象,加载数据集和对应的相机参数
scene = Scene(dataset, gaussians)
# 为高斯模型参数设置优化器和学习率调度器
gaussians.training_setup(opt)
# 如果提供了checkpoint,则从checkpoint加载模型参数并恢复训练进度
if checkpoint:
(model_params, first_iter) = torch.load(checkpoint)
gaussians.restore(model_params, opt)
# 设置背景颜色,白色或黑色取决于数据集要求
bg_color = [1, 1, 1] if dataset.white_background else [0, 0, 0]
background = torch.tensor(bg_color, dtype=torch.float32, device="cuda")
# 创建CUDA事件用于计时
iter_start = torch.cuda.Event(enable_timing=True)
iter_end = torch.cuda.Event(enable_timing=True)
viewpoint_stack = None
ema_loss_for_log = 0.0
# 使用tqdm库创建进度条,追踪训练进度
progress_bar = tqdm(range(first_iter, opt.iterations), desc="Training progress")
first_iter += 1
for iteration in range(first_iter, opt.iterations + 1):
# 记录迭代开始时间
iter_start.record()
# 根据当前迭代次数更新学习率
gaussians.update_learning_rate(iteration)
# 每1000次迭代,提升球谐函数的次数以改进模型复杂度
if iteration % 1000 == 0:
gaussians.oneupSHdegree()
# 随机选择一个训练用的相机视角
if not viewpoint_stack:
viewpoint_stack = scene.getTrainCameras().copy()
viewpoint_cam = viewpoint_stack.pop(randint(0, len(viewpoint_stack)-1))
# 如果达到调试起始点,启用调试模式
if (iteration - 1) == debug_from:
pipe.debug = True
# 根据设置决定是否使用随机背景颜色
bg = torch.rand((3), device="cuda") if opt.random_background else background
# 渲染当前视角的图像
render_pkg = render(viewpoint_cam, gaussians, pipe, bg)
image, viewspace_point_tensor, visibility_filter, radii = render_pkg["render"], render_pkg["viewspace_points"], render_pkg["visibility_filter"], render_pkg["radii"]
# 计算渲染图像与真实图像之间的损失
gt_image = viewpoint_cam.original_image.cuda()
Ll1 = l1_loss(image, gt_image)
loss = (1.0 - opt.lambda_dssim) * Ll1 + opt.lambda_dssim * (1.0 - ssim(image, gt_image))
loss.backward()
# 记录迭代结束时间
iter_end.record()
with torch.no_grad():
# 更新进度条和损失显示
ema_loss_for_log = 0.4 * loss.item() + 0.6 * ema_loss_for_log
if iteration % 10 == 0:
progress_bar.set_postfix({"Loss": f"{ema_loss_for_log:.{7}f}"})
progress_bar.update(10)
if iteration == opt.iterations:
progress_bar.close()
# 定期记录训练数据并保存模型
training_report(tb_writer, iteration, Ll1, loss, l1_loss, iter_start.elapsed_time(iter_end), testing_iterations, scene, render, (pipe, background))
if iteration in saving_iterations:
print("\n[ITER {}] Saving Gaussians".format(iteration))
scene.save(iteration)
# 在指定迭代区间内,对3D高斯模型进行增密和修剪
if iteration < opt.densify_until_iter:
gaussians.max_radii2D[visibility_filter] = torch.max(gaussians.max_radii2D[visibility_filter], radii[visibility_filter])
gaussians.add_densification_stats(viewspace_point_tensor, visibility_filter)
if iteration > opt.densify_from_iter and iteration % opt.densification_interval == 0:
size_threshold = 20 if iteration > opt.opacity_reset_interval else None
gaussians.densify_and_prune(opt.densify_grad_threshold, 0.005, scene.cameras_extent, size_threshold)
if iteration % opt.opacity_reset_interval == 0 or (dataset.white_background and iteration == opt.densify_from_iter):
gaussians.reset_opacity()
# 执行优化器的一步,并准备下一次迭代
if iteration < opt.iterations:
gaussians.optimizer.step()
gaussians.optimizer.zero_grad(set_to_none=True)
# 定期保存checkpoint
if iteration in checkpoint_iterations:
print("\n[ITER {}] Saving Checkpoint".format(iteration))
torch.save((gaussians.capture(), iteration), scene.model_path + "/chkpnt" + str(iteration) + ".pth")
2 数据加载
class Scene:
"""
Scene 类用于管理场景的3D模型,包括相机参数、点云数据和高斯模型的初始化和加载
"""
def __init__(self, args: ModelParams, gaussians: GaussianModel, load_iteration=None, shuffle=True, resolution_scales=[1.0]):
"""
初始化场景对象
:param args: 包含模型路径和源路径等模型参数
:param gaussians: 高斯模型对象,用于场景点的3D表示
:param load_iteration: 指定加载模型的迭代次数,如果为-1,则自动寻找最大迭代次数
:param shuffle: 是否在训练前打乱相机列表
:param resolution_scales: 分辨率比例列表,用于处理不同分辨率的相机
"""
self.model_path = args.model_path # 模型文件保存路径
self.loaded_iter = None # 已加载的迭代次数
self.gaussians = gaussians # 高斯模型对象
# 检查并加载已有的训练模型
if load_iteration:
if load_iteration == -1:
self.loaded_iter = searchForMaxIteration(os.path.join(self.model_path, "point_cloud"))
else:
self.loaded_iter = load_iteration
print(f"Loading trained model at iteration {self.loaded_iter}")
self.train_cameras = {} # 用于训练的相机参数
self.test_cameras = {} # 用于测试的相机参数
# 根据数据集类型(COLMAP或Blender)加载场景信息
if os.path.exists(os.path.join(args.source_path, "sparse")):
scene_info = sceneLoadTypeCallbacks["Colmap"](args.source_path, args.images, args.eval)
elif os.path.exists(os.path.join(args.source_path, "transforms_train.json")):
print("Found transforms_train.json file, assuming Blender data set!")
scene_info = sceneLoadTypeCallbacks["Blender"](args.source_path, args.white_background, args.eval)
else:
assert False, "Could not recognize scene type!"
# 如果是初次训练,初始化3D高斯模型;否则,加载已有模型
if self.loaded_iter:
self.gaussians.load_ply(os.path.join(self.model_path, "point_cloud", "iteration_" + str(self.loaded_iter), "point_cloud.ply"))
else:
self.gaussians.create_from_pcd(scene_info.point_cloud, self.cameras_extent)
# 根据resolution_scales加载不同分辨率的训练和测试位姿
for resolution_scale in resolution_scales:
print("Loading Training Cameras")
self.train_cameras[resolution_scale] = cameraList_from_camInfos(scene_info.train_cameras, resolution_scale, args)
print("Loading Test Cameras")
self.test_cameras[resolution_scale] = cameraList_from_camInfos(scene_info.test_cameras, resolution_scale, args)
def save(self, iteration):
"""
保存当前迭代下的3D高斯模型点云。
:param iteration: 当前的迭代次数。
"""
point_cloud_path = os.path.join(self.model_path, f"point_cloud/iteration_{iteration}")
self.gaussians.save_ply(os.path.join(point_cloud_path, "point_cloud.ply"))
def getTrainCameras(self, scale=1.0):
"""
获取指定分辨率比例的训练相机列表
:param scale: 分辨率比例
:return: 指定分辨率比例的训练相机列表
"""
return self.train_cameras[scale]
sceneLoadTypeCallbacks = {
"Colmap": readColmapSceneInfo,
"Blender" : readNerfSyntheticInfo
}
def readColmapSceneInfo(path, images, eval, llffhold=8):
# 尝试读取COLMAP处理结果中的二进制相机外参和内参文件
try:
cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.bin")
cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.bin")
cam_extrinsics = read_extrinsics_binary(cameras_extrinsic_file)
cam_intrinsics = read_intrinsics_binary(cameras_intrinsic_file)
except:
# 如果二进制文件读取失败,尝试读取文本格式的相机外参和内参文件
cameras_extrinsic_file = os.path.join(path, "sparse/0", "images.txt")
cameras_intrinsic_file = os.path.join(path, "sparse/0", "cameras.txt")
cam_extrinsics = read_extrinsics_text(cameras_extrinsic_file)
cam_intrinsics = read_intrinsics_text(cameras_intrinsic_file)
# 定义存放图片的目录,如果未指定则默认为"images"
reading_dir = "images" if images is None else images
# 读取并处理相机参数,转换为内部使用的格式
cam_infos_unsorted = readColmapCameras(cam_extrinsics=cam_extrinsics, cam_intrinsics=cam_intrinsics, images_folder=os.path.join(path, reading_dir))
# 根据图片名称对相机信息进行排序,以保证顺序一致性
cam_infos = sorted(cam_infos_unsorted.copy(), key=lambda x: x.image_name)
# 根据是否为评估模式(eval),将相机分为训练集和测试集
# 如果为评估模式,根据llffhold参数(通常用于LLFF数据集)间隔选择测试相机
if eval:
train_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold != 0]
test_cam_infos = [c for idx, c in enumerate(cam_infos) if idx % llffhold == 0]
else:
# 如果不是评估模式,所有相机均为训练相机,测试相机列表为空
train_cam_infos = cam_infos
test_cam_infos = []
# 计算场景归一化参数,这是为了处理不同尺寸和位置的场景,使模型训练更稳定
nerf_normalization = getNerfppNorm(train_cam_infos)
# 尝试读取点云数据,优先从PLY文件读取,如果不存在,则尝试从BIN或TXT文件转换并保存为PLY格式
ply_path = os.path.join(path, "sparse/0/points3D.ply")
bin_path = os.path.join(path, "sparse/0/points3D.bin")
txt_path = os.path.join(path, "sparse/0/points3D.txt")
if not os.path.exists(ply_path):
print("Converting point3d.bin to .ply, will happen only the first time you open the scene.")
try:
xyz, rgb, _ = read_points3D_binary(bin_path)
except:
xyz, rgb, _ = read_points3D_text(txt_path)
storePly(ply_path, xyz, rgb)
try:
pcd = fetchPly(ply_path)
except:
pcd = None
# 组装场景信息,包括点云、训练用相机、测试用相机、场景归一化参数和点云文件路径
scene_info = SceneInfo(point_cloud=pcd,
train_cameras=train_cam_infos,
test_cameras=test_cam_infos,
nerf_normalization=nerf_normalization,
ply_path=ply_path)
return scene_info
def readColmapCameras(cam_extrinsics, cam_intrinsics, images_folder):
cam_infos = [] # 初始化用于存储相机信息的列表
# 遍历所有相机的外参
for idx, key in enumerate(cam_extrinsics):
# 动态显示读取相机信息的进度
sys.stdout.write('\r')
sys.stdout.write("Reading camera {}/{}".format(idx+1, len(cam_extrinsics)))
sys.stdout.flush()
# 获取当前相机的外参和内参
extr = cam_extrinsics[key] # 当前相机的外参
intr = cam_intrinsics[extr.camera_id] # 根据外参中的camera_id找到对应的内参
height = intr.height # 相机图片的高度
width = intr.width # 相机图片的宽度
uid = intr.id # 相机的唯一标识符
# 将四元数表示的旋转转换为旋转矩阵R
R = np.transpose(qvec2rotmat(extr.qvec))
# 外参中的平移向量
T = np.array(extr.tvec)
# 根据相机内参模型计算视场角(FoV)
if intr.model == "SIMPLE_PINHOLE":
# 如果是简单针孔模型,只有一个焦距参数
focal_length_x = intr.params[0]
FovY = focal2fov(focal_length_x, height) # 计算垂直方向的视场角
FovX = focal2fov(focal_length_x, width) # 计算水平方向的视场角
elif intr.model == "PINHOLE":
# 如果是针孔模型,有两个焦距参数
focal_length_x = intr.params[0]
focal_length_y = intr.params[1]
FovY = focal2fov(focal_length_y, height) # 使用y方向的焦距计算垂直视场角
FovX = focal2fov(focal_length_x, width) # 使用x方向的焦距计算水平视场角
else:
# 如果不是以上两种模型,抛出错误
assert False, "Colmap camera model not handled: only undistorted datasets (PINHOLE or SIMPLE_PINHOLE cameras) supported!"
# 构建图片的完整路径
image_path = os.path.join(images_folder, os.path.basename(extr.name))
image_name = os.path.basename(image_path).split(".")[0] # 提取图片名称,不包含扩展名
# 使用PIL库打开图片文件
image = Image.open(image_path)
# 创建并存储相机信息
cam_info = CameraInfo(uid=uid, R=R, T=T, FovY=FovY, FovX=FovX, image=image,
image_path=image_path, image_name=image_name, width=width, height=height)
cam_infos.append(cam_info)
# 在读取完所有相机信息后换行
sys.stdout.write('\n')
# 返回整理好的相机信息列表
return cam_infos
3 模型构建
gaussians = GaussianModel(dataset.sh_degree)
def __init__(self, sh_degree: int):
"""
初始化3D高斯模型的参数。
:param sh_degree: 球谐函数的最大次数,用于控制颜色表示的复杂度。
"""
# 初始化球谐次数和最大球谐次数
self.active_sh_degree = 0 # 当前激活的球谐次数,初始为0
self.max_sh_degree = sh_degree # 允许的最大球谐次数
# 初始化3D高斯模型的各项参数
self._xyz = torch.empty(0) # 3D高斯的中心位置(均值)
self._features_dc = torch.empty(0) # 第一个球谐系数,用于表示基础颜色
self._features_rest = torch.empty(0) # 其余的球谐系数,用于表示颜色的细节和变化
self._scaling = torch.empty(0) # 3D高斯的尺度参数,控制高斯的宽度
self._rotation = torch.empty(0) # 3D高斯的旋转参数,用四元数表示
self._opacity = torch.empty(0) # 3D高斯的不透明度,控制可见性
self.max_radii2D = torch.empty(0) # 在2D投影中,每个高斯的最大半径
self.xyz_gradient_accum = torch.empty(0) # 用于累积3D高斯中心位置的梯度
self.denom = torch.empty(0) # 未明确用途的参数
self.optimizer = None # 优化器,用于调整上述参数以改进模型
# 调用setup_functions来初始化一些处理函数
self.setup_functions()
def setup_functions(self):
"""
定义和初始化一些用于处理3D高斯模型参数的函数。
"""
# 定义构建3D高斯协方差矩阵的函数
def build_covariance_from_scaling_rotation(scaling, scaling_modifier, rotation):
L = build_scaling_rotation(scaling_modifier * scaling, rotation)
actual_covariance = L @ L.transpose(1, 2) # 计算实际的协方差矩阵
symm = strip_symmetric(actual_covariance) # 提取对称部分
return symm
# 初始化一些激活函数
self.scaling_activation = torch.exp # 用exp函数确保尺度参数非负
self.scaling_inverse_activation = torch.log # 尺度参数的逆激活函数,用于梯度回传
self.covariance_activation = build_covariance_from_scaling_rotation # 协方差矩阵的激活函数
self.opacity_activation = torch.sigmoid # 用sigmoid函数确保不透明度在0到1之间
self.inverse_opacity_activation = inverse_sigmoid # 不透明度的逆激活函数
self.rotation_activation = torch.nn.functional.normalize # 用于标准化旋转参数的函数
def build_scaling_rotation(s, r):
"""
构建3D高斯模型的尺度-旋转矩阵。
:param s: 尺度参数。
:param r: 旋转参数。
:return: 尺度-旋转矩阵。
"""
L = torch.zeros((s.shape[0], 3, 3), dtype=torch.float, device="cuda") # 初始化尺度矩阵
R = build_rotation(r) # 计算旋转矩阵
# 设置尺度矩阵的对角线元素
L[:, 0, 0] = s[:, 0]
L[:, 1, 1] = s[:, 1]
L[:, 2, 2] = s[:, 2]
L = R @ L # 应用旋转
return L
def strip_symmetric(sym):
"""
提取协方差矩阵的对称部分。
:param sym: 协方差矩阵。
:return: 对称部分。
"""
return strip_lowerdiag(sym)
def strip_lowerdiag(L):
"""
从协方差矩阵中提取六个独立参数。
:param L: 协方差矩阵。
:return: 六个独立参数组成的张量。
"""
uncertainty = torch.zeros((L.shape[0], 6), dtype=torch.float, device="cuda")
# 提取协方差矩阵的独立元素
uncertainty[:, 0] = L[:, 0, 0]
uncertainty[:, 1] = L[:, 0, 1]
uncertainty[:, 2] = L[:, 0, 2]
uncertainty[:, 3] = L[:, 1, 1]
uncertainty[:, 4] = L[:, 1, 2]
uncertainty[:, 5] = L[:, 2, 2]
return uncertainty
def training_setup(self, training_args):
"""
设置训练参数,包括初始化用于累积梯度的变量,配置优化器,以及创建学习率调度器
:param training_args: 包含训练相关参数的对象。
"""
# 设置在训练过程中,用于密集化处理的3D高斯点的比例
self.percent_dense = training_args.percent_dense
# 初始化用于累积3D高斯中心点位置梯度的张量,用于之后判断是否需要对3D高斯进行克隆或切分
self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
# 配置各参数的优化器,包括指定参数、学习率和参数名称
l = [
{'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"},
{'params': [self._features_dc], 'lr': training_args.feature_lr, "name": "f_dc"},
{'params': [self._features_rest], 'lr': training_args.feature_lr / 20.0, "name": "f_rest"},
{'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"},
{'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"},
{'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"}
]
# 创建优化器,这里使用Adam优化器
self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
# 创建学习率调度器,用于对中心点位置的学习率进行调整
self.xyz_scheduler_args = get_expon_lr_func(
lr_init=training_args.position_lr_init*self.spatial_lr_scale,
lr_final=training_args.position_lr_final*self.spatial_lr_scale,
lr_delay_mult=training_args.position_lr_delay_mult,
max_steps=training_args.position_lr_max_steps
)
def get_expon_lr_func(lr_init, lr_final, lr_delay_steps=0, lr_delay_mult=1.0, max_steps=1000000):
"""
创建一个学习率调度函数,该函数根据训练进度动态调整学习率
:param lr_init: 初始学习率。
:param lr_final: 最终学习率。
:param lr_delay_steps: 学习率延迟步数,在这些步数内学习率将被降低。
:param lr_delay_mult: 学习率延迟乘数,用于计算初始延迟学习率。
:param max_steps: 最大步数,用于规范化训练进度。
:return: 一个函数,根据当前步数返回调整后的学习率。
"""
def helper(step):
# 如果步数小于0或学习率为0,直接返回0,表示不进行优化
if step < 0 or (lr_init == 0.0 and lr_final == 0.0):
return 0.0
# 如果设置了学习率延迟步数,计算延迟调整后的学习率
if lr_delay_steps > 0:
delay_rate = lr_delay_mult + (1 - lr_delay_mult) * np.sin(
0.5 * np.pi * np.clip(step / lr_delay_steps, 0, 1)
)
else:
delay_rate = 1.0
# 根据步数计算学习率的对数线性插值,实现从初始学习率到最终学习率的平滑过渡
t = np.clip(step / max_steps, 0, 1)
log_lerp = np.exp(np.log(lr_init) * (1 - t) + np.log(lr_final) * t)
# 返回调整后的学习率
return delay_rate * log_lerp
return helper
def create_from_pcd(self, pcd: BasicPointCloud, spatial_lr_scale: float):
"""
从点云数据初始化模型参数。
:param pcd: 点云数据,包含点的位置和颜色。
:param spatial_lr_scale: 空间学习率缩放因子,影响位置参数的学习率。
"""
# 将点云的位置和颜色数据从numpy数组转换为PyTorch张量,并传送到CUDA设备上
self.spatial_lr_scale = spatial_lr_scale
fused_point_cloud = torch.tensor(np.asarray(pcd.points)).float().cuda() # (P, 3)
fused_color = RGB2SH(torch.tensor(np.asarray(pcd.colors)).float().cuda()) # (P, 3)
# 初始化存储球谐系数的张量,每个颜色通道有(max_sh_degree + 1) ** 2个球谐系数
features = torch.zeros((fused_color.shape[0], 3, (self.max_sh_degree + 1) ** 2)).float().cuda() # (P, 3, 16)
features[:, :3, 0] = fused_color # 将RGB转换后的球谐系数C0项的系数存入
features[:, 3:, 1:] = 0.0 # 其余球谐系数初始化为0
# 打印初始点的数量
print("Number of points at initialisation : ", fused_point_cloud.shape[0])
# 计算点云中每个点到其最近的k个点的平均距离的平方,用于确定高斯的尺度参数
dist2 = torch.clamp_min(distCUDA2(torch.from_numpy(np.asarray(pcd.points)).float().cuda()), 0.0000001) # (P,)
scales = torch.log(torch.sqrt(dist2))[..., None].repeat(1, 3) # (P, 3)
# 初始化每个点的旋转参数为单位四元数(无旋转)
rots = torch.zeros((fused_point_cloud.shape[0], 4), device="cuda") # (P, 4)
rots[:, 0] = 1 # 四元数的实部为1,表示无旋转
# 初始化每个点的不透明度为0.1(通过inverse_sigmoid转换)
opacities = inverse_sigmoid(0.1 * torch.ones((fused_point_cloud.shape[0], 1), dtype=torch.float, device="cuda")) # (P, 1)
# 将以上计算的参数设置为模型的可训练参数
self._xyz = nn.Parameter(fused_point_cloud.requires_grad_(True)) # 位置
self._features_dc = nn.Parameter(features[:, :, 0:1].transpose(1, 2).contiguous().requires_grad_(True)) # 球谐系数C0项
self._features_rest = nn.Parameter(features[:, :, 1:].transpose(1, 2).contiguous().requires_grad_(True)) # 其余球谐系数
self._scaling = nn.Parameter(scales.requires_grad_(True)) # 尺度
self._rotation = nn.Parameter(rots.requires_grad_(True)) # 旋转
self._opacity = nn.Parameter(opacities.requires_grad_(True)) # 不透明度
self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda") # 存储2D投影的最大半径,初始化为0
def RGB2SH(rgb):
"""
将RGB颜色值转换为球谐系数C0项的系数。
:param rgb: RGB颜色值。
:return: 转换后的球谐系数C0项的系数。
"""
return (rgb - 0.5) / C0
根据梯度对3d gaussian进行增加或者删除
def densify_and_prune(self, max_grad, min_opacity, extent, max_screen_size):
"""
对3D高斯分布进行密集化和修剪的操作
:param max_grad: 梯度的最大阈值,用于判断是否需要克隆或分割。
:param min_opacity: 不透明度的最小阈值,低于此值的3D高斯将被删除。
:param extent: 场景的尺寸范围,用于评估高斯分布的大小是否合适。
:param max_screen_size: 最大屏幕尺寸阈值,用于修剪过大的高斯分布。
"""
# 计算3D高斯中心的累积梯度并修正NaN值
grads = self.xyz_gradient_accum / self.denom
grads[grads.isnan()] = 0.0
# 根据梯度和尺寸阈值进行克隆或分割操作
self.densify_and_clone(grads, max_grad, extent)
self.densify_and_split(grads, max_grad, extent)
# 创建修剪掩码以删除不必要的3D高斯分布
prune_mask = (self.get_opacity < min_opacity).squeeze()
if max_screen_size:
big_points_vs = self.max_radii2D > max_screen_size
big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws)
# 应用修剪掩码
self.prune_points(prune_mask)
# 清理CUDA缓存以释放资源
torch.cuda.empty_cache()
def prune_points(self, mask):
"""
删除不符合要求的3D高斯分布。
:param mask: 一个布尔张量,表示需要删除的3D高斯分布。
"""
# 生成有效点的掩码并更新优化器中的参数
valid_points_mask = ~mask
optimizable_tensors = self._prune_optimizer(valid_points_mask)
# 更新各参数
self._xyz = optimizable_tensors["xyz"]
self._features_dc = optimizable_tensors["f_dc"]
self._features_rest = optimizable_tensors["f_rest"]
self._opacity = optimizable_tensors["opacity"]
self._scaling = optimizable_tensors["scaling"]
self._rotation = optimizable_tensors["rotation"]
# 更新累积梯度和其他相关张量
self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask]
self.denom = self.denom[valid_points_mask]
self.max_radii2D = self.max_radii2D[valid_points_mask]
def _prune_optimizer(self, mask):
"""
删除不符合要求的3D高斯分布在优化器中对应的参数
:param mask: 一个布尔张量,表示需要保留的3D高斯分布。
:return: 更新后的可优化张量字典。
"""
optimizable_tensors = {}
for group in self.optimizer.param_groups:
stored_state = self.optimizer.state.get(group['params'][0], None)
if stored_state is not None:
# 更新优化器状态
stored_state["exp_avg"] = stored_state["exp_avg"][mask]
stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask]
# 删除旧状态并更新参数
del self.optimizer.state[group['params'][0]]
group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True)))
self.optimizer.state[group['params'][0]] = stored_state
optimizable_tensors[group["name"]] = group["params"][0]
else:
group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True))
optimizable_tensors[group["name"]] = group["params"][0]
return optimizable_tensors
def densify_and_clone(self, grads, grad_threshold, scene_extent):
"""
对那些梯度超过一定阈值且尺度小于一定阈值的3D高斯进行克隆操作。
这意味着这些高斯在空间中可能表示的细节不足,需要通过克隆来增加细节。
"""
# 选择满足条件的点
selected_pts_mask = torch.where(torch.norm(grads, dim=-1) >= grad_threshold, True, False)
selected_pts_mask = torch.logical_and(selected_pts_mask,
torch.max(self.get_scaling, dim=1).values <= self.percent_dense * scene_extent)
# 提取这些点的属性
new_xyz = self._xyz[selected_pts_mask] # 位置
new_features_dc = self._features_dc[selected_pts_mask] # 直流分量(基本颜色)
new_features_rest = self._features_rest[selected_pts_mask] # 其他球谐分量
new_opacities = self._opacity[selected_pts_mask] # 不透明度
new_scaling = self._scaling[selected_pts_mask] # 尺度
new_rotation = self._rotation[selected_pts_mask] # 旋转
# 将克隆得到的新高斯分布的属性添加到模型中
self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation)
def densify_and_split(self, grads, grad_threshold, scene_extent, N=2):
"""
对那些梯度超过一定阈值且尺度大于一定阈值的3D高斯进行分割操作。
这意味着这些高斯可能过于庞大,覆盖了过多的空间区域,需要分割成更小的部分以提升细节。
"""
# 初始化
n_init_points = self.get_xyz.shape[0]
padded_grad = torch.zeros((n_init_points), device="cuda")
padded_grad[:grads.shape[0]] = grads.squeeze()
# 选择满足条件的点
selected_pts_mask = torch.where(padded_grad >= grad_threshold, True, False)
selected_pts_mask = torch.logical_and(selected_pts_mask,
torch.max(self.get_scaling, dim=1).values > self.percent_dense * scene_extent)
# 计算新高斯分布的属性
stds = self.get_scaling[selected_pts_mask].repeat(N, 1) # 尺度
means = torch.zeros((stds.size(0), 3), device="cuda") # 均值(新分布的中心点)
samples = torch.normal(mean=means, std=stds) # 随机采样新的位置
rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N, 1, 1) # 旋转
# 计算新的位置
new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1)
# 调整尺度并保持其他属性
new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N, 1) / (0.8 * N))
new_rotation = self._rotation[selected_pts_mask].repeat(N, 1)
new_features_dc = self._features_dc[selected_pts_mask].repeat(N, 1, 1)
new_features_rest = self._features_rest[selected_pts_mask].repeat(N, 1, 1)
new_opacity = self._opacity[selected_pts_mask].repeat(N, 1)
# 将分割得到的新高斯分布的属性添加到模型中
self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation)
# 删除原有过大的高斯分布
prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool)))
self.prune_points(prune_filter)
def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation):
"""
将新生成的3D高斯分布的属性添加到模型的参数中。
"""
d = {"xyz": new_xyz,
"f_dc": new_features_dc,
"f_rest": new_features_rest,
"opacity": new_opacities,
"scaling": new_scaling,
"rotation": new_rotation}
optimizable_tensors = self.cat_tensors_to_optimizer(d)
self._xyz = optimizable_tensors["xyz"]
self._features_dc = optimizable_tensors["f_dc"]
self._features_rest = optimizable_tensors["f_rest"]
self._opacity = optimizable_tensors["opacity"]
self._scaling = optimizable_tensors["scaling"]
self._rotation = optimizable_tensors["rotation"]
self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
def cat_tensors_to_optimizer(self, tensors_dict):
"""
将新的参数张量添加到优化器的参数组中
"""
optimizable_tensors = {}
for group in self.optimizer.param_groups:
extension_tensor = tensors_dict[group["name"]]
stored_state = self.optimizer.state.get(group['params'][0], None)
if stored_state is not None:
stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0)
stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0)
del self.optimizer.state[group['params'][0]]
group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
self.optimizer.state[group['params'][0]] = stored_state
optimizable_tensors[group["name"]] = group["params"][0]
else:
group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
optimizable_tensors[group["name"]] = group["params"][0]
return optimizable_tensors
def build_rotation(r):
"""
根据旋转四元数构建旋转矩阵。
"""
norm = torch.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])
q = r / norm[:, None]
R = torch.zeros((q.size(0), 3, 3), device='cuda')
# 计算旋转矩阵的各元素
...
return R
学习率更新
def update_learning_rate(self, iteration):
"""
根据当前的迭代次数动态调整xyz参数的学习率
:param iteration: 当前的迭代次数。
"""
# 遍历优化器中的所有参数组
for param_group in self.optimizer.param_groups:
# 找到名为"xyz"的参数组,即3D高斯分布中心位置的参数
if param_group["name"] == "xyz":
# 使用xyz_scheduler_args函数(一个根据迭代次数返回学习率的调度函数)计算当前迭代次数的学习率
lr = self.xyz_scheduler_args(iteration)
# 将计算得到的学习率应用到xyz参数组
param_group['lr'] = lr
# 返回这个新的学习率值,可能用于日志记录或其他用途
return lr
重置不透明度
def reset_opacity(self):
"""
重置不透明度参数。这个方法将所有的不透明度值设置为一个较小的值(但不是0),以避免在训练过程中因为不透明度过低而导致的问题。
"""
# 使用inverse_sigmoid函数确保新的不透明度值在适当的范围内,即使它们已经很小(最小设定为0.01)
opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity) * 0.01))
# 更新优化器中不透明度参数的值
optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity")
# 将更新后的不透明度参数保存回模型中
self._opacity = optimizable_tensors["opacity"]
def replace_tensor_to_optimizer(self, tensor, name):
"""
将指定的参数张量替换到优化器中,这主要用于更新模型的某些参数(例如不透明度)并确保优化器使用新的参数值。
:param tensor: 新的参数张量。
:param name: 参数的名称,用于在优化器的参数组中定位该参数。
:return: 包含已更新参数的字典。
"""
optimizable_tensors = {}
for group in self.optimizer.param_groups:
# 定位到指定名称的参数组
if group["name"] == name:
# 获取并重置优化器状态(动量等)
stored_state = self.optimizer.state.get(group['params'][0], None)
stored_state["exp_avg"] = torch.zeros_like(tensor) # 重置一阶动量
stored_state["exp_avg_sq"] = torch.zeros_like(tensor) # 重置二阶动量
# 删除旧的优化器状态,并使用新的参数张量创建一个新的可优化参数
del self.optimizer.state[group['params'][0]]
group["params"][0] = nn.Parameter(tensor.requires_grad_(True))
self.optimizer.state[group['params'][0]] = stored_state
# 保存更新后的参数
optimizable_tensors[group["name"]] = group["params"][0]
return optimizable_tensors
4 图像渲染
def render(viewpoint_camera, pc: GaussianModel, pipe, bg_color: torch.Tensor, scaling_modifier=1.0, override_color=None):
"""
渲染场景的函数。
:param viewpoint_camera: 视点相机,包含视场角、图像尺寸和变换矩阵等信息。
:param pc: 3D高斯模型,代表场景中的点云。
:param pipe: 渲染管线配置,可能包含调试标志等信息。
:param bg_color: 背景颜色,必须是一个GPU上的张量。
:param scaling_modifier: 可选的缩放修正值,用于调整3D高斯的尺度。
:param override_color: 可选的覆盖颜色,如果指定,则所有3D高斯使用这个颜色而不是自身的颜色。
"""
# 创建一个零张量,并要求PyTorch对其计算梯度,用于后续获取屏幕空间中3D高斯均值的梯度
screenspace_points = torch.zeros_like(pc.get_xyz, dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0
try:
screenspace_points.retain_grad()
except:
pass
# 设置光栅化(rasterization)的配置
tanfovx = math.tan(viewpoint_camera.FoVx * 0.5) # 计算水平视场角一半的正切值
tanfovy = math.tan(viewpoint_camera.FoVy * 0.5) # 计算垂直视场角一半的正切值
# 初始化光栅化设置
raster_settings = GaussianRasterizationSettings(
image_height=int(viewpoint_camera.image_height),
image_width=int(viewpoint_camera.image_width),
tanfovx=tanfovx,
tanfovy=tanfovy,
bg=bg_color,
scale_modifier=scaling_modifier,
viewmatrix=viewpoint_camera.world_view_transform,
projmatrix=viewpoint_camera.full_proj_transform,
sh_degree=pc.active_sh_degree,
campos=viewpoint_camera.camera_center,
prefiltered=False,
debug=pipe.debug
)
# 初始化光栅化器
rasterizer = GaussianRasterizer(raster_settings=raster_settings)
# 获取3D高斯的中心位置、不透明度和尺度
means3D = pc.get_xyz
means2D = screenspace_points
opacity = pc.get_opacity
# 根据情况选择使用预先计算的3D协方差或者根据尺度/旋转动态计算
if pipe.compute_cov3D_python:
cov3D_precomp = pc.get_covariance(scaling_modifier)
else:
scales = pc.get_scaling
rotations = pc.get_rotation
# 根据情况选择使用预先计算的颜色或者根据球谐系数动态计算
shs = None
colors_precomp = None
if override_color is None:
if pipe.convert_SHs_python:
# 如果需要在Python中从球谐系数转换为RGB颜色
...
else:
shs = pc.get_features
else:
colors_precomp = override_color
# 使用光栅化器渲染可见的3D高斯到图像,并获取它们在屏幕上的半径
rendered_image, radii = rasterizer(
means3D=means3D,
means2D=means2D,
shs=shs,
colors_precomp=colors_precomp,
opacities=opacity,
scales=scales,
rotations=rotations,
cov3D_precomp=cov3D_precomp)
# 返回渲染结果、屏幕空间点、可见性过滤器和半径
return {"render": rendered_image,
"viewspace_points": screenspace_points,
"visibility_filter": radii > 0,
"radii": radii}
用于光栅化3d高斯分布的模块
def __init__(self, raster_settings):
"""
初始化光栅化器实例。
:param raster_settings: 光栅化的设置参数,包括图像高度、宽度、视场角、背景颜色、视图和投影矩阵等。
"""
super().__init__()
self.raster_settings = raster_settings # 保存光栅化的设置参数
def markVisible(self, positions):
"""
标记给定位置的点是否在相机的视锥体内,即判断点是否对相机可见(基于视锥剔除)。
:param positions: 点的位置,通常是3D高斯分布的中心位置。
:return: 一个布尔张量,表示每个点是否可见。
"""
with torch.no_grad(): # 不计算梯度,因为这一步只是用于判断可见性
raster_settings = self.raster_settings
# 调用一个C++/CUDA实现的函数来快速计算可见性
visible = _C.mark_visible(
positions,
raster_settings.viewmatrix,
raster_settings.projmatrix)
return visible
def forward(self, means3D, means2D, opacities, shs=None, colors_precomp=None, scales=None, rotations=None, cov3D_precomp=None):
"""
光栅化器的前向传播方法,用于将3D高斯分布渲染成2D图像。
:param means3D: 3D高斯分布的中心位置。
:param means2D: 屏幕空间中3D高斯分布的预期位置,用于梯度回传。
:param opacities: 高斯分布的不透明度。
:param shs: 球谐系数,用于从方向光照计算颜色。
:param colors_precomp: 预计算的颜色。
:param scales: 高斯分布的尺度参数。
:param rotations: 高斯分布的旋转参数。
:param cov3D_precomp: 预先计算的3D协方差矩阵。
:return: 光栅化后的2D图像。
"""
raster_settings = self.raster_settings
# 检查输入参数的有效性
if (shs is None and colors_precomp is None) or (shs is not None and colors_precomp is not None):
raise Exception('Please provide exactly one of either SHs or precomputed colors!')
if ((scales is None or rotations is None) and cov3D_precomp is None) or ((scales is not None or rotations is not None) and cov3D_precomp is not None):
raise Exception('Please provide exactly one of either scale/rotation pair or precomputed 3D covariance!')
# 如果相关参数未提供,则初始化为空张量
if shs is None:
shs = torch.Tensor([])
if colors_precomp is None:
colors_precomp = torch.Tensor([])
if scales is None:
scales = torch.Tensor([])
if rotations is None:
rotations = torch.Tensor([])
if cov3D_precomp is None:
cov3D_precomp = torch.Tensor([])
# 调用C++/CUDA实现的光栅化函数来渲染图像
return rasterize_gaussians(
means3D,
means2D,
shs,
colors_precomp,
opacities,
scales,
rotations,
cov3D_precomp,
raster_settings,
)
def rasterize_gaussians(
means3D,
means2D,
sh,
colors_precomp,
opacities,
scales,
rotations,
cov3Ds_precomp,
raster_settings,
):
return _RasterizeGaussians.apply(
means3D,
means2D,
sh,
colors_precomp,
opacities,
scales,
rotations,
cov3Ds_precomp,
raster_settings,
)
class _RasterizeGaussians(torch.autograd.Function):
@staticmethod
def forward(
ctx,
means3D,
means2D,
sh,
colors_precomp,
opacities,
scales,
rotations,
cov3Ds_precomp,
raster_settings,
):
# Restructure arguments the way that the C++ lib expects them
args = (
raster_settings.bg, # (3,) [0., 0., 0.] 背景颜色
means3D, # (P, 3) 每个3D gaussian的XYZ均值
colors_precomp, # 提前计算好的每个3D gaussian的颜色
opacities, # (P, 1) 0.1 不透明度
scales, # (P, 3) 每个3D gaussian的XYZ尺度
rotations, # (P, 4) [1., 0., 0., 0.] 每个3D gaussian的旋转四元组
raster_settings.scale_modifier,# 1.0
cov3Ds_precomp,#提前计算好的每个3D gaussian的协方差矩阵
raster_settings.viewmatrix, # (4, 4) 相机外参矩阵 world to camera
raster_settings.projmatrix, # (4, 4) 相机内参矩阵 camera to image
raster_settings.tanfovx, # 0.841174315841308 水平视场角一半的正切值
raster_settings.tanfovy, # 0.4717713779864031 垂直视场角一半的正切值
raster_settings.image_height, # 546 图像高度
raster_settings.image_width, # 979 图像宽度
sh, # (P, 16, 3) 每个3D gaussian对应的球谐系数, R、G、B3个通道分别对应16个球谐系数
raster_settings.sh_degree,# 0 -- > 1 -- > 2 -- > 3 球谐函数的次数, 最开始是0, 每隔1000次迭代, 将球谐函数的次数增加1
raster_settings.campos,# (3,) [-3.9848, -0.3486, 0.1481] 所有相机的中心点坐标
raster_settings.prefiltered, # False
raster_settings.debug # False
)
# Invoke C++/CUDA rasterizer
if raster_settings.debug:
cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted
try:
num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
except Exception as ex:
torch.save(cpu_args, "snapshot_fw.dump")
print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.")
raise ex
else:
num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
# Keep relevant tensors for backward
ctx.raster_settings = raster_settings
ctx.num_rendered = num_rendered
ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer)
return color, radii
@staticmethod
def backward(ctx, grad_out_color, _):
# Restore necessary values from context
num_rendered = ctx.num_rendered
raster_settings = ctx.raster_settings
colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, sh, geomBuffer, binningBuffer, imgBuffer = ctx.saved_tensors
# Restructure args as C++ method expects them
args = (raster_settings.bg,
means3D,
radii,
colors_precomp,
scales,
rotations,
raster_settings.scale_modifier,
cov3Ds_precomp,
raster_settings.viewmatrix,
raster_settings.projmatrix,
raster_settings.tanfovx,
raster_settings.tanfovy,
grad_out_color,
sh,
raster_settings.sh_degree,
raster_settings.campos,
geomBuffer,
num_rendered,
binningBuffer,
imgBuffer,
raster_settings.debug)
# Compute gradients for relevant tensors by invoking backward method
if raster_settings.debug:
cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted
try:
grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args)
except Exception as ex:
torch.save(cpu_args, "snapshot_bw.dump")
print("\nAn error occured in backward. Writing snapshot_bw.dump for debugging.\n")
raise ex
else:
grad_means2D, grad_colors_precomp, grad_opacities, grad_means3D, grad_cov3Ds_precomp, grad_sh, grad_scales, grad_rotations = _C.rasterize_gaussians_backward(*args)
grads = (
grad_means3D,
grad_means2D,
grad_sh,
grad_colors_precomp,
grad_opacities,
grad_scales,
grad_rotations,
grad_cov3Ds_precomp,
None,
)
return grads
图像渲染cuda实现部分
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("rasterize_gaussians", &RasterizeGaussiansCUDA);
m.def("rasterize_gaussians_backward", &RasterizeGaussiansBackwardCUDA);
m.def("mark_visible", &markVisible);
}
// 创建并返回一个 lambda 表达式,该表达式用于调整 torch::Tensor 对象的大小,并返回一个指向它数据的原始指针
std::function<char*(size_t n)=""> resizeFunctional(torch::Tensor& t) {
auto lambda = [&t](size_t N) {
t.resize_({(long long)N});
return reinterpret_cast<char*>(t.contiguous().data_ptr());
};
return lambda;
}
std::tuple<int, torch::tensor,="" torch::tensor="">
RasterizeGaussiansCUDA(
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& colors,
const torch::Tensor& opacity,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const int image_height,
const int image_width,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos,
const bool prefiltered,
const bool debug)
{
if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
AT_ERROR("means3D must have dimensions (num_points, 3)");
}
const int P = means3D.size(0);
const int H = image_height;
const int W = image_width;
auto int_opts = means3D.options().dtype(torch::kInt32);
auto float_opts = means3D.options().dtype(torch::kFloat32);
torch::Tensor out_color = torch::full({NUM_CHANNELS, H, W}, 0.0, float_opts); // (3, H, W), 在指定的视角下, 对所有3D gaussian进行投影和渲染得到的图像
torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32)); // (P,)
torch::Device device(torch::kCUDA);
torch::TensorOptions options(torch::kByte);
torch::Tensor geomBuffer = torch::empty({0}, options.device(device)); // (0,), 存储所有3D gaussian对应的参数(均值、尺度、旋转参数、不透明度)的tensor, 会动态分配存储空间
torch::Tensor binningBuffer = torch::empty({0}, options.device(device)); // (0,)
torch::Tensor imgBuffer = torch::empty({0}, options.device(device)); // (0,), 存储在指定视角下渲染得到的图像的tensor, 会动态分配存储空间
std::function<char*(size_t)> geomFunc = resizeFunctional(geomBuffer); // 动态调整 geomBuffer 大小的函数, 并返回对应的数据指针
std::function<char*(size_t)> binningFunc = resizeFunctional(binningBuffer); // 动态调整 binningBuffer 大小的函数, 并返回对应的数据指针
std::function<char*(size_t)> imgFunc = resizeFunctional(imgBuffer); // 动态调整 imgBuffer 大小的函数, 并返回对应的数据指针
int rendered = 0;
if(P != 0)
{
int M = 0;
if(sh.size(0) != 0)
{
M = sh.size(1);
}
rendered = CudaRasterizer::Rasterizer::forward(
geomFunc,
binningFunc,
imgFunc,
P, degree, M, // 3D gaussian的个数, 球谐函数的次数, 球谐系数的个数 (球谐系数用于表示颜色)
background.contiguous().data<float>(), // 背景颜色, [0, 0, 0]
W, H, // 图像的宽和高
means3D.contiguous().data<float>(), // 每个3D gaussian的XYZ均值
sh.contiguous().data_ptr<float>(), // 每个3D gaussian的球谐系数, 用于表示颜色
colors.contiguous().data<float>(), // 提前计算好的每个3D gaussian的颜色, []
opacity.contiguous().data<float>(), // 每个3D gaussian的不透明度
scales.contiguous().data_ptr<float>(), // 每个3D gaussian的XYZ尺度
scale_modifier, // 尺度缩放系数, 1.0
rotations.contiguous().data_ptr<float>(), // 每个3D gaussian的旋转四元组
cov3D_precomp.contiguous().data<float>(), // 提前计算好的每个3D gaussian的协方差矩阵, []
viewmatrix.contiguous().data<float>(), // 相机外参矩阵, world to camera
projmatrix.contiguous().data<float>(), // 投影矩阵, world to image
campos.contiguous().data<float>(), // 所有相机的中心点XYZ坐标
tan_fovx, // 水平视场角一半的正切值
tan_fovy, // 垂直视场角一半的正切值
prefiltered, // 是否预先过滤掉了中心点(均值XYZ)不在视锥(frustum)内的3D gaussian, False
out_color.contiguous().data<float>(), // 在指定的视角下, 对所有3D gaussian进行投影和渲染得到的图像
radii.contiguous().data<int>(), // 存储每个2D gaussian在图像上的半径
debug); // False
}
return std::make_tuple(rendered, out_color, radii, geomBuffer, binningBuffer, imgBuffer);
}
// Forward rendering procedure for differentiable rasterization
// of Gaussians.
int CudaRasterizer::Rasterizer::forward(
std::function<char* (size_t)=""> geometryBuffer,
std::function<char* (size_t)=""> binningBuffer,
std::function<char* (size_t)=""> imageBuffer,
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* opacities,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* cam_pos,
const float tan_fovx, float tan_fovy,
const bool prefiltered,
float* out_color,
int* radii,
bool debug)
{
const float focal_y = height / (2.0f * tan_fovy); // 垂直方向的焦距 focal_y
const float focal_x = width / (2.0f * tan_fovx); // 水平方向的焦距 focal_x
size_t chunk_size = required<geometrystate>(P); // 计算存储所有3D gaussian的各个参数所需要的空间大小
char* chunkptr = geometryBuffer(chunk_size); // 给所有3D gaussian的各个参数分配存储空间, 并返回存储空间的指针
GeometryState geomState = GeometryState::fromChunk(chunkptr, P); // 在给定的内存块中初始化 GeometryState 结构体, 为不同成员分配空间,并返回一个初始化的实例
if (radii == nullptr)
{
radii = geomState.internal_radii; // 指向radii数据的指针
}
// 定义了一个三维网格(dim3 是 CUDA 中定义三维网格维度的数据类型),确定了在水平和垂直方向上需要多少个块来覆盖整个渲染区域
dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
// 确定了每个块在 X(水平)和 Y(垂直)方向上的线程数
dim3 block(BLOCK_X, BLOCK_Y, 1);
// Dynamically resize image-based auxiliary buffers during training
size_t img_chunk_size = required<imagestate>(width * height); // 计算存储所有2D pixel的各个参数所需要的空间大小
char* img_chunkptr = imageBuffer(img_chunk_size); // 给所有2D pixel的各个参数分配存储空间, 并返回存储空间的指针
ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height); // 在给定的内存块中初始化 ImageState 结构体, 为不同成员分配空间,并返回一个初始化的实例
if (NUM_CHANNELS != 3 && colors_precomp == nullptr)
{
throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
}
// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
CHECK_CUDA(FORWARD::preprocess(
P, D, M, // 3D gaussian的个数, 球谐函数的次数, 球谐系数的个数 (球谐系数用于表示颜色)
means3D, // 每个3D gaussian的XYZ均值
(glm::vec3*)scales, // 每个3D gaussian的XYZ尺度
scale_modifier, // 尺度缩放系数, 1.0
(glm::vec4*)rotations, // 每个3D gaussian的旋转四元组
opacities, // 每个3D gaussian的不透明度
shs, // 每个3D gaussian的球谐系数, 用于表示颜色
geomState.clamped, // 存储每个3D gaussian的R、G、B是否小于0
cov3D_precomp, // 提前计算好的每个3D gaussian的协方差矩阵, []
colors_precomp, // 提前计算好的每个3D gaussian的颜色, []
viewmatrix, // 相机外参矩阵, world to camera
projmatrix, // 投影矩阵, world to image
(glm::vec3*)cam_pos, // 所有相机的中心点XYZ坐标
width, height, // 图像的宽和高
focal_x, focal_y, // 水平、垂直方向的焦距
tan_fovx, tan_fovy, // 水平、垂直视场角一半的正切值
radii, // 存储每个2D gaussian在图像上的半径
geomState.means2D, // 存储每个2D gaussian的均值
geomState.depths, // 存储每个2D gaussian的深度
geomState.cov3D, // 存储每个3D gaussian的协方差矩阵
geomState.rgb, // 存储每个2D pixel的颜色
geomState.conic_opacity, // 存储每个2D gaussian的协方差矩阵的逆矩阵以及它的不透明度
tile_grid, // 在水平和垂直方向上需要多少个块来覆盖整个渲染区域
geomState.tiles_touched, // 存储每个2D gaussian覆盖了多少个tile
prefiltered // 是否预先过滤掉了中心点(均值XYZ)不在视锥(frustum)内的3D gaussian
), debug)
// Compute prefix sum over full list of touched tile counts by Gaussians
// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug)
// Retrieve total number of Gaussian instances to launch and resize aux buffers
int num_rendered; // 存储所有的2D gaussian总共覆盖了多少个tile
// 将 geomState.point_offsets 数组中最后一个元素的值复制到主机内存中的变量 num_rendered
CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug);
size_t binning_chunk_size = required<binningstate>(num_rendered);
char* binning_chunkptr = binningBuffer(binning_chunk_size);
BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered);
// 将每个3D gaussian的对应的tile index和深度存到point_list_keys_unsorted中
// 将每个3D gaussian的对应的index(第几个3D gaussian)存到point_list_unsorted中
// For each instance to be rendered, produce adequate [ tile | depth ] key
// and corresponding dublicated Gaussian indices to be sorted
duplicateWithKeys << <(P + 255) / 256, 256 >> > (
P,
geomState.means2D,
geomState.depths,
geomState.point_offsets,
binningState.point_list_keys_unsorted,
binningState.point_list_unsorted,
radii,
tile_grid)
CHECK_CUDA(, debug)
int bit = getHigherMsb(tile_grid.x * tile_grid.y);
// 对一个键值对列表进行排序。这里的键值对由 binningState.point_list_keys_unsorted 和 binningState.point_list_unsorted 组成
// 排序后的结果存储在 binningState.point_list_keys 和 binningState.point_list 中
// binningState.list_sorting_space 和 binningState.sorting_size 指定了排序操作所需的临时存储空间和其大小
// num_rendered 是要排序的元素总数。0, 32 + bit 指定了排序的最低位和最高位,这里用于确保排序考虑到了足够的位数,以便正确处理所有的键值对
// Sort complete list of (duplicated) Gaussian indices by keys
CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
binningState.list_sorting_space,
binningState.sorting_size,
binningState.point_list_keys_unsorted, binningState.point_list_keys,
binningState.point_list_unsorted, binningState.point_list,
num_rendered, 0, 32 + bit), debug)
// 将 imgState.ranges 数组中的所有元素设置为 0
CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug);
// 识别每个瓦片(tile)在排序后的高斯ID列表中的范围
// 目的是确定哪些高斯ID属于哪个瓦片,并记录每个瓦片的开始和结束位置
// Identify start and end of per-tile workloads in sorted list
if (num_rendered > 0)
identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
num_rendered,
binningState.point_list_keys,
imgState.ranges);
CHECK_CUDA(, debug)
// Let each tile blend its range of Gaussians independently in parallel
const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
CHECK_CUDA(FORWARD::render(
tile_grid, // 在水平和垂直方向上需要多少个块来覆盖整个渲染区域
block, // 每个块在 X(水平)和 Y(垂直)方向上的线程数
imgState.ranges, // 每个瓦片(tile)在排序后的高斯ID列表中的范围
binningState.point_list, // 排序后的3D gaussian的id列表
width, height, // 图像的宽和高
geomState.means2D, // 每个2D gaussian在图像上的中心点位置
feature_ptr, // 每个3D gaussian对应的RGB颜色
geomState.conic_opacity, // 每个2D gaussian的协方差矩阵的逆矩阵以及它的不透明度
imgState.accum_alpha, // 渲染过程后每个像素的最终透明度或透射率值
imgState.n_contrib, // 每个pixel的最后一个贡献的2D gaussian是谁
background, // 背景颜色
out_color), debug) // 输出图像
return num_rendered;
}
// Generates one key/value pair for all Gaussian / tile overlaps.
// Run once per Gaussian (1:N mapping).
__global__ void duplicateWithKeys(
int P,
const float2* points_xy,
const float* depths,
const uint32_t* offsets,
uint64_t* gaussian_keys_unsorted,
uint32_t* gaussian_values_unsorted,
int* radii,
dim3 grid)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
if (radii[idx] > 0)
{
uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
uint2 rect_min, rect_max;
getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
// 对于边界矩形重叠的每个瓦片,具有一个键/值对。
// 键是 | 瓦片ID | 深度 |,
// 值是高斯的ID,按照这个键对值进行排序,将得到一个高斯ID列表,
// 这样它们首先按瓦片排序,然后按深度排序
for (int y = rect_min.y; y < rect_max.y; y++)
{
for (int x = rect_min.x; x < rect_max.x; x++)
{
uint64_t key = y * grid.x + x;
key <<= 32;
key |= *((uint32_t*)&depths[idx]);
gaussian_keys_unsorted[off] = key;
gaussian_values_unsorted[off] = idx;
off++;
}
}
}
}
// 计算当前的2D gaussian落在哪几个tile上
__forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
{
rect_min = {
min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y)))
};
rect_max = {
min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))),
min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y)))
};
}
// 寻找给定无符号整数 n 的最高有效位(Most Significant Bit, MSB)的下一个最高位
// Helper function to find the next-highest bit of the MSB
// on the CPU.
uint32_t getHigherMsb(uint32_t n)
{
uint32_t msb = sizeof(n) * 4;
uint32_t step = msb;
while (step > 1)
{
step /= 2;
if (n >> msb)
msb += step;
else
msb -= step;
}
if (n >> msb)
msb++;
return msb;
}
// 识别每个瓦片(tile)在排序后的高斯ID列表中的范围
// 目的是确定哪些高斯ID属于哪个瓦片,并记录每个瓦片的开始和结束位置
// Check keys to see if it is at the start/end of one tile's range in
// the full sorted list. If yes, write start/end of this tile.
// Run once per instanced (duplicated) Gaussian ID.
__global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= L)
return;
// Read tile ID from key. Update start/end of tile range if at limit.
uint64_t key = point_list_keys[idx];
uint32_t currtile = key >> 32;
if (idx == 0)
ranges[currtile].x = 0;
else
{
uint32_t prevtile = point_list_keys[idx - 1] >> 32;
if (currtile != prevtile)
{
ranges[prevtile].y = idx;
ranges[currtile].x = idx;
}
}
if (idx == L - 1)
ranges[currtile].y = L;
}
// 计算存储 T 类型数据所需的内存大小的函数
// 通过调用 T::fromChunk 并传递一个空指针(nullptr)来模拟内存分配过程
// 通过这个过程,它确定了实际所需的内存大小,加上额外的 128 字节以满足可能的内存对齐要求
size_t required(size_t P)
{
char* size = nullptr;
T::fromChunk(size, P);
return ((size_t)size) + 128;
}
// 在预分配的内存块(chunk)中为不同类型的数组(ptr)分配空间
// chunk(一个指向当前内存块位置的指针引用),ptr(一个指向分配数组的指针引用),count(数组中元素的数量),alignment(内存对齐要求)
// 首先计算新的偏移量,以确保 ptr 的对齐,然后更新 chunk 以指向内存块中下一个可用位置
static void obtain(char*& chunk, T*& ptr, std::size_t count, std::size_t alignment)
{
std::size_t offset = (reinterpret_cast<std::uintptr_t>(chunk) + alignment - 1) & ~(alignment - 1);
ptr = reinterpret_cast<t*>(offset);
chunk = reinterpret_cast<char*>(ptr + count);
}
// 存储所有3D gaussian的各个参数的结构体
struct GeometryState
{
size_t scan_size;
float* depths;
char* scanning_space;
bool* clamped;
int* internal_radii;
float2* means2D;
float* cov3D;
float4* conic_opacity;
float* rgb;
uint32_t* point_offsets;
uint32_t* tiles_touched;
static GeometryState fromChunk(char*& chunk, size_t P);
};
// 在给定的内存块中初始化 GeometryState 结构
// chunk(一个指向内存块的指针引用),P(元素的数量)
// 使用 obtain 函数为 GeometryState 的不同成员分配空间,并返回一个初始化的 GeometryState 实例
CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P)
{
GeometryState geom;
obtain(chunk, geom.depths, P, 128);
obtain(chunk, geom.clamped, P * 3, 128);
obtain(chunk, geom.internal_radii, P, 128);
obtain(chunk, geom.means2D, P, 128);
obtain(chunk, geom.cov3D, P * 6, 128);
obtain(chunk, geom.conic_opacity, P, 128);
obtain(chunk, geom.rgb, P * 3, 128);
obtain(chunk, geom.tiles_touched, P, 128);
cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P);
obtain(chunk, geom.scanning_space, geom.scan_size, 128);
obtain(chunk, geom.point_offsets, P, 128);
return geom;
}
struct ImageState
{
uint2* ranges;
uint32_t* n_contrib;
float* accum_alpha;
static ImageState fromChunk(char*& chunk, size_t N);
};
CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N)
{
ImageState img;
obtain(chunk, img.accum_alpha, N, 128);
obtain(chunk, img.n_contrib, N, 128);
obtain(chunk, img.ranges, N, 128);
return img;
}
struct BinningState
{
size_t sorting_size; // 存储用于排序操作的缓冲区大小
uint64_t* point_list_keys_unsorted; // 未排序的键列表
uint64_t* point_list_keys; // 排序后的键列表
uint32_t* point_list_unsorted; // 未排序的点列表
uint32_t* point_list; // 排序后的点列表
char* list_sorting_space; // 用于排序操作的缓冲区
static BinningState fromChunk(char*& chunk, size_t P);
};
// 初始化 BinningState 实例,分配所需的内存,并执行排序操作
CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P)
{
BinningState binning;
obtain(chunk, binning.point_list, P, 128);
obtain(chunk, binning.point_list_unsorted, P, 128);
obtain(chunk, binning.point_list_keys, P, 128);
obtain(chunk, binning.point_list_keys_unsorted, P, 128);
// 在 GPU 上进行基数排序, 将 point_list_keys_unsorted 作为键,point_list_unsorted 作为值进行排序,排序结果存储在 point_list_keys 和 point_list 中
cub::DeviceRadixSort::SortPairs(
nullptr, binning.sorting_size,
binning.point_list_keys_unsorted, binning.point_list_keys,
binning.point_list_unsorted, binning.point_list, P);
obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128);
return binning;
}
cuda前向传播预处理
void FORWARD::preprocess(int P, int D, int M,
const float* means3D,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
int* radii,
float2* means2D,
float* depths,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
preprocessCUDA<num_channels> << <(P + 255) / 256, 256 >> > (
P, D, M, // 3D gaussian的个数, 球谐函数的次数, 球谐系数的个数 (球谐系数用于表示颜色)
means3D, // 每个3D gaussian的XYZ均值
scales, // 每个3D gaussian的XYZ尺度
scale_modifier, // 尺度缩放系数, 1.0
rotations, // 每个3D gaussian的旋转四元组
opacities, // 每个3D gaussian的不透明度
shs, // 每个3D gaussian的球谐系数, 用于表示颜色
clamped, // 存储每个3D gaussian的R、G、B是否小于0
cov3D_precomp, // 提前计算好的每个3D gaussian的协方差矩阵, []
colors_precomp, // 提前计算好的每个3D gaussian的颜色, []
viewmatrix, // 相机外参矩阵, world to camera
projmatrix, // 投影矩阵, world to image
cam_pos, // 所有相机的中心点XYZ坐标
W, H, // 图像的宽和高
tan_fovx, tan_fovy, // 水平、垂直视场角一半的正切值
focal_x, focal_y, // 水平、垂直方向的焦距
radii, // 存储每个2D gaussian在图像上的半径
means2D, // 存储每个2D gaussian的均值
depths, // 存储每个2D gaussian的深度
cov3Ds, // 存储每个3D gaussian的协方差矩阵
rgb, // 存储每个2D pixel的颜色
conic_opacity, // 存储每个2D gaussian的协方差矩阵的逆矩阵以及它的不透明度
grid, // 在水平和垂直方向上需要多少个tile来覆盖整个渲染区域
tiles_touched, // 存储每个2D gaussian覆盖了多少个tile
prefiltered // 是否预先过滤掉了中心点(均值XYZ)不在视锥(frustum)内的3D gaussian
);
}
// Perform initial steps for each Gaussian prior to rasterization.
template<int c="">
__global__ void preprocessCUDA(int P, int D, int M,
const float* orig_points,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
int* radii,
float2* points_xy_image,
float* depths,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
// 每个线程处理一个3D gaussian, index超过3D gaussian总数的线程直接返回, 防止数组越界访问
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Initialize radius and touched tiles to 0. If this isn't changed,
// this Gaussian will not be processed further.
radii[idx] = 0;
tiles_touched[idx] = 0;
// 判断当前处理的3D gaussian的中心点(均值XYZ)是否在视锥(frustum)内, 如果不在则直接返回
// Perform near culling, quit if outside.
float3 p_view; // 用于存储将 p_orig 通过视图矩阵 viewmatrix 转换到视图空间后的点坐标
if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
return;
// Transform point by projecting
float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
// 将当前3D gaussian的中心点从世界坐标系投影到裁剪坐标系
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
// 将当前3D gaussian的中心点从裁剪坐标转变到归一化设备坐标(Normalized Device Coordinates, NDC)
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
// If 3D covariance matrix is precomputed, use it, otherwise compute
// from scaling and rotation parameters.
const float* cov3D;
if (cov3D_precomp != nullptr)
{
cov3D = cov3D_precomp + idx * 6;
}
else
{
// 根据当前3D gaussian的尺度和旋转参数计算其对应的协方差矩阵
computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
cov3D = cov3Ds + idx * 6;
}
// 将当前的3D gaussian投影到2D图像,得到对应的2D gaussian的协方差矩阵cov
// Compute 2D screen-space covariance matrix
float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
// 计算当前2D gaussian的协方差矩阵cov的逆矩阵
// Invert covariance (EWA algorithm)
float det = (cov.x * cov.z - cov.y * cov.y);
if (det == 0.0f)
return;
float det_inv = 1.f / det;
float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
// 计算2D gaussian的协方差矩阵cov的特征值lambda1, lambda2, 从而计算2D gaussian的最大半径
// 对协方差矩阵进行特征值分解时,可以得到描述分布形状的主轴(特征向量)以及这些轴上分布的宽度(特征值)
// Compute extent in screen space (by finding eigenvalues of
// 2D covariance matrix). Use extent to compute a bounding rectangle
// of screen-space tiles that this Gaussian overlaps with. Quit if
// rectangle covers 0 tiles.
float mid = 0.5f * (cov.x + cov.z);
float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
// 将归一化设备坐标(Normalized Device Coordinates, NDC)转换为像素坐标
float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
uint2 rect_min, rect_max;
// 计算当前的2D gaussian落在哪几个tile上
getRect(point_image, my_radius, rect_min, rect_max, grid);
// 如果没有命中任何一个title则直接返回
if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
return;
// If colors have been precomputed, use them, otherwise convert
// spherical harmonics coefficients to RGB color.
if (colors_precomp == nullptr)
{
// 从每个3D gaussian对应的球谐系数中计算对应的颜色
glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
rgb[idx * C + 0] = result.x;
rgb[idx * C + 1] = result.y;
rgb[idx * C + 2] = result.z;
}
// Store some useful helper data for the next steps.
depths[idx] = p_view.z;
radii[idx] = my_radius;
points_xy_image[idx] = point_image;
// Inverse 2D covariance and opacity neatly pack into one float4
conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] };
tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x);
}
// 判断当前处理的3D gaussian的中心点(均值XYZ)是否在视锥(frustum)内
__forceinline__ __device__ bool in_frustum(int idx,
const float* orig_points,
const float* viewmatrix,
const float* projmatrix,
bool prefiltered,
float3& p_view)
{
float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
// Bring points to screen space
// 将当前3D gaussian的中心点投影到归一化设备坐标系 (NDC)
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
// 将当前3D gaussian的中心点投影到相机坐标系
p_view = transformPoint4x3(p_orig, viewmatrix);
if (p_view.z <= 0.2f)// || ((p_proj.x < -1.3 || p_proj.x > 1.3 || p_proj.y < -1.3 || p_proj.y > 1.3)))
{
if (prefiltered)
{
printf("Point is filtered although prefiltered is set. This shouldn't happen!");
__trap();
}
return false;
}
return true;
}
// 根据当前3D gaussian的尺度和旋转参数计算其对应的协方差矩阵
// Forward method for converting scale and rotation properties of each
// Gaussian to a 3D covariance matrix in world space. Also takes care
// of quaternion normalization.
__device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
{
// Create scaling matrix
glm::mat3 S = glm::mat3(1.0f); // 初始化了一个3x3的单位阵
S[0][0] = mod * scale.x;
S[1][1] = mod * scale.y;
S[2][2] = mod * scale.z;
// Normalize quaternion to get valid rotation
glm::vec4 q = rot;// / glm::length(rot);
float r = q.x;
float x = q.y;
float y = q.z;
float z = q.w;
// Compute rotation matrix from quaternion
glm::mat3 R = glm::mat3(
1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
);
glm::mat3 M = S * R;
// Compute 3D world covariance matrix Sigma
glm::mat3 Sigma = glm::transpose(M) * M;
// Covariance is symmetric, only store upper right
cov3D[0] = Sigma[0][0];
cov3D[1] = Sigma[0][1];
cov3D[2] = Sigma[0][2];
cov3D[3] = Sigma[1][1];
cov3D[4] = Sigma[1][2];
cov3D[5] = Sigma[2][2];
}
// Forward version of 2D covariance matrix computation
__device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix)
{
// The following models the steps outlined by equations 29
// and 31 in "EWA Splatting" (Zwicker et al., 2002).
// Additionally considers aspect / scaling of viewport.
// Transposes used to account for row-/column-major conventions.
// 将当前3D gaussian的中心点从世界坐标系投影到相机坐标系
float3 t = transformPoint4x3(mean, viewmatrix);
const float limx = 1.3f * tan_fovx;
const float limy = 1.3f * tan_fovy;
const float txtz = t.x / t.z;
const float tytz = t.y / t.z;
t.x = min(limx, max(-limx, txtz)) * t.z;
t.y = min(limy, max(-limy, tytz)) * t.z;
// 透视变换是非线性的,因为一个点的屏幕空间坐标与其深度(Z值)成非线性关系。雅可比矩阵 J 提供了一个在特定点附近的线性近似,这使得计算变得简单且高效
glm::mat3 J = glm::mat3(
focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
0, 0, 0);
glm::mat3 W = glm::mat3(
viewmatrix[0], viewmatrix[4], viewmatrix[8],
viewmatrix[1], viewmatrix[5], viewmatrix[9],
viewmatrix[2], viewmatrix[6], viewmatrix[10]);
glm::mat3 T = W * J;
glm::mat3 Vrk = glm::mat3(
cov3D[0], cov3D[1], cov3D[2],
cov3D[1], cov3D[3], cov3D[4],
cov3D[2], cov3D[4], cov3D[5]);
glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
// Apply low-pass filter: every Gaussian should be at least
// one pixel wide/high. Discard 3rd row and column.
cov[0][0] += 0.3f;
cov[1][1] += 0.3f;
return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
}
// 从每个3D gaussian对应的球谐系数中计算对应的颜色
// Forward method for converting the input spherical harmonics
// coefficients of each Gaussian to a simple RGB color.
__device__ glm::vec3 computeColorFromSH(int idx, int deg, int max_coeffs, const glm::vec3* means, glm::vec3 campos, const float* shs, bool* clamped)
{
// The implementation is loosely based on code for
// "Differentiable Point-Based Radiance Fields for
// Efficient View Synthesis" by Zhang et al. (2022)
glm::vec3 pos = means[idx];
glm::vec3 dir = pos - campos;
dir = dir / glm::length(dir);
glm::vec3* sh = ((glm::vec3*)shs) + idx * max_coeffs;
glm::vec3 result = SH_C0 * sh[0];
if (deg > 0)
{
float x = dir.x;
float y = dir.y;
float z = dir.z;
result = result - SH_C1 * y * sh[1] + SH_C1 * z * sh[2] - SH_C1 * x * sh[3];
if (deg > 1)
{
float xx = x * x, yy = y * y, zz = z * z;
float xy = x * y, yz = y * z, xz = x * z;
result = result +
SH_C2[0] * xy * sh[4] +
SH_C2[1] * yz * sh[5] +
SH_C2[2] * (2.0f * zz - xx - yy) * sh[6] +
SH_C2[3] * xz * sh[7] +
SH_C2[4] * (xx - yy) * sh[8];
if (deg > 2)
{
result = result +
SH_C3[0] * y * (3.0f * xx - yy) * sh[9] +
SH_C3[1] * xy * z * sh[10] +
SH_C3[2] * y * (4.0f * zz - xx - yy) * sh[11] +
SH_C3[3] * z * (2.0f * zz - 3.0f * xx - 3.0f * yy) * sh[12] +
SH_C3[4] * x * (4.0f * zz - xx - yy) * sh[13] +
SH_C3[5] * z * (xx - yy) * sh[14] +
SH_C3[6] * x * (xx - 3.0f * yy) * sh[15];
}
}
}
result += 0.5f;
// RGB colors are clamped to positive values. If values are
// clamped, we need to keep track of this for the backward pass.
clamped[3 * idx + 0] = (result.x < 0);
clamped[3 * idx + 1] = (result.y < 0);
clamped[3 * idx + 2] = (result.z < 0);
return glm::max(result, 0.0f);
}
// 计算当前的2D gaussian落在哪几个tile上
__forceinline__ __device__ void getRect(const float2 p, int max_radius, uint2& rect_min, uint2& rect_max, dim3 grid)
{
rect_min = {
min(grid.x, max((int)0, (int)((p.x - max_radius) / BLOCK_X))),
min(grid.y, max((int)0, (int)((p.y - max_radius) / BLOCK_Y)))
};
rect_max = {
min(grid.x, max((int)0, (int)((p.x + max_radius + BLOCK_X - 1) / BLOCK_X))),
min(grid.y, max((int)0, (int)((p.y + max_radius + BLOCK_Y - 1) / BLOCK_Y)))
};
}
__forceinline__ __device__ float3 transformPoint4x3(const float3& p, const float* matrix)
{
float3 transformed = {
matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
};
return transformed;
}
__forceinline__ __device__ float4 transformPoint4x4(const float3& p, const float* matrix)
{
float4 transformed = {
matrix[0] * p.x + matrix[4] * p.y + matrix[8] * p.z + matrix[12],
matrix[1] * p.x + matrix[5] * p.y + matrix[9] * p.z + matrix[13],
matrix[2] * p.x + matrix[6] * p.y + matrix[10] * p.z + matrix[14],
matrix[3] * p.x + matrix[7] * p.y + matrix[11] * p.z + matrix[15]
};
return transformed;
}
// 将归一化设备坐标(Normalized Device Coordinates, NDC)转换为像素坐标
__forceinline__ __device__ float ndc2Pix(float v, int S)
{
return ((v + 1.0) * S - 1.0) * 0.5;
}
cuda前传渲染
void FORWARD::render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H,
const float2* means2D,
const float* colors,
const float4* conic_opacity,
float* final_T,
uint32_t* n_contrib,
const float* bg_color,
float* out_color)
{
renderCUDA<num_channels> << <grid, block="">> > (
ranges, // 每个瓦片(tile)在排序后的高斯ID列表中的范围
point_list, // 排序后的3D gaussian的id列表
W, H, // 图像的宽和高
means2D, // 每个2D gaussian在图像上的中心点位置
colors, // 每个3D gaussian对应的RGB颜色
conic_opacity, // 每个2D gaussian的协方差矩阵的逆矩阵以及它的不透明度
final_T, // 渲染过程后每个像素的最终透明度或透射率值
n_contrib, // 每个pixel的最后一个贡献的2D gaussian是谁
bg_color, // 背景颜色
out_color); // 输出图像
}
// Main rasterization method. Collaboratively works on one tile per
// block, each thread treats one pixel. Alternates between fetching
// and rasterizing data.
template <uint32_t channels="">
__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
renderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
int W, int H,
const float2* __restrict__ points_xy_image,
const float* __restrict__ features,
const float4* __restrict__ conic_opacity,
float* __restrict__ final_T,
uint32_t* __restrict__ n_contrib,
const float* __restrict__ bg_color,
float* __restrict__ out_color)
{
// Identify current tile and associated min/max pixel range.
auto block = cg::this_thread_block();
uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
// 当前处理的tile的左上角的像素坐标
uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
// 当前处理的tile的右下角的像素坐标
uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
// 当前处理的像素坐标
uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
// 当前处理的像素id
uint32_t pix_id = W * pix.y + pix.x;
// 当前处理的像素坐标
float2 pixf = { (float)pix.x, (float)pix.y };
// Check if this thread is associated with a valid pixel or outside.
bool inside = pix.x < W&& pix.y < H;
// Done threads can help with fetching, but don't rasterize
bool done = !inside;
// Load start/end range of IDs to process in bit sorted list.
// 当前处理的tile对应的3D gaussian的起始id和结束id
uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
// 还有多少3D gaussian需要处理
int toDo = range.y - range.x;
// Allocate storage for batches of collectively fetched data.
__shared__ int collected_id[BLOCK_SIZE];
__shared__ float2 collected_xy[BLOCK_SIZE];
__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
// Initialize helper variables
float T = 1.0f;
uint32_t contributor = 0;
uint32_t last_contributor = 0;
float C[CHANNELS] = { 0 };
// Iterate over batches until all done or range is complete
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// End if entire block votes that it is done rasterizing
int num_done = __syncthreads_count(done);
if (num_done == BLOCK_SIZE)
break;
// Collectively fetch per-Gaussian data from global to shared
int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
// 当前处理的3D gaussian的id
int coll_id = point_list[range.x + progress];
collected_id[block.thread_rank()] = coll_id;
collected_xy[block.thread_rank()] = points_xy_image[coll_id];
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
}
block.sync();
// Iterate over current batch
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// Keep track of current position in range
contributor++;
// Resample using conic matrix (cf. "Surface
// Splatting" by Zwicker et al., 2001)
float2 xy = collected_xy[j]; // 当前处理的2D gaussian在图像上的中心点坐标
float2 d = { xy.x - pixf.x, xy.y - pixf.y }; // 当前处理的2D gaussian的中心点到当前处理的pixel的offset
float4 con_o = collected_conic_opacity[j]; // 当前处理的2D gaussian的协方差矩阵的逆矩阵以及它的不透明度
// 计算高斯分布的强度(或权重),用于确定像素在光栅化过程中的贡献程度
float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f)
continue;
// Eq. (2) from 3D Gaussian splatting paper.
// Obtain alpha by multiplying with Gaussian opacity
// and its exponential falloff from mean.
// Avoid numerical instabilities (see paper appendix).
float alpha = min(0.99f, con_o.w * exp(power));
if (alpha < 1.0f / 255.0f)
continue;
float test_T = T * (1 - alpha);
if (test_T < 0.0001f)
{
done = true;
continue;
}
// Eq. (3) from 3D Gaussian splatting paper.
for (int ch = 0; ch < CHANNELS; ch++)
C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
T = test_T;
// Keep track of last range entry to update this
// pixel.
last_contributor = contributor;
}
}
// All threads that treat valid pixel write out their final
// rendering data to the frame and auxiliary buffers.
if (inside)
{
final_T[pix_id] = T; // 渲染过程后每个像素的最终透明度或透射率值
n_contrib[pix_id] = last_contributor; // 最后一个贡献的2D gaussian是谁
for (int ch = 0; ch < CHANNELS; ch++)
out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
}
}
cuda反向传播代码
std::tuple<torch::tensor, torch::tensor,="" torch::tensor="">
RasterizeGaussiansBackwardCUDA(
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& radii,
const torch::Tensor& colors,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const torch::Tensor& dL_dout_color,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos,
const torch::Tensor& geomBuffer,
const int R,
const torch::Tensor& binningBuffer,
const torch::Tensor& imageBuffer,
const bool debug)
{
const int P = means3D.size(0);
const int H = dL_dout_color.size(1);
const int W = dL_dout_color.size(2);
int M = 0;
if(sh.size(0) != 0)
{
M = sh.size(1);
}
torch::Tensor dL_dmeans3D = torch::zeros({P, 3}, means3D.options());
torch::Tensor dL_dmeans2D = torch::zeros({P, 3}, means3D.options());
torch::Tensor dL_dcolors = torch::zeros({P, NUM_CHANNELS}, means3D.options());
torch::Tensor dL_dconic = torch::zeros({P, 2, 2}, means3D.options());
torch::Tensor dL_dopacity = torch::zeros({P, 1}, means3D.options());
torch::Tensor dL_dcov3D = torch::zeros({P, 6}, means3D.options());
torch::Tensor dL_dsh = torch::zeros({P, M, 3}, means3D.options());
torch::Tensor dL_dscales = torch::zeros({P, 3}, means3D.options());
torch::Tensor dL_drotations = torch::zeros({P, 4}, means3D.options());
if(P != 0)
{
CudaRasterizer::Rasterizer::backward(P, degree, M, R,
background.contiguous().data<float>(),
W, H,
means3D.contiguous().data<float>(),
sh.contiguous().data<float>(),
colors.contiguous().data<float>(),
scales.data_ptr<float>(),
scale_modifier,
rotations.data_ptr<float>(),
cov3D_precomp.contiguous().data<float>(),
viewmatrix.contiguous().data<float>(),
projmatrix.contiguous().data<float>(),
campos.contiguous().data<float>(),
tan_fovx,
tan_fovy,
radii.contiguous().data<int>(),
reinterpret_cast<char*>(geomBuffer.contiguous().data_ptr()),
reinterpret_cast<char*>(binningBuffer.contiguous().data_ptr()),
reinterpret_cast<char*>(imageBuffer.contiguous().data_ptr()),
dL_dout_color.contiguous().data<float>(),
dL_dmeans2D.contiguous().data<float>(),
dL_dconic.contiguous().data<float>(),
dL_dopacity.contiguous().data<float>(),
dL_dcolors.contiguous().data<float>(),
dL_dmeans3D.contiguous().data<float>(),
dL_dcov3D.contiguous().data<float>(),
dL_dsh.contiguous().data<float>(),
dL_dscales.contiguous().data<float>(),
dL_drotations.contiguous().data<float>(),
debug);
}
return std::make_tuple(dL_dmeans2D, dL_dcolors, dL_dopacity, dL_dmeans3D, dL_dcov3D, dL_dsh, dL_dscales, dL_drotations);
}
// 产生对应于前向渲染过程所需的优化梯度
void CudaRasterizer::Rasterizer::backward(
const int P, int D, int M, int R,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* campos,
const float tan_fovx, float tan_fovy,
const int* radii,
char* geom_buffer,
char* binning_buffer,
char* img_buffer,
const float* dL_dpix,
float* dL_dmean2D,
float* dL_dconic,
float* dL_dopacity,
float* dL_dcolor,
float* dL_dmean3D,
float* dL_dcov3D,
float* dL_dsh,
float* dL_dscale,
float* dL_drot,
bool debug)
{
GeometryState geomState = GeometryState::fromChunk(geom_buffer, P);
BinningState binningState = BinningState::fromChunk(binning_buffer, R);
ImageState imgState = ImageState::fromChunk(img_buffer, width * height);
if (radii == nullptr)
{
radii = geomState.internal_radii;
}
const float focal_y = height / (2.0f * tan_fovy);
const float focal_x = width / (2.0f * tan_fovx);
const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
const dim3 block(BLOCK_X, BLOCK_Y, 1);
// 根据每像素损失梯度计算损失梯度,关于2D均值位置、圆锥矩阵、
// 高斯的不透明度和RGB。如果我们获得了预计算的颜色而不是球谐系数,就使用它们。
const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb;
CHECK_CUDA(BACKWARD::render(
tile_grid,
block,
imgState.ranges,
binningState.point_list,
width, height,
background,
geomState.means2D,
geomState.conic_opacity,
color_ptr,
imgState.accum_alpha,
imgState.n_contrib,
dL_dpix,
(float3*)dL_dmean2D,
(float4*)dL_dconic,
dL_dopacity,
dL_dcolor), debug)
// 处理预处理的剩余部分
const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D;
CHECK_CUDA(BACKWARD::preprocess(P, D, M,
(float3*)means3D,
radii,
shs,
geomState.clamped,
(glm::vec3*)scales,
(glm::vec4*)rotations,
scale_modifier,
cov3D_ptr,
viewmatrix,
projmatrix,
focal_x, focal_y,
tan_fovx, tan_fovy,
(glm::vec3*)campos,
(float3*)dL_dmean2D,
dL_dconic,
(glm::vec3*)dL_dmean3D,
dL_dcolor,
dL_dcov3D,
dL_dsh,
(glm::vec3*)dL_dscale,
(glm::vec4*)dL_drot), debug)
}
cuda反传渲染代码
void BACKWARD::render(
const dim3 grid, const dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H,
const float* bg_color,
const float2* means2D,
const float4* conic_opacity,
const float* colors,
const float* final_Ts,
const uint32_t* n_contrib,
const float* dL_dpixels,
float3* dL_dmean2D,
float4* dL_dconic2D,
float* dL_dopacity,
float* dL_dcolors)
{
// 调用CUDA内核函数执行光栅化过程
renderCUDA << > >(
ranges,
point_list,
W, H,
bg_color,
means2D,
conic_opacity,
colors,
final_Ts,
n_contrib,
dL_dpixels,
dL_dmean2D,
dL_dconic2D,
dL_dopacity,
dL_dcolors
);
}
// 反向渲染过程的模板函数。
template
__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
renderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
int W, int H,
const float* __restrict__ bg_color,
const float2* __restrict__ points_xy_image,
const float4* __restrict__ conic_opacity,
const float* __restrict__ colors,
const float* __restrict__ final_Ts,
const uint32_t* __restrict__ n_contrib,
const float* __restrict__ dL_dpixels,
float3* __restrict__ dL_dmean2D,
float4* __restrict__ dL_dconic2D,
float* __restrict__ dL_dopacity,
float* __restrict__ dL_dcolors)
{
// 重新进行光栅化计算,计算所需的块信息。
auto block = cg::this_thread_block();
const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
const uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
const uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
const uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
const uint32_t pix_id = W * pix.y + pix.x;
const float2 pixf = { (float)pix.x, (float)pix.y };
const bool inside = pix.x < W&& pix.y < H;
// 当前处理的tile对应的3D gaussian的起始id和结束id
const uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
bool done = !inside;
int toDo = range.y - range.x;
__shared__ int collected_id[BLOCK_SIZE];
__shared__ float2 collected_xy[BLOCK_SIZE];
__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
__shared__ float collected_colors[C * BLOCK_SIZE];
// 在正向传播中,存储了T的最终值,即所有(1 - alpha)因子的乘积。
const float T_final = inside ? final_Ts[pix_id] : 0;
float T = T_final;
// 从后面开始。最后一个贡献的高斯的ID是从每个像素的正向传播中已知的
uint32_t contributor = toDo;
const int last_contributor = inside ? n_contrib[pix_id] : 0;
float accum_rec[C] = { 0 };
float dL_dpixel[C]; // 当前pixel对应的梯度
if (inside)
for (int i = 0; i < C; i++)
dL_dpixel[i] = dL_dpixels[i * H * W + pix_id];
float last_alpha = 0;
float last_color[C] = { 0 };
// 从后面开始加载辅助数据到共享内存中,并以相反顺序加载
const float ddelx_dx = 0.5 * W;
const float ddely_dy = 0.5 * H;
// Traverse all Gaussians
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// 将辅助数据加载到共享内存中,从后面开始
// 并以相反的顺序加载它们。
block.sync();
const int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
const int coll_id = point_list[range.y - progress - 1];
collected_id[block.thread_rank()] = coll_id;
collected_xy[block.thread_rank()] = points_xy_image[coll_id];
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
for (int i = 0; i < C; i++)
collected_colors[i * BLOCK_SIZE + block.thread_rank()] = colors[coll_id * C + i];
}
block.sync();
// 迭代高斯分布
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// 跟踪当前高斯的ID。如果这个高斯位于这个像素的最后一个贡献者之后,就跳过
contributor--;
if (contributor >= last_contributor)
continue;
// 像之前一样计算混合值
const float2 xy = collected_xy[j];
const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
const float4 con_o = collected_conic_opacity[j];
const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f)
continue;
const float G = exp(power);
const float alpha = min(0.99f, con_o.w * G);
if (alpha < 1.0f / 255.0f)
continue;
T = T / (1.f - alpha);
const float dchannel_dcolor = alpha * T;
// 将梯度传播到每个高斯的颜色,并保留
// 关于alpha(高斯/像素对的混合因子)的梯度
float dL_dalpha = 0.0f;
const int global_id = collected_id[j];
for (int ch = 0; ch < C; ch++)
{
const float c = collected_colors[ch * BLOCK_SIZE + j];
// 更新最后的颜色(用于下一次迭代)
accum_rec[ch] = last_alpha * last_color[ch] + (1.f - last_alpha) * accum_rec[ch];
last_color[ch] = c;
const float dL_dchannel = dL_dpixel[ch];
dL_dalpha += (c - accum_rec[ch]) * dL_dchannel;
// 更新关于高斯颜色的梯度。
// 使用原子操作,因为这个像素只是可能
// 受此高斯影响的众多像素之一
atomicAdd(&(dL_dcolors[global_id * C + ch]), dchannel_dcolor * dL_dchannel);
}
dL_dalpha *= T;
//更新最后的alpha(用于下一次迭代)
last_alpha = alpha;
// 考虑到alpha还会影响如果没有其他内容可混合时
// 添加多少背景颜色的事实
float bg_dot_dpixel = 0;
for (int i = 0; i < C; i++)
bg_dot_dpixel += bg_color[i] * dL_dpixel[i];
dL_dalpha += (-T_final / (1.f - alpha)) * bg_dot_dpixel;
// 有用的可重用临时变量
const float dL_dG = con_o.w * dL_dalpha;
const float gdx = G * d.x;
const float gdy = G * d.y;
const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
// 更新关于高斯的2D均值位置的梯度
atomicAdd(&dL_dmean2D[global_id].x, dL_dG * dG_ddelx * ddelx_dx);
atomicAdd(&dL_dmean2D[global_id].y, dL_dG * dG_ddely * ddely_dy);
// 更新关于2D协方差(2x2矩阵,对称)的梯度
atomicAdd(&dL_dconic2D[global_id].x, -0.5f * gdx * d.x * dL_dG);
atomicAdd(&dL_dconic2D[global_id].y, -0.5f * gdx * d.y * dL_dG);
atomicAdd(&dL_dconic2D[global_id].w, -0.5f * gdy * d.y * dL_dG);
// 更新关于高斯不透明度的梯度
atomicAdd(&(dL_dopacity[global_id]), G * dL_dalpha);
}
}
}
cuda反传预处理代码
void BACKWARD::preprocess(
int P, int D, int M,
const float3* means3D,
const int* radii,
const float* shs,
const bool* clamped,
const glm::vec3* scales,
const glm::vec4* rotations,
const float scale_modifier,
const float* cov3Ds,
const float* viewmatrix,
const float* projmatrix,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
const glm::vec3* campos,
const float3* dL_dmean2D,
const float* dL_dconic,
glm::vec3* dL_dmean3D,
float* dL_dcolor,
float* dL_dcov3D,
float* dL_dsh,
glm::vec3* dL_dscale,
glm::vec4* dL_drot)
{
// 传播用于2D圆锥矩阵计算路径的梯度。
// 由于过程较长,因此它有自己的内核,而不是作为“预处理”的一部分。
// 完成后,损失梯度相对于3D均值已被修改,且相对于3D协方差矩阵的梯度已被计算。
computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
P,
means3D,
radii,
cov3Ds,
focal_x,
focal_y,
tan_fovx,
tan_fovy,
viewmatrix,
dL_dconic,
(float3*)dL_dmean3D,
dL_dcov3D);
// 传播剩余步骤的梯度:完成3D均值梯度,
// 将颜色梯度传播到球谐系数(如果需要),将3D协方差矩阵的梯度传播到尺度和旋转。
preprocessCUDA << < (P + 255) / 256, 256 >> > (
P, D, M,
(float3*)means3D,
radii,
shs,
clamped,
(glm::vec3*)scales,
(glm::vec4*)rotations,
scale_modifier,
projmatrix,
campos,
(float3*)dL_dmean2D,
(glm::vec3*)dL_dmean3D,
dL_dcolor,
dL_dcov3D,
dL_dsh,
dL_dscale,
dL_drot);
}
本文来自博客园,作者:SXQ-BLOG,转载请注明原文链接:https://www.cnblogs.com/sxq-blog/p/18224667