视频去重关键技术-视频镜头智能分割。使用大模型TransNetV2实现自动剪辑。Python自动脚本。
视频去重和镜头智能分割是现代视频处理中的重要技术,尤其是在内容创作和管理中。下面我将为你详细解释这一过程中的关键概念、相关技术以及如何利用大模型TransNetV2来实现自动剪辑,并给出一个Python自动脚本的示例。
关键概念解释
- 视频去重:指的是从一段视频中识别并删除重复的片段,以减少冗余内容,提高视频的观赏性和信息密度。
- 镜头智能分割:是将视频分割成多个镜头(场景),每个镜头代表一个独立的场景或事件,这样可以更方便地进行分析和编辑。
- TransNetV2:是一种基于深度学习的模型,专门用于视频分析和处理。它通过学习视频中的时间和空间特征,能够有效地识别镜头的起始和结束位置。
- Python自动脚本:使用Python编写的程序,可以自动执行视频处理任务,如镜头分割和去重,减少人工干预。
去重手段
市面上视频去重软件有很多,但我们要把去重功能集成到自己的脚本中,所以不能使用其他软件,换言之要自己实现去重功能。
去重的手段主要有以下:
- 抽帧
- 插帧
- 裁剪
- 改分辨率
- 改画面比例
- 添加片头片尾
- 修改帧率
- 修改比特率
- 修改亮度、对比度、饱和度
- 变速
- 添加视频蒙版
- 添加图片蒙版
- 添加贴纸
添加转场
- 添加背景图片/视频
- 扫光
- 修改背景音乐音量
- 添加额外音轨
- 镜像反转
- 画中画
- 修改元数据
- 加水印
- 打乱镜头顺序
Python 自动脚本示例

以下是一个使用TransNetV2模型进行视频镜头智能分割的Python自动脚本示例:
main.py
import os
from moviepy.editor import VideoFileClip
from transnetv2 import TransNetV2
if __name__ == '__main__':
video_path = input('请输入视频文件路径\n')
while not os.path.isfile(video_path):
video_path = input('请输入正确的视频文件路径\n')
video_name = os.path.basename(video_path)
video_name_without_ext = os.path.splitext(video_name)[0]
video_folder = os.path.dirname(video_path)
output_folder = os.path.join(video_folder, video_name_without_ext)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
model = TransNetV2()
video_frames, single_frame_predictions, all_frame_predictions = model.predict_video_2(video_path)
scenes = model.predictions_to_scenes(single_frame_predictions)
video_clip = VideoFileClip(video_path)
for i, (start, end) in enumerate(scenes):
start_time = start / video_clip.fps
end_time = end / video_clip.fps
segment_clip = video_clip.subclip(start_time, end_time)
output_path = os.path.join(output_folder, f'{video_name_without_ext}_{i+1}.mp4')
segment_clip.write_videofile(output_path, codec='libx264', fps=video_clip.fps)
video_clip.close()
input('\n任务已完成,按回车键退出……')
main.py
import os
from moviepy.editor import VideoFileClip
from transnetv2 import TransNetV2
if __name__ == '__main__':
video_path = input('请输入视频文件路径\n')
while not os.path.isfile(video_path):
video_path = input('请输入正确的视频文件路径\n')
video_name = os.path.basename(video_path)
video_name_without_ext = os.path.splitext(video_name)[0]
video_folder = os.path.dirname(video_path)
output_folder = os.path.join(video_folder, video_name_without_ext)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
model = TransNetV2()
video_frames, single_frame_predictions, all_frame_predictions = model.predict_video_2(video_path)
scenes = model.predictions_to_scenes(single_frame_predictions)
video_clip = VideoFileClip(video_path)
for i, (start, end) in enumerate(scenes):
start_time = start / video_clip.fps
end_time = end / video_clip.fps
segment_clip = video_clip.subclip(start_time, end_time)
output_path = os.path.join(output_folder, f'{video_name_without_ext}_{i+1}.mp4')
segment_clip.write_videofile(output_path, codec='libx264', fps=video_clip.fps)
video_clip.close()
input('\n任务已完成,按回车键退出……')
transnetv2.py
import math
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import numpy as np
import tensorflow as tf
from moviepy.editor import VideoFileClip
class TransNetV2:
def __init__(self, model_dir=None):
if model_dir is None:
# model_dir = os.path.join(os.path.dirname(__file__), "transnetv2-weights/")
model_dir = "transnetv2-weights/"
if not os.path.isdir(model_dir):
raise FileNotFoundError(f"[TransNetV2] ERROR: {model_dir} is not a directory.")
else:
print(f"[TransNetV2] Using weights from {model_dir}.")
self._input_size = (27, 48, 3)
try:
self._model = tf.saved_model.load(model_dir)
except OSError as exc:
raise IOError(f"[TransNetV2] It seems that files in {model_dir} are corrupted or missing. "
f"Re-download them manually and retry. For more info, see: "
f"https://github.com/soCzech/TransNetV2/issues/1#issuecomment-647357796") from exc
def predict_raw(self, frames: np.ndarray):
assert len(frames.shape) == 5 and frames.shape[2:] == self._input_size, \
"[TransNetV2] Input shape must be [batch, frames, height, width, 3]."
frames = tf.cast(frames, tf.float32)
logits, dict_ = self._model(frames)
single_frame_pred = tf.sigmoid(logits)
all_frames_pred = tf.sigmoid(dict_["many_hot"])
return single_frame_pred, all_frames_pred
def predict_frames(self, frames: np.ndarray):
assert len(frames.shape) == 4 and frames.shape[1:] == self._input_size, \
"[TransNetV2] Input shape must be [frames, height, width, 3]."
def input_iterator():
# return windows of size 100 where the first/last 25 frames are from the previous/next batch
# the first and last window must be padded by copies of the first and last frame of the video
no_padded_frames_start = 25
no_padded_frames_end = 25 + 50 - (len(frames) % 50 if len(frames) % 50 != 0 else 50) # 25 - 74
start_frame = np.expand_dims(frames[0], 0)
end_frame = np.expand_dims(frames[-1], 0)
padded_inputs = np.concatenate(
[start_frame] * no_padded_frames_start + [frames] + [end_frame] * no_padded_frames_end, 0
)
ptr = 0
while ptr + 100 <= len(padded_inputs):
out = padded_inputs[ptr:ptr + 100]
ptr += 50
yield out[np.newaxis]
predictions = []
for inp in input_iterator():
single_frame_pred, all_frames_pred = self.predict_raw(inp)
predictions.append((single_frame_pred.numpy()[0, 25:75, 0],
all_frames_pred.numpy()[0, 25:75, 0]))
print("\r[TransNetV2] Processing video frames {}/{}".format(
min(len(predictions) * 50, len(frames)), len(frames)
), end="")
print("\n")
single_frame_pred = np.concatenate([single_ for single_, all_ in predictions])
all_frames_pred = np.concatenate([all_ for single_, all_ in predictions])
return single_frame_pred[:len(frames)], all_frames_pred[:len(frames)] # remove extra padded frames
def predict_video(self, video_fn: str):
try:
import ffmpeg
except ModuleNotFoundError:
raise ModuleNotFoundError("For `predict_video` function `ffmpeg` needs to be installed in order to extract "
"individual frames from video file. Install `ffmpeg` command line tool and then "
"install python wrapper by `pip install ffmpeg-python`.")
print("[TransNetV2] Extracting frames from {}".format(video_fn))
video_stream, err = ffmpeg.input(video_fn).output(
"pipe:", format="rawvideo", pix_fmt="rgb24", s="48x27"
).run(capture_stdout=True, capture_stderr=True)
video = np.frombuffer(video_stream, np.uint8).reshape([-1, 27, 48, 3])
return (video, *self.predict_frames(video))
def predict_video_2(self, video_fn: str):
print("[TransNetV2] Extracting frames from {}".format(video_fn))
clip = VideoFileClip(video_fn, target_resolution=(27, 48))
duration = math.floor(clip.duration * 10) / 10
fps = clip.fps # 视频的帧率
frames = []
for t in range(0, int(duration * fps)):
frame = clip.get_frame(t / fps) # 获取当前时间点的帧
if len(frame) != 0: # 如果帧的长度不为零
frames.append(frame) # 将帧添加到 frames 列表中
video = np.array(frames)
return video, *self.predict_frames(video)
@staticmethod
def predictions_to_scenes(predictions: np.ndarray, threshold: float = 0.5):
predictions = (predictions > threshold).astype(np.uint8)
scenes = []
t, t_prev, start = -1, 0, 0
for i, t in enumerate(predictions):
if t_prev == 1 and t == 0:
start = i
if t_prev == 0 and t == 1 and i != 0:
scenes.append([start, i])
t_prev = t
if t == 0:
scenes.append([start, i])
# just fix if all predictions are 1
if len(scenes) == 0:
return np.array([[0, len(predictions) - 1]], dtype=np.int32)
return np.array(scenes, dtype=np.int32)
@staticmethod
def visualize_predictions(frames: np.ndarray, predictions):
from PIL import Image, ImageDraw
if isinstance(predictions, np.ndarray):
predictions = [predictions]
ih, iw, ic = frames.shape[1:]
width = 25
# pad frames so that length of the video is divisible by width
# pad frames also by len(predictions) pixels in width in order to show predictions
pad_with = width - len(frames) % width if len(frames) % width != 0 else 0
frames = np.pad(frames, [(0, pad_with), (0, 1), (0, len(predictions)), (0, 0)])
predictions = [np.pad(x, (0, pad_with)) for x in predictions]
height = len(frames) // width
img = frames.reshape([height, width, ih + 1, iw + len(predictions), ic])
img = np.concatenate(np.split(
np.concatenate(np.split(img, height), axis=2)[0], width
), axis=2)[0, :-1]
img = Image.fromarray(img)
draw = ImageDraw.Draw(img)
# iterate over all frames
for i, pred in enumerate(zip(*predictions)):
x, y = i % width, i // width
x, y = x * (iw + len(predictions)) + iw, y * (ih + 1) + ih - 1
# we can visualize multiple predictions per single frame
for j, p in enumerate(pred):
color = [0, 0, 0]
color[(j + 1) % 3] = 255
value = round(p * (ih - 1))
if value != 0:
draw.line((x + j, y, x + j, y - value), fill=tuple(color), width=1)
return img
def main():
import sys
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("files", type=str, nargs="+", help="path to video files to process")
parser.add_argument("--weights", type=str, default=None,
help="path to TransNet V2 weights, tries to infer the location if not specified")
parser.add_argument('--visualize', action="store_true",
help="save a png file with prediction visualization for each extracted video")
args = parser.parse_args()
model = TransNetV2(args.weights)
for file in args.files:
if os.path.exists(file + ".predictions.txt") or os.path.exists(file + ".scenes.txt"):
print(f"[TransNetV2] {file}.predictions.txt or {file}.scenes.txt already exists. "
f"Skipping video {file}.", file=sys.stderr)
continue
video_frames, single_frame_predictions, all_frame_predictions = \
model.predict_video(file)
predictions = np.stack([single_frame_predictions, all_frame_predictions], 1)
np.savetxt(file + ".predictions.txt", predictions, fmt="%.6f")
scenes = model.predictions_to_scenes(single_frame_predictions)
np.savetxt(file + ".scenes.txt", scenes, fmt="%d")
if args.visualize:
if os.path.exists(file + ".vis.png"):
print(f"[TransNetV2] {file}.vis.png already exists. "
f"Skipping visualization of video {file}.", file=sys.stderr)
continue
pil_image = model.visualize_predictions(
video_frames, predictions=(single_frame_predictions, all_frame_predictions))
pil_image.save(file + ".vis.png")
if __name__ == "__main__":
main()
完整源码
下载地址:https://pan.quark.cn/s/935a6f314af5
即刻收集
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· winform 绘制太阳,地球,月球 运作规律
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 上周热点回顾(3.3-3.9)