Python脚本,它将遍历指定目录下的所有.srt文件,移除其中的不必要的英文字符、不必要的空行以及不必要的空格。该脚本会保留字幕索引、字幕时间线以及字幕中的中文内容,并且只保留字幕中的中文内容。它还会保留字幕行与字幕之间的换行符,同时去掉字幕与字幕之间的不必要的换行符。处理后的内容将被保存为新的.srt文件。

代码:

import os
import re

def is_index_line(line):
    # 检查是否为索引行,通常索引行以数字开头
    return bool(re.match(r'^\d+$', line))

def is_time_line(line):
    # 检查是否为时间线,包含"-->"的行
    return bool(re.search(r'-->', line))

def clean_srt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    new_content = []
    subtitle_block = []
    in_subtitle = False

    for line in lines:
        # 清除空白行
        if not line.strip():
            if subtitle_block:  # 如果当前有字幕块,则添加一个换行符
                new_content.append(line)
            in_subtitle = False  # 重置字幕块状态
            continue

        if is_index_line(line) or is_time_line(line):
            # 如果当前行为索引或时间线,则保存之前的字幕块(如果存在)
            if subtitle_block:
                new_content.extend(subtitle_block)
                new_content.append('')  # 保留字幕块之间的换行符
                subtitle_block = []
            # 添加索引行和时间线
            new_content.append(line)
            in_subtitle = True
        elif in_subtitle:
            # 仅保留字幕中的中文内容
            chinese_only_line = re.sub(r'[^\u4e00-\u9fa5\n]', '', line)
            if chinese_only_line.strip():  # 如果行内有中文字符
                subtitle_block.append(chinese_only_line)

    # 添加最后一个字幕块
    if subtitle_block:
        new_content.extend(subtitle_block)
        new_content.append('')  # 在文件末尾添加一个换行符

    # 保存为新的srt文件
    new_file_path = file_path.replace('.srt', '_cleaned.srt')
    with open(new_file_path, 'w', encoding='utf-8') as new_file:
        new_file.writelines(new_content)

    print(f'Processed and saved cleaned file to: {new_file_path}')

def process_directory(directory_path):
    for filename in os.listdir(directory_path):
        if filename.endswith('.srt'):
            file_path = os.path.join(directory_path, filename)
            clean_srt_file(file_path)

# 指定目录路径
directory_path = r'C:\caijian\29-51'
process_directory(directory_path)

 

代码2(改进):

import os
import re

def clean_and_save_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    new_content = []
    in_subtitle_block = False
    subtitle_block = []

    for line in lines:
        line = line.strip()

        if is_index_line(line) or is_time_line(line):
            if subtitle_block:
                new_content.extend(subtitle_block)
                new_content.append('')  # 保留字幕块之间的换行符
                subtitle_block = []
            new_content.append(line)
            in_subtitle_block = True
        elif in_subtitle_block:
            if line:  # 非空行
                cleaned_line = re.sub(r'[^\S\r\n]+', '', line)  # 去除多余的空格
                cleaned_line = re.sub(r'[^\u4e00-\u9fa5,。?!、《》()【】:",。?!]', '', cleaned_line)  # 只保留中文和部分标点
                if cleaned_line:
                    subtitle_block.append(cleaned_line)
            else:  # 空行
                if subtitle_block:  # 如果字幕块不为空,则结束当前字幕块
                    new_content.extend(subtitle_block)
                    new_content.append('')  # 保留字幕块之间的换行符
                    subtitle_block = []
                    in_subtitle_block = False

    if subtitle_block:  # 添加最后一个字幕块
        new_content.extend(subtitle_block)

    new_file_path = file_path.replace('.srt', '_cleaned.srt')
    with open(new_file_path, 'w', encoding='utf-8') as new_file:
        new_file.write('\n'.join(new_content))

    print(f'Processed and saved cleaned file to: {new_file_path}')

def is_index_line(line):
    return bool(re.match(r'^\d+$', line))

def is_time_line(line):
    return bool(re.search(r'-->', line))

def process_directory(directory_path):
    for filename in os.listdir(directory_path):
        if filename.endswith('.srt'):
            file_path = os.path.join(directory_path, filename)
            clean_and_save_srt(file_path)

# 指定目录路径
directory_path = r'C:\caijian\29-51'
process_directory(directory_path)

 

posted on 2024-10-23 13:08  大话人生  阅读(7)  评论(0编辑  收藏  举报