Python脚本,它将遍历指定目录下的所有.srt
文件,移除其中的不必要的英文字符、不必要的空行以及不必要的空格。该脚本会保留字幕索引、字幕时间线以及字幕中的中文内容,并且只保留字幕中的中文内容。它还会保留字幕行与字幕之间的换行符,同时去掉字幕与字幕之间的不必要的换行符。处理后的内容将被保存为新的.srt
文件。
代码:
import os import re def is_index_line(line): # 检查是否为索引行,通常索引行以数字开头 return bool(re.match(r'^\d+$', line)) def is_time_line(line): # 检查是否为时间线,包含"-->"的行 return bool(re.search(r'-->', line)) def clean_srt_file(file_path): with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() new_content = [] subtitle_block = [] in_subtitle = False for line in lines: # 清除空白行 if not line.strip(): if subtitle_block: # 如果当前有字幕块,则添加一个换行符 new_content.append(line) in_subtitle = False # 重置字幕块状态 continue if is_index_line(line) or is_time_line(line): # 如果当前行为索引或时间线,则保存之前的字幕块(如果存在) if subtitle_block: new_content.extend(subtitle_block) new_content.append('') # 保留字幕块之间的换行符 subtitle_block = [] # 添加索引行和时间线 new_content.append(line) in_subtitle = True elif in_subtitle: # 仅保留字幕中的中文内容 chinese_only_line = re.sub(r'[^\u4e00-\u9fa5\n]', '', line) if chinese_only_line.strip(): # 如果行内有中文字符 subtitle_block.append(chinese_only_line) # 添加最后一个字幕块 if subtitle_block: new_content.extend(subtitle_block) new_content.append('') # 在文件末尾添加一个换行符 # 保存为新的srt文件 new_file_path = file_path.replace('.srt', '_cleaned.srt') with open(new_file_path, 'w', encoding='utf-8') as new_file: new_file.writelines(new_content) print(f'Processed and saved cleaned file to: {new_file_path}') def process_directory(directory_path): for filename in os.listdir(directory_path): if filename.endswith('.srt'): file_path = os.path.join(directory_path, filename) clean_srt_file(file_path) # 指定目录路径 directory_path = r'C:\caijian\29-51' process_directory(directory_path)
代码2(改进):
import os import re def clean_and_save_srt(file_path): with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() new_content = [] in_subtitle_block = False subtitle_block = [] for line in lines: line = line.strip() if is_index_line(line) or is_time_line(line): if subtitle_block: new_content.extend(subtitle_block) new_content.append('') # 保留字幕块之间的换行符 subtitle_block = [] new_content.append(line) in_subtitle_block = True elif in_subtitle_block: if line: # 非空行 cleaned_line = re.sub(r'[^\S\r\n]+', '', line) # 去除多余的空格 cleaned_line = re.sub(r'[^\u4e00-\u9fa5,。?!、《》()【】:",。?!]', '', cleaned_line) # 只保留中文和部分标点 if cleaned_line: subtitle_block.append(cleaned_line) else: # 空行 if subtitle_block: # 如果字幕块不为空,则结束当前字幕块 new_content.extend(subtitle_block) new_content.append('') # 保留字幕块之间的换行符 subtitle_block = [] in_subtitle_block = False if subtitle_block: # 添加最后一个字幕块 new_content.extend(subtitle_block) new_file_path = file_path.replace('.srt', '_cleaned.srt') with open(new_file_path, 'w', encoding='utf-8') as new_file: new_file.write('\n'.join(new_content)) print(f'Processed and saved cleaned file to: {new_file_path}') def is_index_line(line): return bool(re.match(r'^\d+$', line)) def is_time_line(line): return bool(re.search(r'-->', line)) def process_directory(directory_path): for filename in os.listdir(directory_path): if filename.endswith('.srt'): file_path = os.path.join(directory_path, filename) clean_and_save_srt(file_path) # 指定目录路径 directory_path = r'C:\caijian\29-51' process_directory(directory_path)