python实现批量将文件的编码格式进行转换
用python3实现批量将文件的编码格式进行转换;
需要指定四个参数,
1、搜索的根路径
2、文件的类型(正则表达式来处理)
3、源编码格式
4、目标编码格式
#!/usr/bin/env python # -*- coding: utf-8 -*- import os import codecs import chardet import re class TextDetect: path_root = '' file_pattern = '' def __init__(self, path_root, file_pattern): self.path_root = path_root self.file_pattern = file_pattern def print_member(): print("path_root = %s"%(self.path_root)) print("file_pattern = %s"%(self.file_pattern)) def convert(self,file, in_enc="GBK", out_enc="UTF-8"): """ 该程序用于将目录下的文件从指定格式转换到指定格式,默认的是GBK转到utf-8 :param file: 文件路径 :param in_enc: 输入文件格式 :param out_enc: 输出文件格式 :return: """ in_enc = in_enc.upper() out_enc = out_enc.upper() try: print("convert [ " + file.split('\\')[-1] + " ].....From " + in_enc + " --> " + out_enc) f = codecs.open(file, 'r', in_enc, "ignore") new_content = f.read() codecs.open(file, 'w', out_enc).write(new_content) except IOError as err: print("I/O error: {0}".format(err)) def detect(self, in_enc="GBK", out_enc="UTF-8"): for root, dirs, files in os.walk(self.path_root, topdown=True): for item in files: match = re.match(self.file_pattern, item, re.IGNORECASE) if not match: continue item_path = os.path.join(root,item) print("find file name = %s"%(item)) with open(item_path, "rb") as f: data = f.read() codeType = chardet.detect(data)['encoding'] print("%s's codeType is %s"%(item, codeType)) if in_enc == "GBK": #GBK特殊处理一下 if codeType == 'GB2312' or codeType == 'GBK' or codeType == 'GB18030': print("%s's codeType is %s,change encode!"%(item,codeType)) self.convert(item_path, codeType, out_enc) else: if codeType == in_enc: print("%s's codeType is %s,change encode!"%(item,codeType)) self.convert(item_path, codeType, out_enc) if __name__ == "__main__": # 使用时填写这四个参数即可 # 处理的根路径 path_root = "C:\\WorkSpace\\MyProjects\\python_basic_practice\\test" # 要进行格式转换的文件正则表达式,匹配某一类型的文件 search_file_pattern = ".*\.[ch]" # 要转换的源编码 in_enc = "GBK" # 要转换的目的编码 out_enc = "UTF-8" my_detect=TextDetect(path_root, search_file_pattern) my_detect.detect(in_enc, out_enc)
知行合一