python编码探测-优化版

import os
import chardet
from tkinter import filedialog
from concurrent.futures import ThreadPoolExecutor

#获取文件的编码
def get_all_chardet(filename, max_bytes=1048576):  # 默认读取1MB
    try:
        with open(file=filename, mode='rb') as f3:  # 使用with保证文件被正确关闭
            data = f3.read(max_bytes)  # 限制读取的字节数
    except Exception as e:
        print(f"无法读取文件 {filename},错误信息:{e}")
        return
    
    if not data:
        print(f"文件 {filename} 内容为空")
        return
    
    result = chardet.detect(data)  # 检测文件内容
    if not result['encoding']:
        print(f"文件 {filename} 的编码检测失败")
        return
    a = list(result.values())
    print(f"{filename} 编码为: {a[0]}")  # 输出编码

#检测所有文件,使用多线程优化
def all_chardet_files(Folderpath):
    with ThreadPoolExecutor() as executor:
        for filepath, dirnames, filenames in os.walk(Folderpath):
            for filename in filenames:
                full_path = os.path.join(filepath, filename)
                executor.submit(get_all_chardet, full_path)

#输出指定文件类型的编码
def by_filetype(Folderpath):
    filetype = input('输入指定文件类型,例如.xml: ')
    with ThreadPoolExecutor() as executor:
        for filepath, dirnames, filenames in os.walk(Folderpath):
            for filename in filenames:
                if os.path.splitext(filename)[1] == filetype:  # 指定文件类型
                    full_path = os.path.join(filepath, filename)
                    executor.submit(get_all_chardet, full_path)

#获取指定编码的文件
def get_specified_chardet(filename, b, max_bytes=1048576):  # 默认读取1MB
    try:
        with open(file=filename, mode='rb') as f3:
            data = f3.read(max_bytes)
    except Exception as e:
        print(f"无法读取文件 {filename},错误信息:{e}")
        return
    
    if not data:
        print(f"文件 {filename} 内容为空")
        return
    
    result = chardet.detect(data)
    a = list(result.values())
    if a[0] == b:
        print(f"文件 {filename} 编码为指定的 {b}")

#输出特定编码的文件
def specified_chardet_files(Folderpath):
    b = input("请输入需要检测的编码: ")
    print(f"编码是 {b} 的文件如下:")
    with ThreadPoolExecutor() as executor:
        for filepath, dirnames, filenames in os.walk(Folderpath):
            for filename in filenames:
                full_path = os.path.join(filepath, filename)
                executor.submit(get_specified_chardet, full_path, b)

#获取非指定编码的文件
def get_no_specified_chardet(filename, b, max_bytes=1048576):  # 默认读取1MB
    try:
        with open(file=filename, mode='rb') as f3:
            data = f3.read(max_bytes)
    except Exception as e:
        print(f"无法读取文件 {filename},错误信息:{e}")
        return
    
    if not data:
        print(f"文件 {filename} 内容为空")
        return
    
    result = chardet.detect(data)
    a = list(result.values())
    if a[0] != b:
        print(f"文件 {filename} 编码不是 {b}")

#输出非指定编码的文件
def no_specified_chardet_files(Folderpath):
    b = input("请输入需要检测的编码: ")
    print(f"编码不是 {b} 的文件如下:")
    with ThreadPoolExecutor() as executor:
        for filepath, dirnames, filenames in os.walk(Folderpath):
            for filename in filenames:
                full_path = os.path.join(filepath, filename)
                executor.submit(get_no_specified_chardet, full_path, b)

#菜单函数
def case():
    print("utf-8  GB2312  ascii 等")
    print("1. 输出所有文件的编码")
    print("2. 输出指定类型文件的编码")
    print("3. 输出指定编码的文件")
    print("4. 输出非指定编码的文件")
    a = int(input("请输入选项:"))
    if a == 1:
        all_chardet_files(Folderpath)
    elif a == 2:
        by_filetype(Folderpath)
    elif a == 3:
        specified_chardet_files(Folderpath)
    elif a == 4:
        no_specified_chardet_files(Folderpath)

if __name__ == '__main__':
    print("输入需要检测的路径")
    Folderpath = filedialog.askdirectory()  # 获得选择好的文件夹
    print("检测的路径是 " + Folderpath)
    case()
    ask = input("是否继续?y or exit").lower()
    while ask == 'y':
        case()
posted @ 2024-09-23 13:51  孙犯困  阅读(15)  评论(0编辑  收藏  举报