python magic文件格式识别

import os
import magic

# 决定使用这种方式识别,描述较全面,利用关键字匹配,如果能匹配到,就确定匹配的关键字为其文件类型
# 如果匹配不到,就用之前的文件格式

# a = magic.from_file(path)
# b = magic.from_buffer(open(path).read(1024))
# c = magic.from_file(path, mime=True)
# print(a)

FILE_TYPE = {
    'Executables': ['exe', 'mz', 'msi', 'coff', 'elf', 'krnl', 'rpm', 'linux', 'macho'],
    'Documents': ['ps', 'rtf', 'odp', 'ods', 'odt', 'hwp', 'gul', 'ebook', 'latex'],
    'Code': ['php', 'python', 'perl', 'ruby', 'cpp', 'java', 'shell', 'pascal', 'awk', 'dyalog',
             'fortran', 'java-bytecode'],
    'Bundles': ['zip', 'gzip', 'bzip', 'rzip', 'dzip', '7-zip', 'cab', 'jar', 'rar', 'mscompress', 'ace',
                'arj', 'asd', 'blackhole', 'kgb'],
    'Other': ['bat', 'cmd']
}


def identify_file_type():
    """
    文件类型识别
    :return:
    """

    # path = "D:/scripts/file/AF7.5.1.mf"
    path = "D:/scripts/file/CmdHelperService.7z"

    # 识别之前先根据文件后缀判断,排除以下文件格式:
    exclude_file_type = ['txt', 'pdf', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', 'com', 'mf']

    file_type = os.path.splitext(path)
    if file_type[1]:
        file_type = file_type[1].strip('.')
        if file_type not in exclude_file_type:
            file_type_info = magic.from_file(path)
            print(file_type_info)
            # TODO magic库识别后无法准确匹配的特殊文件类型,陆续添加
            # dll 类型
            if 'DLL' in file_type_info:
                file_type = 'dll'
            # vmdk 类型
            if 'VMware4 disk image' in file_type_info:
                file_type = 'vmdk'
            else:
                for file_types in FILE_TYPE.values():
                    for f_type in file_types:
                        if f_type.upper() in file_type_info or f_type.capitalize() in file_type_info or f_type in file_type_info:
                            file_type = f_type

    return file_type


file_type = identify_file_type()
print(file_type)


# 7-zip archive data, version 0.4
# 7-zip
                      
posted @ 2021-10-11 11:52  嗨,阿良  阅读(502)  评论(0编辑  收藏  举报