[转]Python之基于十六进制判断文件类型

核心代码：

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Author  : sukimport structfrom io import BytesIO# 支持文件类型# 用16进制字符串的目的是可以知道文件头是多少字节# 各种文件头的长度不一样，少则2字符，长则8字符def typeList(types):    type_dict = {'jpg': ['FFD8FFE000104A464946'],                 'png': ['89504E470D0A1A0A0000'],                 'gif': ['47494638396126026F01'],                 'tif': ['49492A00227105008037'],                 'bmp': ['424D8E1B030000000000'],                 'dwg': ['41433130313500000000'],                 'html': ['3C21444F435459504520'],                 'htm': ['3C21646F637479706520'],                 'css': ['48544D4C207B0D0A0942'],                 'js': ['696B2E71623D696B2E71'],                 'rtf': ['7B5C727466315C616E73'],                 'psd': ['38425053000100000000'],                 'eml': ['46726F6D3A203D3F6762'],                 'wps': ['D0CF11E0A1B11AE10000'],                 'mdb': ['5374616E64617264204A'],                 'ps': '[252150532D41646F6265]',                 'pdf': ['255044462D312E'],                 'rmvb': ['2E524D46000000120001'],                 'flv': ['464C5601050000000900'],                 'mp4': ['00000020667479706D70'],                 'mp3': ['49443303000000002176'],                 'mpg': ['000001BA210001000180'],                 'wmv': ['3026B2758E66CF11A6D9'],                 'wav': ['52494646E27807005741'],                 'avi': ['52494646D07D60074156'],                 'mid': ['4D546864000000060001'],                 'zip': ['504B0304140000000800', '504B0304140000080800', '504B03040A0000080000'],                 'rar': ['526172211A0700CF9073'],                 'ini': ['235468697320636F6E66'],                 'jar': ['504B03040A0000000000'],                 'exe': ['4D5A9000030000000400'],                 'jsp': ['3C25402070616765206C'],                 'mf': ['4D616E69666573742D56'],                 'xml': ['3C3F786D6C2076657273'],                 'sql': ['494E5345525420494E54'],                 'java': ['7061636B616765207765'],                 'bat': ['406563686F206F66660D'],                 'gz': ['1F8B0800000000000000'],                 'properties': ['6C6F67346A2E726F6F74'],                 'class': ['CAFEBABE0000002E0041'],                 'chm': ['49545346030000006000'],                 'mxp': ['04000000010000001300'],                 'docx': ['504B0304140006000800', '504B03040A0000000000'],                 'torrent': ['6431303A637265617465'],                 'mov': ['6D6F6F76'],                 'wpd': ['FF575043'],                 'dbx': ['CFAD12FEC5FD746F'],                 'pst': ['2142444E'],                 'qdf': ['AC9EBD8F'],                 'pwl': ['E3828596'],                 'ram': ['2E7261FD']                 }    ret = {}    for k_hex, v_prefix in type_dict.items():        if k_hex in types:            ret[k_hex] = v_prefix    return ret# 字节码转16进制字符串def bytes2hex(bytes):    num = len(bytes)    hexstr = u""    for i in range(num):        t = u"%x" % bytes[i]        if len(t) % 2:            hexstr += u"0"        hexstr += t    return hexstr.upper()# 获取文件类型def file_type(filename):    binfile = open(filename, 'rb')  # 必需二制字读取    tl = typeList(types=["jpg", "zip", "docx"])    ftype = None    for type_name, hcode_list in tl.items():        flag = False        for hcode in hcode_list:            numOfBytes = int(len(hcode) / 2)  # 需要读多少字节            binfile.seek(0)  # 每次读取都要回到文件头，不然会一直往后读取            hbytes = struct.unpack_from("B" * numOfBytes, binfile.read(numOfBytes))  # 一个 "B"表示一个字节            f_hcode = bytes2hex(hbytes)  # 如果判断不出来，打印出这个值，往字典增加即可            # print("上传数据流hex", s_hcode, '=', "代码字典hex", hcode)  # 如果判断不出来，打印出这个值，往字典增加即可            if f_hcode == hcode:                flag = True                break        if flag:            ftype = type_name            break    binfile.close()    return ftype# 获取字节流类型def stream_type(stream, types):    """    :param stream:流数据    :param types:需要判断文件类型,格式:["jpg","jpn"]    :return:    """    tl = typeList(types=types)    ftype = None    for type_name, hcode_list in tl.items():        flag = False        for hcode in hcode_list:            numOfBytes = int(len(hcode) / 2)  # 需要读多少字节            hbytes = struct.unpack_from("B" * numOfBytes, stream[0:numOfBytes])  # 一个 "B"表示一个字节            s_hcode = bytes2hex(hbytes)            # print("上传数据流hex", s_hcode, '=', "代码字典hex", hcode)  # 如果判断不出来，打印出这个值，往字典增加即可            if s_hcode == hcode:                flag = True                break        if flag:            ftype = type_name            break    return ftypedef stream_split(stream, count=3):    """    主要处理流是分段获取的数据    :param stream: 块流    :param count: 取多少段合成来判断类型,默认三段    :return:    """    block_stream = BytesIO()    temp = 1    for block in stream:        block_stream.write(block)        if temp == count:            break        temp += 1    return block_stream.getvalue()

is_file_type.py

type_dict字典，根据自己上传的文件，来填写，数据来自互联网。

基于Flask的上传示例

 
@index.route('/upload', methods=['GET', 'POST'])
def upload():
 
    if request.method == 'GET':
        return render_template('upload.html')
 
    upload_obj = request.files.get('code_file')
 
    if not upload_obj:
        return '没有选择文件上传'
 
    ret = stream_type(stream_split(upload_obj.stream), ["jpg", "png", "pdf"])
 
    if not ret:
        return '上传失败，文件类型不匹配，类型必须 "jpg" or "png" or "pdf"'
 
    file_name = upload_obj.filename
    upload_obj.save(os.path.join('files', file_name))
 
    return '上传文件成功'

upload.html

 



{% extends 'layout.html' %}
{% block content %}
<h1>上传代码</h1>
<form action="" method="post" enctype="multipart/form-data">
    <input type="file" name="code_file">
    <input type="submit" value="上传"></input>
</form>
{% endblock %}

开始上传文件：

上传不在列表中的文件类型

上传在列表中的文件类型

---------------------
作者：小粉优化大师
来源：CNBLOGS
原文：https://www.cnblogs.com/ygbh/p/11918876.html
版权声明：本文为作者原创文章，转载请附上博文链接！

posted @ 2021-12-16 15:34 寂寞圣贤阅读(145) 评论(0) 收藏举报

刷新页面返回顶部

寂寞圣贤

[转]Python之基于十六进制判断文件类型

公告