Python之codecs模块的使用

 codecs模块的作用

主要用于在不同数据之间转换文本的编码器和解码器。 

1、编码切片十六进制并且指定切片的隔间 

import binascii

def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    #指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )

if __name__ == '__main__':
    print(to_hex(b'abcdef', 1))
    print(to_hex(b'abcdef', 2))
codecs_to_hex.py

运行效果

b'61 62 63 64 65 66'
b'6162 6364 6566'

2、编码UTF-8和UTF-16的示例

import binascii
import unicodedata


def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    # 指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )


if __name__ == '__main__':
    text = 'français'
    print('原数据    :{!r}'.format(text))
    for c in text:
        # 打印Unicode数据库中各个字符的名
        print(' {!r}: {}'.format(c, unicodedata.name(c, c)))
    # 使用UTF-8编码
    print('UTF-8 : {}'.format(to_hex(text.encode('utf-8'), 1)))
    # 使用UTF-16编码
    print('UTF-16 : {}'.format(to_hex(text.encode('utf-16'), 2)))
codecs_encodings.py

运行效果

原数据    :'français'
 'f': LATIN SMALL LETTER F
 'r': LATIN SMALL LETTER R
 'a': LATIN SMALL LETTER A
 'n': LATIN SMALL LETTER N
 'ç': LATIN SMALL LETTER C WITH CEDILLA
 'a': LATIN SMALL LETTER A
 'i': LATIN SMALL LETTER I
 's': LATIN SMALL LETTER S
UTF-8 : b'66 72 61 6e c3 a7 61 69 73'
UTF-16 : b'fffe 6600 7200 6100 6e00 e700 6100 6900 7300'

3、解码的示例

import binascii
import unicodedata


def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    # 指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )


if __name__ == '__main__':
    text = 'français'
    encoded = text.encode('utf-8')
    decoded = encoded.decode('utf-8')
    print('原来的数据 :', repr(text))
    print('编码过的内容 :', to_hex(encoded, 1), type(encoded))
    print('解码的内容 :', repr(decoded), type(decoded))
codecs_decode.py

运行效果

原来的数据 : 'français'
编码过的内容 : b'66 72 61 6e c3 a7 61 69 73' <class 'bytes'>
解码的内容 : 'français' <class 'str'>

4、codecs模块打开文件设置编码格式写入内容

import binascii
import codecs
import sys


def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    # 指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )


encoding = sys.argv[1]
filename = encoding + '.txt'
print('写入的文件名', filename)

# 创建设置好编码格式的文件句柄,并且写入内容
with codecs.open(filename, mode='w', encoding=encoding) as wf:
    wf.write('français')
nbytes = {
    'utf-8': 1,
    'utf-16': 2,
    'utf-32': 3,
}.get(encoding, 1)

print('读取文件内容')
with open(filename, mode='rb') as rf:
    print(to_hex(rf.read(), nbytes))
codecs_open_write.py

 运行效果

写入的文件名 utf-8.txt
读取文件内容
b'66 72 61 6e c3 a7 61 69 73'

5、设置解码格式读取文件内容

import codecs
import sys

encoding = sys.argv[1]
filename = encoding + '.txt'
print('读取的文件内容', filename)

# 创建设置好编码格式的文件句柄,并且写入内容
with codecs.open(filename, mode='r', encoding=encoding) as rf:
    print(repr(rf.read()))
codecs_open_read.py

 运行效果

读取的文件内容 utf-8.txt
'français'

6、打印出字节序

import codecs

BOM_TYPES = [
    'BOM', 'BOM_BE', 'BOM_LE',
    'BOM_UTF8',
    'BOM_UTF16', 'BOM_UTF16_BE', 'BOM_UTF16_LE',
    'BOM_UTF32', 'BOM_UTF32_BE', 'BOM_UTF32_LE',
]

for name in BOM_TYPES:
    print('{:12} : {}'.format(
        name, to_hex(getattr(codecs, name), 2) #通过反射获取属性值
    ))
codecs_bom.py

运行效果

BOM          : b'fffe'
BOM_BE       : b'feff'
BOM_LE       : b'fffe'
BOM_UTF8     : b'efbb bf'
BOM_UTF16    : b'fffe'
BOM_UTF16_BE : b'feff'
BOM_UTF16_LE : b'fffe'
BOM_UTF32    : b'fffe 0000'
BOM_UTF32_BE : b'0000 feff'
BOM_UTF32_LE : b'fffe 0000'

7、codecs模块,字节排序由解码器在编解码器中自动检测和处理,但是可以在编码时指定显式排序。

import binascii
import codecs


def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    # 指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )


if codecs.BOM_UTF16 == codecs.BOM_UTF16_BE:
    bom = codecs.BOM_UTF16_LE
    encoding = 'utf_16_le'
else:
    bom = codecs.BOM_UTF16_BE
    encoding = 'utf_16_be'

print('Native order', to_hex(codecs.BOM_UTF16, 2))
print('Selected order', to_hex(bom, 2))

encoded_text = 'français'.encode(encoding)
print('{:14} : {}'.format(encoding, to_hex(encoded_text, 2)))

with open('nonnative-encoded.txt', mode='wb') as wf:
    wf.write(bom)
    wf.write(encoded_text)
codecs_bom_create_file.py

 运行效果

Native order b'fffe'
Selected order b'feff'
utf_16_be      : b'0066 0072 0061 006e 00e7 0061 0069 0073'

8、codecs模块,在打开文件时没有指定字节顺序,因此解码器使用文件前两个字节中的BOM值来确定它

import binascii
import codecs


def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    # 指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )


with open('nonnative-encoded.txt', mode='rb') as rf:
    raw_bytes = rf.read()
print('Raw  :', to_hex(raw_bytes, 2))
with codecs.open('nonnative-encoded.txt', encoding='utf-16') as f:
    decoded_text = f.read()

print('解码的数据', repr(decoded_text))
codecs_bom_detection.py

 运行效果

Raw : b'feff 0066 0072 0061 006e 00e7 0061 0069 0073'
解码的数据 'français'

 9、编码错误的处理

错误的模式描述
strict 如果数据无法转换时,则抛出一个异常。
replace 将一个无法转换的数据,替换为一个特殊的标记字符
ignore 忽略数据
xmlcharrefreplace XML编码 (仅编码)
backslashreplace 转义序列 (仅编码)

10、编码错误的处理

import codecs
import sys

error_handling = sys.argv[1]

text = 'français'

try:
    # 利用codecs,获取文件句柄,并且写入内容,设置错误的处理机制
    with codecs.open('encode_error.txt', 'w',
                     encoding='ascii',
                     errors=error_handling) as f:
        f.write(text)

except UnicodeEncodeError as err:
    print('ERROR:', err)

else:
    # If there was no error writing to the file,
    # show what it contains.
    with open('encode_error.txt', 'rb') as f:
        print('File contents: {!r}'.format(f.read()))
codecs_encode_error.py

测试效果

$ python3 codecs_encode_error.py strict
ERROR: 'ascii' codec can't encode character '\xe7' in position

$ python3 codecs_encode_error.py replace
File contents: b'fran?ais'

$ python3 codecs_encode_error.py ignore
File contents: b'franais'

$ python3 codecs_encode_error.py xmlcharrefreplace
File contents: b'fran&#231;ais'

11、解码错误的处理

import codecs
import sys
import binascii


def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    # 指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )


error_handling = sys.argv[1]

text = 'français'
print('源数据     :', repr(text))

# 指定编码,保存文本内容
with codecs.open('decode_error.txt', 'w',
                 encoding='utf-16') as f:
    f.write(text)

# 读取文本,并且转为十六进制显示
with open('decode_error.txt', 'rb') as f:
    print('File contents:', to_hex(f.read(), 1))

# 尝试用错误的编码读取数据
with codecs.open('decode_error.txt', 'r',
                 encoding='utf-8',
                 errors=error_handling) as f:
    try:
        data = f.read()
    except UnicodeDecodeError as err:
        print('ERROR:', err)
    else:
        print('Read         :', repr(data))
codecs_decode_error.py

测试效果

源数据     : 'français'
File contents: b'ff fe 66 00 72 00 61 00 6e 00 e7 00 61 00 69 00 73 00'
Read         : '��f\x00r\x00a\x00n\x00�\x00a\x00i\x00s\x00'

 12、文件读取写入和IoByte读取写入编码设置的获取示例

import codecs
import io
import binascii


def to_hex(t, nbytes):
    # 设置切片的间距
    chars_per_item = nbytes * 2

    # 获取十六进制的数据
    hex_version = binascii.hexlify(t)

    # 指定切片的间隔,切片十六进制的数据
    return b' '.join(
        hex_version[start:start + chars_per_item]
        for start in range(0, len(hex_version), chars_per_item)
    )


data = 'français'

utf8 = data.encode('utf-8')
print('将utf-8编码结果转换十六进制,并且设置1个字节用空格分割', to_hex(utf8, 1))

# file_encoding='utf-16',指的是文件打开句柄处理的编码
output = io.BytesIO()
encoded_file = codecs.EncodedFile(output, data_encoding='utf-8', file_encoding='utf-16')
encoded_file.write(utf8)

utf16 = output.getvalue()
print('使用file_encoding编码获取的值', to_hex(utf16, 2))

# data_encoding='utf-8',指的是read(),write()处理的时候,所用到的编码
buffer = io.BytesIO(utf16)
encoded_file = codecs.EncodedFile(buffer, data_encoding='utf-8', file_encoding='utf-16')
recoed = encoded_file.read()
print('使用data_encoding编码获取的值', to_hex(recoed, 1))
codecs_encodedfile.py

测试效果

将utf-8编码结果转换十六进制,并且设置1个字节用空格分割 b'66 72 61 6e c3 a7 61 69 73'
使用file_encoding编码获取的值 b'fffe 6600 7200 6100 6e00 e700 6100 6900 7300'
使用data_encoding编码获取的值 b'66 72 61 6e c3 a7 61 69 73'

13、非unicode的编码示例

import codecs
import io

buffer = io.StringIO()
stream = codecs.getwriter('rot_13')(buffer)
text = 'abcdefghijklmnopqrstuvwxyz'
stream.write(text)
stream.flush()

print('源数据', text)
print('ROT_13', buffer.getvalue())
codecs_rot13.py

测试效果

源数据 abcdefghijklmnopqrstuvwxyz
ROT_13 nopqrstuvwxyzabcdefghijklm

14、利用zlib编码进行数据的压缩与解压

import codecs
import io

buffer = io.BytesIO()
stream = codecs.getwriter('zlib')(buffer)
text = b'abcdefghijklmnopqrstuvwxyz\n' * 50
stream.write(text)
stream.flush()

print('源数据长度', len(text))

compressed_data = buffer.getvalue()
print('zlib压缩后的数据长度', len(compressed_data))

buffer = io.BytesIO(compressed_data)
stream = codecs.getreader('zlib')(buffer)

first_line = stream.readline()
print('读取第一行', repr(first_line))

uncompressed_data = first_line + stream.read()
print('解压后的数据长度', len(uncompressed_data))
print('与源数据进行比较', text == uncompressed_data)
codecs_zlib.py

 测试效果

源数据长度 1350
zlib压缩后的数据长度 48
读取第一行 b'abcdefghijklmnopqrstuvwxyz\n'
解压后的数据长度 1350
与源数据进行比较 True

15、增量bz2编码的示例

import codecs
import sys

text = b'abcdefghijklmnopqrstuvwxyz\n'
repetitions = 50

print('文本长度 :', len(text))
print('重复次数 :', repetitions)
print('乘于重复次数的长度:', len(text) * repetitions)

encoder = codecs.getincrementalencoder('bz2')()
encoded = []
print('编码:')
last = repetitions - 1
for i in range(repetitions):
    en_c = encoder.encode(text, final=(i == last))
    if en_c:
        print('\nEncoded: {} bytes'.format(len(en_c)))
        encoded.append(en_c)
    else:
        sys.stdout.write('.')

all_encoded = b''.join(encoded)
print('总的编码长度', len(all_encoded))

print('解码')
decoder = codecs.getincrementaldecoder('bz2')()
decoded = []
for i, b in enumerate(all_encoded):
    final = (i + 1) == len(text)
    c = decoder.decode(bytes([b]), final)
    if c:
        print('\nDecoded : {}'.format(len(c)))
        decoded.append(c)
    else:
        sys.stdout.write('.')
restored = b''.join(decoded)
print('\n解压后的总长度', len(restored))
codecs_incremental_bz2.py

测试效果

文本长度 : 27
重复次数 : 50
乘于重复次数的长度: 1350
编码:
.................................................
Encoded: 99 bytes
总的编码长度 99
解码
........................................................................................
Decoded : 1350
..........
解压后的总长度 1350

16、网络通讯交互的数据都是unicode编码的字节流发送的示例

import socketserver
import socket
import threading
import codecs


class Echo(socketserver.BaseRequestHandler):
    def handle(self):
        data = self.request.recv(1024)
        self.request.send(data)


class PassThrough:
    def __init__(self, other):
        self.other = other

    def write(self, data):
        print('写入', repr(data))
        return self.other.write(data)

    def read(self, size=-1):
        print('Reading :', end=' ')
        data = self.other.read(size)
        print(repr(data))
        return data

    def flush(self):
        return self.other.flush()

    def close(self):
        return self.other.close()


if __name__ == '__main__':
    address = ('localhost', 8080)
    server = socketserver.TCPServer(address, Echo)
    ip, port = server.server_address

    task = threading.Thread(target=server.serve_forever)
    task.setDaemon(True)
    task.start()

    sk = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sk.connect((ip, port))

    # 包装socket的reader和writer方法
    read_file = sk.makefile('rb')
    incoming = codecs.getreader('utf-8')(PassThrough(read_file))

    write_file = sk.makefile('wb')
    outgoing = codecs.getwriter('utf-8')(PassThrough(write_file))

    # 发送数据
    text = 'français'
    print('Sending :', repr(text))
    outgoing.write(text)
    outgoing.flush()

    # 接收数据
    response = incoming.read()
    print('Received:', repr(response))

    # 清理socket句柄
    sk.close()
    server.socket.close()
codecs_socket.py

测试效果

Sending : 'français'
写入 b'fran\xc3\xa7ais'
Reading : b'fran\xc3\xa7ais'
Reading : b''
Received: 'français'

17、大写转小写,小写转大写的示例

import string


# string.ascii_lowercase : abcdefghijklmnopqrstuvwxyz
# string.ascii_uppercase : ABCDEFGHIJKLMNOPQRSTUVWXYZ
def invertcaps(text):
    """大写转小写,小写转大写的功能"""
    return ''.join(
        c.upper()
        if c in string.ascii_lowercase
        else c.lower() if c in string.ascii_uppercase else c
        for c in text
    )


if __name__ == '__main__':
    print(invertcaps('ABCdef'))
    print(invertcaps('abcDEF'))
codecs_invertcaps.py

测试效果

abcDEF
ABCdef

18、自定义映射表,大写转小写,小写转大写的示例

import string
import codecs

decoding_map = codecs.make_identity_dict(range(256))

pairs = list(zip(
    [ord(c) for c in string.ascii_lowercase],
    [ord(c) for c in string.ascii_uppercase],
))

decoding_map.update({
    upper: lower
    for (lower, upper) in pairs
})

decoding_map.update({
    lower: upper
    for (lower, upper) in pairs
})

# 创建一个单独的编码映射
encoding_map = codecs.make_encoding_map(decoding_map)

if __name__ == '__main__':
    print(codecs.charmap_encode('abcDEF', 'strict',
                                encoding_map))
    print(codecs.charmap_decode(b'abcDEF', 'strict',
                                decoding_map))
    print(encoding_map == decoding_map)
codecs_invertcaps.py

测试效果

(b'ABCdef', 6)
('ABCdef', 6)
True

19、自定义映射表,大写转小写,小写转大写的错误的处理

import codecs
import string

# Map every character to itself
decoding_map = codecs.make_identity_dict(range(256))

# Make a list of pairs of ordinal values for the lower
# and uppercase letters
pairs = list(zip(
    [ord(c) for c in string.ascii_lowercase],
    [ord(c) for c in string.ascii_uppercase],
))

# Modify the mapping to convert upper to lower and
# lower to upper.
decoding_map.update({
    upper: lower
    for (lower, upper)
    in pairs
})
decoding_map.update({
    lower: upper
    for (lower, upper)
    in pairs
})

# Create a separate encoding map.
encoding_map = codecs.make_encoding_map(decoding_map)

if __name__ == '__main__':
    text = 'pi: \u03c0'

    for error in ['ignore', 'replace', 'strict']:
        try:
            encoded = codecs.charmap_encode(
                text, error, encoding_map)
        except UnicodeEncodeError as err:
            encoded = str(err)
        print('{:7}: {}'.format(error, encoded))
codecs_invertcaps_error.py

测试效果

ignore : (b'PI: ', 5)
replace: (b'PI: ?', 5)
strict : 'charmap' codec can't encode character '\u03c0' in position 4: character maps to <undefined>

20、自定义搜索函数,用于搜索模块支持编码的格式

import codecs
import encodings


def search1(encoding):
    print('search1: Searching for:', encoding)
    return None


def search2(encoding):
    print('search2: Searching for:', encoding)
    return None


codecs.register(search1)
codecs.register(search2)

utf8 = codecs.lookup('utf-8')
print('UTF-8:', utf8)

try:
    unknown = codecs.lookup('no-such-encoding')
except LookupError as err:
    print('ERROR:', err)
codecs_register.py

测试效果

UTF-8: <codecs.CodecInfo object for encoding utf-8 at 0x2189e635fa8>
search1: Searching for: no-such-encoding
search2: Searching for: no-such-encoding
ERROR: unknown encoding: no-such-encoding

21、利用codecs模块扩展自定义编码功能的示例

import codecs
import string

# 创建映射关系
decoding_map = codecs.make_identity_dict(range(256))

# 创建大小写字母 ascii值对应关系
pairs = list(zip(
    [ord(c) for c in string.ascii_lowercase],
    [ord(c) for c in string.ascii_uppercase],
))

# 创大小写字母 ascii值对应关系,更新到映射表里面
decoding_map.update({
    upper: lower
    for (lower, upper)
    in pairs
})
decoding_map.update({
    lower: upper
    for (lower, upper)
    in pairs
})

# 创建一个单独的编码映射。
encoding_map = codecs.make_encoding_map(decoding_map)


class InvertCapsCodec(codecs.Codec):
    "状态编码器/译码器"

    def encode(self, input, errors='strict'):
        return codecs.charmap_encode(input, errors, encoding_map)

    def decode(self, input, errors='strict'):
        return codecs.charmap_decode(input, errors, decoding_map)


class InvertCapsIncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        data, nbytes = codecs.charmap_encode(input,
                                             self.errors,
                                             encoding_map)
        return data


class InvertCapsIncrementalDecoder(codecs.IncrementalDecoder):
    def decode(self, input, final=False):
        data, nbytes = codecs.charmap_decode(input,
                                             self.errors,
                                             decoding_map)
        return data


class InvertCapsStreamReader(InvertCapsCodec,
                             codecs.StreamReader):
    pass


class InvertCapsStreamWriter(InvertCapsCodec,
                             codecs.StreamWriter):
    pass


def find_invertcaps(encoding):
    """Return the codec for 'invertcaps'.
    """
    if encoding == 'invertcaps':
        return codecs.CodecInfo(
            name='invertcaps',
            encode=InvertCapsCodec().encode,
            decode=InvertCapsCodec().decode,
            incrementalencoder=InvertCapsIncrementalEncoder,
            incrementaldecoder=InvertCapsIncrementalDecoder,
            streamreader=InvertCapsStreamReader,
            streamwriter=InvertCapsStreamWriter,
        )
    return None


# 注册一个新的编码解释器
codecs.register(find_invertcaps)

if __name__ == '__main__':

    # 获取一个编码解释器
    encoder = codecs.getencoder('invertcaps')
    text = 'abcDEF'
    encoded_text, consumed = encoder(text)
    print('Encoded "{}" to "{}", consuming {} characters'.format(
        text, encoded_text, consumed))

    # Stream writer
    import io

    buffer = io.BytesIO()
    writer = codecs.getwriter('invertcaps')(buffer)
    print('StreamWriter for io buffer: ')
    print('  writing "abcDEF"')
    writer.write('abcDEF')
    print('  buffer contents: ', buffer.getvalue())

    # Incremental decoder
    decoder_factory = codecs.getincrementaldecoder('invertcaps')
    decoder = decoder_factory()
    decoded_text_parts = []
    for c in encoded_text:
        decoded_text_parts.append(
            decoder.decode(bytes([c]), final=False)
        )
    decoded_text_parts.append(decoder.decode(b'', final=True))
    decoded_text = ''.join(decoded_text_parts)
    print('IncrementalDecoder converted {!r} to {!r}'.format(
        encoded_text, decoded_text))
codecs_invertcaps_register.py

测试效果

Encoded "abcDEF" to "b'ABCdef'", consuming 6 characters
StreamWriter for io buffer: 
  writing "abcDEF"
  buffer contents:  b'ABCdef'
IncrementalDecoder converted b'ABCdef' to 'abcDEF'

 

posted @ 2020-05-09 12:49  小粉优化大师  阅读(2049)  评论(0编辑  收藏  举报