Python3之常用模块

大纲>>

time &datetime模块
random模块
OS模块
sys模块
shelve模块
shutil模块
xml模块
configparser模块
Hashlib、Hmac模块
zipfile&tarfile模块
PyYAML模块
re正则表达式

time & datetime模块

# !/usr/bin/env python
import time, datetime

"""
    常用标准库：
    
    time
    
        1、时间戳:时间戳表示的是从1970年1月1日00:00:00开始按秒计算的偏移量；
        2、格式化的时间字符串
        3、元组（struct_time）：struct_time元组共有9个元素
        
        
    
        格式：
            %a    本地（locale）简化星期名称    
            %A    本地完整星期名称    
            %b    本地简化月份名称    
            %B    本地完整月份名称    
            %c    本地相应的日期和时间表示    
            %d    一个月中的第几天（01 - 31）    
            %H    一天中的第几个小时（24小时制，00 - 23）    
            %I    第几个小时（12小时制，01 - 12）    
            %j    一年中的第几天（001 - 366）    
            %m    月份（01 - 12）    
            %M    分钟数（00 - 59）    
            %p    本地am或者pm的相应符    一    
            %S    秒（01 - 61）    二    
            %U    一年中的星期数。（00 - 53星期天是一个星期的开始。）第一个星期天之前的所有天数都放在第0周。    三    
            %w    一个星期中的第几天（0 - 6，0是星期天）    三    
            %W    和%U基本相同，不同的是%W以星期一为一个星期的开始。    
            %x    本地相应日期    
            %X    本地相应时间    
            %y    去掉世纪的年份（00 - 99）    
            %Y    完整的年份    
            %Z    时区的名字（如果不存在为空字符）    
            %%    ‘%’字符
"""
# print(help(time))
# print(help(time.ctime)) # 查看具体命令用法

# 当前时间 时间戳
print(time.time())
# cpu 时间
print(time.clock())

# 延迟多少秒
# print(time.sleep(1))

# 返回元组格式的时间 UTC         time.gmtime(x) x为时间戳
print(time.gmtime())

# 返回元组格式的时间 UTC+8  这是我们常用的时间    time.localtime(x) x为时间戳
print(time.localtime())


x = time.localtime()
print("x:", x)
# 将元组格式的时间格式化为str格式的自定义格式时间 time.strftime(str_format, x)  str_format:格式   x元组时间
print(time.strftime("%Y-%m-%d %H:%M:%S", x))

# 秒格式化为字符串形式  格式为：Tue Jun 16 11:53:31 2009
print(time.ctime(1245124411))

# 获取元组时间中的具体时间  年/月/日......
print(x.tm_year, x.tm_mon, x.tm_mday, x.tm_hour, x.tm_min, x.tm_sec)

# 将元组格式的时间转换为时间戳
print(time.mktime(x))

# 将时间戳转为字符串格式
print(time.gmtime(time.time()-86640))   # 将utc时间戳转换成struct_time格式
print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))   # 将utc struct_time格式转成指定的字符串格式


"""
    datetime模块：
"""
print("时间加减datetime模块".center(50, "~"))

# 返回 2018-01-20 23:20:49.418354
print(datetime.datetime.now())

# 时间戳直接转成日期格式 2018-01-20
print(datetime.date.fromtimestamp(time.time()))

# 当前时间+3天
print(datetime.datetime.now() + datetime.timedelta(3))

# 当前时间-3天
print(datetime.datetime.now() + datetime.timedelta(-3))

# 当前时间+3小时
print(datetime.datetime.now() + datetime.timedelta(hours=3))

# 当前时间+30分
print(datetime.datetime.now() + datetime.timedelta(minutes=30))

c_time  = datetime.datetime.now()
# 时间替换
print(c_time.replace(minute=54, hour=5))

时间关系转换图：

random模块

# Author：Allister.Liu
# !/usr/bin/env python
import random

"""
    random模块：
        

"""

# 用于生成一个0到1的随机符点数: 0 <= n < 1.0
print(random.random())

#  random.randint(a, b)，用于生成一个指定范围内的整数。其中参数a是下限，参数b是上限，生成的随机数n: a <= n <= b
print(random.randint(1, 10))


# random.randrange([start], stop[, step])，
# 从指定范围内，按指定基数递增的集合中 获取一个随机数。如：random.randrange(10, 100, 2)，
# 结果相当于从[10, 12, 14, 16, ... 96, 98]序列中获取一个随机数。
# random.randrange(10, 100, 2)在结果上与 random.choice(range(10, 100, 2) 等效。
print(random.randrange(1, 10))
print(random.choice(range(10, 100, 2)))

# 从序列中获取一个随机元素。 random.choice(sequence) sequence在python不是一种特定的类型，而是泛指一系列的类型。 list, tuple, 字符串都属于sequence
print(random.choice("abcdef"))

print(random.choice("学习Python的小伙伴"))  # 伙
print(random.choice(["JGood", "is", "a", "handsome", "boy"]))  # boy--  List
print(random.choice(("Tuple","List","Dict")))   # Tuple

# random.sample(sequence, k)，从指定序列中随机获取指定长度的片断。sample函数不会修改原有序列。
print(random.sample([1, 2, 3, 4, 5, 6, 7, 8, 9], 5))  # [2, 1, 9, 5, 7]


# 随机整数：
print(random.randint(0, 99))  # 70

# 随机选取0到100间的偶数：
print(random.randrange(0, 101, 2))  # 4

# 随机浮点数：
print(random.random())  # 0.2746445568079129
print(random.uniform(1, 10))  # 9.887001463194844

# 随机字符：
print(random.choice('abcdefg&#%^*f'))  # e

# 多个字符中选取特定数量的字符：
print(random.sample('abcdefghij123', 3))  # ['3', 'j', 'i']

# 随机选取字符串：
print(random.choice(['apple', 'pear', 'peach', 'orange', 'lemon']))  # peach

# 洗牌#
items = [1, 2, 3, 4, 5, 6, 7, 8, 9]
print(items)  # [1, 2, 3, 4, 5, 6, 7, 8, 9]

random.shuffle(items)
print(items)  # [8, 3, 6, 1, 4, 9, 5, 7, 2]


"""
 生成6为验证码：由数字， 大写字母， 小写字母组成的6位随机验证码
"""


def produce_check_code(scope = 6):
    check_code = ""
    for i in range(scope):
        tmp = random.randint(0, 10)

        if tmp < 6:
            tmp = random.randint(0, 9)
        elif tmp > 8:
            tmp = chr(random.randint(65, 90))
        else:
            tmp = chr(random.randint(97, 122))

        check_code += str(tmp)
    return check_code


print(produce_check_code(8))

0.21786963196954112
3
2
34
b
的
JGood
List
[7, 2, 6, 4, 8]
12
14
0.5355914470942843
3.3065568721321013
%
['2', 'g', 'f']
pear
[1, 2, 3, 4, 5, 6, 7, 8, 9]
[6, 7, 5, 9, 1, 2, 3, 4, 8]
D626EbYt

OS模块

提供对操作系统进行调用的接口：

# Author：Allister.Liu
# !/usr/bin/env python
import os

"""
    OS模块：
"""
path = "E:/logo/ic2c/logo.png"

# 获取当前工作目录，即当前python脚本工作的目录路径 === linux: pwd
print(os.getcwd())

# 改变当前脚本工作目录；相当于shell下cd
# os.chdir("dirname")

# 返回当前目录: ('.')
print(os.curdir)

# 获取当前目录的父目录字符串名：('..')
print(os.pardir)

# 可生成多层递归目录
# os.makedirs('dirname1/dirname2')

# 若目录为空，则删除，并递归到上一级目录，如若也为空，则删除，依此类推
# os.removedirs('dirname1')

# 生成单级目录；相当于shell中mkdir dirname
# os.mkdir('dirname')

# 删除单级空目录，若目录不为空则无法删除，报错；相当于shell中rmdir dirname
# os.rmdir('dirname')

# 列出指定目录下的所有文件和子目录，包括隐藏文件，并以列表方式打印
print(os.listdir('E:/logo'))

# 删除一个文件
# os.remove()

# 重命名文件/目录
# os.rename("oldname","newname")

# 获取文件/目录信息
# os.stat('path/filename')

# 输出操作系统特定的路径分隔符，win下为"\\",Linux下为"/"
os.sep

# 输出当前平台使用的行终止符，win下为"\t\n",Linux下为"\n"
os.linesep

# 输出用于分割文件路径的字符串 eg：环境变量path的分隔符
os.pathsep

# 输出字符串指示当前使用平台。win->'nt'; Linux->'posix'
os.name

# 运行shell命令，直接显示
os.system("dir")

# 获取系统环境变量
print(os.environ)

# 返回path规范化的绝对路径
print(os.path.abspath(path))

# 将path分割成目录和文件名二元组返回
print(os.path.split(path))

# 返回path的目录。其实就是os.path.split(path)的第一个元素
print(os.path.dirname(path))

# 返回path最后的文件名。如何path以／或\结尾，那么就会返回空值。即os.path.split(path)的第二个元素
print(os.path.basename(path))

# 如果path存在，返回True；如果path不存在，返回False
print(os.path.exists(path))

# 如果path是绝对路径，返回True
print(os.path.isabs(path))

# 如果path是一个存在的文件，返回True。否则返回False
print(os.path.isfile(path))

# 如果path是一个存在的目录，则返回True。否则返回False
print(os.path.isdir(path))

# 将多个路径组合后返回，第一个绝对路径之前的参数将被忽略
# os.path.join(path1[, path2[, ...]])

# 返回path所指向的文件或者目录的最后存取时间
print(os.path.getatime(path))

# 返回path所指向的文件或者目录的最后修改时间
print(os.path.getmtime(path))

sys模块

# Author：Allister.Liu
# !/usr/bin/env python

import sys


print(help(sys))
# 命令行参数List，第一个元素是程序本身路径
sys.argv

# 退出程序，正常退出时exit(0)
# sys.exit(0)

# 获取Python解释程序的版本信息
print(sys.version)

# 最大的Int值
print(sys.maxsize)

# 返回模块的搜索路径，初始化时使用PYTHONPATH环境变量的值
print(sys.path)

# 返回操作系统平台名称
print(sys.platform)

# 不换行输出  进度条
sys.stdout.write('please:')
val = sys.stdin.readline()[:-1]
print(val)

shelve模块

# Author：Allister.Liu
# !/usr/bin/env python
import shelve
import os, datetime


"""
    shelve模块：shelve模块是一个简单的k,v将内存数据通过文件持久化的模块，可以持久化任何pickle可支持的python数据格式
"""

file_path = "datas"
# 文件夹不存在则创建
if not os.path.exists(file_path):
    os.mkdir(file_path)


# 打开一个文件
d = shelve.open(file_path + "/shelve_file.data")


class Test(object):
    def __init__(self, n):
        self.n = n

t1 = Test(123)
t2 = Test(123334)

names = ["Allister", "Linde", "Heddy", "Daty"]

# 持久化列表 k为names
d["names"] = names

# 持久化类
d["t1"] = t1

d["t2"] = t2

d["date"] = datetime.datetime.now()


"""
    获取文件内容
"""
# 根据key获取value
print(d.get("names"))
print(d.get("t1"))
print(d.get("date"))

print(d.items())

shutil模块

# Author：Allister.Liu
# !/usr/bin/env python

import shutil

"""
    shutil模块：
    
    
    shutil.copyfileobj(fsrc, fdst[, length])：将文件内容拷贝到另一个文件中，可以部分内容；
    
    shutil.copyfile(src, dst)：拷贝文件；
    
    shutil.copymode(src, dst)：仅拷贝权限。内容、组、用户均不变；
    
    shutil.copystat(src, dst)：拷贝状态的信息，包括：mode bits, atime, mtime, flags；
    
    shutil.copy(src, dst)：拷贝文件和权限；
    
    shutil.copy2(src, dst)：拷贝文件和状态信息&权限等；
    
    shutil.rmtree(path[, ignore_errors[, onerror]])：递归的去删除文件；
    
    shutil.move(src, dst)：递归的去移动文件；
    
    shutil.copytree(src, dst, symlinks=False, ignore=None)：递归的去拷贝文件，目录；
    
    
    shutil.move(src, dst)：递归的去移动文件
    
    shutil.make_archive(base_name, format,...)：创建压缩包并返回文件路径，例如：zip、tar；
        base_name： 压缩包的文件名，也可以是压缩包的路径。只是文件名时，则保存至当前目录，否则保存至指定路径， 如：ic2c      =>保存至当前路径; 
            如：/Users/Allister/ic2c =>保存至/Users/Allister/;
        format：	压缩包种类，“zip”, “tar”, “bztar”，“gztar”;
        root_dir：	要压缩的文件夹路径（默认当前目录）;
        owner：	用户，默认当前用户;
        group：	组，默认当前组;
        logger：	用于记录日志，通常是logging.Logger对象;
    
    
"""

"""
复制“笔记.data”至文件“笔记1.data”
"""
with open("笔记.data", "r", encoding= "utf-8") as f1:
    with open("笔记1.data", "w", encoding="utf-8") as f2:
        shutil.copyfileobj(f1, f2)


# 无需打开文件，copyfile自动打开文件并复制
# shutil.copyfile("笔记.data", "笔记2.data")

# 递归copy文件夹下的所有文件，
# shutil.copytree("../day4", "../day5/copys")

# 将以上递归copy的目录删除
# shutil.rmtree("copys")

# 压缩文件并返回路径
# print(shutil.make_archive("H:/wx/432", "zip" ,root_dir="H:/PycharmProjects/python_tutorial/"))

xml模块

 1 <data>
 2     <country name="Liechtenstein">
 3         <rank updated="yes">2</rank>
 4         <year updated="yes">2009</year>
 5         <gdppc>141100</gdppc>
 6         <neighbor direction="E" name="Austria" />
 7         <neighbor direction="W" name="Switzerland" />
 8     </country>
 9     <country name="Singapore">
10         <rank updated="yes">5</rank>
11         <year updated="yes">2012</year>
12         <gdppc>59900</gdppc>
13         <neighbor direction="N" name="Malaysia" />
14     </country>
15     <country name="Panama">
16         <rank updated="yes">69</rank>
17         <year updated="yes">2012</year>
18         <gdppc>13600</gdppc>
19         <neighbor direction="W" name="Costa Rica" />
20         <neighbor direction="E" name="Colombia" />
21     </country>
22 </data>

xml_test.xml

# Author：Allister.Liu
# !/usr/bin/env python
import xml.etree.ElementTree as ET


"""
xml处理模块:xml是实现不同语言或程序之间进行数据交换的协议，跟json差不多，但json使用起来更简单，不过，古时候，在json还没诞生的黑暗年代，大家只能选择用xml呀，
            至今很多传统公司如金融行业的很多系统的接口还主要是xml。

"""

# xml协议在各个语言里的都 是支持的，在python中可以用以下模块操作xml

tree = ET.parse("datas/xml_test.xml")

root = tree.getroot()
print("父节点：", root.tag)

# print("遍历xml文档".center(50, "~"))
# # 遍历xml文档
# for child in root:
#     print(child.tag, child.attrib)
#     for i in child:
#         print(i.tag, i.text)
#
# print("year节点".center(50, "~"))
# # 只遍历year节点
# for node in root.iter('year'):
#     print(node.tag, node.text)


"""
    修改和删除xml文档内容
"""
# 修改
for node in root.iter('year'):
    new_year = int(node.text) + 1
    node.text = str(new_year)
    node.set("updated", "yes")

tree.write("datas/xmltest.xml")

# 删除node
for country in root.findall('country'):
    rank = int(country.find('rank').text)
    if rank > 50:
        root.remove(country)

tree.write('datas/output.xml')




"""
    创建xml文档
"""
new_xml = ET.Element("namelist")
name = ET.SubElement(new_xml, "name", attrib={"enrolled": "yes"})
age = ET.SubElement(name, "age", attrib={"checked": "no"})
sex = ET.SubElement(name, "sex")
sex.text = '33'
name2 = ET.SubElement(new_xml, "name", attrib={"enrolled": "no"})
age = ET.SubElement(name2, "age")
age.text = '19'

et = ET.ElementTree(new_xml)  # 生成文档对象
et.write("datas/test.xml", encoding="utf-8", xml_declaration=True)

ET.dump(new_xml)  # 打印生成的格式

configparser模块

文件的生成：

# Author：Allister.Liu
# !/usr/bin/env python
import configparser

"""
    mysql的配置文件：
"""
config = configparser.ConfigParser()
# 第一种赋值
config["client"] = {'port': '3306',
                    'default-character-set': 'utf8'}

# 第二种赋值
config['mysqld'] = {}
config['mysqld']['port'] = '3306'
config['mysqld']['character_set_server'] = 'utf8'
config['mysqld']['collation-server'] = 'utf8_general_ci'
config['mysqld']['lower_case_table_names'] = '1'
config['mysqld']['max_connections'] = '200'

# 第三种赋值
config['mysqld_safe'] = {}
topsecret = config['mysqld_safe']
topsecret['log-error'] = '/usr/local/mysql/error.log'

config['mysqld']['datadir'] = '/usr/local/mysql/data'

with open('datas/my.ini', 'w') as configfile:
    config.write(configfile)

文件的读取：

# Author：Allister.Liu
# !/usr/bin/env python
import configparser

"""
    configparser的读取：
"""

config = configparser.ConfigParser()

# 打开文件，返回文件路径
config.read('datas/my.ini')

# 读取文件中的父节点
print(config.sections())  # ['client', 'mysqld', 'mysqld_safe', 'logs']

# 判断节点是否存在文件中
print("mysqld" in config)  # True

# 获取节点下某个值
print(config["mysqld"]["port"])  # 3306
print(config["mysqld_safe"]["log-error"])   # /usr/local/mysql/error.log

topsecret = config["mysqld_safe"]
print(topsecret["log-error"])   # /usr/local/mysql/error.log

print("遍历配置文件".center(50, "~"))
for key in config["mysqld"]:
    print(key)

# 返回元组格式的属性
# [('port', '3306'), ('character_set_server', 'utf8'), ('collation-server', 'utf8_general_ci'), ('lower_case_table_names', '1'), ('max_connections', '200'), ('datadir', '/usr/local/mysql/data')]
print(config.items("mysqld"))


print(" 改写 ".center(50, "#"))
# 删除mysqld后重新写入
# sec = config.remove_section('mysqld') # 要删除的key
# config.write(open('datas/my.ini', "w"))

# # 判断一个节点是否存在
# sec = config.has_section('mysqld')
# print(sec)
# # 添加一个节点，如果存在会报错
# sec = config.add_section('logs')
# config.write(open('datas/my.ini', "w"))


# 新增logs节点下的log_path
config.set('logs', 'log_path', "/usr/logs")
config.write(open('datas/my.ini', "w"))

Hashlib、Hmac模块

# Author：Allister.Liu
# !/usr/bin/env python

import hashlib

"""
    hashlib模块:用于加密相关的操作，3.x里代替了md5模块和sha模块，主要提供 SHA1, SHA224, SHA256, SHA384, SHA512 ，MD5 算法。
"""
m1 = hashlib.md5()
m1.update("asdfghjkl".encode("utf-8"))

# 2进制
print(m1.digest())
# 16进制
print(m1.hexdigest())

# ######## md5 ########
print(" md5 ".center(50, "#"))
hash = hashlib.md5()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha1 ########
print(" sha1 ".center(50, "#"))
hash = hashlib.sha1()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha256 ########
print(" sha256 ".center(50, "#"))
hash = hashlib.sha256()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha384 ########
print(" sha384 ".center(50, "#"))
hash = hashlib.sha384()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())

# ######## sha512 ########
print(" sha512 ".center(50, "#"))
hash = hashlib.sha512()
hash.update('admin'.encode("utf-8"))
print(hash.hexdigest())


"""
    python 还有一个 hmac 模块，它内部对我们创建 key 和 内容 再进行处理然后再加密
    
    散列消息鉴别码，简称HMAC，是一种基于消息鉴别码MAC（Message Authentication Code）的鉴别机制。使用HMAC时,消息通讯的双方，通过验证消息中加入的鉴别密钥K来鉴别消息的真伪；
    
    一般用于网络通信中消息加密，前提是双方先要约定好key,就像接头暗号一样，然后消息发送把用key把消息加密，接收方用key ＋ 消息明文再加密，拿加密后的值 跟 发送者的相对比是否相等，这样就能验证消息的真实性，及发送者的合法性了。
"""

import hmac
h = hmac.new('中华好儿女'.encode("utf-8"), '美丽的山河'.encode("utf-8"))
print(h.hexdigest())

zipfile&tarfile模块

# Author：Allister.Liu
# !/usr/bin/env python

"""
zip解压缩

"""
import zipfile
# 压缩
z = zipfile.ZipFile('Allister.zip', 'w')
z.write('笔记.data')
z.write('sys_test.py')
z.close()

# 解压
z = zipfile.ZipFile('Allister.zip', 'r')
z.extractall()
z.close()


"""
    tar解压缩
"""

import tarfile

# 压缩
tar = tarfile.open('your.tar', 'w')
tar.add('/home/dsa.tools/mysql.zip', arcname='mysql.zip')
tar.add('/Users/wupeiqi/PycharmProjects/cmdb.zip', arcname='cmdb.zip')
tar.close()

# 解压
tar = tarfile.open('your.tar', 'r')
tar.extractall()  # 可设置解压地址
tar.close()

a、zipfile

   1 """
   2 Read and write ZIP files.
   3 
   4 XXX references to utf-8 need further investigation.
   5 """
   6 import io
   7 import os
   8 import re
   9 import importlib.util
  10 import sys
  11 import time
  12 import stat
  13 import shutil
  14 import struct
  15 import binascii
  16 
  17 try:
  18     import threading
  19 except ImportError:
  20     import dummy_threading as threading
  21 
  22 try:
  23     import zlib # We may need its compression method
  24     crc32 = zlib.crc32
  25 except ImportError:
  26     zlib = None
  27     crc32 = binascii.crc32
  28 
  29 try:
  30     import bz2 # We may need its compression method
  31 except ImportError:
  32     bz2 = None
  33 
  34 try:
  35     import lzma # We may need its compression method
  36 except ImportError:
  37     lzma = None
  38 
  39 __all__ = ["BadZipFile", "BadZipfile", "error",
  40            "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA",
  41            "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"]
  42 
  43 class BadZipFile(Exception):
  44     pass
  45 
  46 
  47 class LargeZipFile(Exception):
  48     """
  49     Raised when writing a zipfile, the zipfile requires ZIP64 extensions
  50     and those extensions are disabled.
  51     """
  52 
  53 error = BadZipfile = BadZipFile      # Pre-3.2 compatibility names
  54 
  55 
  56 ZIP64_LIMIT = (1 << 31) - 1
  57 ZIP_FILECOUNT_LIMIT = (1 << 16) - 1
  58 ZIP_MAX_COMMENT = (1 << 16) - 1
  59 
  60 # constants for Zip file compression methods
  61 ZIP_STORED = 0
  62 ZIP_DEFLATED = 8
  63 ZIP_BZIP2 = 12
  64 ZIP_LZMA = 14
  65 # Other ZIP compression methods not supported
  66 
  67 DEFAULT_VERSION = 20
  68 ZIP64_VERSION = 45
  69 BZIP2_VERSION = 46
  70 LZMA_VERSION = 63
  71 # we recognize (but not necessarily support) all features up to that version
  72 MAX_EXTRACT_VERSION = 63
  73 
  74 # Below are some formats and associated data for reading/writing headers using
  75 # the struct module.  The names and structures of headers/records are those used
  76 # in the PKWARE description of the ZIP file format:
  77 #     http://www.pkware.com/documents/casestudies/APPNOTE.TXT
  78 # (URL valid as of January 2008)
  79 
  80 # The "end of central directory" structure, magic number, size, and indices
  81 # (section V.I in the format document)
  82 structEndArchive = b"<4s4H2LH"
  83 stringEndArchive = b"PK\005\006"
  84 sizeEndCentDir = struct.calcsize(structEndArchive)
  85 
  86 _ECD_SIGNATURE = 0
  87 _ECD_DISK_NUMBER = 1
  88 _ECD_DISK_START = 2
  89 _ECD_ENTRIES_THIS_DISK = 3
  90 _ECD_ENTRIES_TOTAL = 4
  91 _ECD_SIZE = 5
  92 _ECD_OFFSET = 6
  93 _ECD_COMMENT_SIZE = 7
  94 # These last two indices are not part of the structure as defined in the
  95 # spec, but they are used internally by this module as a convenience
  96 _ECD_COMMENT = 8
  97 _ECD_LOCATION = 9
  98 
  99 # The "central directory" structure, magic number, size, and indices
 100 # of entries in the structure (section V.F in the format document)
 101 structCentralDir = "<4s4B4HL2L5H2L"
 102 stringCentralDir = b"PK\001\002"
 103 sizeCentralDir = struct.calcsize(structCentralDir)
 104 
 105 # indexes of entries in the central directory structure
 106 _CD_SIGNATURE = 0
 107 _CD_CREATE_VERSION = 1
 108 _CD_CREATE_SYSTEM = 2
 109 _CD_EXTRACT_VERSION = 3
 110 _CD_EXTRACT_SYSTEM = 4
 111 _CD_FLAG_BITS = 5
 112 _CD_COMPRESS_TYPE = 6
 113 _CD_TIME = 7
 114 _CD_DATE = 8
 115 _CD_CRC = 9
 116 _CD_COMPRESSED_SIZE = 10
 117 _CD_UNCOMPRESSED_SIZE = 11
 118 _CD_FILENAME_LENGTH = 12
 119 _CD_EXTRA_FIELD_LENGTH = 13
 120 _CD_COMMENT_LENGTH = 14
 121 _CD_DISK_NUMBER_START = 15
 122 _CD_INTERNAL_FILE_ATTRIBUTES = 16
 123 _CD_EXTERNAL_FILE_ATTRIBUTES = 17
 124 _CD_LOCAL_HEADER_OFFSET = 18
 125 
 126 # The "local file header" structure, magic number, size, and indices
 127 # (section V.A in the format document)
 128 structFileHeader = "<4s2B4HL2L2H"
 129 stringFileHeader = b"PK\003\004"
 130 sizeFileHeader = struct.calcsize(structFileHeader)
 131 
 132 _FH_SIGNATURE = 0
 133 _FH_EXTRACT_VERSION = 1
 134 _FH_EXTRACT_SYSTEM = 2
 135 _FH_GENERAL_PURPOSE_FLAG_BITS = 3
 136 _FH_COMPRESSION_METHOD = 4
 137 _FH_LAST_MOD_TIME = 5
 138 _FH_LAST_MOD_DATE = 6
 139 _FH_CRC = 7
 140 _FH_COMPRESSED_SIZE = 8
 141 _FH_UNCOMPRESSED_SIZE = 9
 142 _FH_FILENAME_LENGTH = 10
 143 _FH_EXTRA_FIELD_LENGTH = 11
 144 
 145 # The "Zip64 end of central directory locator" structure, magic number, and size
 146 structEndArchive64Locator = "<4sLQL"
 147 stringEndArchive64Locator = b"PK\x06\x07"
 148 sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator)
 149 
 150 # The "Zip64 end of central directory" record, magic number, size, and indices
 151 # (section V.G in the format document)
 152 structEndArchive64 = "<4sQ2H2L4Q"
 153 stringEndArchive64 = b"PK\x06\x06"
 154 sizeEndCentDir64 = struct.calcsize(structEndArchive64)
 155 
 156 _CD64_SIGNATURE = 0
 157 _CD64_DIRECTORY_RECSIZE = 1
 158 _CD64_CREATE_VERSION = 2
 159 _CD64_EXTRACT_VERSION = 3
 160 _CD64_DISK_NUMBER = 4
 161 _CD64_DISK_NUMBER_START = 5
 162 _CD64_NUMBER_ENTRIES_THIS_DISK = 6
 163 _CD64_NUMBER_ENTRIES_TOTAL = 7
 164 _CD64_DIRECTORY_SIZE = 8
 165 _CD64_OFFSET_START_CENTDIR = 9
 166 
 167 def _check_zipfile(fp):
 168     try:
 169         if _EndRecData(fp):
 170             return True         # file has correct magic number
 171     except OSError:
 172         pass
 173     return False
 174 
 175 def is_zipfile(filename):
 176     """Quickly see if a file is a ZIP file by checking the magic number.
 177 
 178     The filename argument may be a file or file-like object too.
 179     """
 180     result = False
 181     try:
 182         if hasattr(filename, "read"):
 183             result = _check_zipfile(fp=filename)
 184         else:
 185             with open(filename, "rb") as fp:
 186                 result = _check_zipfile(fp)
 187     except OSError:
 188         pass
 189     return result
 190 
 191 def _EndRecData64(fpin, offset, endrec):
 192     """
 193     Read the ZIP64 end-of-archive records and use that to update endrec
 194     """
 195     try:
 196         fpin.seek(offset - sizeEndCentDir64Locator, 2)
 197     except OSError:
 198         # If the seek fails, the file is not large enough to contain a ZIP64
 199         # end-of-archive record, so just return the end record we were given.
 200         return endrec
 201 
 202     data = fpin.read(sizeEndCentDir64Locator)
 203     if len(data) != sizeEndCentDir64Locator:
 204         return endrec
 205     sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
 206     if sig != stringEndArchive64Locator:
 207         return endrec
 208 
 209     if diskno != 0 or disks != 1:
 210         raise BadZipFile("zipfiles that span multiple disks are not supported")
 211 
 212     # Assume no 'zip64 extensible data'
 213     fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2)
 214     data = fpin.read(sizeEndCentDir64)
 215     if len(data) != sizeEndCentDir64:
 216         return endrec
 217     sig, sz, create_version, read_version, disk_num, disk_dir, \
 218         dircount, dircount2, dirsize, diroffset = \
 219         struct.unpack(structEndArchive64, data)
 220     if sig != stringEndArchive64:
 221         return endrec
 222 
 223     # Update the original endrec using data from the ZIP64 record
 224     endrec[_ECD_SIGNATURE] = sig
 225     endrec[_ECD_DISK_NUMBER] = disk_num
 226     endrec[_ECD_DISK_START] = disk_dir
 227     endrec[_ECD_ENTRIES_THIS_DISK] = dircount
 228     endrec[_ECD_ENTRIES_TOTAL] = dircount2
 229     endrec[_ECD_SIZE] = dirsize
 230     endrec[_ECD_OFFSET] = diroffset
 231     return endrec
 232 
 233 
 234 def _EndRecData(fpin):
 235     """Return data from the "End of Central Directory" record, or None.
 236 
 237     The data is a list of the nine items in the ZIP "End of central dir"
 238     record followed by a tenth item, the file seek offset of this record."""
 239 
 240     # Determine file size
 241     fpin.seek(0, 2)
 242     filesize = fpin.tell()
 243 
 244     # Check to see if this is ZIP file with no archive comment (the
 245     # "end of central directory" structure should be the last item in the
 246     # file if this is the case).
 247     try:
 248         fpin.seek(-sizeEndCentDir, 2)
 249     except OSError:
 250         return None
 251     data = fpin.read()
 252     if (len(data) == sizeEndCentDir and
 253         data[0:4] == stringEndArchive and
 254         data[-2:] == b"\000\000"):
 255         # the signature is correct and there's no comment, unpack structure
 256         endrec = struct.unpack(structEndArchive, data)
 257         endrec=list(endrec)
 258 
 259         # Append a blank comment and record start offset
 260         endrec.append(b"")
 261         endrec.append(filesize - sizeEndCentDir)
 262 
 263         # Try to read the "Zip64 end of central directory" structure
 264         return _EndRecData64(fpin, -sizeEndCentDir, endrec)
 265 
 266     # Either this is not a ZIP file, or it is a ZIP file with an archive
 267     # comment.  Search the end of the file for the "end of central directory"
 268     # record signature. The comment is the last item in the ZIP file and may be
 269     # up to 64K long.  It is assumed that the "end of central directory" magic
 270     # number does not appear in the comment.
 271     maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0)
 272     fpin.seek(maxCommentStart, 0)
 273     data = fpin.read()
 274     start = data.rfind(stringEndArchive)
 275     if start >= 0:
 276         # found the magic number; attempt to unpack and interpret
 277         recData = data[start:start+sizeEndCentDir]
 278         if len(recData) != sizeEndCentDir:
 279             # Zip file is corrupted.
 280             return None
 281         endrec = list(struct.unpack(structEndArchive, recData))
 282         commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
 283         comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
 284         endrec.append(comment)
 285         endrec.append(maxCommentStart + start)
 286 
 287         # Try to read the "Zip64 end of central directory" structure
 288         return _EndRecData64(fpin, maxCommentStart + start - filesize,
 289                              endrec)
 290 
 291     # Unable to find a valid end of central directory structure
 292     return None
 293 
 294 
 295 class ZipInfo (object):
 296     """Class with attributes describing each file in the ZIP archive."""
 297 
 298     __slots__ = (
 299         'orig_filename',
 300         'filename',
 301         'date_time',
 302         'compress_type',
 303         'comment',
 304         'extra',
 305         'create_system',
 306         'create_version',
 307         'extract_version',
 308         'reserved',
 309         'flag_bits',
 310         'volume',
 311         'internal_attr',
 312         'external_attr',
 313         'header_offset',
 314         'CRC',
 315         'compress_size',
 316         'file_size',
 317         '_raw_time',
 318     )
 319 
 320     def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
 321         self.orig_filename = filename   # Original file name in archive
 322 
 323         # Terminate the file name at the first null byte.  Null bytes in file
 324         # names are used as tricks by viruses in archives.
 325         null_byte = filename.find(chr(0))
 326         if null_byte >= 0:
 327             filename = filename[0:null_byte]
 328         # This is used to ensure paths in generated ZIP files always use
 329         # forward slashes as the directory separator, as required by the
 330         # ZIP format specification.
 331         if os.sep != "/" and os.sep in filename:
 332             filename = filename.replace(os.sep, "/")
 333 
 334         self.filename = filename        # Normalized file name
 335         self.date_time = date_time      # year, month, day, hour, min, sec
 336 
 337         if date_time[0] < 1980:
 338             raise ValueError('ZIP does not support timestamps before 1980')
 339 
 340         # Standard values:
 341         self.compress_type = ZIP_STORED # Type of compression for the file
 342         self.comment = b""              # Comment for each file
 343         self.extra = b""                # ZIP extra data
 344         if sys.platform == 'win32':
 345             self.create_system = 0          # System which created ZIP archive
 346         else:
 347             # Assume everything else is unix-y
 348             self.create_system = 3          # System which created ZIP archive
 349         self.create_version = DEFAULT_VERSION  # Version which created ZIP archive
 350         self.extract_version = DEFAULT_VERSION # Version needed to extract archive
 351         self.reserved = 0               # Must be zero
 352         self.flag_bits = 0              # ZIP flag bits
 353         self.volume = 0                 # Volume number of file header
 354         self.internal_attr = 0          # Internal attributes
 355         self.external_attr = 0          # External file attributes
 356         # Other attributes are set by class ZipFile:
 357         # header_offset         Byte offset to the file header
 358         # CRC                   CRC-32 of the uncompressed file
 359         # compress_size         Size of the compressed file
 360         # file_size             Size of the uncompressed file
 361 
 362     def __repr__(self):
 363         result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)]
 364         if self.compress_type != ZIP_STORED:
 365             result.append(' compress_type=%s' %
 366                           compressor_names.get(self.compress_type,
 367                                                self.compress_type))
 368         hi = self.external_attr >> 16
 369         lo = self.external_attr & 0xFFFF
 370         if hi:
 371             result.append(' filemode=%r' % stat.filemode(hi))
 372         if lo:
 373             result.append(' external_attr=%#x' % lo)
 374         isdir = self.filename[-1:] == '/'
 375         if not isdir or self.file_size:
 376             result.append(' file_size=%r' % self.file_size)
 377         if ((not isdir or self.compress_size) and
 378             (self.compress_type != ZIP_STORED or
 379              self.file_size != self.compress_size)):
 380             result.append(' compress_size=%r' % self.compress_size)
 381         result.append('>')
 382         return ''.join(result)
 383 
 384     def FileHeader(self, zip64=None):
 385         """Return the per-file header as a string."""
 386         dt = self.date_time
 387         dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
 388         dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
 389         if self.flag_bits & 0x08:
 390             # Set these to zero because we write them after the file data
 391             CRC = compress_size = file_size = 0
 392         else:
 393             CRC = self.CRC
 394             compress_size = self.compress_size
 395             file_size = self.file_size
 396 
 397         extra = self.extra
 398 
 399         min_version = 0
 400         if zip64 is None:
 401             zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT
 402         if zip64:
 403             fmt = '<HHQQ'
 404             extra = extra + struct.pack(fmt,
 405                                         1, struct.calcsize(fmt)-4, file_size, compress_size)
 406         if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
 407             if not zip64:
 408                 raise LargeZipFile("Filesize would require ZIP64 extensions")
 409             # File is larger than what fits into a 4 byte integer,
 410             # fall back to the ZIP64 extension
 411             file_size = 0xffffffff
 412             compress_size = 0xffffffff
 413             min_version = ZIP64_VERSION
 414 
 415         if self.compress_type == ZIP_BZIP2:
 416             min_version = max(BZIP2_VERSION, min_version)
 417         elif self.compress_type == ZIP_LZMA:
 418             min_version = max(LZMA_VERSION, min_version)
 419 
 420         self.extract_version = max(min_version, self.extract_version)
 421         self.create_version = max(min_version, self.create_version)
 422         filename, flag_bits = self._encodeFilenameFlags()
 423         header = struct.pack(structFileHeader, stringFileHeader,
 424                              self.extract_version, self.reserved, flag_bits,
 425                              self.compress_type, dostime, dosdate, CRC,
 426                              compress_size, file_size,
 427                              len(filename), len(extra))
 428         return header + filename + extra
 429 
 430     def _encodeFilenameFlags(self):
 431         try:
 432             return self.filename.encode('ascii'), self.flag_bits
 433         except UnicodeEncodeError:
 434             return self.filename.encode('utf-8'), self.flag_bits | 0x800
 435 
 436     def _decodeExtra(self):
 437         # Try to decode the extra field.
 438         extra = self.extra
 439         unpack = struct.unpack
 440         while len(extra) >= 4:
 441             tp, ln = unpack('<HH', extra[:4])
 442             if tp == 1:
 443                 if ln >= 24:
 444                     counts = unpack('<QQQ', extra[4:28])
 445                 elif ln == 16:
 446                     counts = unpack('<QQ', extra[4:20])
 447                 elif ln == 8:
 448                     counts = unpack('<Q', extra[4:12])
 449                 elif ln == 0:
 450                     counts = ()
 451                 else:
 452                     raise RuntimeError("Corrupt extra field %s"%(ln,))
 453 
 454                 idx = 0
 455 
 456                 # ZIP64 extension (large files and/or large archives)
 457                 if self.file_size in (0xffffffffffffffff, 0xffffffff):
 458                     self.file_size = counts[idx]
 459                     idx += 1
 460 
 461                 if self.compress_size == 0xFFFFFFFF:
 462                     self.compress_size = counts[idx]
 463                     idx += 1
 464 
 465                 if self.header_offset == 0xffffffff:
 466                     old = self.header_offset
 467                     self.header_offset = counts[idx]
 468                     idx+=1
 469 
 470             extra = extra[ln+4:]
 471 
 472 
 473 class _ZipDecrypter:
 474     """Class to handle decryption of files stored within a ZIP archive.
 475 
 476     ZIP supports a password-based form of encryption. Even though known
 477     plaintext attacks have been found against it, it is still useful
 478     to be able to get data out of such a file.
 479 
 480     Usage:
 481         zd = _ZipDecrypter(mypwd)
 482         plain_char = zd(cypher_char)
 483         plain_text = map(zd, cypher_text)
 484     """
 485 
 486     def _GenerateCRCTable():
 487         """Generate a CRC-32 table.
 488 
 489         ZIP encryption uses the CRC32 one-byte primitive for scrambling some
 490         internal keys. We noticed that a direct implementation is faster than
 491         relying on binascii.crc32().
 492         """
 493         poly = 0xedb88320
 494         table = [0] * 256
 495         for i in range(256):
 496             crc = i
 497             for j in range(8):
 498                 if crc & 1:
 499                     crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly
 500                 else:
 501                     crc = ((crc >> 1) & 0x7FFFFFFF)
 502             table[i] = crc
 503         return table
 504     crctable = None
 505 
 506     def _crc32(self, ch, crc):
 507         """Compute the CRC32 primitive on one byte."""
 508         return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ch) & 0xff]
 509 
 510     def __init__(self, pwd):
 511         if _ZipDecrypter.crctable is None:
 512             _ZipDecrypter.crctable = _ZipDecrypter._GenerateCRCTable()
 513         self.key0 = 305419896
 514         self.key1 = 591751049
 515         self.key2 = 878082192
 516         for p in pwd:
 517             self._UpdateKeys(p)
 518 
 519     def _UpdateKeys(self, c):
 520         self.key0 = self._crc32(c, self.key0)
 521         self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295
 522         self.key1 = (self.key1 * 134775813 + 1) & 4294967295
 523         self.key2 = self._crc32((self.key1 >> 24) & 255, self.key2)
 524 
 525     def __call__(self, c):
 526         """Decrypt a single character."""
 527         assert isinstance(c, int)
 528         k = self.key2 | 2
 529         c = c ^ (((k * (k^1)) >> 8) & 255)
 530         self._UpdateKeys(c)
 531         return c
 532 
 533 
 534 class LZMACompressor:
 535 
 536     def __init__(self):
 537         self._comp = None
 538 
 539     def _init(self):
 540         props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1})
 541         self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[
 542             lzma._decode_filter_properties(lzma.FILTER_LZMA1, props)
 543         ])
 544         return struct.pack('<BBH', 9, 4, len(props)) + props
 545 
 546     def compress(self, data):
 547         if self._comp is None:
 548             return self._init() + self._comp.compress(data)
 549         return self._comp.compress(data)
 550 
 551     def flush(self):
 552         if self._comp is None:
 553             return self._init() + self._comp.flush()
 554         return self._comp.flush()
 555 
 556 
 557 class LZMADecompressor:
 558 
 559     def __init__(self):
 560         self._decomp = None
 561         self._unconsumed = b''
 562         self.eof = False
 563 
 564     def decompress(self, data):
 565         if self._decomp is None:
 566             self._unconsumed += data
 567             if len(self._unconsumed) <= 4:
 568                 return b''
 569             psize, = struct.unpack('<H', self._unconsumed[2:4])
 570             if len(self._unconsumed) <= 4 + psize:
 571                 return b''
 572 
 573             self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[
 574                 lzma._decode_filter_properties(lzma.FILTER_LZMA1,
 575                                                self._unconsumed[4:4 + psize])
 576             ])
 577             data = self._unconsumed[4 + psize:]
 578             del self._unconsumed
 579 
 580         result = self._decomp.decompress(data)
 581         self.eof = self._decomp.eof
 582         return result
 583 
 584 
 585 compressor_names = {
 586     0: 'store',
 587     1: 'shrink',
 588     2: 'reduce',
 589     3: 'reduce',
 590     4: 'reduce',
 591     5: 'reduce',
 592     6: 'implode',
 593     7: 'tokenize',
 594     8: 'deflate',
 595     9: 'deflate64',
 596     10: 'implode',
 597     12: 'bzip2',
 598     14: 'lzma',
 599     18: 'terse',
 600     19: 'lz77',
 601     97: 'wavpack',
 602     98: 'ppmd',
 603 }
 604 
 605 def _check_compression(compression):
 606     if compression == ZIP_STORED:
 607         pass
 608     elif compression == ZIP_DEFLATED:
 609         if not zlib:
 610             raise RuntimeError(
 611                 "Compression requires the (missing) zlib module")
 612     elif compression == ZIP_BZIP2:
 613         if not bz2:
 614             raise RuntimeError(
 615                 "Compression requires the (missing) bz2 module")
 616     elif compression == ZIP_LZMA:
 617         if not lzma:
 618             raise RuntimeError(
 619                 "Compression requires the (missing) lzma module")
 620     else:
 621         raise RuntimeError("That compression method is not supported")
 622 
 623 
 624 def _get_compressor(compress_type):
 625     if compress_type == ZIP_DEFLATED:
 626         return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
 627                                 zlib.DEFLATED, -15)
 628     elif compress_type == ZIP_BZIP2:
 629         return bz2.BZ2Compressor()
 630     elif compress_type == ZIP_LZMA:
 631         return LZMACompressor()
 632     else:
 633         return None
 634 
 635 
 636 def _get_decompressor(compress_type):
 637     if compress_type == ZIP_STORED:
 638         return None
 639     elif compress_type == ZIP_DEFLATED:
 640         return zlib.decompressobj(-15)
 641     elif compress_type == ZIP_BZIP2:
 642         return bz2.BZ2Decompressor()
 643     elif compress_type == ZIP_LZMA:
 644         return LZMADecompressor()
 645     else:
 646         descr = compressor_names.get(compress_type)
 647         if descr:
 648             raise NotImplementedError("compression type %d (%s)" % (compress_type, descr))
 649         else:
 650             raise NotImplementedError("compression type %d" % (compress_type,))
 651 
 652 
 653 class _SharedFile:
 654     def __init__(self, file, pos, close, lock):
 655         self._file = file
 656         self._pos = pos
 657         self._close = close
 658         self._lock = lock
 659 
 660     def read(self, n=-1):
 661         with self._lock:
 662             self._file.seek(self._pos)
 663             data = self._file.read(n)
 664             self._pos = self._file.tell()
 665             return data
 666 
 667     def close(self):
 668         if self._file is not None:
 669             fileobj = self._file
 670             self._file = None
 671             self._close(fileobj)
 672 
 673 # Provide the tell method for unseekable stream
 674 class _Tellable:
 675     def __init__(self, fp):
 676         self.fp = fp
 677         self.offset = 0
 678 
 679     def write(self, data):
 680         n = self.fp.write(data)
 681         self.offset += n
 682         return n
 683 
 684     def tell(self):
 685         return self.offset
 686 
 687     def flush(self):
 688         self.fp.flush()
 689 
 690     def close(self):
 691         self.fp.close()
 692 
 693 
 694 class ZipExtFile(io.BufferedIOBase):
 695     """File-like object for reading an archive member.
 696        Is returned by ZipFile.open().
 697     """
 698 
 699     # Max size supported by decompressor.
 700     MAX_N = 1 << 31 - 1
 701 
 702     # Read from compressed files in 4k blocks.
 703     MIN_READ_SIZE = 4096
 704 
 705     # Search for universal newlines or line chunks.
 706     PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)')
 707 
 708     def __init__(self, fileobj, mode, zipinfo, decrypter=None,
 709                  close_fileobj=False):
 710         self._fileobj = fileobj
 711         self._decrypter = decrypter
 712         self._close_fileobj = close_fileobj
 713 
 714         self._compress_type = zipinfo.compress_type
 715         self._compress_left = zipinfo.compress_size
 716         self._left = zipinfo.file_size
 717 
 718         self._decompressor = _get_decompressor(self._compress_type)
 719 
 720         self._eof = False
 721         self._readbuffer = b''
 722         self._offset = 0
 723 
 724         self._universal = 'U' in mode
 725         self.newlines = None
 726 
 727         # Adjust read size for encrypted files since the first 12 bytes
 728         # are for the encryption/password information.
 729         if self._decrypter is not None:
 730             self._compress_left -= 12
 731 
 732         self.mode = mode
 733         self.name = zipinfo.filename
 734 
 735         if hasattr(zipinfo, 'CRC'):
 736             self._expected_crc = zipinfo.CRC
 737             self._running_crc = crc32(b'')
 738         else:
 739             self._expected_crc = None
 740 
 741     def __repr__(self):
 742         result = ['<%s.%s' % (self.__class__.__module__,
 743                               self.__class__.__qualname__)]
 744         if not self.closed:
 745             result.append(' name=%r mode=%r' % (self.name, self.mode))
 746             if self._compress_type != ZIP_STORED:
 747                 result.append(' compress_type=%s' %
 748                               compressor_names.get(self._compress_type,
 749                                                    self._compress_type))
 750         else:
 751             result.append(' [closed]')
 752         result.append('>')
 753         return ''.join(result)
 754 
 755     def readline(self, limit=-1):
 756         """Read and return a line from the stream.
 757 
 758         If limit is specified, at most limit bytes will be read.
 759         """
 760 
 761         if not self._universal and limit < 0:
 762             # Shortcut common case - newline found in buffer.
 763             i = self._readbuffer.find(b'\n', self._offset) + 1
 764             if i > 0:
 765                 line = self._readbuffer[self._offset: i]
 766                 self._offset = i
 767                 return line
 768 
 769         if not self._universal:
 770             return io.BufferedIOBase.readline(self, limit)
 771 
 772         line = b''
 773         while limit < 0 or len(line) < limit:
 774             readahead = self.peek(2)
 775             if readahead == b'':
 776                 return line
 777 
 778             #
 779             # Search for universal newlines or line chunks.
 780             #
 781             # The pattern returns either a line chunk or a newline, but not
 782             # both. Combined with peek(2), we are assured that the sequence
 783             # '\r\n' is always retrieved completely and never split into
 784             # separate newlines - '\r', '\n' due to coincidental readaheads.
 785             #
 786             match = self.PATTERN.search(readahead)
 787             newline = match.group('newline')
 788             if newline is not None:
 789                 if self.newlines is None:
 790                     self.newlines = []
 791                 if newline not in self.newlines:
 792                     self.newlines.append(newline)
 793                 self._offset += len(newline)
 794                 return line + b'\n'
 795 
 796             chunk = match.group('chunk')
 797             if limit >= 0:
 798                 chunk = chunk[: limit - len(line)]
 799 
 800             self._offset += len(chunk)
 801             line += chunk
 802 
 803         return line
 804 
 805     def peek(self, n=1):
 806         """Returns buffered bytes without advancing the position."""
 807         if n > len(self._readbuffer) - self._offset:
 808             chunk = self.read(n)
 809             if len(chunk) > self._offset:
 810                 self._readbuffer = chunk + self._readbuffer[self._offset:]
 811                 self._offset = 0
 812             else:
 813                 self._offset -= len(chunk)
 814 
 815         # Return up to 512 bytes to reduce allocation overhead for tight loops.
 816         return self._readbuffer[self._offset: self._offset + 512]
 817 
 818     def readable(self):
 819         return True
 820 
 821     def read(self, n=-1):
 822         """Read and return up to n bytes.
 823         If the argument is omitted, None, or negative, data is read and returned until EOF is reached..
 824         """
 825         if n is None or n < 0:
 826             buf = self._readbuffer[self._offset:]
 827             self._readbuffer = b''
 828             self._offset = 0
 829             while not self._eof:
 830                 buf += self._read1(self.MAX_N)
 831             return buf
 832 
 833         end = n + self._offset
 834         if end < len(self._readbuffer):
 835             buf = self._readbuffer[self._offset:end]
 836             self._offset = end
 837             return buf
 838 
 839         n = end - len(self._readbuffer)
 840         buf = self._readbuffer[self._offset:]
 841         self._readbuffer = b''
 842         self._offset = 0
 843         while n > 0 and not self._eof:
 844             data = self._read1(n)
 845             if n < len(data):
 846                 self._readbuffer = data
 847                 self._offset = n
 848                 buf += data[:n]
 849                 break
 850             buf += data
 851             n -= len(data)
 852         return buf
 853 
 854     def _update_crc(self, newdata):
 855         # Update the CRC using the given data.
 856         if self._expected_crc is None:
 857             # No need to compute the CRC if we don't have a reference value
 858             return
 859         self._running_crc = crc32(newdata, self._running_crc)
 860         # Check the CRC if we're at the end of the file
 861         if self._eof and self._running_crc != self._expected_crc:
 862             raise BadZipFile("Bad CRC-32 for file %r" % self.name)
 863 
 864     def read1(self, n):
 865         """Read up to n bytes with at most one read() system call."""
 866 
 867         if n is None or n < 0:
 868             buf = self._readbuffer[self._offset:]
 869             self._readbuffer = b''
 870             self._offset = 0
 871             while not self._eof:
 872                 data = self._read1(self.MAX_N)
 873                 if data:
 874                     buf += data
 875                     break
 876             return buf
 877 
 878         end = n + self._offset
 879         if end < len(self._readbuffer):
 880             buf = self._readbuffer[self._offset:end]
 881             self._offset = end
 882             return buf
 883 
 884         n = end - len(self._readbuffer)
 885         buf = self._readbuffer[self._offset:]
 886         self._readbuffer = b''
 887         self._offset = 0
 888         if n > 0:
 889             while not self._eof:
 890                 data = self._read1(n)
 891                 if n < len(data):
 892                     self._readbuffer = data
 893                     self._offset = n
 894                     buf += data[:n]
 895                     break
 896                 if data:
 897                     buf += data
 898                     break
 899         return buf
 900 
 901     def _read1(self, n):
 902         # Read up to n compressed bytes with at most one read() system call,
 903         # decrypt and decompress them.
 904         if self._eof or n <= 0:
 905             return b''
 906 
 907         # Read from file.
 908         if self._compress_type == ZIP_DEFLATED:
 909             ## Handle unconsumed data.
 910             data = self._decompressor.unconsumed_tail
 911             if n > len(data):
 912                 data += self._read2(n - len(data))
 913         else:
 914             data = self._read2(n)
 915 
 916         if self._compress_type == ZIP_STORED:
 917             self._eof = self._compress_left <= 0
 918         elif self._compress_type == ZIP_DEFLATED:
 919             n = max(n, self.MIN_READ_SIZE)
 920             data = self._decompressor.decompress(data, n)
 921             self._eof = (self._decompressor.eof or
 922                          self._compress_left <= 0 and
 923                          not self._decompressor.unconsumed_tail)
 924             if self._eof:
 925                 data += self._decompressor.flush()
 926         else:
 927             data = self._decompressor.decompress(data)
 928             self._eof = self._decompressor.eof or self._compress_left <= 0
 929 
 930         data = data[:self._left]
 931         self._left -= len(data)
 932         if self._left <= 0:
 933             self._eof = True
 934         self._update_crc(data)
 935         return data
 936 
 937     def _read2(self, n):
 938         if self._compress_left <= 0:
 939             return b''
 940 
 941         n = max(n, self.MIN_READ_SIZE)
 942         n = min(n, self._compress_left)
 943 
 944         data = self._fileobj.read(n)
 945         self._compress_left -= len(data)
 946         if not data:
 947             raise EOFError
 948 
 949         if self._decrypter is not None:
 950             data = bytes(map(self._decrypter, data))
 951         return data
 952 
 953     def close(self):
 954         try:
 955             if self._close_fileobj:
 956                 self._fileobj.close()
 957         finally:
 958             super().close()
 959 
 960 
 961 class ZipFile:
 962     """ Class with methods to open, read, write, close, list zip files.
 963 
 964     z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True)
 965 
 966     file: Either the path to the file, or a file-like object.
 967           If it is a path, the file will be opened and closed by ZipFile.
 968     mode: The mode can be either read 'r', write 'w', exclusive create 'x',
 969           or append 'a'.
 970     compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib),
 971                  ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma).
 972     allowZip64: if True ZipFile will create files with ZIP64 extensions when
 973                 needed, otherwise it will raise an exception when this would
 974                 be necessary.
 975 
 976     """
 977 
 978     fp = None                   # Set here since __del__ checks it
 979     _windows_illegal_name_trans_table = None
 980 
 981     def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True):
 982         """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
 983         or append 'a'."""
 984         if mode not in ('r', 'w', 'x', 'a'):
 985             raise RuntimeError("ZipFile requires mode 'r', 'w', 'x', or 'a'")
 986 
 987         _check_compression(compression)
 988 
 989         self._allowZip64 = allowZip64
 990         self._didModify = False
 991         self.debug = 0  # Level of printing: 0 through 3
 992         self.NameToInfo = {}    # Find file info given name
 993         self.filelist = []      # List of ZipInfo instances for archive
 994         self.compression = compression  # Method of compression
 995         self.mode = mode
 996         self.pwd = None
 997         self._comment = b''
 998 
 999         # Check if we were passed a file-like object
1000         if isinstance(file, str):
1001             # No, it's a filename
1002             self._filePassed = 0
1003             self.filename = file
1004             modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b',
1005                         'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'}
1006             filemode = modeDict[mode]
1007             while True:
1008                 try:
1009                     self.fp = io.open(file, filemode)
1010                 except OSError:
1011                     if filemode in modeDict:
1012                         filemode = modeDict[filemode]
1013                         continue
1014                     raise
1015                 break
1016         else:
1017             self._filePassed = 1
1018             self.fp = file
1019             self.filename = getattr(file, 'name', None)
1020         self._fileRefCnt = 1
1021         self._lock = threading.RLock()
1022         self._seekable = True
1023 
1024         try:
1025             if mode == 'r':
1026                 self._RealGetContents()
1027             elif mode in ('w', 'x'):
1028                 # set the modified flag so central directory gets written
1029                 # even if no files are added to the archive
1030                 self._didModify = True
1031                 try:
1032                     self.start_dir = self.fp.tell()
1033                 except (AttributeError, OSError):
1034                     self.fp = _Tellable(self.fp)
1035                     self.start_dir = 0
1036                     self._seekable = False
1037                 else:
1038                     # Some file-like objects can provide tell() but not seek()
1039                     try:
1040                         self.fp.seek(self.start_dir)
1041                     except (AttributeError, OSError):
1042                         self._seekable = False
1043             elif mode == 'a':
1044                 try:
1045                     # See if file is a zip file
1046                     self._RealGetContents()
1047                     # seek to start of directory and overwrite
1048                     self.fp.seek(self.start_dir)
1049                 except BadZipFile:
1050                     # file is not a zip file, just append
1051                     self.fp.seek(0, 2)
1052 
1053                     # set the modified flag so central directory gets written
1054                     # even if no files are added to the archive
1055                     self._didModify = True
1056                     self.start_dir = self.fp.tell()
1057             else:
1058                 raise RuntimeError("Mode must be 'r', 'w', 'x', or 'a'")
1059         except:
1060             fp = self.fp
1061             self.fp = None
1062             self._fpclose(fp)
1063             raise
1064 
1065     def __enter__(self):
1066         return self
1067 
1068     def __exit__(self, type, value, traceback):
1069         self.close()
1070 
1071     def __repr__(self):
1072         result = ['<%s.%s' % (self.__class__.__module__,
1073                               self.__class__.__qualname__)]
1074         if self.fp is not None:
1075             if self._filePassed:
1076                 result.append(' file=%r' % self.fp)
1077             elif self.filename is not None:
1078                 result.append(' filename=%r' % self.filename)
1079             result.append(' mode=%r' % self.mode)
1080         else:
1081             result.append(' [closed]')
1082         result.append('>')
1083         return ''.join(result)
1084 
1085     def _RealGetContents(self):
1086         """Read in the table of contents for the ZIP file."""
1087         fp = self.fp
1088         try:
1089             endrec = _EndRecData(fp)
1090         except OSError:
1091             raise BadZipFile("File is not a zip file")
1092         if not endrec:
1093             raise BadZipFile("File is not a zip file")
1094         if self.debug > 1:
1095             print(endrec)
1096         size_cd = endrec[_ECD_SIZE]             # bytes in central directory
1097         offset_cd = endrec[_ECD_OFFSET]         # offset of central directory
1098         self._comment = endrec[_ECD_COMMENT]    # archive comment
1099 
1100         # "concat" is zero, unless zip was concatenated to another file
1101         concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
1102         if endrec[_ECD_SIGNATURE] == stringEndArchive64:
1103             # If Zip64 extension structures are present, account for them
1104             concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
1105 
1106         if self.debug > 2:
1107             inferred = concat + offset_cd
1108             print("given, inferred, offset", offset_cd, inferred, concat)
1109         # self.start_dir:  Position of start of central directory
1110         self.start_dir = offset_cd + concat
1111         fp.seek(self.start_dir, 0)
1112         data = fp.read(size_cd)
1113         fp = io.BytesIO(data)
1114         total = 0
1115         while total < size_cd:
1116             centdir = fp.read(sizeCentralDir)
1117             if len(centdir) != sizeCentralDir:
1118                 raise BadZipFile("Truncated central directory")
1119             centdir = struct.unpack(structCentralDir, centdir)
1120             if centdir[_CD_SIGNATURE] != stringCentralDir:
1121                 raise BadZipFile("Bad magic number for central directory")
1122             if self.debug > 2:
1123                 print(centdir)
1124             filename = fp.read(centdir[_CD_FILENAME_LENGTH])
1125             flags = centdir[5]
1126             if flags & 0x800:
1127                 # UTF-8 file names extension
1128                 filename = filename.decode('utf-8')
1129             else:
1130                 # Historical ZIP filename encoding
1131                 filename = filename.decode('cp437')
1132             # Create ZipInfo instance to store file information
1133             x = ZipInfo(filename)
1134             x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
1135             x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
1136             x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
1137             (x.create_version, x.create_system, x.extract_version, x.reserved,
1138              x.flag_bits, x.compress_type, t, d,
1139              x.CRC, x.compress_size, x.file_size) = centdir[1:12]
1140             if x.extract_version > MAX_EXTRACT_VERSION:
1141                 raise NotImplementedError("zip file version %.1f" %
1142                                           (x.extract_version / 10))
1143             x.volume, x.internal_attr, x.external_attr = centdir[15:18]
1144             # Convert date/time code to (year, month, day, hour, min, sec)
1145             x._raw_time = t
1146             x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
1147                             t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
1148 
1149             x._decodeExtra()
1150             x.header_offset = x.header_offset + concat
1151             self.filelist.append(x)
1152             self.NameToInfo[x.filename] = x
1153 
1154             # update total bytes read from central directory
1155             total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
1156                      + centdir[_CD_EXTRA_FIELD_LENGTH]
1157                      + centdir[_CD_COMMENT_LENGTH])
1158 
1159             if self.debug > 2:
1160                 print("total", total)
1161 
1162 
1163     def namelist(self):
1164         """Return a list of file names in the archive."""
1165         return [data.filename for data in self.filelist]
1166 
1167     def infolist(self):
1168         """Return a list of class ZipInfo instances for files in the
1169         archive."""
1170         return self.filelist
1171 
1172     def printdir(self, file=None):
1173         """Print a table of contents for the zip file."""
1174         print("%-46s %19s %12s" % ("File Name", "Modified    ", "Size"),
1175               file=file)
1176         for zinfo in self.filelist:
1177             date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
1178             print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size),
1179                   file=file)
1180 
1181     def testzip(self):
1182         """Read all the files and check the CRC."""
1183         chunk_size = 2 ** 20
1184         for zinfo in self.filelist:
1185             try:
1186                 # Read by chunks, to avoid an OverflowError or a
1187                 # MemoryError with very large embedded files.
1188                 with self.open(zinfo.filename, "r") as f:
1189                     while f.read(chunk_size):     # Check CRC-32
1190                         pass
1191             except BadZipFile:
1192                 return zinfo.filename
1193 
1194     def getinfo(self, name):
1195         """Return the instance of ZipInfo given 'name'."""
1196         info = self.NameToInfo.get(name)
1197         if info is None:
1198             raise KeyError(
1199                 'There is no item named %r in the archive' % name)
1200 
1201         return info
1202 
1203     def setpassword(self, pwd):
1204         """Set default password for encrypted files."""
1205         if pwd and not isinstance(pwd, bytes):
1206             raise TypeError("pwd: expected bytes, got %s" % type(pwd))
1207         if pwd:
1208             self.pwd = pwd
1209         else:
1210             self.pwd = None
1211 
1212     @property
1213     def comment(self):
1214         """The comment text associated with the ZIP file."""
1215         return self._comment
1216 
1217     @comment.setter
1218     def comment(self, comment):
1219         if not isinstance(comment, bytes):
1220             raise TypeError("comment: expected bytes, got %s" % type(comment))
1221         # check for valid comment length
1222         if len(comment) > ZIP_MAX_COMMENT:
1223             import warnings
1224             warnings.warn('Archive comment is too long; truncating to %d bytes'
1225                           % ZIP_MAX_COMMENT, stacklevel=2)
1226             comment = comment[:ZIP_MAX_COMMENT]
1227         self._comment = comment
1228         self._didModify = True
1229 
1230     def read(self, name, pwd=None):
1231         """Return file bytes (as a string) for name."""
1232         with self.open(name, "r", pwd) as fp:
1233             return fp.read()
1234 
1235     def open(self, name, mode="r", pwd=None):
1236         """Return file-like object for 'name'."""
1237         if mode not in ("r", "U", "rU"):
1238             raise RuntimeError('open() requires mode "r", "U", or "rU"')
1239         if 'U' in mode:
1240             import warnings
1241             warnings.warn("'U' mode is deprecated",
1242                           DeprecationWarning, 2)
1243         if pwd and not isinstance(pwd, bytes):
1244             raise TypeError("pwd: expected bytes, got %s" % type(pwd))
1245         if not self.fp:
1246             raise RuntimeError(
1247                 "Attempt to read ZIP archive that was already closed")
1248 
1249         # Make sure we have an info object
1250         if isinstance(name, ZipInfo):
1251             # 'name' is already an info object
1252             zinfo = name
1253         else:
1254             # Get info object for name
1255             zinfo = self.getinfo(name)
1256 
1257         self._fileRefCnt += 1
1258         zef_file = _SharedFile(self.fp, zinfo.header_offset, self._fpclose, self._lock)
1259         try:
1260             # Skip the file header:
1261             fheader = zef_file.read(sizeFileHeader)
1262             if len(fheader) != sizeFileHeader:
1263                 raise BadZipFile("Truncated file header")
1264             fheader = struct.unpack(structFileHeader, fheader)
1265             if fheader[_FH_SIGNATURE] != stringFileHeader:
1266                 raise BadZipFile("Bad magic number for file header")
1267 
1268             fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
1269             if fheader[_FH_EXTRA_FIELD_LENGTH]:
1270                 zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
1271 
1272             if zinfo.flag_bits & 0x20:
1273                 # Zip 2.7: compressed patched data
1274                 raise NotImplementedError("compressed patched data (flag bit 5)")
1275 
1276             if zinfo.flag_bits & 0x40:
1277                 # strong encryption
1278                 raise NotImplementedError("strong encryption (flag bit 6)")
1279 
1280             if zinfo.flag_bits & 0x800:
1281                 # UTF-8 filename
1282                 fname_str = fname.decode("utf-8")
1283             else:
1284                 fname_str = fname.decode("cp437")
1285 
1286             if fname_str != zinfo.orig_filename:
1287                 raise BadZipFile(
1288                     'File name in directory %r and header %r differ.'
1289                     % (zinfo.orig_filename, fname))
1290 
1291             # check for encrypted flag & handle password
1292             is_encrypted = zinfo.flag_bits & 0x1
1293             zd = None
1294             if is_encrypted:
1295                 if not pwd:
1296                     pwd = self.pwd
1297                 if not pwd:
1298                     raise RuntimeError("File %s is encrypted, password "
1299                                        "required for extraction" % name)
1300 
1301                 zd = _ZipDecrypter(pwd)
1302                 # The first 12 bytes in the cypher stream is an encryption header
1303                 #  used to strengthen the algorithm. The first 11 bytes are
1304                 #  completely random, while the 12th contains the MSB of the CRC,
1305                 #  or the MSB of the file time depending on the header type
1306                 #  and is used to check the correctness of the password.
1307                 header = zef_file.read(12)
1308                 h = list(map(zd, header[0:12]))
1309                 if zinfo.flag_bits & 0x8:
1310                     # compare against the file type from extended local headers
1311                     check_byte = (zinfo._raw_time >> 8) & 0xff
1312                 else:
1313                     # compare against the CRC otherwise
1314                     check_byte = (zinfo.CRC >> 24) & 0xff
1315                 if h[11] != check_byte:
1316                     raise RuntimeError("Bad password for file", name)
1317 
1318             return ZipExtFile(zef_file, mode, zinfo, zd, True)
1319         except:
1320             zef_file.close()
1321             raise
1322 
1323     def extract(self, member, path=None, pwd=None):
1324         """Extract a member from the archive to the current working directory,
1325            using its full name. Its file information is extracted as accurately
1326            as possible. `member' may be a filename or a ZipInfo object. You can
1327            specify a different directory using `path'.
1328         """
1329         if not isinstance(member, ZipInfo):
1330             member = self.getinfo(member)
1331 
1332         if path is None:
1333             path = os.getcwd()
1334 
1335         return self._extract_member(member, path, pwd)
1336 
1337     def extractall(self, path=None, members=None, pwd=None):
1338         """Extract all members from the archive to the current working
1339            directory. `path' specifies a different directory to extract to.
1340            `members' is optional and must be a subset of the list returned
1341            by namelist().
1342         """
1343         if members is None:
1344             members = self.namelist()
1345 
1346         for zipinfo in members:
1347             self.extract(zipinfo, path, pwd)
1348 
1349     @classmethod
1350     def _sanitize_windows_name(cls, arcname, pathsep):
1351         """Replace bad characters and remove trailing dots from parts."""
1352         table = cls._windows_illegal_name_trans_table
1353         if not table:
1354             illegal = ':<>|"?*'
1355             table = str.maketrans(illegal, '_' * len(illegal))
1356             cls._windows_illegal_name_trans_table = table
1357         arcname = arcname.translate(table)
1358         # remove trailing dots
1359         arcname = (x.rstrip('.') for x in arcname.split(pathsep))
1360         # rejoin, removing empty parts.
1361         arcname = pathsep.join(x for x in arcname if x)
1362         return arcname
1363 
1364     def _extract_member(self, member, targetpath, pwd):
1365         """Extract the ZipInfo object 'member' to a physical
1366            file on the path targetpath.
1367         """
1368         # build the destination pathname, replacing
1369         # forward slashes to platform specific separators.
1370         arcname = member.filename.replace('/', os.path.sep)
1371 
1372         if os.path.altsep:
1373             arcname = arcname.replace(os.path.altsep, os.path.sep)
1374         # interpret absolute pathname as relative, remove drive letter or
1375         # UNC path, redundant separators, "." and ".." components.
1376         arcname = os.path.splitdrive(arcname)[1]
1377         invalid_path_parts = ('', os.path.curdir, os.path.pardir)
1378         arcname = os.path.sep.join(x for x in arcname.split(os.path.sep)
1379                                    if x not in invalid_path_parts)
1380         if os.path.sep == '\\':
1381             # filter illegal characters on Windows
1382             arcname = self._sanitize_windows_name(arcname, os.path.sep)
1383 
1384         targetpath = os.path.join(targetpath, arcname)
1385         targetpath = os.path.normpath(targetpath)
1386 
1387         # Create all upper directories if necessary.
1388         upperdirs = os.path.dirname(targetpath)
1389         if upperdirs and not os.path.exists(upperdirs):
1390             os.makedirs(upperdirs)
1391 
1392         if member.filename[-1] == '/':
1393             if not os.path.isdir(targetpath):
1394                 os.mkdir(targetpath)
1395             return targetpath
1396 
1397         with self.open(member, pwd=pwd) as source, \
1398              open(targetpath, "wb") as target:
1399             shutil.copyfileobj(source, target)
1400 
1401         return targetpath
1402 
1403     def _writecheck(self, zinfo):
1404         """Check for errors before writing a file to the archive."""
1405         if zinfo.filename in self.NameToInfo:
1406             import warnings
1407             warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3)
1408         if self.mode not in ('w', 'x', 'a'):
1409             raise RuntimeError("write() requires mode 'w', 'x', or 'a'")
1410         if not self.fp:
1411             raise RuntimeError(
1412                 "Attempt to write ZIP archive that was already closed")
1413         _check_compression(zinfo.compress_type)
1414         if not self._allowZip64:
1415             requires_zip64 = None
1416             if len(self.filelist) >= ZIP_FILECOUNT_LIMIT:
1417                 requires_zip64 = "Files count"
1418             elif zinfo.file_size > ZIP64_LIMIT:
1419                 requires_zip64 = "Filesize"
1420             elif zinfo.header_offset > ZIP64_LIMIT:
1421                 requires_zip64 = "Zipfile size"
1422             if requires_zip64:
1423                 raise LargeZipFile(requires_zip64 +
1424                                    " would require ZIP64 extensions")
1425 
1426     def write(self, filename, arcname=None, compress_type=None):
1427         """Put the bytes from filename into the archive under the name
1428         arcname."""
1429         if not self.fp:
1430             raise RuntimeError(
1431                 "Attempt to write to ZIP archive that was already closed")
1432 
1433         st = os.stat(filename)
1434         isdir = stat.S_ISDIR(st.st_mode)
1435         mtime = time.localtime(st.st_mtime)
1436         date_time = mtime[0:6]
1437         # Create ZipInfo instance to store file information
1438         if arcname is None:
1439             arcname = filename
1440         arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
1441         while arcname[0] in (os.sep, os.altsep):
1442             arcname = arcname[1:]
1443         if isdir:
1444             arcname += '/'
1445         zinfo = ZipInfo(arcname, date_time)
1446         zinfo.external_attr = (st[0] & 0xFFFF) << 16      # Unix attributes
1447         if isdir:
1448             zinfo.compress_type = ZIP_STORED
1449         elif compress_type is None:
1450             zinfo.compress_type = self.compression
1451         else:
1452             zinfo.compress_type = compress_type
1453 
1454         zinfo.file_size = st.st_size
1455         zinfo.flag_bits = 0x00
1456         with self._lock:
1457             if self._seekable:
1458                 self.fp.seek(self.start_dir)
1459             zinfo.header_offset = self.fp.tell()    # Start of header bytes
1460             if zinfo.compress_type == ZIP_LZMA:
1461                 # Compressed data includes an end-of-stream (EOS) marker
1462                 zinfo.flag_bits |= 0x02
1463 
1464             self._writecheck(zinfo)
1465             self._didModify = True
1466 
1467             if isdir:
1468                 zinfo.file_size = 0
1469                 zinfo.compress_size = 0
1470                 zinfo.CRC = 0
1471                 zinfo.external_attr |= 0x10  # MS-DOS directory flag
1472                 self.filelist.append(zinfo)
1473                 self.NameToInfo[zinfo.filename] = zinfo
1474                 self.fp.write(zinfo.FileHeader(False))
1475                 self.start_dir = self.fp.tell()
1476                 return
1477 
1478             cmpr = _get_compressor(zinfo.compress_type)
1479             if not self._seekable:
1480                 zinfo.flag_bits |= 0x08
1481             with open(filename, "rb") as fp:
1482                 # Must overwrite CRC and sizes with correct data later
1483                 zinfo.CRC = CRC = 0
1484                 zinfo.compress_size = compress_size = 0
1485                 # Compressed size can be larger than uncompressed size
1486                 zip64 = self._allowZip64 and \
1487                     zinfo.file_size * 1.05 > ZIP64_LIMIT
1488                 self.fp.write(zinfo.FileHeader(zip64))
1489                 file_size = 0
1490                 while 1:
1491                     buf = fp.read(1024 * 8)
1492                     if not buf:
1493                         break
1494                     file_size = file_size + len(buf)
1495                     CRC = crc32(buf, CRC)
1496                     if cmpr:
1497                         buf = cmpr.compress(buf)
1498                         compress_size = compress_size + len(buf)
1499                     self.fp.write(buf)
1500             if cmpr:
1501                 buf = cmpr.flush()
1502                 compress_size = compress_size + len(buf)
1503                 self.fp.write(buf)
1504                 zinfo.compress_size = compress_size
1505             else:
1506                 zinfo.compress_size = file_size
1507             zinfo.CRC = CRC
1508             zinfo.file_size = file_size
1509             if zinfo.flag_bits & 0x08:
1510                 # Write CRC and file sizes after the file data
1511                 fmt = '<LQQ' if zip64 else '<LLL'
1512                 self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size,
1513                                           zinfo.file_size))
1514                 self.start_dir = self.fp.tell()
1515             else:
1516                 if not zip64 and self._allowZip64:
1517                     if file_size > ZIP64_LIMIT:
1518                         raise RuntimeError('File size has increased during compressing')
1519                     if compress_size > ZIP64_LIMIT:
1520                         raise RuntimeError('Compressed size larger than uncompressed size')
1521                 # Seek backwards and write file header (which will now include
1522                 # correct CRC and file sizes)
1523                 self.start_dir = self.fp.tell() # Preserve current position in file
1524                 self.fp.seek(zinfo.header_offset)
1525                 self.fp.write(zinfo.FileHeader(zip64))
1526                 self.fp.seek(self.start_dir)
1527             self.filelist.append(zinfo)
1528             self.NameToInfo[zinfo.filename] = zinfo
1529 
1530     def writestr(self, zinfo_or_arcname, data, compress_type=None):
1531         """Write a file into the archive.  The contents is 'data', which
1532         may be either a 'str' or a 'bytes' instance; if it is a 'str',
1533         it is encoded as UTF-8 first.
1534         'zinfo_or_arcname' is either a ZipInfo instance or
1535         the name of the file in the archive."""
1536         if isinstance(data, str):
1537             data = data.encode("utf-8")
1538         if not isinstance(zinfo_or_arcname, ZipInfo):
1539             zinfo = ZipInfo(filename=zinfo_or_arcname,
1540                             date_time=time.localtime(time.time())[:6])
1541             zinfo.compress_type = self.compression
1542             if zinfo.filename[-1] == '/':
1543                 zinfo.external_attr = 0o40775 << 16   # drwxrwxr-x
1544                 zinfo.external_attr |= 0x10           # MS-DOS directory flag
1545             else:
1546                 zinfo.external_attr = 0o600 << 16     # ?rw-------
1547         else:
1548             zinfo = zinfo_or_arcname
1549 
1550         if not self.fp:
1551             raise RuntimeError(
1552                 "Attempt to write to ZIP archive that was already closed")
1553 
1554         zinfo.file_size = len(data)            # Uncompressed size
1555         with self._lock:
1556             if self._seekable:
1557                 self.fp.seek(self.start_dir)
1558             zinfo.header_offset = self.fp.tell()    # Start of header data
1559             if compress_type is not None:
1560                 zinfo.compress_type = compress_type
1561             zinfo.header_offset = self.fp.tell()    # Start of header data
1562             if compress_type is not None:
1563                 zinfo.compress_type = compress_type
1564             if zinfo.compress_type == ZIP_LZMA:
1565                 # Compressed data includes an end-of-stream (EOS) marker
1566                 zinfo.flag_bits |= 0x02
1567 
1568             self._writecheck(zinfo)
1569             self._didModify = True
1570             zinfo.CRC = crc32(data)       # CRC-32 checksum
1571             co = _get_compressor(zinfo.compress_type)
1572             if co:
1573                 data = co.compress(data) + co.flush()
1574                 zinfo.compress_size = len(data)    # Compressed size
1575             else:
1576                 zinfo.compress_size = zinfo.file_size
1577             zip64 = zinfo.file_size > ZIP64_LIMIT or \
1578                 zinfo.compress_size > ZIP64_LIMIT
1579             if zip64 and not self._allowZip64:
1580                 raise LargeZipFile("Filesize would require ZIP64 extensions")
1581             self.fp.write(zinfo.FileHeader(zip64))
1582             self.fp.write(data)
1583             if zinfo.flag_bits & 0x08:
1584                 # Write CRC and file sizes after the file data
1585                 fmt = '<LQQ' if zip64 else '<LLL'
1586                 self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size,
1587                                           zinfo.file_size))
1588             self.fp.flush()
1589             self.start_dir = self.fp.tell()
1590             self.filelist.append(zinfo)
1591             self.NameToInfo[zinfo.filename] = zinfo
1592 
1593     def __del__(self):
1594         """Call the "close()" method in case the user forgot."""
1595         self.close()
1596 
1597     def close(self):
1598         """Close the file, and for mode 'w', 'x' and 'a' write the ending
1599         records."""
1600         if self.fp is None:
1601             return
1602 
1603         try:
1604             if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records
1605                 with self._lock:
1606                     if self._seekable:
1607                         self.fp.seek(self.start_dir)
1608                     self._write_end_record()
1609         finally:
1610             fp = self.fp
1611             self.fp = None
1612             self._fpclose(fp)
1613 
1614     def _write_end_record(self):
1615         for zinfo in self.filelist:         # write central directory
1616             dt = zinfo.date_time
1617             dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
1618             dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
1619             extra = []
1620             if zinfo.file_size > ZIP64_LIMIT \
1621                or zinfo.compress_size > ZIP64_LIMIT:
1622                 extra.append(zinfo.file_size)
1623                 extra.append(zinfo.compress_size)
1624                 file_size = 0xffffffff
1625                 compress_size = 0xffffffff
1626             else:
1627                 file_size = zinfo.file_size
1628                 compress_size = zinfo.compress_size
1629 
1630             if zinfo.header_offset > ZIP64_LIMIT:
1631                 extra.append(zinfo.header_offset)
1632                 header_offset = 0xffffffff
1633             else:
1634                 header_offset = zinfo.header_offset
1635 
1636             extra_data = zinfo.extra
1637             min_version = 0
1638             if extra:
1639                 # Append a ZIP64 field to the extra's
1640                 extra_data = struct.pack(
1641                     '<HH' + 'Q'*len(extra),
1642                     1, 8*len(extra), *extra) + extra_data
1643 
1644                 min_version = ZIP64_VERSION
1645 
1646             if zinfo.compress_type == ZIP_BZIP2:
1647                 min_version = max(BZIP2_VERSION, min_version)
1648             elif zinfo.compress_type == ZIP_LZMA:
1649                 min_version = max(LZMA_VERSION, min_version)
1650 
1651             extract_version = max(min_version, zinfo.extract_version)
1652             create_version = max(min_version, zinfo.create_version)
1653             try:
1654                 filename, flag_bits = zinfo._encodeFilenameFlags()
1655                 centdir = struct.pack(structCentralDir,
1656                                       stringCentralDir, create_version,
1657                                       zinfo.create_system, extract_version, zinfo.reserved,
1658                                       flag_bits, zinfo.compress_type, dostime, dosdate,
1659                                       zinfo.CRC, compress_size, file_size,
1660                                       len(filename), len(extra_data), len(zinfo.comment),
1661                                       0, zinfo.internal_attr, zinfo.external_attr,
1662                                       header_offset)
1663             except DeprecationWarning:
1664                 print((structCentralDir, stringCentralDir, create_version,
1665                        zinfo.create_system, extract_version, zinfo.reserved,
1666                        zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
1667                        zinfo.CRC, compress_size, file_size,
1668                        len(zinfo.filename), len(extra_data), len(zinfo.comment),
1669                        0, zinfo.internal_attr, zinfo.external_attr,
1670                        header_offset), file=sys.stderr)
1671                 raise
1672             self.fp.write(centdir)
1673             self.fp.write(filename)
1674             self.fp.write(extra_data)
1675             self.fp.write(zinfo.comment)
1676 
1677         pos2 = self.fp.tell()
1678         # Write end-of-zip-archive record
1679         centDirCount = len(self.filelist)
1680         centDirSize = pos2 - self.start_dir
1681         centDirOffset = self.start_dir
1682         requires_zip64 = None
1683         if centDirCount > ZIP_FILECOUNT_LIMIT:
1684             requires_zip64 = "Files count"
1685         elif centDirOffset > ZIP64_LIMIT:
1686             requires_zip64 = "Central directory offset"
1687         elif centDirSize > ZIP64_LIMIT:
1688             requires_zip64 = "Central directory size"
1689         if requires_zip64:
1690             # Need to write the ZIP64 end-of-archive records
1691             if not self._allowZip64:
1692                 raise LargeZipFile(requires_zip64 +
1693                                    " would require ZIP64 extensions")
1694             zip64endrec = struct.pack(
1695                 structEndArchive64, stringEndArchive64,
1696                 44, 45, 45, 0, 0, centDirCount, centDirCount,
1697                 centDirSize, centDirOffset)
1698             self.fp.write(zip64endrec)
1699 
1700             zip64locrec = struct.pack(
1701                 structEndArchive64Locator,
1702                 stringEndArchive64Locator, 0, pos2, 1)
1703             self.fp.write(zip64locrec)
1704             centDirCount = min(centDirCount, 0xFFFF)
1705             centDirSize = min(centDirSize, 0xFFFFFFFF)
1706             centDirOffset = min(centDirOffset, 0xFFFFFFFF)
1707 
1708         endrec = struct.pack(structEndArchive, stringEndArchive,
1709                              0, 0, centDirCount, centDirCount,
1710                              centDirSize, centDirOffset, len(self._comment))
1711         self.fp.write(endrec)
1712         self.fp.write(self._comment)
1713         self.fp.flush()
1714 
1715     def _fpclose(self, fp):
1716         assert self._fileRefCnt > 0
1717         self._fileRefCnt -= 1
1718         if not self._fileRefCnt and not self._filePassed:
1719             fp.close()
1720 
1721 
1722 class PyZipFile(ZipFile):
1723     """Class to create ZIP archives with Python library files and packages."""
1724 
1725     def __init__(self, file, mode="r", compression=ZIP_STORED,
1726                  allowZip64=True, optimize=-1):
1727         ZipFile.__init__(self, file, mode=mode, compression=compression,
1728                          allowZip64=allowZip64)
1729         self._optimize = optimize
1730 
1731     def writepy(self, pathname, basename="", filterfunc=None):
1732         """Add all files from "pathname" to the ZIP archive.
1733 
1734         If pathname is a package directory, search the directory and
1735         all package subdirectories recursively for all *.py and enter
1736         the modules into the archive.  If pathname is a plain
1737         directory, listdir *.py and enter all modules.  Else, pathname
1738         must be a Python *.py file and the module will be put into the
1739         archive.  Added modules are always module.pyc.
1740         This method will compile the module.py into module.pyc if
1741         necessary.
1742         If filterfunc(pathname) is given, it is called with every argument.
1743         When it is False, the file or directory is skipped.
1744         """
1745         if filterfunc and not filterfunc(pathname):
1746             if self.debug:
1747                 label = 'path' if os.path.isdir(pathname) else 'file'
1748                 print('%s "%s" skipped by filterfunc' % (label, pathname))
1749             return
1750         dir, name = os.path.split(pathname)
1751         if os.path.isdir(pathname):
1752             initname = os.path.join(pathname, "__init__.py")
1753             if os.path.isfile(initname):
1754                 # This is a package directory, add it
1755                 if basename:
1756                     basename = "%s/%s" % (basename, name)
1757                 else:
1758                     basename = name
1759                 if self.debug:
1760                     print("Adding package in", pathname, "as", basename)
1761                 fname, arcname = self._get_codename(initname[0:-3], basename)
1762                 if self.debug:
1763                     print("Adding", arcname)
1764                 self.write(fname, arcname)
1765                 dirlist = os.listdir(pathname)
1766                 dirlist.remove("__init__.py")
1767                 # Add all *.py files and package subdirectories
1768                 for filename in dirlist:
1769                     path = os.path.join(pathname, filename)
1770                     root, ext = os.path.splitext(filename)
1771                     if os.path.isdir(path):
1772                         if os.path.isfile(os.path.join(path, "__init__.py")):
1773                             # This is a package directory, add it
1774                             self.writepy(path, basename,
1775                                          filterfunc=filterfunc)  # Recursive call
1776                     elif ext == ".py":
1777                         if filterfunc and not filterfunc(path):
1778                             if self.debug:
1779                                 print('file "%s" skipped by filterfunc' % path)
1780                             continue
1781                         fname, arcname = self._get_codename(path[0:-3],
1782                                                             basename)
1783                         if self.debug:
1784                             print("Adding", arcname)
1785                         self.write(fname, arcname)
1786             else:
1787                 # This is NOT a package directory, add its files at top level
1788                 if self.debug:
1789                     print("Adding files from directory", pathname)
1790                 for filename in os.listdir(pathname):
1791                     path = os.path.join(pathname, filename)
1792                     root, ext = os.path.splitext(filename)
1793                     if ext == ".py":
1794                         if filterfunc and not filterfunc(path):
1795                             if self.debug:
1796                                 print('file "%s" skipped by filterfunc' % path)
1797                             continue
1798                         fname, arcname = self._get_codename(path[0:-3],
1799                                                             basename)
1800                         if self.debug:
1801                             print("Adding", arcname)
1802                         self.write(fname, arcname)
1803         else:
1804             if pathname[-3:] != ".py":
1805                 raise RuntimeError(
1806                     'Files added with writepy() must end with ".py"')
1807             fname, arcname = self._get_codename(pathname[0:-3], basename)
1808             if self.debug:
1809                 print("Adding file", arcname)
1810             self.write(fname, arcname)
1811 
1812     def _get_codename(self, pathname, basename):
1813         """Return (filename, archivename) for the path.
1814 
1815         Given a module name path, return the correct file path and
1816         archive name, compiling if necessary.  For example, given
1817         /python/lib/string, return (/python/lib/string.pyc, string).
1818         """
1819         def _compile(file, optimize=-1):
1820             import py_compile
1821             if self.debug:
1822                 print("Compiling", file)
1823             try:
1824                 py_compile.compile(file, doraise=True, optimize=optimize)
1825             except py_compile.PyCompileError as err:
1826                 print(err.msg)
1827                 return False
1828             return True
1829 
1830         file_py  = pathname + ".py"
1831         file_pyc = pathname + ".pyc"
1832         pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='')
1833         pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1)
1834         pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2)
1835         if self._optimize == -1:
1836             # legacy mode: use whatever file is present
1837             if (os.path.isfile(file_pyc) and
1838                   os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime):
1839                 # Use .pyc file.
1840                 arcname = fname = file_pyc
1841             elif (os.path.isfile(pycache_opt0) and
1842                   os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime):
1843                 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
1844                 # file name in the archive.
1845                 fname = pycache_opt0
1846                 arcname = file_pyc
1847             elif (os.path.isfile(pycache_opt1) and
1848                   os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime):
1849                 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
1850                 # file name in the archive.
1851                 fname = pycache_opt1
1852                 arcname = file_pyc
1853             elif (os.path.isfile(pycache_opt2) and
1854                   os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime):
1855                 # Use the __pycache__/*.pyc file, but write it to the legacy pyc
1856                 # file name in the archive.
1857                 fname = pycache_opt2
1858                 arcname = file_pyc
1859             else:
1860                 # Compile py into PEP 3147 pyc file.
1861                 if _compile(file_py):
1862                     if sys.flags.optimize == 0:
1863                         fname = pycache_opt0
1864                     elif sys.flags.optimize == 1:
1865                         fname = pycache_opt1
1866                     else:
1867                         fname = pycache_opt2
1868                     arcname = file_pyc
1869                 else:
1870                     fname = arcname = file_py
1871         else:
1872             # new mode: use given optimization level
1873             if self._optimize == 0:
1874                 fname = pycache_opt0
1875                 arcname = file_pyc
1876             else:
1877                 arcname = file_pyc
1878                 if self._optimize == 1:
1879                     fname = pycache_opt1
1880                 elif self._optimize == 2:
1881                     fname = pycache_opt2
1882                 else:
1883                     msg = "invalid value for 'optimize': {!r}".format(self._optimize)
1884                     raise ValueError(msg)
1885             if not (os.path.isfile(fname) and
1886                     os.stat(fname).st_mtime >= os.stat(file_py).st_mtime):
1887                 if not _compile(file_py, optimize=self._optimize):
1888                     fname = arcname = file_py
1889         archivename = os.path.split(arcname)[1]
1890         if basename:
1891             archivename = "%s/%s" % (basename, archivename)
1892         return (fname, archivename)
1893 
1894 
1895 def main(args = None):
1896     import textwrap
1897     USAGE=textwrap.dedent("""\
1898         Usage:
1899             zipfile.py -l zipfile.zip        # Show listing of a zipfile
1900             zipfile.py -t zipfile.zip        # Test if a zipfile is valid
1901             zipfile.py -e zipfile.zip target # Extract zipfile into target dir
1902             zipfile.py -c zipfile.zip src ... # Create zipfile from sources
1903         """)
1904     if args is None:
1905         args = sys.argv[1:]
1906 
1907     if not args or args[0] not in ('-l', '-c', '-e', '-t'):
1908         print(USAGE)
1909         sys.exit(1)
1910 
1911     if args[0] == '-l':
1912         if len(args) != 2:
1913             print(USAGE)
1914             sys.exit(1)
1915         with ZipFile(args[1], 'r') as zf:
1916             zf.printdir()
1917 
1918     elif args[0] == '-t':
1919         if len(args) != 2:
1920             print(USAGE)
1921             sys.exit(1)
1922         with ZipFile(args[1], 'r') as zf:
1923             badfile = zf.testzip()
1924         if badfile:
1925             print("The following enclosed file is corrupted: {!r}".format(badfile))
1926         print("Done testing")
1927 
1928     elif args[0] == '-e':
1929         if len(args) != 3:
1930             print(USAGE)
1931             sys.exit(1)
1932 
1933         with ZipFile(args[1], 'r') as zf:
1934             zf.extractall(args[2])
1935 
1936     elif args[0] == '-c':
1937         if len(args) < 3:
1938             print(USAGE)
1939             sys.exit(1)
1940 
1941         def addToZip(zf, path, zippath):
1942             if os.path.isfile(path):
1943                 zf.write(path, zippath, ZIP_DEFLATED)
1944             elif os.path.isdir(path):
1945                 if zippath:
1946                     zf.write(path, zippath)
1947                 for nm in os.listdir(path):
1948                     addToZip(zf,
1949                              os.path.join(path, nm), os.path.join(zippath, nm))
1950             # else: ignore
1951 
1952         with ZipFile(args[1], 'w') as zf:
1953             for path in args[2:]:
1954                 zippath = os.path.basename(path)
1955                 if not zippath:
1956                     zippath = os.path.basename(os.path.dirname(path))
1957                 if zippath in ('', os.curdir, os.pardir):
1958                     zippath = ''
1959                 addToZip(zf, path, zippath)
1960 
1961 if __name__ == "__main__":
1962     main()

View zipfile Code

b、tarfile

   1 #!/usr/bin/env python3
   2 #-------------------------------------------------------------------
   3 # tarfile.py
   4 #-------------------------------------------------------------------
   5 # Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
   6 # All rights reserved.
   7 #
   8 # Permission  is  hereby granted,  free  of charge,  to  any person
   9 # obtaining a  copy of  this software  and associated documentation
  10 # files  (the  "Software"),  to   deal  in  the  Software   without
  11 # restriction,  including  without limitation  the  rights to  use,
  12 # copy, modify, merge, publish, distribute, sublicense, and/or sell
  13 # copies  of  the  Software,  and to  permit  persons  to  whom the
  14 # Software  is  furnished  to  do  so,  subject  to  the  following
  15 # conditions:
  16 #
  17 # The above copyright  notice and this  permission notice shall  be
  18 # included in all copies or substantial portions of the Software.
  19 #
  20 # THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
  21 # EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
  22 # OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
  23 # NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
  24 # HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
  25 # WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
  26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  27 # OTHER DEALINGS IN THE SOFTWARE.
  28 #
  29 """Read from and write to tar format archives.
  30 """
  31 
  32 version     = "0.9.0"
  33 __author__  = "Lars Gust\u00e4bel (lars@gustaebel.de)"
  34 __date__    = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
  35 __cvsid__   = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
  36 __credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
  37 
  38 #---------
  39 # Imports
  40 #---------
  41 from builtins import open as bltn_open
  42 import sys
  43 import os
  44 import io
  45 import shutil
  46 import stat
  47 import time
  48 import struct
  49 import copy
  50 import re
  51 
  52 try:
  53     import grp, pwd
  54 except ImportError:
  55     grp = pwd = None
  56 
  57 # os.symlink on Windows prior to 6.0 raises NotImplementedError
  58 symlink_exception = (AttributeError, NotImplementedError)
  59 try:
  60     # OSError (winerror=1314) will be raised if the caller does not hold the
  61     # SeCreateSymbolicLinkPrivilege privilege
  62     symlink_exception += (OSError,)
  63 except NameError:
  64     pass
  65 
  66 # from tarfile import *
  67 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]
  68 
  69 #---------------------------------------------------------
  70 # tar constants
  71 #---------------------------------------------------------
  72 NUL = b"\0"                     # the null character
  73 BLOCKSIZE = 512                 # length of processing blocks
  74 RECORDSIZE = BLOCKSIZE * 20     # length of records
  75 GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
  76 POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
  77 
  78 LENGTH_NAME = 100               # maximum length of a filename
  79 LENGTH_LINK = 100               # maximum length of a linkname
  80 LENGTH_PREFIX = 155             # maximum length of the prefix field
  81 
  82 REGTYPE = b"0"                  # regular file
  83 AREGTYPE = b"\0"                # regular file
  84 LNKTYPE = b"1"                  # link (inside tarfile)
  85 SYMTYPE = b"2"                  # symbolic link
  86 CHRTYPE = b"3"                  # character special device
  87 BLKTYPE = b"4"                  # block special device
  88 DIRTYPE = b"5"                  # directory
  89 FIFOTYPE = b"6"                 # fifo special device
  90 CONTTYPE = b"7"                 # contiguous file
  91 
  92 GNUTYPE_LONGNAME = b"L"         # GNU tar longname
  93 GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
  94 GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
  95 
  96 XHDTYPE = b"x"                  # POSIX.1-2001 extended header
  97 XGLTYPE = b"g"                  # POSIX.1-2001 global header
  98 SOLARIS_XHDTYPE = b"X"          # Solaris extended header
  99 
 100 USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
 101 GNU_FORMAT = 1                  # GNU tar format
 102 PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
 103 DEFAULT_FORMAT = GNU_FORMAT
 104 
 105 #---------------------------------------------------------
 106 # tarfile constants
 107 #---------------------------------------------------------
 108 # File types that tarfile supports:
 109 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
 110                    SYMTYPE, DIRTYPE, FIFOTYPE,
 111                    CONTTYPE, CHRTYPE, BLKTYPE,
 112                    GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 113                    GNUTYPE_SPARSE)
 114 
 115 # File types that will be treated as a regular file.
 116 REGULAR_TYPES = (REGTYPE, AREGTYPE,
 117                  CONTTYPE, GNUTYPE_SPARSE)
 118 
 119 # File types that are part of the GNU tar format.
 120 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
 121              GNUTYPE_SPARSE)
 122 
 123 # Fields from a pax header that override a TarInfo attribute.
 124 PAX_FIELDS = ("path", "linkpath", "size", "mtime",
 125               "uid", "gid", "uname", "gname")
 126 
 127 # Fields from a pax header that are affected by hdrcharset.
 128 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
 129 
 130 # Fields in a pax header that are numbers, all other fields
 131 # are treated as strings.
 132 PAX_NUMBER_FIELDS = {
 133     "atime": float,
 134     "ctime": float,
 135     "mtime": float,
 136     "uid": int,
 137     "gid": int,
 138     "size": int
 139 }
 140 
 141 #---------------------------------------------------------
 142 # initialization
 143 #---------------------------------------------------------
 144 if os.name in ("nt", "ce"):
 145     ENCODING = "utf-8"
 146 else:
 147     ENCODING = sys.getfilesystemencoding()
 148 
 149 #---------------------------------------------------------
 150 # Some useful functions
 151 #---------------------------------------------------------
 152 
 153 def stn(s, length, encoding, errors):
 154     """Convert a string to a null-terminated bytes object.
 155     """
 156     s = s.encode(encoding, errors)
 157     return s[:length] + (length - len(s)) * NUL
 158 
 159 def nts(s, encoding, errors):
 160     """Convert a null-terminated bytes object to a string.
 161     """
 162     p = s.find(b"\0")
 163     if p != -1:
 164         s = s[:p]
 165     return s.decode(encoding, errors)
 166 
 167 def nti(s):
 168     """Convert a number field to a python number.
 169     """
 170     # There are two possible encodings for a number field, see
 171     # itn() below.
 172     if s[0] in (0o200, 0o377):
 173         n = 0
 174         for i in range(len(s) - 1):
 175             n <<= 8
 176             n += s[i + 1]
 177         if s[0] == 0o377:
 178             n = -(256 ** (len(s) - 1) - n)
 179     else:
 180         try:
 181             s = nts(s, "ascii", "strict")
 182             n = int(s.strip() or "0", 8)
 183         except ValueError:
 184             raise InvalidHeaderError("invalid header")
 185     return n
 186 
 187 def itn(n, digits=8, format=DEFAULT_FORMAT):
 188     """Convert a python number to a number field.
 189     """
 190     # POSIX 1003.1-1988 requires numbers to be encoded as a string of
 191     # octal digits followed by a null-byte, this allows values up to
 192     # (8**(digits-1))-1. GNU tar allows storing numbers greater than
 193     # that if necessary. A leading 0o200 or 0o377 byte indicate this
 194     # particular encoding, the following digits-1 bytes are a big-endian
 195     # base-256 representation. This allows values up to (256**(digits-1))-1.
 196     # A 0o200 byte indicates a positive number, a 0o377 byte a negative
 197     # number.
 198     if 0 <= n < 8 ** (digits - 1):
 199         s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
 200     elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
 201         if n >= 0:
 202             s = bytearray([0o200])
 203         else:
 204             s = bytearray([0o377])
 205             n = 256 ** digits + n
 206 
 207         for i in range(digits - 1):
 208             s.insert(1, n & 0o377)
 209             n >>= 8
 210     else:
 211         raise ValueError("overflow in number field")
 212 
 213     return s
 214 
 215 def calc_chksums(buf):
 216     """Calculate the checksum for a member's header by summing up all
 217        characters except for the chksum field which is treated as if
 218        it was filled with spaces. According to the GNU tar sources,
 219        some tars (Sun and NeXT) calculate chksum with signed char,
 220        which will be different if there are chars in the buffer with
 221        the high bit set. So we calculate two checksums, unsigned and
 222        signed.
 223     """
 224     unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
 225     signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
 226     return unsigned_chksum, signed_chksum
 227 
 228 def copyfileobj(src, dst, length=None, exception=OSError):
 229     """Copy length bytes from fileobj src to fileobj dst.
 230        If length is None, copy the entire content.
 231     """
 232     if length == 0:
 233         return
 234     if length is None:
 235         shutil.copyfileobj(src, dst)
 236         return
 237 
 238     BUFSIZE = 16 * 1024
 239     blocks, remainder = divmod(length, BUFSIZE)
 240     for b in range(blocks):
 241         buf = src.read(BUFSIZE)
 242         if len(buf) < BUFSIZE:
 243             raise exception("unexpected end of data")
 244         dst.write(buf)
 245 
 246     if remainder != 0:
 247         buf = src.read(remainder)
 248         if len(buf) < remainder:
 249             raise exception("unexpected end of data")
 250         dst.write(buf)
 251     return
 252 
 253 def filemode(mode):
 254     """Deprecated in this location; use stat.filemode."""
 255     import warnings
 256     warnings.warn("deprecated in favor of stat.filemode",
 257                   DeprecationWarning, 2)
 258     return stat.filemode(mode)
 259 
 260 def _safe_print(s):
 261     encoding = getattr(sys.stdout, 'encoding', None)
 262     if encoding is not None:
 263         s = s.encode(encoding, 'backslashreplace').decode(encoding)
 264     print(s, end=' ')
 265 
 266 
 267 class TarError(Exception):
 268     """Base exception."""
 269     pass
 270 class ExtractError(TarError):
 271     """General exception for extract errors."""
 272     pass
 273 class ReadError(TarError):
 274     """Exception for unreadable tar archives."""
 275     pass
 276 class CompressionError(TarError):
 277     """Exception for unavailable compression methods."""
 278     pass
 279 class StreamError(TarError):
 280     """Exception for unsupported operations on stream-like TarFiles."""
 281     pass
 282 class HeaderError(TarError):
 283     """Base exception for header errors."""
 284     pass
 285 class EmptyHeaderError(HeaderError):
 286     """Exception for empty headers."""
 287     pass
 288 class TruncatedHeaderError(HeaderError):
 289     """Exception for truncated headers."""
 290     pass
 291 class EOFHeaderError(HeaderError):
 292     """Exception for end of file headers."""
 293     pass
 294 class InvalidHeaderError(HeaderError):
 295     """Exception for invalid headers."""
 296     pass
 297 class SubsequentHeaderError(HeaderError):
 298     """Exception for missing and invalid extended headers."""
 299     pass
 300 
 301 #---------------------------
 302 # internal stream interface
 303 #---------------------------
 304 class _LowLevelFile:
 305     """Low-level file object. Supports reading and writing.
 306        It is used instead of a regular file object for streaming
 307        access.
 308     """
 309 
 310     def __init__(self, name, mode):
 311         mode = {
 312             "r": os.O_RDONLY,
 313             "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
 314         }[mode]
 315         if hasattr(os, "O_BINARY"):
 316             mode |= os.O_BINARY
 317         self.fd = os.open(name, mode, 0o666)
 318 
 319     def close(self):
 320         os.close(self.fd)
 321 
 322     def read(self, size):
 323         return os.read(self.fd, size)
 324 
 325     def write(self, s):
 326         os.write(self.fd, s)
 327 
 328 class _Stream:
 329     """Class that serves as an adapter between TarFile and
 330        a stream-like object.  The stream-like object only
 331        needs to have a read() or write() method and is accessed
 332        blockwise.  Use of gzip or bzip2 compression is possible.
 333        A stream-like object could be for example: sys.stdin,
 334        sys.stdout, a socket, a tape device etc.
 335 
 336        _Stream is intended to be used only internally.
 337     """
 338 
 339     def __init__(self, name, mode, comptype, fileobj, bufsize):
 340         """Construct a _Stream object.
 341         """
 342         self._extfileobj = True
 343         if fileobj is None:
 344             fileobj = _LowLevelFile(name, mode)
 345             self._extfileobj = False
 346 
 347         if comptype == '*':
 348             # Enable transparent compression detection for the
 349             # stream interface
 350             fileobj = _StreamProxy(fileobj)
 351             comptype = fileobj.getcomptype()
 352 
 353         self.name     = name or ""
 354         self.mode     = mode
 355         self.comptype = comptype
 356         self.fileobj  = fileobj
 357         self.bufsize  = bufsize
 358         self.buf      = b""
 359         self.pos      = 0
 360         self.closed   = False
 361 
 362         try:
 363             if comptype == "gz":
 364                 try:
 365                     import zlib
 366                 except ImportError:
 367                     raise CompressionError("zlib module is not available")
 368                 self.zlib = zlib
 369                 self.crc = zlib.crc32(b"")
 370                 if mode == "r":
 371                     self._init_read_gz()
 372                     self.exception = zlib.error
 373                 else:
 374                     self._init_write_gz()
 375 
 376             elif comptype == "bz2":
 377                 try:
 378                     import bz2
 379                 except ImportError:
 380                     raise CompressionError("bz2 module is not available")
 381                 if mode == "r":
 382                     self.dbuf = b""
 383                     self.cmp = bz2.BZ2Decompressor()
 384                     self.exception = OSError
 385                 else:
 386                     self.cmp = bz2.BZ2Compressor()
 387 
 388             elif comptype == "xz":
 389                 try:
 390                     import lzma
 391                 except ImportError:
 392                     raise CompressionError("lzma module is not available")
 393                 if mode == "r":
 394                     self.dbuf = b""
 395                     self.cmp = lzma.LZMADecompressor()
 396                     self.exception = lzma.LZMAError
 397                 else:
 398                     self.cmp = lzma.LZMACompressor()
 399 
 400             elif comptype != "tar":
 401                 raise CompressionError("unknown compression type %r" % comptype)
 402 
 403         except:
 404             if not self._extfileobj:
 405                 self.fileobj.close()
 406             self.closed = True
 407             raise
 408 
 409     def __del__(self):
 410         if hasattr(self, "closed") and not self.closed:
 411             self.close()
 412 
 413     def _init_write_gz(self):
 414         """Initialize for writing with gzip compression.
 415         """
 416         self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
 417                                             -self.zlib.MAX_WBITS,
 418                                             self.zlib.DEF_MEM_LEVEL,
 419                                             0)
 420         timestamp = struct.pack("<L", int(time.time()))
 421         self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
 422         if self.name.endswith(".gz"):
 423             self.name = self.name[:-3]
 424         # RFC1952 says we must use ISO-8859-1 for the FNAME field.
 425         self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
 426 
 427     def write(self, s):
 428         """Write string s to the stream.
 429         """
 430         if self.comptype == "gz":
 431             self.crc = self.zlib.crc32(s, self.crc)
 432         self.pos += len(s)
 433         if self.comptype != "tar":
 434             s = self.cmp.compress(s)
 435         self.__write(s)
 436 
 437     def __write(self, s):
 438         """Write string s to the stream if a whole new block
 439            is ready to be written.
 440         """
 441         self.buf += s
 442         while len(self.buf) > self.bufsize:
 443             self.fileobj.write(self.buf[:self.bufsize])
 444             self.buf = self.buf[self.bufsize:]
 445 
 446     def close(self):
 447         """Close the _Stream object. No operation should be
 448            done on it afterwards.
 449         """
 450         if self.closed:
 451             return
 452 
 453         self.closed = True
 454         try:
 455             if self.mode == "w" and self.comptype != "tar":
 456                 self.buf += self.cmp.flush()
 457 
 458             if self.mode == "w" and self.buf:
 459                 self.fileobj.write(self.buf)
 460                 self.buf = b""
 461                 if self.comptype == "gz":
 462                     self.fileobj.write(struct.pack("<L", self.crc))
 463                     self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
 464         finally:
 465             if not self._extfileobj:
 466                 self.fileobj.close()
 467 
 468     def _init_read_gz(self):
 469         """Initialize for reading a gzip compressed fileobj.
 470         """
 471         self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
 472         self.dbuf = b""
 473 
 474         # taken from gzip.GzipFile with some alterations
 475         if self.__read(2) != b"\037\213":
 476             raise ReadError("not a gzip file")
 477         if self.__read(1) != b"\010":
 478             raise CompressionError("unsupported compression method")
 479 
 480         flag = ord(self.__read(1))
 481         self.__read(6)
 482 
 483         if flag & 4:
 484             xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
 485             self.read(xlen)
 486         if flag & 8:
 487             while True:
 488                 s = self.__read(1)
 489                 if not s or s == NUL:
 490                     break
 491         if flag & 16:
 492             while True:
 493                 s = self.__read(1)
 494                 if not s or s == NUL:
 495                     break
 496         if flag & 2:
 497             self.__read(2)
 498 
 499     def tell(self):
 500         """Return the stream's file pointer position.
 501         """
 502         return self.pos
 503 
 504     def seek(self, pos=0):
 505         """Set the stream's file pointer to pos. Negative seeking
 506            is forbidden.
 507         """
 508         if pos - self.pos >= 0:
 509             blocks, remainder = divmod(pos - self.pos, self.bufsize)
 510             for i in range(blocks):
 511                 self.read(self.bufsize)
 512             self.read(remainder)
 513         else:
 514             raise StreamError("seeking backwards is not allowed")
 515         return self.pos
 516 
 517     def read(self, size=None):
 518         """Return the next size number of bytes from the stream.
 519            If size is not defined, return all bytes of the stream
 520            up to EOF.
 521         """
 522         if size is None:
 523             t = []
 524             while True:
 525                 buf = self._read(self.bufsize)
 526                 if not buf:
 527                     break
 528                 t.append(buf)
 529             buf = "".join(t)
 530         else:
 531             buf = self._read(size)
 532         self.pos += len(buf)
 533         return buf
 534 
 535     def _read(self, size):
 536         """Return size bytes from the stream.
 537         """
 538         if self.comptype == "tar":
 539             return self.__read(size)
 540 
 541         c = len(self.dbuf)
 542         while c < size:
 543             buf = self.__read(self.bufsize)
 544             if not buf:
 545                 break
 546             try:
 547                 buf = self.cmp.decompress(buf)
 548             except self.exception:
 549                 raise ReadError("invalid compressed data")
 550             self.dbuf += buf
 551             c += len(buf)
 552         buf = self.dbuf[:size]
 553         self.dbuf = self.dbuf[size:]
 554         return buf
 555 
 556     def __read(self, size):
 557         """Return size bytes from stream. If internal buffer is empty,
 558            read another block from the stream.
 559         """
 560         c = len(self.buf)
 561         while c < size:
 562             buf = self.fileobj.read(self.bufsize)
 563             if not buf:
 564                 break
 565             self.buf += buf
 566             c += len(buf)
 567         buf = self.buf[:size]
 568         self.buf = self.buf[size:]
 569         return buf
 570 # class _Stream
 571 
 572 class _StreamProxy(object):
 573     """Small proxy class that enables transparent compression
 574        detection for the Stream interface (mode 'r|*').
 575     """
 576 
 577     def __init__(self, fileobj):
 578         self.fileobj = fileobj
 579         self.buf = self.fileobj.read(BLOCKSIZE)
 580 
 581     def read(self, size):
 582         self.read = self.fileobj.read
 583         return self.buf
 584 
 585     def getcomptype(self):
 586         if self.buf.startswith(b"\x1f\x8b\x08"):
 587             return "gz"
 588         elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
 589             return "bz2"
 590         elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
 591             return "xz"
 592         else:
 593             return "tar"
 594 
 595     def close(self):
 596         self.fileobj.close()
 597 # class StreamProxy
 598 
 599 #------------------------
 600 # Extraction file object
 601 #------------------------
 602 class _FileInFile(object):
 603     """A thin wrapper around an existing file object that
 604        provides a part of its data as an individual file
 605        object.
 606     """
 607 
 608     def __init__(self, fileobj, offset, size, blockinfo=None):
 609         self.fileobj = fileobj
 610         self.offset = offset
 611         self.size = size
 612         self.position = 0
 613         self.name = getattr(fileobj, "name", None)
 614         self.closed = False
 615 
 616         if blockinfo is None:
 617             blockinfo = [(0, size)]
 618 
 619         # Construct a map with data and zero blocks.
 620         self.map_index = 0
 621         self.map = []
 622         lastpos = 0
 623         realpos = self.offset
 624         for offset, size in blockinfo:
 625             if offset > lastpos:
 626                 self.map.append((False, lastpos, offset, None))
 627             self.map.append((True, offset, offset + size, realpos))
 628             realpos += size
 629             lastpos = offset + size
 630         if lastpos < self.size:
 631             self.map.append((False, lastpos, self.size, None))
 632 
 633     def flush(self):
 634         pass
 635 
 636     def readable(self):
 637         return True
 638 
 639     def writable(self):
 640         return False
 641 
 642     def seekable(self):
 643         return self.fileobj.seekable()
 644 
 645     def tell(self):
 646         """Return the current file position.
 647         """
 648         return self.position
 649 
 650     def seek(self, position, whence=io.SEEK_SET):
 651         """Seek to a position in the file.
 652         """
 653         if whence == io.SEEK_SET:
 654             self.position = min(max(position, 0), self.size)
 655         elif whence == io.SEEK_CUR:
 656             if position < 0:
 657                 self.position = max(self.position + position, 0)
 658             else:
 659                 self.position = min(self.position + position, self.size)
 660         elif whence == io.SEEK_END:
 661             self.position = max(min(self.size + position, self.size), 0)
 662         else:
 663             raise ValueError("Invalid argument")
 664         return self.position
 665 
 666     def read(self, size=None):
 667         """Read data from the file.
 668         """
 669         if size is None:
 670             size = self.size - self.position
 671         else:
 672             size = min(size, self.size - self.position)
 673 
 674         buf = b""
 675         while size > 0:
 676             while True:
 677                 data, start, stop, offset = self.map[self.map_index]
 678                 if start <= self.position < stop:
 679                     break
 680                 else:
 681                     self.map_index += 1
 682                     if self.map_index == len(self.map):
 683                         self.map_index = 0
 684             length = min(size, stop - self.position)
 685             if data:
 686                 self.fileobj.seek(offset + (self.position - start))
 687                 b = self.fileobj.read(length)
 688                 if len(b) != length:
 689                     raise ReadError("unexpected end of data")
 690                 buf += b
 691             else:
 692                 buf += NUL * length
 693             size -= length
 694             self.position += length
 695         return buf
 696 
 697     def readinto(self, b):
 698         buf = self.read(len(b))
 699         b[:len(buf)] = buf
 700         return len(buf)
 701 
 702     def close(self):
 703         self.closed = True
 704 #class _FileInFile
 705 
 706 class ExFileObject(io.BufferedReader):
 707 
 708     def __init__(self, tarfile, tarinfo):
 709         fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
 710                 tarinfo.size, tarinfo.sparse)
 711         super().__init__(fileobj)
 712 #class ExFileObject
 713 
 714 #------------------
 715 # Exported Classes
 716 #------------------
 717 class TarInfo(object):
 718     """Informational class which holds the details about an
 719        archive member given by a tar header block.
 720        TarInfo objects are returned by TarFile.getmember(),
 721        TarFile.getmembers() and TarFile.gettarinfo() and are
 722        usually created internally.
 723     """
 724 
 725     __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
 726                  "chksum", "type", "linkname", "uname", "gname",
 727                  "devmajor", "devminor",
 728                  "offset", "offset_data", "pax_headers", "sparse",
 729                  "tarfile", "_sparse_structs", "_link_target")
 730 
 731     def __init__(self, name=""):
 732         """Construct a TarInfo object. name is the optional name
 733            of the member.
 734         """
 735         self.name = name        # member name
 736         self.mode = 0o644       # file permissions
 737         self.uid = 0            # user id
 738         self.gid = 0            # group id
 739         self.size = 0           # file size
 740         self.mtime = 0          # modification time
 741         self.chksum = 0         # header checksum
 742         self.type = REGTYPE     # member type
 743         self.linkname = ""      # link name
 744         self.uname = ""         # user name
 745         self.gname = ""         # group name
 746         self.devmajor = 0       # device major number
 747         self.devminor = 0       # device minor number
 748 
 749         self.offset = 0         # the tar header starts here
 750         self.offset_data = 0    # the file's data starts here
 751 
 752         self.sparse = None      # sparse member information
 753         self.pax_headers = {}   # pax header information
 754 
 755     # In pax headers the "name" and "linkname" field are called
 756     # "path" and "linkpath".
 757     def _getpath(self):
 758         return self.name
 759     def _setpath(self, name):
 760         self.name = name
 761     path = property(_getpath, _setpath)
 762 
 763     def _getlinkpath(self):
 764         return self.linkname
 765     def _setlinkpath(self, linkname):
 766         self.linkname = linkname
 767     linkpath = property(_getlinkpath, _setlinkpath)
 768 
 769     def __repr__(self):
 770         return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
 771 
 772     def get_info(self):
 773         """Return the TarInfo's attributes as a dictionary.
 774         """
 775         info = {
 776             "name":     self.name,
 777             "mode":     self.mode & 0o7777,
 778             "uid":      self.uid,
 779             "gid":      self.gid,
 780             "size":     self.size,
 781             "mtime":    self.mtime,
 782             "chksum":   self.chksum,
 783             "type":     self.type,
 784             "linkname": self.linkname,
 785             "uname":    self.uname,
 786             "gname":    self.gname,
 787             "devmajor": self.devmajor,
 788             "devminor": self.devminor
 789         }
 790 
 791         if info["type"] == DIRTYPE and not info["name"].endswith("/"):
 792             info["name"] += "/"
 793 
 794         return info
 795 
 796     def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
 797         """Return a tar header as a string of 512 byte blocks.
 798         """
 799         info = self.get_info()
 800 
 801         if format == USTAR_FORMAT:
 802             return self.create_ustar_header(info, encoding, errors)
 803         elif format == GNU_FORMAT:
 804             return self.create_gnu_header(info, encoding, errors)
 805         elif format == PAX_FORMAT:
 806             return self.create_pax_header(info, encoding)
 807         else:
 808             raise ValueError("invalid format")
 809 
 810     def create_ustar_header(self, info, encoding, errors):
 811         """Return the object as a ustar header block.
 812         """
 813         info["magic"] = POSIX_MAGIC
 814 
 815         if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
 816             raise ValueError("linkname is too long")
 817 
 818         if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
 819             info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
 820 
 821         return self._create_header(info, USTAR_FORMAT, encoding, errors)
 822 
 823     def create_gnu_header(self, info, encoding, errors):
 824         """Return the object as a GNU header block sequence.
 825         """
 826         info["magic"] = GNU_MAGIC
 827 
 828         buf = b""
 829         if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
 830             buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
 831 
 832         if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
 833             buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
 834 
 835         return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
 836 
 837     def create_pax_header(self, info, encoding):
 838         """Return the object as a ustar header block. If it cannot be
 839            represented this way, prepend a pax extended header sequence
 840            with supplement information.
 841         """
 842         info["magic"] = POSIX_MAGIC
 843         pax_headers = self.pax_headers.copy()
 844 
 845         # Test string fields for values that exceed the field length or cannot
 846         # be represented in ASCII encoding.
 847         for name, hname, length in (
 848                 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
 849                 ("uname", "uname", 32), ("gname", "gname", 32)):
 850 
 851             if hname in pax_headers:
 852                 # The pax header has priority.
 853                 continue
 854 
 855             # Try to encode the string as ASCII.
 856             try:
 857                 info[name].encode("ascii", "strict")
 858             except UnicodeEncodeError:
 859                 pax_headers[hname] = info[name]
 860                 continue
 861 
 862             if len(info[name]) > length:
 863                 pax_headers[hname] = info[name]
 864 
 865         # Test number fields for values that exceed the field limit or values
 866         # that like to be stored as float.
 867         for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
 868             if name in pax_headers:
 869                 # The pax header has priority. Avoid overflow.
 870                 info[name] = 0
 871                 continue
 872 
 873             val = info[name]
 874             if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
 875                 pax_headers[name] = str(val)
 876                 info[name] = 0
 877 
 878         # Create a pax extended header if necessary.
 879         if pax_headers:
 880             buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
 881         else:
 882             buf = b""
 883 
 884         return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
 885 
 886     @classmethod
 887     def create_pax_global_header(cls, pax_headers):
 888         """Return the object as a pax global header block sequence.
 889         """
 890         return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
 891 
 892     def _posix_split_name(self, name, encoding, errors):
 893         """Split a name longer than 100 chars into a prefix
 894            and a name part.
 895         """
 896         components = name.split("/")
 897         for i in range(1, len(components)):
 898             prefix = "/".join(components[:i])
 899             name = "/".join(components[i:])
 900             if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
 901                     len(name.encode(encoding, errors)) <= LENGTH_NAME:
 902                 break
 903         else:
 904             raise ValueError("name is too long")
 905 
 906         return prefix, name
 907 
 908     @staticmethod
 909     def _create_header(info, format, encoding, errors):
 910         """Return a header block. info is a dictionary with file
 911            information, format must be one of the *_FORMAT constants.
 912         """
 913         parts = [
 914             stn(info.get("name", ""), 100, encoding, errors),
 915             itn(info.get("mode", 0) & 0o7777, 8, format),
 916             itn(info.get("uid", 0), 8, format),
 917             itn(info.get("gid", 0), 8, format),
 918             itn(info.get("size", 0), 12, format),
 919             itn(info.get("mtime", 0), 12, format),
 920             b"        ", # checksum field
 921             info.get("type", REGTYPE),
 922             stn(info.get("linkname", ""), 100, encoding, errors),
 923             info.get("magic", POSIX_MAGIC),
 924             stn(info.get("uname", ""), 32, encoding, errors),
 925             stn(info.get("gname", ""), 32, encoding, errors),
 926             itn(info.get("devmajor", 0), 8, format),
 927             itn(info.get("devminor", 0), 8, format),
 928             stn(info.get("prefix", ""), 155, encoding, errors)
 929         ]
 930 
 931         buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
 932         chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
 933         buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
 934         return buf
 935 
 936     @staticmethod
 937     def _create_payload(payload):
 938         """Return the string payload filled with zero bytes
 939            up to the next 512 byte border.
 940         """
 941         blocks, remainder = divmod(len(payload), BLOCKSIZE)
 942         if remainder > 0:
 943             payload += (BLOCKSIZE - remainder) * NUL
 944         return payload
 945 
 946     @classmethod
 947     def _create_gnu_long_header(cls, name, type, encoding, errors):
 948         """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
 949            for name.
 950         """
 951         name = name.encode(encoding, errors) + NUL
 952 
 953         info = {}
 954         info["name"] = "././@LongLink"
 955         info["type"] = type
 956         info["size"] = len(name)
 957         info["magic"] = GNU_MAGIC
 958 
 959         # create extended header + name blocks.
 960         return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
 961                 cls._create_payload(name)
 962 
 963     @classmethod
 964     def _create_pax_generic_header(cls, pax_headers, type, encoding):
 965         """Return a POSIX.1-2008 extended or global header sequence
 966            that contains a list of keyword, value pairs. The values
 967            must be strings.
 968         """
 969         # Check if one of the fields contains surrogate characters and thereby
 970         # forces hdrcharset=BINARY, see _proc_pax() for more information.
 971         binary = False
 972         for keyword, value in pax_headers.items():
 973             try:
 974                 value.encode("utf-8", "strict")
 975             except UnicodeEncodeError:
 976                 binary = True
 977                 break
 978 
 979         records = b""
 980         if binary:
 981             # Put the hdrcharset field at the beginning of the header.
 982             records += b"21 hdrcharset=BINARY\n"
 983 
 984         for keyword, value in pax_headers.items():
 985             keyword = keyword.encode("utf-8")
 986             if binary:
 987                 # Try to restore the original byte representation of `value'.
 988                 # Needless to say, that the encoding must match the string.
 989                 value = value.encode(encoding, "surrogateescape")
 990             else:
 991                 value = value.encode("utf-8")
 992 
 993             l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
 994             n = p = 0
 995             while True:
 996                 n = l + len(str(p))
 997                 if n == p:
 998                     break
 999                 p = n
1000             records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1001 
1002         # We use a hardcoded "././@PaxHeader" name like star does
1003         # instead of the one that POSIX recommends.
1004         info = {}
1005         info["name"] = "././@PaxHeader"
1006         info["type"] = type
1007         info["size"] = len(records)
1008         info["magic"] = POSIX_MAGIC
1009 
1010         # Create pax header + record blocks.
1011         return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1012                 cls._create_payload(records)
1013 
1014     @classmethod
1015     def frombuf(cls, buf, encoding, errors):
1016         """Construct a TarInfo object from a 512 byte bytes object.
1017         """
1018         if len(buf) == 0:
1019             raise EmptyHeaderError("empty header")
1020         if len(buf) != BLOCKSIZE:
1021             raise TruncatedHeaderError("truncated header")
1022         if buf.count(NUL) == BLOCKSIZE:
1023             raise EOFHeaderError("end of file header")
1024 
1025         chksum = nti(buf[148:156])
1026         if chksum not in calc_chksums(buf):
1027             raise InvalidHeaderError("bad checksum")
1028 
1029         obj = cls()
1030         obj.name = nts(buf[0:100], encoding, errors)
1031         obj.mode = nti(buf[100:108])
1032         obj.uid = nti(buf[108:116])
1033         obj.gid = nti(buf[116:124])
1034         obj.size = nti(buf[124:136])
1035         obj.mtime = nti(buf[136:148])
1036         obj.chksum = chksum
1037         obj.type = buf[156:157]
1038         obj.linkname = nts(buf[157:257], encoding, errors)
1039         obj.uname = nts(buf[265:297], encoding, errors)
1040         obj.gname = nts(buf[297:329], encoding, errors)
1041         obj.devmajor = nti(buf[329:337])
1042         obj.devminor = nti(buf[337:345])
1043         prefix = nts(buf[345:500], encoding, errors)
1044 
1045         # Old V7 tar format represents a directory as a regular
1046         # file with a trailing slash.
1047         if obj.type == AREGTYPE and obj.name.endswith("/"):
1048             obj.type = DIRTYPE
1049 
1050         # The old GNU sparse format occupies some of the unused
1051         # space in the buffer for up to 4 sparse structures.
1052         # Save the them for later processing in _proc_sparse().
1053         if obj.type == GNUTYPE_SPARSE:
1054             pos = 386
1055             structs = []
1056             for i in range(4):
1057                 try:
1058                     offset = nti(buf[pos:pos + 12])
1059                     numbytes = nti(buf[pos + 12:pos + 24])
1060                 except ValueError:
1061                     break
1062                 structs.append((offset, numbytes))
1063                 pos += 24
1064             isextended = bool(buf[482])
1065             origsize = nti(buf[483:495])
1066             obj._sparse_structs = (structs, isextended, origsize)
1067 
1068         # Remove redundant slashes from directories.
1069         if obj.isdir():
1070             obj.name = obj.name.rstrip("/")
1071 
1072         # Reconstruct a ustar longname.
1073         if prefix and obj.type not in GNU_TYPES:
1074             obj.name = prefix + "/" + obj.name
1075         return obj
1076 
1077     @classmethod
1078     def fromtarfile(cls, tarfile):
1079         """Return the next TarInfo object from TarFile object
1080            tarfile.
1081         """
1082         buf = tarfile.fileobj.read(BLOCKSIZE)
1083         obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1084         obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1085         return obj._proc_member(tarfile)
1086 
1087     #--------------------------------------------------------------------------
1088     # The following are methods that are called depending on the type of a
1089     # member. The entry point is _proc_member() which can be overridden in a
1090     # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1091     # implement the following
1092     # operations:
1093     # 1. Set self.offset_data to the position where the data blocks begin,
1094     #    if there is data that follows.
1095     # 2. Set tarfile.offset to the position where the next member's header will
1096     #    begin.
1097     # 3. Return self or another valid TarInfo object.
1098     def _proc_member(self, tarfile):
1099         """Choose the right processing method depending on
1100            the type and call it.
1101         """
1102         if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1103             return self._proc_gnulong(tarfile)
1104         elif self.type == GNUTYPE_SPARSE:
1105             return self._proc_sparse(tarfile)
1106         elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1107             return self._proc_pax(tarfile)
1108         else:
1109             return self._proc_builtin(tarfile)
1110 
1111     def _proc_builtin(self, tarfile):
1112         """Process a builtin type or an unknown type which
1113            will be treated as a regular file.
1114         """
1115         self.offset_data = tarfile.fileobj.tell()
1116         offset = self.offset_data
1117         if self.isreg() or self.type not in SUPPORTED_TYPES:
1118             # Skip the following data blocks.
1119             offset += self._block(self.size)
1120         tarfile.offset = offset
1121 
1122         # Patch the TarInfo object with saved global
1123         # header information.
1124         self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1125 
1126         return self
1127 
1128     def _proc_gnulong(self, tarfile):
1129         """Process the blocks that hold a GNU longname
1130            or longlink member.
1131         """
1132         buf = tarfile.fileobj.read(self._block(self.size))
1133 
1134         # Fetch the next header and process it.
1135         try:
1136             next = self.fromtarfile(tarfile)
1137         except HeaderError:
1138             raise SubsequentHeaderError("missing or bad subsequent header")
1139 
1140         # Patch the TarInfo object from the next header with
1141         # the longname information.
1142         next.offset = self.offset
1143         if self.type == GNUTYPE_LONGNAME:
1144             next.name = nts(buf, tarfile.encoding, tarfile.errors)
1145         elif self.type == GNUTYPE_LONGLINK:
1146             next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1147 
1148         return next
1149 
1150     def _proc_sparse(self, tarfile):
1151         """Process a GNU sparse header plus extra headers.
1152         """
1153         # We already collected some sparse structures in frombuf().
1154         structs, isextended, origsize = self._sparse_structs
1155         del self._sparse_structs
1156 
1157         # Collect sparse structures from extended header blocks.
1158         while isextended:
1159             buf = tarfile.fileobj.read(BLOCKSIZE)
1160             pos = 0
1161             for i in range(21):
1162                 try:
1163                     offset = nti(buf[pos:pos + 12])
1164                     numbytes = nti(buf[pos + 12:pos + 24])
1165                 except ValueError:
1166                     break
1167                 if offset and numbytes:
1168                     structs.append((offset, numbytes))
1169                 pos += 24
1170             isextended = bool(buf[504])
1171         self.sparse = structs
1172 
1173         self.offset_data = tarfile.fileobj.tell()
1174         tarfile.offset = self.offset_data + self._block(self.size)
1175         self.size = origsize
1176         return self
1177 
1178     def _proc_pax(self, tarfile):
1179         """Process an extended or global header as described in
1180            POSIX.1-2008.
1181         """
1182         # Read the header information.
1183         buf = tarfile.fileobj.read(self._block(self.size))
1184 
1185         # A pax header stores supplemental information for either
1186         # the following file (extended) or all following files
1187         # (global).
1188         if self.type == XGLTYPE:
1189             pax_headers = tarfile.pax_headers
1190         else:
1191             pax_headers = tarfile.pax_headers.copy()
1192 
1193         # Check if the pax header contains a hdrcharset field. This tells us
1194         # the encoding of the path, linkpath, uname and gname fields. Normally,
1195         # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1196         # implementations are allowed to store them as raw binary strings if
1197         # the translation to UTF-8 fails.
1198         match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1199         if match is not None:
1200             pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1201 
1202         # For the time being, we don't care about anything other than "BINARY".
1203         # The only other value that is currently allowed by the standard is
1204         # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1205         hdrcharset = pax_headers.get("hdrcharset")
1206         if hdrcharset == "BINARY":
1207             encoding = tarfile.encoding
1208         else:
1209             encoding = "utf-8"
1210 
1211         # Parse pax header information. A record looks like that:
1212         # "%d %s=%s\n" % (length, keyword, value). length is the size
1213         # of the complete record including the length field itself and
1214         # the newline. keyword and value are both UTF-8 encoded strings.
1215         regex = re.compile(br"(\d+) ([^=]+)=")
1216         pos = 0
1217         while True:
1218             match = regex.match(buf, pos)
1219             if not match:
1220                 break
1221 
1222             length, keyword = match.groups()
1223             length = int(length)
1224             value = buf[match.end(2) + 1:match.start(1) + length - 1]
1225 
1226             # Normally, we could just use "utf-8" as the encoding and "strict"
1227             # as the error handler, but we better not take the risk. For
1228             # example, GNU tar <= 1.23 is known to store filenames it cannot
1229             # translate to UTF-8 as raw strings (unfortunately without a
1230             # hdrcharset=BINARY header).
1231             # We first try the strict standard encoding, and if that fails we
1232             # fall back on the user's encoding and error handler.
1233             keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1234                     tarfile.errors)
1235             if keyword in PAX_NAME_FIELDS:
1236                 value = self._decode_pax_field(value, encoding, tarfile.encoding,
1237                         tarfile.errors)
1238             else:
1239                 value = self._decode_pax_field(value, "utf-8", "utf-8",
1240                         tarfile.errors)
1241 
1242             pax_headers[keyword] = value
1243             pos += length
1244 
1245         # Fetch the next header.
1246         try:
1247             next = self.fromtarfile(tarfile)
1248         except HeaderError:
1249             raise SubsequentHeaderError("missing or bad subsequent header")
1250 
1251         # Process GNU sparse information.
1252         if "GNU.sparse.map" in pax_headers:
1253             # GNU extended sparse format version 0.1.
1254             self._proc_gnusparse_01(next, pax_headers)
1255 
1256         elif "GNU.sparse.size" in pax_headers:
1257             # GNU extended sparse format version 0.0.
1258             self._proc_gnusparse_00(next, pax_headers, buf)
1259 
1260         elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1261             # GNU extended sparse format version 1.0.
1262             self._proc_gnusparse_10(next, pax_headers, tarfile)
1263 
1264         if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1265             # Patch the TarInfo object with the extended header info.
1266             next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1267             next.offset = self.offset
1268 
1269             if "size" in pax_headers:
1270                 # If the extended header replaces the size field,
1271                 # we need to recalculate the offset where the next
1272                 # header starts.
1273                 offset = next.offset_data
1274                 if next.isreg() or next.type not in SUPPORTED_TYPES:
1275                     offset += next._block(next.size)
1276                 tarfile.offset = offset
1277 
1278         return next
1279 
1280     def _proc_gnusparse_00(self, next, pax_headers, buf):
1281         """Process a GNU tar extended sparse header, version 0.0.
1282         """
1283         offsets = []
1284         for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1285             offsets.append(int(match.group(1)))
1286         numbytes = []
1287         for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1288             numbytes.append(int(match.group(1)))
1289         next.sparse = list(zip(offsets, numbytes))
1290 
1291     def _proc_gnusparse_01(self, next, pax_headers):
1292         """Process a GNU tar extended sparse header, version 0.1.
1293         """
1294         sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1295         next.sparse = list(zip(sparse[::2], sparse[1::2]))
1296 
1297     def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1298         """Process a GNU tar extended sparse header, version 1.0.
1299         """
1300         fields = None
1301         sparse = []
1302         buf = tarfile.fileobj.read(BLOCKSIZE)
1303         fields, buf = buf.split(b"\n", 1)
1304         fields = int(fields)
1305         while len(sparse) < fields * 2:
1306             if b"\n" not in buf:
1307                 buf += tarfile.fileobj.read(BLOCKSIZE)
1308             number, buf = buf.split(b"\n", 1)
1309             sparse.append(int(number))
1310         next.offset_data = tarfile.fileobj.tell()
1311         next.sparse = list(zip(sparse[::2], sparse[1::2]))
1312 
1313     def _apply_pax_info(self, pax_headers, encoding, errors):
1314         """Replace fields with supplemental information from a previous
1315            pax extended or global header.
1316         """
1317         for keyword, value in pax_headers.items():
1318             if keyword == "GNU.sparse.name":
1319                 setattr(self, "path", value)
1320             elif keyword == "GNU.sparse.size":
1321                 setattr(self, "size", int(value))
1322             elif keyword == "GNU.sparse.realsize":
1323                 setattr(self, "size", int(value))
1324             elif keyword in PAX_FIELDS:
1325                 if keyword in PAX_NUMBER_FIELDS:
1326                     try:
1327                         value = PAX_NUMBER_FIELDS[keyword](value)
1328                     except ValueError:
1329                         value = 0
1330                 if keyword == "path":
1331                     value = value.rstrip("/")
1332                 setattr(self, keyword, value)
1333 
1334         self.pax_headers = pax_headers.copy()
1335 
1336     def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1337         """Decode a single field from a pax record.
1338         """
1339         try:
1340             return value.decode(encoding, "strict")
1341         except UnicodeDecodeError:
1342             return value.decode(fallback_encoding, fallback_errors)
1343 
1344     def _block(self, count):
1345         """Round up a byte count by BLOCKSIZE and return it,
1346            e.g. _block(834) => 1024.
1347         """
1348         blocks, remainder = divmod(count, BLOCKSIZE)
1349         if remainder:
1350             blocks += 1
1351         return blocks * BLOCKSIZE
1352 
1353     def isreg(self):
1354         return self.type in REGULAR_TYPES
1355     def isfile(self):
1356         return self.isreg()
1357     def isdir(self):
1358         return self.type == DIRTYPE
1359     def issym(self):
1360         return self.type == SYMTYPE
1361     def islnk(self):
1362         return self.type == LNKTYPE
1363     def ischr(self):
1364         return self.type == CHRTYPE
1365     def isblk(self):
1366         return self.type == BLKTYPE
1367     def isfifo(self):
1368         return self.type == FIFOTYPE
1369     def issparse(self):
1370         return self.sparse is not None
1371     def isdev(self):
1372         return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1373 # class TarInfo
1374 
1375 class TarFile(object):
1376     """The TarFile Class provides an interface to tar archives.
1377     """
1378 
1379     debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1380 
1381     dereference = False         # If true, add content of linked file to the
1382                                 # tar file, else the link.
1383 
1384     ignore_zeros = False        # If true, skips empty or invalid blocks and
1385                                 # continues processing.
1386 
1387     errorlevel = 1              # If 0, fatal errors only appear in debug
1388                                 # messages (if debug >= 0). If > 0, errors
1389                                 # are passed to the caller as exceptions.
1390 
1391     format = DEFAULT_FORMAT     # The format to use when creating an archive.
1392 
1393     encoding = ENCODING         # Encoding for 8-bit character strings.
1394 
1395     errors = None               # Error handler for unicode conversion.
1396 
1397     tarinfo = TarInfo           # The default TarInfo class to use.
1398 
1399     fileobject = ExFileObject   # The file-object for extractfile().
1400 
1401     def __init__(self, name=None, mode="r", fileobj=None, format=None,
1402             tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1403             errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
1404         """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1405            read from an existing archive, 'a' to append data to an existing
1406            file or 'w' to create a new file overwriting an existing one. `mode'
1407            defaults to 'r'.
1408            If `fileobj' is given, it is used for reading or writing data. If it
1409            can be determined, `mode' is overridden by `fileobj's mode.
1410            `fileobj' is not closed, when TarFile is closed.
1411         """
1412         modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
1413         if mode not in modes:
1414             raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1415         self.mode = mode
1416         self._mode = modes[mode]
1417 
1418         if not fileobj:
1419             if self.mode == "a" and not os.path.exists(name):
1420                 # Create nonexistent files in append mode.
1421                 self.mode = "w"
1422                 self._mode = "wb"
1423             fileobj = bltn_open(name, self._mode)
1424             self._extfileobj = False
1425         else:
1426             if (name is None and hasattr(fileobj, "name") and
1427                 isinstance(fileobj.name, (str, bytes))):
1428                 name = fileobj.name
1429             if hasattr(fileobj, "mode"):
1430                 self._mode = fileobj.mode
1431             self._extfileobj = True
1432         self.name = os.path.abspath(name) if name else None
1433         self.fileobj = fileobj
1434 
1435         # Init attributes.
1436         if format is not None:
1437             self.format = format
1438         if tarinfo is not None:
1439             self.tarinfo = tarinfo
1440         if dereference is not None:
1441             self.dereference = dereference
1442         if ignore_zeros is not None:
1443             self.ignore_zeros = ignore_zeros
1444         if encoding is not None:
1445             self.encoding = encoding
1446         self.errors = errors
1447 
1448         if pax_headers is not None and self.format == PAX_FORMAT:
1449             self.pax_headers = pax_headers
1450         else:
1451             self.pax_headers = {}
1452 
1453         if debug is not None:
1454             self.debug = debug
1455         if errorlevel is not None:
1456             self.errorlevel = errorlevel
1457 
1458         # Init datastructures.
1459         self.closed = False
1460         self.members = []       # list of members as TarInfo objects
1461         self._loaded = False    # flag if all members have been read
1462         self.offset = self.fileobj.tell()
1463                                 # current position in the archive file
1464         self.inodes = {}        # dictionary caching the inodes of
1465                                 # archive members already added
1466 
1467         try:
1468             if self.mode == "r":
1469                 self.firstmember = None
1470                 self.firstmember = self.next()
1471 
1472             if self.mode == "a":
1473                 # Move to the end of the archive,
1474                 # before the first empty block.
1475                 while True:
1476                     self.fileobj.seek(self.offset)
1477                     try:
1478                         tarinfo = self.tarinfo.fromtarfile(self)
1479                         self.members.append(tarinfo)
1480                     except EOFHeaderError:
1481                         self.fileobj.seek(self.offset)
1482                         break
1483                     except HeaderError as e:
1484                         raise ReadError(str(e))
1485 
1486             if self.mode in ("a", "w", "x"):
1487                 self._loaded = True
1488 
1489                 if self.pax_headers:
1490                     buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1491                     self.fileobj.write(buf)
1492                     self.offset += len(buf)
1493         except:
1494             if not self._extfileobj:
1495                 self.fileobj.close()
1496             self.closed = True
1497             raise
1498 
1499     #--------------------------------------------------------------------------
1500     # Below are the classmethods which act as alternate constructors to the
1501     # TarFile class. The open() method is the only one that is needed for
1502     # public use; it is the "super"-constructor and is able to select an
1503     # adequate "sub"-constructor for a particular compression using the mapping
1504     # from OPEN_METH.
1505     #
1506     # This concept allows one to subclass TarFile without losing the comfort of
1507     # the super-constructor. A sub-constructor is registered and made available
1508     # by adding it to the mapping in OPEN_METH.
1509 
1510     @classmethod
1511     def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1512         """Open a tar archive for reading, writing or appending. Return
1513            an appropriate TarFile class.
1514 
1515            mode:
1516            'r' or 'r:*' open for reading with transparent compression
1517            'r:'         open for reading exclusively uncompressed
1518            'r:gz'       open for reading with gzip compression
1519            'r:bz2'      open for reading with bzip2 compression
1520            'r:xz'       open for reading with lzma compression
1521            'a' or 'a:'  open for appending, creating the file if necessary
1522            'w' or 'w:'  open for writing without compression
1523            'w:gz'       open for writing with gzip compression
1524            'w:bz2'      open for writing with bzip2 compression
1525            'w:xz'       open for writing with lzma compression
1526 
1527            'x' or 'x:'  create a tarfile exclusively without compression, raise
1528                         an exception if the file is already created
1529            'x:gz'       create a gzip compressed tarfile, raise an exception
1530                         if the file is already created
1531            'x:bz2'      create a bzip2 compressed tarfile, raise an exception
1532                         if the file is already created
1533            'x:xz'       create an lzma compressed tarfile, raise an exception
1534                         if the file is already created
1535 
1536            'r|*'        open a stream of tar blocks with transparent compression
1537            'r|'         open an uncompressed stream of tar blocks for reading
1538            'r|gz'       open a gzip compressed stream of tar blocks
1539            'r|bz2'      open a bzip2 compressed stream of tar blocks
1540            'r|xz'       open an lzma compressed stream of tar blocks
1541            'w|'         open an uncompressed stream for writing
1542            'w|gz'       open a gzip compressed stream for writing
1543            'w|bz2'      open a bzip2 compressed stream for writing
1544            'w|xz'       open an lzma compressed stream for writing
1545         """
1546 
1547         if not name and not fileobj:
1548             raise ValueError("nothing to open")
1549 
1550         if mode in ("r", "r:*"):
1551             # Find out which *open() is appropriate for opening the file.
1552             for comptype in cls.OPEN_METH:
1553                 func = getattr(cls, cls.OPEN_METH[comptype])
1554                 if fileobj is not None:
1555                     saved_pos = fileobj.tell()
1556                 try:
1557                     return func(name, "r", fileobj, **kwargs)
1558                 except (ReadError, CompressionError) as e:
1559                     if fileobj is not None:
1560                         fileobj.seek(saved_pos)
1561                     continue
1562             raise ReadError("file could not be opened successfully")
1563 
1564         elif ":" in mode:
1565             filemode, comptype = mode.split(":", 1)
1566             filemode = filemode or "r"
1567             comptype = comptype or "tar"
1568 
1569             # Select the *open() function according to
1570             # given compression.
1571             if comptype in cls.OPEN_METH:
1572                 func = getattr(cls, cls.OPEN_METH[comptype])
1573             else:
1574                 raise CompressionError("unknown compression type %r" % comptype)
1575             return func(name, filemode, fileobj, **kwargs)
1576 
1577         elif "|" in mode:
1578             filemode, comptype = mode.split("|", 1)
1579             filemode = filemode or "r"
1580             comptype = comptype or "tar"
1581 
1582             if filemode not in ("r", "w"):
1583                 raise ValueError("mode must be 'r' or 'w'")
1584 
1585             stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1586             try:
1587                 t = cls(name, filemode, stream, **kwargs)
1588             except:
1589                 stream.close()
1590                 raise
1591             t._extfileobj = False
1592             return t
1593 
1594         elif mode in ("a", "w", "x"):
1595             return cls.taropen(name, mode, fileobj, **kwargs)
1596 
1597         raise ValueError("undiscernible mode")
1598 
1599     @classmethod
1600     def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1601         """Open uncompressed tar archive name for reading or writing.
1602         """
1603         if mode not in ("r", "a", "w", "x"):
1604             raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1605         return cls(name, mode, fileobj, **kwargs)
1606 
1607     @classmethod
1608     def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1609         """Open gzip compressed tar archive name for reading or writing.
1610            Appending is not allowed.
1611         """
1612         if mode not in ("r", "w", "x"):
1613             raise ValueError("mode must be 'r', 'w' or 'x'")
1614 
1615         try:
1616             import gzip
1617             gzip.GzipFile
1618         except (ImportError, AttributeError):
1619             raise CompressionError("gzip module is not available")
1620 
1621         try:
1622             fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1623         except OSError:
1624             if fileobj is not None and mode == 'r':
1625                 raise ReadError("not a gzip file")
1626             raise
1627 
1628         try:
1629             t = cls.taropen(name, mode, fileobj, **kwargs)
1630         except OSError:
1631             fileobj.close()
1632             if mode == 'r':
1633                 raise ReadError("not a gzip file")
1634             raise
1635         except:
1636             fileobj.close()
1637             raise
1638         t._extfileobj = False
1639         return t
1640 
1641     @classmethod
1642     def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1643         """Open bzip2 compressed tar archive name for reading or writing.
1644            Appending is not allowed.
1645         """
1646         if mode not in ("r", "w", "x"):
1647             raise ValueError("mode must be 'r', 'w' or 'x'")
1648 
1649         try:
1650             import bz2
1651         except ImportError:
1652             raise CompressionError("bz2 module is not available")
1653 
1654         fileobj = bz2.BZ2File(fileobj or name, mode,
1655                               compresslevel=compresslevel)
1656 
1657         try:
1658             t = cls.taropen(name, mode, fileobj, **kwargs)
1659         except (OSError, EOFError):
1660             fileobj.close()
1661             if mode == 'r':
1662                 raise ReadError("not a bzip2 file")
1663             raise
1664         except:
1665             fileobj.close()
1666             raise
1667         t._extfileobj = False
1668         return t
1669 
1670     @classmethod
1671     def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
1672         """Open lzma compressed tar archive name for reading or writing.
1673            Appending is not allowed.
1674         """
1675         if mode not in ("r", "w", "x"):
1676             raise ValueError("mode must be 'r', 'w' or 'x'")
1677 
1678         try:
1679             import lzma
1680         except ImportError:
1681             raise CompressionError("lzma module is not available")
1682 
1683         fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
1684 
1685         try:
1686             t = cls.taropen(name, mode, fileobj, **kwargs)
1687         except (lzma.LZMAError, EOFError):
1688             fileobj.close()
1689             if mode == 'r':
1690                 raise ReadError("not an lzma file")
1691             raise
1692         except:
1693             fileobj.close()
1694             raise
1695         t._extfileobj = False
1696         return t
1697 
1698     # All *open() methods are registered here.
1699     OPEN_METH = {
1700         "tar": "taropen",   # uncompressed tar
1701         "gz":  "gzopen",    # gzip compressed tar
1702         "bz2": "bz2open",   # bzip2 compressed tar
1703         "xz":  "xzopen"     # lzma compressed tar
1704     }
1705 
1706     #--------------------------------------------------------------------------
1707     # The public methods which TarFile provides:
1708 
1709     def close(self):
1710         """Close the TarFile. In write-mode, two finishing zero blocks are
1711            appended to the archive.
1712         """
1713         if self.closed:
1714             return
1715 
1716         self.closed = True
1717         try:
1718             if self.mode in ("a", "w", "x"):
1719                 self.fileobj.write(NUL * (BLOCKSIZE * 2))
1720                 self.offset += (BLOCKSIZE * 2)
1721                 # fill up the end with zero-blocks
1722                 # (like option -b20 for tar does)
1723                 blocks, remainder = divmod(self.offset, RECORDSIZE)
1724                 if remainder > 0:
1725                     self.fileobj.write(NUL * (RECORDSIZE - remainder))
1726         finally:
1727             if not self._extfileobj:
1728                 self.fileobj.close()
1729 
1730     def getmember(self, name):
1731         """Return a TarInfo object for member `name'. If `name' can not be
1732            found in the archive, KeyError is raised. If a member occurs more
1733            than once in the archive, its last occurrence is assumed to be the
1734            most up-to-date version.
1735         """
1736         tarinfo = self._getmember(name)
1737         if tarinfo is None:
1738             raise KeyError("filename %r not found" % name)
1739         return tarinfo
1740 
1741     def getmembers(self):
1742         """Return the members of the archive as a list of TarInfo objects. The
1743            list has the same order as the members in the archive.
1744         """
1745         self._check()
1746         if not self._loaded:    # if we want to obtain a list of
1747             self._load()        # all members, we first have to
1748                                 # scan the whole archive.
1749         return self.members
1750 
1751     def getnames(self):
1752         """Return the members of the archive as a list of their names. It has
1753            the same order as the list returned by getmembers().
1754         """
1755         return [tarinfo.name for tarinfo in self.getmembers()]
1756 
1757     def gettarinfo(self, name=None, arcname=None, fileobj=None):
1758         """Create a TarInfo object from the result of os.stat or equivalent
1759            on an existing file. The file is either named by `name', or
1760            specified as a file object `fileobj' with a file descriptor. If
1761            given, `arcname' specifies an alternative name for the file in the
1762            archive, otherwise, the name is taken from the 'name' attribute of
1763            'fileobj', or the 'name' argument. The name should be a text
1764            string.
1765         """
1766         self._check("awx")
1767 
1768         # When fileobj is given, replace name by
1769         # fileobj's real name.
1770         if fileobj is not None:
1771             name = fileobj.name
1772 
1773         # Building the name of the member in the archive.
1774         # Backward slashes are converted to forward slashes,
1775         # Absolute paths are turned to relative paths.
1776         if arcname is None:
1777             arcname = name
1778         drv, arcname = os.path.splitdrive(arcname)
1779         arcname = arcname.replace(os.sep, "/")
1780         arcname = arcname.lstrip("/")
1781 
1782         # Now, fill the TarInfo object with
1783         # information specific for the file.
1784         tarinfo = self.tarinfo()
1785         tarinfo.tarfile = self  # Not needed
1786 
1787         # Use os.stat or os.lstat, depending on platform
1788         # and if symlinks shall be resolved.
1789         if fileobj is None:
1790             if hasattr(os, "lstat") and not self.dereference:
1791                 statres = os.lstat(name)
1792             else:
1793                 statres = os.stat(name)
1794         else:
1795             statres = os.fstat(fileobj.fileno())
1796         linkname = ""
1797 
1798         stmd = statres.st_mode
1799         if stat.S_ISREG(stmd):
1800             inode = (statres.st_ino, statres.st_dev)
1801             if not self.dereference and statres.st_nlink > 1 and \
1802                     inode in self.inodes and arcname != self.inodes[inode]:
1803                 # Is it a hardlink to an already
1804                 # archived file?
1805                 type = LNKTYPE
1806                 linkname = self.inodes[inode]
1807             else:
1808                 # The inode is added only if its valid.
1809                 # For win32 it is always 0.
1810                 type = REGTYPE
1811                 if inode[0]:
1812                     self.inodes[inode] = arcname
1813         elif stat.S_ISDIR(stmd):
1814             type = DIRTYPE
1815         elif stat.S_ISFIFO(stmd):
1816             type = FIFOTYPE
1817         elif stat.S_ISLNK(stmd):
1818             type = SYMTYPE
1819             linkname = os.readlink(name)
1820         elif stat.S_ISCHR(stmd):
1821             type = CHRTYPE
1822         elif stat.S_ISBLK(stmd):
1823             type = BLKTYPE
1824         else:
1825             return None
1826 
1827         # Fill the TarInfo object with all
1828         # information we can get.
1829         tarinfo.name = arcname
1830         tarinfo.mode = stmd
1831         tarinfo.uid = statres.st_uid
1832         tarinfo.gid = statres.st_gid
1833         if type == REGTYPE:
1834             tarinfo.size = statres.st_size
1835         else:
1836             tarinfo.size = 0
1837         tarinfo.mtime = statres.st_mtime
1838         tarinfo.type = type
1839         tarinfo.linkname = linkname
1840         if pwd:
1841             try:
1842                 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1843             except KeyError:
1844                 pass
1845         if grp:
1846             try:
1847                 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1848             except KeyError:
1849                 pass
1850 
1851         if type in (CHRTYPE, BLKTYPE):
1852             if hasattr(os, "major") and hasattr(os, "minor"):
1853                 tarinfo.devmajor = os.major(statres.st_rdev)
1854                 tarinfo.devminor = os.minor(statres.st_rdev)
1855         return tarinfo
1856 
1857     def list(self, verbose=True, *, members=None):
1858         """Print a table of contents to sys.stdout. If `verbose' is False, only
1859            the names of the members are printed. If it is True, an `ls -l'-like
1860            output is produced. `members' is optional and must be a subset of the
1861            list returned by getmembers().
1862         """
1863         self._check()
1864 
1865         if members is None:
1866             members = self
1867         for tarinfo in members:
1868             if verbose:
1869                 _safe_print(stat.filemode(tarinfo.mode))
1870                 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1871                                        tarinfo.gname or tarinfo.gid))
1872                 if tarinfo.ischr() or tarinfo.isblk():
1873                     _safe_print("%10s" %
1874                             ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
1875                 else:
1876                     _safe_print("%10d" % tarinfo.size)
1877                 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1878                             % time.localtime(tarinfo.mtime)[:6])
1879 
1880             _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
1881 
1882             if verbose:
1883                 if tarinfo.issym():
1884                     _safe_print("-> " + tarinfo.linkname)
1885                 if tarinfo.islnk():
1886                     _safe_print("link to " + tarinfo.linkname)
1887             print()
1888 
1889     def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
1890         """Add the file `name' to the archive. `name' may be any type of file
1891            (directory, fifo, symbolic link, etc.). If given, `arcname'
1892            specifies an alternative name for the file in the archive.
1893            Directories are added recursively by default. This can be avoided by
1894            setting `recursive' to False. `exclude' is a function that should
1895            return True for each filename to be excluded. `filter' is a function
1896            that expects a TarInfo object argument and returns the changed
1897            TarInfo object, if it returns None the TarInfo object will be
1898            excluded from the archive.
1899         """
1900         self._check("awx")
1901 
1902         if arcname is None:
1903             arcname = name
1904 
1905         # Exclude pathnames.
1906         if exclude is not None:
1907             import warnings
1908             warnings.warn("use the filter argument instead",
1909                     DeprecationWarning, 2)
1910             if exclude(name):
1911                 self._dbg(2, "tarfile: Excluded %r" % name)
1912                 return
1913 
1914         # Skip if somebody tries to archive the archive...
1915         if self.name is not None and os.path.abspath(name) == self.name:
1916             self._dbg(2, "tarfile: Skipped %r" % name)
1917             return
1918 
1919         self._dbg(1, name)
1920 
1921         # Create a TarInfo object from the file.
1922         tarinfo = self.gettarinfo(name, arcname)
1923 
1924         if tarinfo is None:
1925             self._dbg(1, "tarfile: Unsupported type %r" % name)
1926             return
1927 
1928         # Change or exclude the TarInfo object.
1929         if filter is not None:
1930             tarinfo = filter(tarinfo)
1931             if tarinfo is None:
1932                 self._dbg(2, "tarfile: Excluded %r" % name)
1933                 return
1934 
1935         # Append the tar header and data to the archive.
1936         if tarinfo.isreg():
1937             with bltn_open(name, "rb") as f:
1938                 self.addfile(tarinfo, f)
1939 
1940         elif tarinfo.isdir():
1941             self.addfile(tarinfo)
1942             if recursive:
1943                 for f in os.listdir(name):
1944                     self.add(os.path.join(name, f), os.path.join(arcname, f),
1945                             recursive, exclude, filter=filter)
1946 
1947         else:
1948             self.addfile(tarinfo)
1949 
1950     def addfile(self, tarinfo, fileobj=None):
1951         """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1952            given, it should be a binary file, and tarinfo.size bytes are read
1953            from it and added to the archive. You can create TarInfo objects
1954            directly, or by using gettarinfo().
1955         """
1956         self._check("awx")
1957 
1958         tarinfo = copy.copy(tarinfo)
1959 
1960         buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
1961         self.fileobj.write(buf)
1962         self.offset += len(buf)
1963 
1964         # If there's data to follow, append it.
1965         if fileobj is not None:
1966             copyfileobj(fileobj, self.fileobj, tarinfo.size)
1967             blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1968             if remainder > 0:
1969                 self.fileobj.write(NUL * (BLOCKSIZE - remainder))
1970                 blocks += 1
1971             self.offset += blocks * BLOCKSIZE
1972 
1973         self.members.append(tarinfo)
1974 
1975     def extractall(self, path=".", members=None, *, numeric_owner=False):
1976         """Extract all members from the archive to the current working
1977            directory and set owner, modification time and permissions on
1978            directories afterwards. `path' specifies a different directory
1979            to extract to. `members' is optional and must be a subset of the
1980            list returned by getmembers(). If `numeric_owner` is True, only
1981            the numbers for user/group names are used and not the names.
1982         """
1983         directories = []
1984 
1985         if members is None:
1986             members = self
1987 
1988         for tarinfo in members:
1989             if tarinfo.isdir():
1990                 # Extract directories with a safe mode.
1991                 directories.append(tarinfo)
1992                 tarinfo = copy.copy(tarinfo)
1993                 tarinfo.mode = 0o700
1994             # Do not set_attrs directories, as we will do that further down
1995             self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
1996                          numeric_owner=numeric_owner)
1997 
1998         # Reverse sort directories.
1999         directories.sort(key=lambda a: a.name)
2000         directories.reverse()
2001 
2002         # Set correct owner, mtime and filemode on directories.
2003         for tarinfo in directories:
2004             dirpath = os.path.join(path, tarinfo.name)
2005             try:
2006                 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
2007                 self.utime(tarinfo, dirpath)
2008                 self.chmod(tarinfo, dirpath)
2009             except ExtractError as e:
2010                 if self.errorlevel > 1:
2011                     raise
2012                 else:
2013                     self._dbg(1, "tarfile: %s" % e)
2014 
2015     def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
2016         """Extract a member from the archive to the current working directory,
2017            using its full name. Its file information is extracted as accurately
2018            as possible. `member' may be a filename or a TarInfo object. You can
2019            specify a different directory using `path'. File attributes (owner,
2020            mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2021            is True, only the numbers for user/group names are used and not
2022            the names.
2023         """
2024         self._check("r")
2025 
2026         if isinstance(member, str):
2027             tarinfo = self.getmember(member)
2028         else:
2029             tarinfo = member
2030 
2031         # Prepare the link target for makelink().
2032         if tarinfo.islnk():
2033             tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2034 
2035         try:
2036             self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2037                                  set_attrs=set_attrs,
2038                                  numeric_owner=numeric_owner)
2039         except OSError as e:
2040             if self.errorlevel > 0:
2041                 raise
2042             else:
2043                 if e.filename is None:
2044                     self._dbg(1, "tarfile: %s" % e.strerror)
2045                 else:
2046                     self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2047         except ExtractError as e:
2048             if self.errorlevel > 1:
2049                 raise
2050             else:
2051                 self._dbg(1, "tarfile: %s" % e)
2052 
2053     def extractfile(self, member):
2054         """Extract a member from the archive as a file object. `member' may be
2055            a filename or a TarInfo object. If `member' is a regular file or a
2056            link, an io.BufferedReader object is returned. Otherwise, None is
2057            returned.
2058         """
2059         self._check("r")
2060 
2061         if isinstance(member, str):
2062             tarinfo = self.getmember(member)
2063         else:
2064             tarinfo = member
2065 
2066         if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2067             # Members with unknown types are treated as regular files.
2068             return self.fileobject(self, tarinfo)
2069 
2070         elif tarinfo.islnk() or tarinfo.issym():
2071             if isinstance(self.fileobj, _Stream):
2072                 # A small but ugly workaround for the case that someone tries
2073                 # to extract a (sym)link as a file-object from a non-seekable
2074                 # stream of tar blocks.
2075                 raise StreamError("cannot extract (sym)link as file object")
2076             else:
2077                 # A (sym)link's file object is its target's file object.
2078                 return self.extractfile(self._find_link_target(tarinfo))
2079         else:
2080             # If there's no data associated with the member (directory, chrdev,
2081             # blkdev, etc.), return None instead of a file object.
2082             return None
2083 
2084     def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2085                         numeric_owner=False):
2086         """Extract the TarInfo object tarinfo to a physical
2087            file called targetpath.
2088         """
2089         # Fetch the TarInfo object for the given name
2090         # and build the destination pathname, replacing
2091         # forward slashes to platform specific separators.
2092         targetpath = targetpath.rstrip("/")
2093         targetpath = targetpath.replace("/", os.sep)
2094 
2095         # Create all upper directories.
2096         upperdirs = os.path.dirname(targetpath)
2097         if upperdirs and not os.path.exists(upperdirs):
2098             # Create directories that are not part of the archive with
2099             # default permissions.
2100             os.makedirs(upperdirs)
2101 
2102         if tarinfo.islnk() or tarinfo.issym():
2103             self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2104         else:
2105             self._dbg(1, tarinfo.name)
2106 
2107         if tarinfo.isreg():
2108             self.makefile(tarinfo, targetpath)
2109         elif tarinfo.isdir():
2110             self.makedir(tarinfo, targetpath)
2111         elif tarinfo.isfifo():
2112             self.makefifo(tarinfo, targetpath)
2113         elif tarinfo.ischr() or tarinfo.isblk():
2114             self.makedev(tarinfo, targetpath)
2115         elif tarinfo.islnk() or tarinfo.issym():
2116             self.makelink(tarinfo, targetpath)
2117         elif tarinfo.type not in SUPPORTED_TYPES:
2118             self.makeunknown(tarinfo, targetpath)
2119         else:
2120             self.makefile(tarinfo, targetpath)
2121 
2122         if set_attrs:
2123             self.chown(tarinfo, targetpath, numeric_owner)
2124             if not tarinfo.issym():
2125                 self.chmod(tarinfo, targetpath)
2126                 self.utime(tarinfo, targetpath)
2127 
2128     #--------------------------------------------------------------------------
2129     # Below are the different file methods. They are called via
2130     # _extract_member() when extract() is called. They can be replaced in a
2131     # subclass to implement other functionality.
2132 
2133     def makedir(self, tarinfo, targetpath):
2134         """Make a directory called targetpath.
2135         """
2136         try:
2137             # Use a safe mode for the directory, the real mode is set
2138             # later in _extract_member().
2139             os.mkdir(targetpath, 0o700)
2140         except FileExistsError:
2141             pass
2142 
2143     def makefile(self, tarinfo, targetpath):
2144         """Make a file called targetpath.
2145         """
2146         source = self.fileobj
2147         source.seek(tarinfo.offset_data)
2148         with bltn_open(targetpath, "wb") as target:
2149             if tarinfo.sparse is not None:
2150                 for offset, size in tarinfo.sparse:
2151                     target.seek(offset)
2152                     copyfileobj(source, target, size, ReadError)
2153                 target.seek(tarinfo.size)
2154                 target.truncate()
2155             else:
2156                 copyfileobj(source, target, tarinfo.size, ReadError)
2157 
2158     def makeunknown(self, tarinfo, targetpath):
2159         """Make a file from a TarInfo object with an unknown type
2160            at targetpath.
2161         """
2162         self.makefile(tarinfo, targetpath)
2163         self._dbg(1, "tarfile: Unknown file type %r, " \
2164                      "extracted as regular file." % tarinfo.type)
2165 
2166     def makefifo(self, tarinfo, targetpath):
2167         """Make a fifo called targetpath.
2168         """
2169         if hasattr(os, "mkfifo"):
2170             os.mkfifo(targetpath)
2171         else:
2172             raise ExtractError("fifo not supported by system")
2173 
2174     def makedev(self, tarinfo, targetpath):
2175         """Make a character or block device called targetpath.
2176         """
2177         if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2178             raise ExtractError("special devices not supported by system")
2179 
2180         mode = tarinfo.mode
2181         if tarinfo.isblk():
2182             mode |= stat.S_IFBLK
2183         else:
2184             mode |= stat.S_IFCHR
2185 
2186         os.mknod(targetpath, mode,
2187                  os.makedev(tarinfo.devmajor, tarinfo.devminor))
2188 
2189     def makelink(self, tarinfo, targetpath):
2190         """Make a (symbolic) link called targetpath. If it cannot be created
2191           (platform limitation), we try to make a copy of the referenced file
2192           instead of a link.
2193         """
2194         try:
2195             # For systems that support symbolic and hard links.
2196             if tarinfo.issym():
2197                 os.symlink(tarinfo.linkname, targetpath)
2198             else:
2199                 # See extract().
2200                 if os.path.exists(tarinfo._link_target):
2201                     os.link(tarinfo._link_target, targetpath)
2202                 else:
2203                     self._extract_member(self._find_link_target(tarinfo),
2204                                          targetpath)
2205         except symlink_exception:
2206             try:
2207                 self._extract_member(self._find_link_target(tarinfo),
2208                                      targetpath)
2209             except KeyError:
2210                 raise ExtractError("unable to resolve link inside archive")
2211 
2212     def chown(self, tarinfo, targetpath, numeric_owner):
2213         """Set owner of targetpath according to tarinfo. If numeric_owner
2214            is True, use .gid/.uid instead of .gname/.uname.
2215         """
2216         if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
2217             # We have to be root to do so.
2218             if numeric_owner:
2219                 g = tarinfo.gid
2220                 u = tarinfo.uid
2221             else:
2222                 try:
2223                     g = grp.getgrnam(tarinfo.gname)[2]
2224                 except KeyError:
2225                     g = tarinfo.gid
2226                 try:
2227                     u = pwd.getpwnam(tarinfo.uname)[2]
2228                 except KeyError:
2229                     u = tarinfo.uid
2230             try:
2231                 if tarinfo.issym() and hasattr(os, "lchown"):
2232                     os.lchown(targetpath, u, g)
2233                 else:
2234                     os.chown(targetpath, u, g)
2235             except OSError as e:
2236                 raise ExtractError("could not change owner")
2237 
2238     def chmod(self, tarinfo, targetpath):
2239         """Set file permissions of targetpath according to tarinfo.
2240         """
2241         if hasattr(os, 'chmod'):
2242             try:
2243                 os.chmod(targetpath, tarinfo.mode)
2244             except OSError as e:
2245                 raise ExtractError("could not change mode")
2246 
2247     def utime(self, tarinfo, targetpath):
2248         """Set modification time of targetpath according to tarinfo.
2249         """
2250         if not hasattr(os, 'utime'):
2251             return
2252         try:
2253             os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2254         except OSError as e:
2255             raise ExtractError("could not change modification time")
2256 
2257     #--------------------------------------------------------------------------
2258     def next(self):
2259         """Return the next member of the archive as a TarInfo object, when
2260            TarFile is opened for reading. Return None if there is no more
2261            available.
2262         """
2263         self._check("ra")
2264         if self.firstmember is not None:
2265             m = self.firstmember
2266             self.firstmember = None
2267             return m
2268 
2269         # Advance the file pointer.
2270         if self.offset != self.fileobj.tell():
2271             self.fileobj.seek(self.offset - 1)
2272             if not self.fileobj.read(1):
2273                 raise ReadError("unexpected end of data")
2274 
2275         # Read the next block.
2276         tarinfo = None
2277         while True:
2278             try:
2279                 tarinfo = self.tarinfo.fromtarfile(self)
2280             except EOFHeaderError as e:
2281                 if self.ignore_zeros:
2282                     self._dbg(2, "0x%X: %s" % (self.offset, e))
2283                     self.offset += BLOCKSIZE
2284                     continue
2285             except InvalidHeaderError as e:
2286                 if self.ignore_zeros:
2287                     self._dbg(2, "0x%X: %s" % (self.offset, e))
2288                     self.offset += BLOCKSIZE
2289                     continue
2290                 elif self.offset == 0:
2291                     raise ReadError(str(e))
2292             except EmptyHeaderError:
2293                 if self.offset == 0:
2294                     raise ReadError("empty file")
2295             except TruncatedHeaderError as e:
2296                 if self.offset == 0:
2297                     raise ReadError(str(e))
2298             except SubsequentHeaderError as e:
2299                 raise ReadError(str(e))
2300             break
2301 
2302         if tarinfo is not None:
2303             self.members.append(tarinfo)
2304         else:
2305             self._loaded = True
2306 
2307         return tarinfo
2308 
2309     #--------------------------------------------------------------------------
2310     # Little helper methods:
2311 
2312     def _getmember(self, name, tarinfo=None, normalize=False):
2313         """Find an archive member by name from bottom to top.
2314            If tarinfo is given, it is used as the starting point.
2315         """
2316         # Ensure that all members have been loaded.
2317         members = self.getmembers()
2318 
2319         # Limit the member search list up to tarinfo.
2320         if tarinfo is not None:
2321             members = members[:members.index(tarinfo)]
2322 
2323         if normalize:
2324             name = os.path.normpath(name)
2325 
2326         for member in reversed(members):
2327             if normalize:
2328                 member_name = os.path.normpath(member.name)
2329             else:
2330                 member_name = member.name
2331 
2332             if name == member_name:
2333                 return member
2334 
2335     def _load(self):
2336         """Read through the entire archive file and look for readable
2337            members.
2338         """
2339         while True:
2340             tarinfo = self.next()
2341             if tarinfo is None:
2342                 break
2343         self._loaded = True
2344 
2345     def _check(self, mode=None):
2346         """Check if TarFile is still open, and if the operation's mode
2347            corresponds to TarFile's mode.
2348         """
2349         if self.closed:
2350             raise OSError("%s is closed" % self.__class__.__name__)
2351         if mode is not None and self.mode not in mode:
2352             raise OSError("bad operation for mode %r" % self.mode)
2353 
2354     def _find_link_target(self, tarinfo):
2355         """Find the target member of a symlink or hardlink member in the
2356            archive.
2357         """
2358         if tarinfo.issym():
2359             # Always search the entire archive.
2360             linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2361             limit = None
2362         else:
2363             # Search the archive before the link, because a hard link is
2364             # just a reference to an already archived file.
2365             linkname = tarinfo.linkname
2366             limit = tarinfo
2367 
2368         member = self._getmember(linkname, tarinfo=limit, normalize=True)
2369         if member is None:
2370             raise KeyError("linkname %r not found" % linkname)
2371         return member
2372 
2373     def __iter__(self):
2374         """Provide an iterator object.
2375         """
2376         if self._loaded:
2377             return iter(self.members)
2378         else:
2379             return TarIter(self)
2380 
2381     def _dbg(self, level, msg):
2382         """Write debugging output to sys.stderr.
2383         """
2384         if level <= self.debug:
2385             print(msg, file=sys.stderr)
2386 
2387     def __enter__(self):
2388         self._check()
2389         return self
2390 
2391     def __exit__(self, type, value, traceback):
2392         if type is None:
2393             self.close()
2394         else:
2395             # An exception occurred. We must not call close() because
2396             # it would try to write end-of-archive blocks and padding.
2397             if not self._extfileobj:
2398                 self.fileobj.close()
2399             self.closed = True
2400 # class TarFile
2401 
2402 class TarIter:
2403     """Iterator Class.
2404 
2405        for tarinfo in TarFile(...):
2406            suite...
2407     """
2408 
2409     def __init__(self, tarfile):
2410         """Construct a TarIter object.
2411         """
2412         self.tarfile = tarfile
2413         self.index = 0
2414     def __iter__(self):
2415         """Return iterator object.
2416         """
2417         return self
2418     def __next__(self):
2419         """Return the next item using TarFile's next() method.
2420            When all members have been read, set TarFile as _loaded.
2421         """
2422         # Fix for SF #1100429: Under rare circumstances it can
2423         # happen that getmembers() is called during iteration,
2424         # which will cause TarIter to stop prematurely.
2425 
2426         if self.index == 0 and self.tarfile.firstmember is not None:
2427             tarinfo = self.tarfile.next()
2428         elif self.index < len(self.tarfile.members):
2429             tarinfo = self.tarfile.members[self.index]
2430         elif not self.tarfile._loaded:
2431             tarinfo = self.tarfile.next()
2432             if not tarinfo:
2433                 self.tarfile._loaded = True
2434                 raise StopIteration
2435         else:
2436             raise StopIteration
2437         self.index += 1
2438         return tarinfo
2439 
2440 #--------------------
2441 # exported functions
2442 #--------------------
2443 def is_tarfile(name):
2444     """Return True if name points to a tar archive that we
2445        are able to handle, else return False.
2446     """
2447     try:
2448         t = open(name)
2449         t.close()
2450         return True
2451     except TarError:
2452         return False
2453 
2454 open = TarFile.open
2455 
2456 
2457 def main():
2458     import argparse
2459 
2460     description = 'A simple command line interface for tarfile module.'
2461     parser = argparse.ArgumentParser(description=description)
2462     parser.add_argument('-v', '--verbose', action='store_true', default=False,
2463                         help='Verbose output')
2464     group = parser.add_mutually_exclusive_group()
2465     group.add_argument('-l', '--list', metavar='<tarfile>',
2466                        help='Show listing of a tarfile')
2467     group.add_argument('-e', '--extract', nargs='+',
2468                        metavar=('<tarfile>', '<output_dir>'),
2469                        help='Extract tarfile into target dir')
2470     group.add_argument('-c', '--create', nargs='+',
2471                        metavar=('<name>', '<file>'),
2472                        help='Create tarfile from sources')
2473     group.add_argument('-t', '--test', metavar='<tarfile>',
2474                        help='Test if a tarfile is valid')
2475     args = parser.parse_args()
2476 
2477     if args.test:
2478         src = args.test
2479         if is_tarfile(src):
2480             with open(src, 'r') as tar:
2481                 tar.getmembers()
2482                 print(tar.getmembers(), file=sys.stderr)
2483             if args.verbose:
2484                 print('{!r} is a tar archive.'.format(src))
2485         else:
2486             parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2487 
2488     elif args.list:
2489         src = args.list
2490         if is_tarfile(src):
2491             with TarFile.open(src, 'r:*') as tf:
2492                 tf.list(verbose=args.verbose)
2493         else:
2494             parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2495 
2496     elif args.extract:
2497         if len(args.extract) == 1:
2498             src = args.extract[0]
2499             curdir = os.curdir
2500         elif len(args.extract) == 2:
2501             src, curdir = args.extract
2502         else:
2503             parser.exit(1, parser.format_help())
2504 
2505         if is_tarfile(src):
2506             with TarFile.open(src, 'r:*') as tf:
2507                 tf.extractall(path=curdir)
2508             if args.verbose:
2509                 if curdir == '.':
2510                     msg = '{!r} file is extracted.'.format(src)
2511                 else:
2512                     msg = ('{!r} file is extracted '
2513                            'into {!r} directory.').format(src, curdir)
2514                 print(msg)
2515         else:
2516             parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2517 
2518     elif args.create:
2519         tar_name = args.create.pop(0)
2520         _, ext = os.path.splitext(tar_name)
2521         compressions = {
2522             # gz
2523             '.gz': 'gz',
2524             '.tgz': 'gz',
2525             # xz
2526             '.xz': 'xz',
2527             '.txz': 'xz',
2528             # bz2
2529             '.bz2': 'bz2',
2530             '.tbz': 'bz2',
2531             '.tbz2': 'bz2',
2532             '.tb2': 'bz2',
2533         }
2534         tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2535         tar_files = args.create
2536 
2537         with TarFile.open(tar_name, tar_mode) as tf:
2538             for file_name in tar_files:
2539                 tf.add(file_name)
2540 
2541         if args.verbose:
2542             print('{!r} file created.'.format(tar_name))
2543 
2544     else:
2545         parser.exit(1, parser.format_help())
2546 
2547 if __name__ == '__main__':
2548     main()

View tarfile Code

PyYAML模块　

Python也可以很容易的处理ymal文档格式，只不过需要安装一个模块，参考文档：http://pyyaml.org/wiki/PyYAMLDocumentation。

re正则表达式

正则表达式：
    
    '.'     默认匹配除\n之外的任意一个字符，若指定flag DOTALL,则匹配任意字符，包括换行
    '^'     匹配字符开头，若指定flags MULTILINE,这种也可以匹配上(r"^a","\nabc\neee",flags=re.MULTILINE)
    '$'     匹配字符结尾，或e.search("foo$","bfoo\nsdfsf",flags=re.MULTILINE).group()也可以
    '*'     匹配*号前的字符0次或多次，re.findall("ab*","cabb3abcbbac")  结果为['abb', 'ab', 'a']
    '+'     匹配前一个字符1次或多次，re.findall("ab+","ab+cd+abb+bba") 结果['ab', 'abb']
    '?'     匹配前一个字符1次或0次
    '{m}'   匹配前一个字符m次
    '{n,m}' 匹配前一个字符n到m次，re.findall("ab{1,3}","abb abc abbcbbb") 结果'abb', 'ab', 'abb']
    '|'     匹配|左或|右的字符，re.search("abc|ABC","ABCBabcCD").group() 结果'ABC'
    '(...)' 分组匹配，re.search("(abc){2}a(123|456)c", "abcabca456c").group() 结果 abcabca456c
     
     
    '\A'    只从字符开头匹配，re.search("\Aabc","alexabc") 是匹配不到的
    '\Z'    匹配字符结尾，同$
    '\d'    匹配数字0-9
    '\D'    匹配非数字
    '\w'    匹配[A-Za-z0-9]
    '\W'    匹配非[A-Za-z0-9]
    '\s'     匹配空白字符、\t、\n、\r , re.search("\s+","ab\tc1\n3").group() 结果 '\t'
    
    

    re.match 从头开始匹配
    re.search 匹配包含
    re.findall 把所有匹配到的字符放到以列表中的元素返回
    re.splitall 以匹配到的字符当做列表分隔符
    re.sub      匹配字符并替换
    
    
    匹配模式：
        re.I:忽略大小写
        re.M;多行模式，改变'^'和'$'的行为
        re.S:点任意匹配模式，改变'.'的行为

# A开头，[A-Za-z0-9]1-7位，后面1个或n次数字并且以“n”结尾
print(re.match("^A\w{1,7}\d+\w*n$", "Allister12365HaoSen"))

# 以数字开头长度为17位，以数字|x|X结尾 18位身份证
print(re.match("^\d{17}(\d|x|X){1}$", "42210319630213275X"))
# 15位身份证 以数字开头的15位数字
print(re.match("^\d{15}", "422103196302132"))


# 以“A”开头，a-zA-Z 一个或多个，后面加上r
print(re.search("^A[a-zA-Z]+r", "Allister123Allister&ds"))

# '?'     匹配前一个字符1次或0次
print(re.search("aaa?", "aaEEEEaaa"))  # aa


print(re.findall("abf?.", "abf%dafsgaabfterftw"))

# 按指定字符分割为列表
print(re.split("[0-9]+", "rf123Allister89ljp"))  # ['rf', 'Allister', 'ljp']

# sub(pattern, repl, string, count=0, flags=0) 将匹配到的值替换为指定字符 可以指定替换次数
print(re.sub("[0-9]+", "|", "rf123Allister89ljp", 5))  # rf|Allister|ljp


"""
    将身份证分解为省、市、区、年、月、日
"""
# {'city': '09', 'county': '21', 'year': '1990', 'province': '51', 'day': '06', 'month': '08'}
print(re.search("(?P<province>[0-9]{2})(?P<city>[0-9]{2})(?P<county>[0-9]{2})(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})",\
                "51092119900806181X").groupdict())


"""
    匹配模式：
        re.I:忽略大小写
        re.M;多行模式，改变'^'和'$'的行为
        re.S:点任意匹配模式，改变'.'的行为
"""

# re.I 忽略大小写
print(re.search("[a-z]+", "abcdEFg", re.I))  # abcdEFg

# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : re_test.py
# @Author: Allister.Liu
# @Date  : 2018/1/22
# @Desc  : 正则表达式

import re

"""
正则表达式：
    
    '.'     默认匹配除\n之外的任意一个字符，若指定flag DOTALL,则匹配任意字符，包括换行
    '^'     匹配字符开头，若指定flags MULTILINE,这种也可以匹配上(r"^a","\nabc\neee",flags=re.MULTILINE)
    '$'     匹配字符结尾，或e.search("foo$","bfoo\nsdfsf",flags=re.MULTILINE).group()也可以
    '*'     匹配*号前的字符0次或多次，re.findall("ab*","cabb3abcbbac")  结果为['abb', 'ab', 'a']
    '+'     匹配前一个字符1次或多次，re.findall("ab+","ab+cd+abb+bba") 结果['ab', 'abb']
    '?'     匹配前一个字符1次或0次
    '{m}'   匹配前一个字符m次
    '{n,m}' 匹配前一个字符n到m次，re.findall("ab{1,3}","abb abc abbcbbb") 结果'abb', 'ab', 'abb']
    '|'     匹配|左或|右的字符，re.search("abc|ABC","ABCBabcCD").group() 结果'ABC'
    '(...)' 分组匹配，re.search("(abc){2}a(123|456)c", "abcabca456c").group() 结果 abcabca456c
     
     
    '\A'    只从字符开头匹配，re.search("\Aabc","alexabc") 是匹配不到的
    '\Z'    匹配字符结尾，同$
    '\d'    匹配数字0-9
    '\D'    匹配非数字
    '\w'    匹配[A-Za-z0-9]
    '\W'    匹配非[A-Za-z0-9]
    '\s'     匹配空白字符、\t、\n、\r , re.search("\s+","ab\tc1\n3").group() 结果 '\t'
    
    

    re.match 从头开始匹配
    re.search 匹配包含
    re.findall 把所有匹配到的字符放到以列表中的元素返回
    re.splitall 以匹配到的字符当做列表分隔符
    re.sub      匹配字符并替换
    
    
    匹配模式：
        re.I:忽略大小写
        re.M;多行模式，改变'^'和'$'的行为
        re.S:点任意匹配模式，改变'.'的行为
"""

# A开头，[A-Za-z0-9]1-7位，后面1个或n次数字并且以“n”结尾
print(re.match("^A\w{1,7}\d+\w*n$", "Allister12365HaoSen"))

# 以数字开头长度为17位，以数字|x|X结尾 18位身份证
print(re.match("^\d{17}(\d|x|X){1}$", "42210319630213275X"))
# 15位身份证 以数字开头的15位数字
print(re.match("^\d{15}", "422103196302132"))


# 以“A”开头，a-zA-Z 一个或多个，后面加上r
print(re.search("^A[a-zA-Z]+r", "Allister123Allister&ds"))

# '?'     匹配前一个字符1次或0次
print(re.search("aaa?", "aaEEEEaaa"))  # aa


print(re.findall("abf?.", "abf%dafsgaabfterftw"))

# 按指定字符分割为列表
print(re.split("[0-9]+", "rf123Allister89ljp"))  # ['rf', 'Allister', 'ljp']

# sub(pattern, repl, string, count=0, flags=0) 将匹配到的值替换为指定字符 可以指定替换次数
print(re.sub("[0-9]+", "|", "rf123Allister89ljp", 5))  # rf|Allister|ljp


"""
    将身份证分解为省、市、区、年、月、日
"""
# {'city': '09', 'county': '21', 'year': '1990', 'province': '51', 'day': '06', 'month': '08'}
print(re.search("(?P<province>[0-9]{2})(?P<city>[0-9]{2})(?P<county>[0-9]{2})(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})",\
                "51092119900806181X").groupdict())


"""
    匹配模式：
        re.I:忽略大小写
        re.M;多行模式，改变'^'和'$'的行为
        re.S:点任意匹配模式，改变'.'的行为
"""

# re.I 忽略大小写
print(re.search("[a-z]+", "abcdEFg", re.I))  # abcdEFg

posted @ 2018-01-20 23:36 Allister 阅读(2322) 评论(0) 编辑收藏举报

刷新页面返回顶部

Allister - 易码当先，码出精彩人生...