Python3之常用模块
大纲>>
- time &datetime模块
- random模块
- OS模块
- sys模块
- shelve模块
- shutil模块
- xml模块
- configparser模块
- Hashlib、Hmac模块
- zipfile&tarfile模块
- PyYAML模块
- re正则表达式
time & datetime模块
# !/usr/bin/env python import time, datetime """ 常用标准库: time 1、时间戳:时间戳表示的是从1970年1月1日00:00:00开始按秒计算的偏移量; 2、格式化的时间字符串 3、元组(struct_time):struct_time元组共有9个元素 格式: %a 本地(locale)简化星期名称 %A 本地完整星期名称 %b 本地简化月份名称 %B 本地完整月份名称 %c 本地相应的日期和时间表示 %d 一个月中的第几天(01 - 31) %H 一天中的第几个小时(24小时制,00 - 23) %I 第几个小时(12小时制,01 - 12) %j 一年中的第几天(001 - 366) %m 月份(01 - 12) %M 分钟数(00 - 59) %p 本地am或者pm的相应符 一 %S 秒(01 - 61) 二 %U 一年中的星期数。(00 - 53星期天是一个星期的开始。)第一个星期天之前的所有天数都放在第0周。 三 %w 一个星期中的第几天(0 - 6,0是星期天) 三 %W 和%U基本相同,不同的是%W以星期一为一个星期的开始。 %x 本地相应日期 %X 本地相应时间 %y 去掉世纪的年份(00 - 99) %Y 完整的年份 %Z 时区的名字(如果不存在为空字符) %% ‘%’字符 """ # print(help(time)) # print(help(time.ctime)) # 查看具体命令用法 # 当前时间 时间戳 print(time.time()) # cpu 时间 print(time.clock()) # 延迟多少秒 # print(time.sleep(1)) # 返回元组格式的时间 UTC time.gmtime(x) x为时间戳 print(time.gmtime()) # 返回元组格式的时间 UTC+8 这是我们常用的时间 time.localtime(x) x为时间戳 print(time.localtime()) x = time.localtime() print("x:", x) # 将元组格式的时间格式化为str格式的自定义格式时间 time.strftime(str_format, x) str_format:格式 x元组时间 print(time.strftime("%Y-%m-%d %H:%M:%S", x)) # 秒格式化为字符串形式 格式为:Tue Jun 16 11:53:31 2009 print(time.ctime(1245124411)) # 获取元组时间中的具体时间 年/月/日...... print(x.tm_year, x.tm_mon, x.tm_mday, x.tm_hour, x.tm_min, x.tm_sec) # 将元组格式的时间转换为时间戳 print(time.mktime(x)) # 将时间戳转为字符串格式 print(time.gmtime(time.time()-86640)) # 将utc时间戳转换成struct_time格式 print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) # 将utc struct_time格式转成指定的字符串格式 """ datetime模块: """ print("时间加减datetime模块".center(50, "~")) # 返回 2018-01-20 23:20:49.418354 print(datetime.datetime.now()) # 时间戳直接转成日期格式 2018-01-20 print(datetime.date.fromtimestamp(time.time())) # 当前时间+3天 print(datetime.datetime.now() + datetime.timedelta(3)) # 当前时间-3天 print(datetime.datetime.now() + datetime.timedelta(-3)) # 当前时间+3小时 print(datetime.datetime.now() + datetime.timedelta(hours=3)) # 当前时间+30分 print(datetime.datetime.now() + datetime.timedelta(minutes=30)) c_time = datetime.datetime.now() # 时间替换 print(c_time.replace(minute=54, hour=5))
时间关系转换图:
random模块
# Author:Allister.Liu # !/usr/bin/env python import random """ random模块: """ # 用于生成一个0到1的随机符点数: 0 <= n < 1.0 print(random.random()) # random.randint(a, b),用于生成一个指定范围内的整数。其中参数a是下限,参数b是上限,生成的随机数n: a <= n <= b print(random.randint(1, 10)) # random.randrange([start], stop[, step]), # 从指定范围内,按指定基数递增的集合中 获取一个随机数。如:random.randrange(10, 100, 2), # 结果相当于从[10, 12, 14, 16, ... 96, 98]序列中获取一个随机数。 # random.randrange(10, 100, 2)在结果上与 random.choice(range(10, 100, 2) 等效。 print(random.randrange(1, 10)) print(random.choice(range(10, 100, 2))) # 从序列中获取一个随机元素。 random.choice(sequence) sequence在python不是一种特定的类型,而是泛指一系列的类型。 list, tuple, 字符串都属于sequence print(random.choice("abcdef")) print(random.choice("学习Python的小伙伴")) # 伙 print(random.choice(["JGood", "is", "a", "handsome", "boy"])) # boy-- List print(random.choice(("Tuple","List","Dict"))) # Tuple # random.sample(sequence, k),从指定序列中随机获取指定长度的片断。sample函数不会修改原有序列。 print(random.sample([1, 2, 3, 4, 5, 6, 7, 8, 9], 5)) # [2, 1, 9, 5, 7] # 随机整数: print(random.randint(0, 99)) # 70 # 随机选取0到100间的偶数: print(random.randrange(0, 101, 2)) # 4 # 随机浮点数: print(random.random()) # 0.2746445568079129 print(random.uniform(1, 10)) # 9.887001463194844 # 随机字符: print(random.choice('abcdefg&#%^*f')) # e # 多个字符中选取特定数量的字符: print(random.sample('abcdefghij123', 3)) # ['3', 'j', 'i'] # 随机选取字符串: print(random.choice(['apple', 'pear', 'peach', 'orange', 'lemon'])) # peach # 洗牌# items = [1, 2, 3, 4, 5, 6, 7, 8, 9] print(items) # [1, 2, 3, 4, 5, 6, 7, 8, 9] random.shuffle(items) print(items) # [8, 3, 6, 1, 4, 9, 5, 7, 2] """ 生成6为验证码:由数字, 大写字母, 小写字母组成的6位随机验证码 """ def produce_check_code(scope = 6): check_code = "" for i in range(scope): tmp = random.randint(0, 10) if tmp < 6: tmp = random.randint(0, 9) elif tmp > 8: tmp = chr(random.randint(65, 90)) else: tmp = chr(random.randint(97, 122)) check_code += str(tmp) return check_code print(produce_check_code(8))
0.21786963196954112 3 2 34 b 的 JGood List [7, 2, 6, 4, 8] 12 14 0.5355914470942843 3.3065568721321013 % ['2', 'g', 'f'] pear [1, 2, 3, 4, 5, 6, 7, 8, 9] [6, 7, 5, 9, 1, 2, 3, 4, 8] D626EbYt
OS模块
提供对操作系统进行调用的接口:
# Author:Allister.Liu # !/usr/bin/env python import os """ OS模块: """ path = "E:/logo/ic2c/logo.png" # 获取当前工作目录,即当前python脚本工作的目录路径 === linux: pwd print(os.getcwd()) # 改变当前脚本工作目录;相当于shell下cd # os.chdir("dirname") # 返回当前目录: ('.') print(os.curdir) # 获取当前目录的父目录字符串名:('..') print(os.pardir) # 可生成多层递归目录 # os.makedirs('dirname1/dirname2') # 若目录为空,则删除,并递归到上一级目录,如若也为空,则删除,依此类推 # os.removedirs('dirname1') # 生成单级目录;相当于shell中mkdir dirname # os.mkdir('dirname') # 删除单级空目录,若目录不为空则无法删除,报错;相当于shell中rmdir dirname # os.rmdir('dirname') # 列出指定目录下的所有文件和子目录,包括隐藏文件,并以列表方式打印 print(os.listdir('E:/logo')) # 删除一个文件 # os.remove() # 重命名文件/目录 # os.rename("oldname","newname") # 获取文件/目录信息 # os.stat('path/filename') # 输出操作系统特定的路径分隔符,win下为"\\",Linux下为"/" os.sep # 输出当前平台使用的行终止符,win下为"\t\n",Linux下为"\n" os.linesep # 输出用于分割文件路径的字符串 eg:环境变量path的分隔符 os.pathsep # 输出字符串指示当前使用平台。win->'nt'; Linux->'posix' os.name # 运行shell命令,直接显示 os.system("dir") # 获取系统环境变量 print(os.environ) # 返回path规范化的绝对路径 print(os.path.abspath(path)) # 将path分割成目录和文件名二元组返回 print(os.path.split(path)) # 返回path的目录。其实就是os.path.split(path)的第一个元素 print(os.path.dirname(path)) # 返回path最后的文件名。如何path以/或\结尾,那么就会返回空值。即os.path.split(path)的第二个元素 print(os.path.basename(path)) # 如果path存在,返回True;如果path不存在,返回False print(os.path.exists(path)) # 如果path是绝对路径,返回True print(os.path.isabs(path)) # 如果path是一个存在的文件,返回True。否则返回False print(os.path.isfile(path)) # 如果path是一个存在的目录,则返回True。否则返回False print(os.path.isdir(path)) # 将多个路径组合后返回,第一个绝对路径之前的参数将被忽略 # os.path.join(path1[, path2[, ...]]) # 返回path所指向的文件或者目录的最后存取时间 print(os.path.getatime(path)) # 返回path所指向的文件或者目录的最后修改时间 print(os.path.getmtime(path))
sys模块
# Author:Allister.Liu # !/usr/bin/env python import sys print(help(sys)) # 命令行参数List,第一个元素是程序本身路径 sys.argv # 退出程序,正常退出时exit(0) # sys.exit(0) # 获取Python解释程序的版本信息 print(sys.version) # 最大的Int值 print(sys.maxsize) # 返回模块的搜索路径,初始化时使用PYTHONPATH环境变量的值 print(sys.path) # 返回操作系统平台名称 print(sys.platform) # 不换行输出 进度条 sys.stdout.write('please:') val = sys.stdin.readline()[:-1] print(val)
shelve模块
# Author:Allister.Liu # !/usr/bin/env python import shelve import os, datetime """ shelve模块:shelve模块是一个简单的k,v将内存数据通过文件持久化的模块,可以持久化任何pickle可支持的python数据格式 """ file_path = "datas" # 文件夹不存在则创建 if not os.path.exists(file_path): os.mkdir(file_path) # 打开一个文件 d = shelve.open(file_path + "/shelve_file.data") class Test(object): def __init__(self, n): self.n = n t1 = Test(123) t2 = Test(123334) names = ["Allister", "Linde", "Heddy", "Daty"] # 持久化列表 k为names d["names"] = names # 持久化类 d["t1"] = t1 d["t2"] = t2 d["date"] = datetime.datetime.now() """ 获取文件内容 """ # 根据key获取value print(d.get("names")) print(d.get("t1")) print(d.get("date")) print(d.items())
shutil模块
# Author:Allister.Liu # !/usr/bin/env python import shutil """ shutil模块: shutil.copyfileobj(fsrc, fdst[, length]):将文件内容拷贝到另一个文件中,可以部分内容; shutil.copyfile(src, dst):拷贝文件; shutil.copymode(src, dst):仅拷贝权限。内容、组、用户均不变; shutil.copystat(src, dst):拷贝状态的信息,包括:mode bits, atime, mtime, flags; shutil.copy(src, dst):拷贝文件和权限; shutil.copy2(src, dst):拷贝文件和状态信息&权限等; shutil.rmtree(path[, ignore_errors[, onerror]]):递归的去删除文件; shutil.move(src, dst):递归的去移动文件; shutil.copytree(src, dst, symlinks=False, ignore=None):递归的去拷贝文件,目录; shutil.move(src, dst):递归的去移动文件 shutil.make_archive(base_name, format,...):创建压缩包并返回文件路径,例如:zip、tar; base_name: 压缩包的文件名,也可以是压缩包的路径。只是文件名时,则保存至当前目录,否则保存至指定路径, 如:ic2c =>保存至当前路径; 如:/Users/Allister/ic2c =>保存至/Users/Allister/; format: 压缩包种类,“zip”, “tar”, “bztar”,“gztar”; root_dir: 要压缩的文件夹路径(默认当前目录); owner: 用户,默认当前用户; group: 组,默认当前组; logger: 用于记录日志,通常是logging.Logger对象; """ """ 复制“笔记.data”至文件“笔记1.data” """ with open("笔记.data", "r", encoding= "utf-8") as f1: with open("笔记1.data", "w", encoding="utf-8") as f2: shutil.copyfileobj(f1, f2) # 无需打开文件,copyfile自动打开文件并复制 # shutil.copyfile("笔记.data", "笔记2.data") # 递归copy文件夹下的所有文件, # shutil.copytree("../day4", "../day5/copys") # 将以上递归copy的目录删除 # shutil.rmtree("copys") # 压缩文件并返回路径 # print(shutil.make_archive("H:/wx/432", "zip" ,root_dir="H:/PycharmProjects/python_tutorial/"))
xml模块
1 <data> 2 <country name="Liechtenstein"> 3 <rank updated="yes">2</rank> 4 <year updated="yes">2009</year> 5 <gdppc>141100</gdppc> 6 <neighbor direction="E" name="Austria" /> 7 <neighbor direction="W" name="Switzerland" /> 8 </country> 9 <country name="Singapore"> 10 <rank updated="yes">5</rank> 11 <year updated="yes">2012</year> 12 <gdppc>59900</gdppc> 13 <neighbor direction="N" name="Malaysia" /> 14 </country> 15 <country name="Panama"> 16 <rank updated="yes">69</rank> 17 <year updated="yes">2012</year> 18 <gdppc>13600</gdppc> 19 <neighbor direction="W" name="Costa Rica" /> 20 <neighbor direction="E" name="Colombia" /> 21 </country> 22 </data>
# Author:Allister.Liu # !/usr/bin/env python import xml.etree.ElementTree as ET """ xml处理模块:xml是实现不同语言或程序之间进行数据交换的协议,跟json差不多,但json使用起来更简单,不过,古时候,在json还没诞生的黑暗年代,大家只能选择用xml呀, 至今很多传统公司如金融行业的很多系统的接口还主要是xml。 """ # xml协议在各个语言里的都 是支持的,在python中可以用以下模块操作xml tree = ET.parse("datas/xml_test.xml") root = tree.getroot() print("父节点:", root.tag) # print("遍历xml文档".center(50, "~")) # # 遍历xml文档 # for child in root: # print(child.tag, child.attrib) # for i in child: # print(i.tag, i.text) # # print("year节点".center(50, "~")) # # 只遍历year节点 # for node in root.iter('year'): # print(node.tag, node.text) """ 修改和删除xml文档内容 """ # 修改 for node in root.iter('year'): new_year = int(node.text) + 1 node.text = str(new_year) node.set("updated", "yes") tree.write("datas/xmltest.xml") # 删除node for country in root.findall('country'): rank = int(country.find('rank').text) if rank > 50: root.remove(country) tree.write('datas/output.xml') """ 创建xml文档 """ new_xml = ET.Element("namelist") name = ET.SubElement(new_xml, "name", attrib={"enrolled": "yes"}) age = ET.SubElement(name, "age", attrib={"checked": "no"}) sex = ET.SubElement(name, "sex") sex.text = '33' name2 = ET.SubElement(new_xml, "name", attrib={"enrolled": "no"}) age = ET.SubElement(name2, "age") age.text = '19' et = ET.ElementTree(new_xml) # 生成文档对象 et.write("datas/test.xml", encoding="utf-8", xml_declaration=True) ET.dump(new_xml) # 打印生成的格式
configparser模块
- 文件的生成:
# Author:Allister.Liu # !/usr/bin/env python import configparser """ mysql的配置文件: """ config = configparser.ConfigParser() # 第一种赋值 config["client"] = {'port': '3306', 'default-character-set': 'utf8'} # 第二种赋值 config['mysqld'] = {} config['mysqld']['port'] = '3306' config['mysqld']['character_set_server'] = 'utf8' config['mysqld']['collation-server'] = 'utf8_general_ci' config['mysqld']['lower_case_table_names'] = '1' config['mysqld']['max_connections'] = '200' # 第三种赋值 config['mysqld_safe'] = {} topsecret = config['mysqld_safe'] topsecret['log-error'] = '/usr/local/mysql/error.log' config['mysqld']['datadir'] = '/usr/local/mysql/data' with open('datas/my.ini', 'w') as configfile: config.write(configfile)
- 文件的读取:
# Author:Allister.Liu # !/usr/bin/env python import configparser """ configparser的读取: """ config = configparser.ConfigParser() # 打开文件,返回文件路径 config.read('datas/my.ini') # 读取文件中的父节点 print(config.sections()) # ['client', 'mysqld', 'mysqld_safe', 'logs'] # 判断节点是否存在文件中 print("mysqld" in config) # True # 获取节点下某个值 print(config["mysqld"]["port"]) # 3306 print(config["mysqld_safe"]["log-error"]) # /usr/local/mysql/error.log topsecret = config["mysqld_safe"] print(topsecret["log-error"]) # /usr/local/mysql/error.log print("遍历配置文件".center(50, "~")) for key in config["mysqld"]: print(key) # 返回元组格式的属性 # [('port', '3306'), ('character_set_server', 'utf8'), ('collation-server', 'utf8_general_ci'), ('lower_case_table_names', '1'), ('max_connections', '200'), ('datadir', '/usr/local/mysql/data')] print(config.items("mysqld")) print(" 改写 ".center(50, "#")) # 删除mysqld后重新写入 # sec = config.remove_section('mysqld') # 要删除的key # config.write(open('datas/my.ini', "w")) # # 判断一个节点是否存在 # sec = config.has_section('mysqld') # print(sec) # # 添加一个节点,如果存在会报错 # sec = config.add_section('logs') # config.write(open('datas/my.ini', "w")) # 新增logs节点下的log_path config.set('logs', 'log_path', "/usr/logs") config.write(open('datas/my.ini', "w"))
Hashlib、Hmac模块
# Author:Allister.Liu # !/usr/bin/env python import hashlib """ hashlib模块:用于加密相关的操作,3.x里代替了md5模块和sha模块,主要提供 SHA1, SHA224, SHA256, SHA384, SHA512 ,MD5 算法。 """ m1 = hashlib.md5() m1.update("asdfghjkl".encode("utf-8")) # 2进制 print(m1.digest()) # 16进制 print(m1.hexdigest()) # ######## md5 ######## print(" md5 ".center(50, "#")) hash = hashlib.md5() hash.update('admin'.encode("utf-8")) print(hash.hexdigest()) # ######## sha1 ######## print(" sha1 ".center(50, "#")) hash = hashlib.sha1() hash.update('admin'.encode("utf-8")) print(hash.hexdigest()) # ######## sha256 ######## print(" sha256 ".center(50, "#")) hash = hashlib.sha256() hash.update('admin'.encode("utf-8")) print(hash.hexdigest()) # ######## sha384 ######## print(" sha384 ".center(50, "#")) hash = hashlib.sha384() hash.update('admin'.encode("utf-8")) print(hash.hexdigest()) # ######## sha512 ######## print(" sha512 ".center(50, "#")) hash = hashlib.sha512() hash.update('admin'.encode("utf-8")) print(hash.hexdigest()) """ python 还有一个 hmac 模块,它内部对我们创建 key 和 内容 再进行处理然后再加密 散列消息鉴别码,简称HMAC,是一种基于消息鉴别码MAC(Message Authentication Code)的鉴别机制。使用HMAC时,消息通讯的双方,通过验证消息中加入的鉴别密钥K来鉴别消息的真伪; 一般用于网络通信中消息加密,前提是双方先要约定好key,就像接头暗号一样,然后消息发送把用key把消息加密,接收方用key + 消息明文再加密,拿加密后的值 跟 发送者的相对比是否相等,这样就能验证消息的真实性,及发送者的合法性了。 """ import hmac h = hmac.new('中华好儿女'.encode("utf-8"), '美丽的山河'.encode("utf-8")) print(h.hexdigest())
zipfile&tarfile模块
# Author:Allister.Liu # !/usr/bin/env python """ zip解压缩 """ import zipfile # 压缩 z = zipfile.ZipFile('Allister.zip', 'w') z.write('笔记.data') z.write('sys_test.py') z.close() # 解压 z = zipfile.ZipFile('Allister.zip', 'r') z.extractall() z.close() """ tar解压缩 """ import tarfile # 压缩 tar = tarfile.open('your.tar', 'w') tar.add('/home/dsa.tools/mysql.zip', arcname='mysql.zip') tar.add('/Users/wupeiqi/PycharmProjects/cmdb.zip', arcname='cmdb.zip') tar.close() # 解压 tar = tarfile.open('your.tar', 'r') tar.extractall() # 可设置解压地址 tar.close()
a、zipfile
1 """ 2 Read and write ZIP files. 3 4 XXX references to utf-8 need further investigation. 5 """ 6 import io 7 import os 8 import re 9 import importlib.util 10 import sys 11 import time 12 import stat 13 import shutil 14 import struct 15 import binascii 16 17 try: 18 import threading 19 except ImportError: 20 import dummy_threading as threading 21 22 try: 23 import zlib # We may need its compression method 24 crc32 = zlib.crc32 25 except ImportError: 26 zlib = None 27 crc32 = binascii.crc32 28 29 try: 30 import bz2 # We may need its compression method 31 except ImportError: 32 bz2 = None 33 34 try: 35 import lzma # We may need its compression method 36 except ImportError: 37 lzma = None 38 39 __all__ = ["BadZipFile", "BadZipfile", "error", 40 "ZIP_STORED", "ZIP_DEFLATED", "ZIP_BZIP2", "ZIP_LZMA", 41 "is_zipfile", "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile"] 42 43 class BadZipFile(Exception): 44 pass 45 46 47 class LargeZipFile(Exception): 48 """ 49 Raised when writing a zipfile, the zipfile requires ZIP64 extensions 50 and those extensions are disabled. 51 """ 52 53 error = BadZipfile = BadZipFile # Pre-3.2 compatibility names 54 55 56 ZIP64_LIMIT = (1 << 31) - 1 57 ZIP_FILECOUNT_LIMIT = (1 << 16) - 1 58 ZIP_MAX_COMMENT = (1 << 16) - 1 59 60 # constants for Zip file compression methods 61 ZIP_STORED = 0 62 ZIP_DEFLATED = 8 63 ZIP_BZIP2 = 12 64 ZIP_LZMA = 14 65 # Other ZIP compression methods not supported 66 67 DEFAULT_VERSION = 20 68 ZIP64_VERSION = 45 69 BZIP2_VERSION = 46 70 LZMA_VERSION = 63 71 # we recognize (but not necessarily support) all features up to that version 72 MAX_EXTRACT_VERSION = 63 73 74 # Below are some formats and associated data for reading/writing headers using 75 # the struct module. The names and structures of headers/records are those used 76 # in the PKWARE description of the ZIP file format: 77 # http://www.pkware.com/documents/casestudies/APPNOTE.TXT 78 # (URL valid as of January 2008) 79 80 # The "end of central directory" structure, magic number, size, and indices 81 # (section V.I in the format document) 82 structEndArchive = b"<4s4H2LH" 83 stringEndArchive = b"PK\005\006" 84 sizeEndCentDir = struct.calcsize(structEndArchive) 85 86 _ECD_SIGNATURE = 0 87 _ECD_DISK_NUMBER = 1 88 _ECD_DISK_START = 2 89 _ECD_ENTRIES_THIS_DISK = 3 90 _ECD_ENTRIES_TOTAL = 4 91 _ECD_SIZE = 5 92 _ECD_OFFSET = 6 93 _ECD_COMMENT_SIZE = 7 94 # These last two indices are not part of the structure as defined in the 95 # spec, but they are used internally by this module as a convenience 96 _ECD_COMMENT = 8 97 _ECD_LOCATION = 9 98 99 # The "central directory" structure, magic number, size, and indices 100 # of entries in the structure (section V.F in the format document) 101 structCentralDir = "<4s4B4HL2L5H2L" 102 stringCentralDir = b"PK\001\002" 103 sizeCentralDir = struct.calcsize(structCentralDir) 104 105 # indexes of entries in the central directory structure 106 _CD_SIGNATURE = 0 107 _CD_CREATE_VERSION = 1 108 _CD_CREATE_SYSTEM = 2 109 _CD_EXTRACT_VERSION = 3 110 _CD_EXTRACT_SYSTEM = 4 111 _CD_FLAG_BITS = 5 112 _CD_COMPRESS_TYPE = 6 113 _CD_TIME = 7 114 _CD_DATE = 8 115 _CD_CRC = 9 116 _CD_COMPRESSED_SIZE = 10 117 _CD_UNCOMPRESSED_SIZE = 11 118 _CD_FILENAME_LENGTH = 12 119 _CD_EXTRA_FIELD_LENGTH = 13 120 _CD_COMMENT_LENGTH = 14 121 _CD_DISK_NUMBER_START = 15 122 _CD_INTERNAL_FILE_ATTRIBUTES = 16 123 _CD_EXTERNAL_FILE_ATTRIBUTES = 17 124 _CD_LOCAL_HEADER_OFFSET = 18 125 126 # The "local file header" structure, magic number, size, and indices 127 # (section V.A in the format document) 128 structFileHeader = "<4s2B4HL2L2H" 129 stringFileHeader = b"PK\003\004" 130 sizeFileHeader = struct.calcsize(structFileHeader) 131 132 _FH_SIGNATURE = 0 133 _FH_EXTRACT_VERSION = 1 134 _FH_EXTRACT_SYSTEM = 2 135 _FH_GENERAL_PURPOSE_FLAG_BITS = 3 136 _FH_COMPRESSION_METHOD = 4 137 _FH_LAST_MOD_TIME = 5 138 _FH_LAST_MOD_DATE = 6 139 _FH_CRC = 7 140 _FH_COMPRESSED_SIZE = 8 141 _FH_UNCOMPRESSED_SIZE = 9 142 _FH_FILENAME_LENGTH = 10 143 _FH_EXTRA_FIELD_LENGTH = 11 144 145 # The "Zip64 end of central directory locator" structure, magic number, and size 146 structEndArchive64Locator = "<4sLQL" 147 stringEndArchive64Locator = b"PK\x06\x07" 148 sizeEndCentDir64Locator = struct.calcsize(structEndArchive64Locator) 149 150 # The "Zip64 end of central directory" record, magic number, size, and indices 151 # (section V.G in the format document) 152 structEndArchive64 = "<4sQ2H2L4Q" 153 stringEndArchive64 = b"PK\x06\x06" 154 sizeEndCentDir64 = struct.calcsize(structEndArchive64) 155 156 _CD64_SIGNATURE = 0 157 _CD64_DIRECTORY_RECSIZE = 1 158 _CD64_CREATE_VERSION = 2 159 _CD64_EXTRACT_VERSION = 3 160 _CD64_DISK_NUMBER = 4 161 _CD64_DISK_NUMBER_START = 5 162 _CD64_NUMBER_ENTRIES_THIS_DISK = 6 163 _CD64_NUMBER_ENTRIES_TOTAL = 7 164 _CD64_DIRECTORY_SIZE = 8 165 _CD64_OFFSET_START_CENTDIR = 9 166 167 def _check_zipfile(fp): 168 try: 169 if _EndRecData(fp): 170 return True # file has correct magic number 171 except OSError: 172 pass 173 return False 174 175 def is_zipfile(filename): 176 """Quickly see if a file is a ZIP file by checking the magic number. 177 178 The filename argument may be a file or file-like object too. 179 """ 180 result = False 181 try: 182 if hasattr(filename, "read"): 183 result = _check_zipfile(fp=filename) 184 else: 185 with open(filename, "rb") as fp: 186 result = _check_zipfile(fp) 187 except OSError: 188 pass 189 return result 190 191 def _EndRecData64(fpin, offset, endrec): 192 """ 193 Read the ZIP64 end-of-archive records and use that to update endrec 194 """ 195 try: 196 fpin.seek(offset - sizeEndCentDir64Locator, 2) 197 except OSError: 198 # If the seek fails, the file is not large enough to contain a ZIP64 199 # end-of-archive record, so just return the end record we were given. 200 return endrec 201 202 data = fpin.read(sizeEndCentDir64Locator) 203 if len(data) != sizeEndCentDir64Locator: 204 return endrec 205 sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) 206 if sig != stringEndArchive64Locator: 207 return endrec 208 209 if diskno != 0 or disks != 1: 210 raise BadZipFile("zipfiles that span multiple disks are not supported") 211 212 # Assume no 'zip64 extensible data' 213 fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) 214 data = fpin.read(sizeEndCentDir64) 215 if len(data) != sizeEndCentDir64: 216 return endrec 217 sig, sz, create_version, read_version, disk_num, disk_dir, \ 218 dircount, dircount2, dirsize, diroffset = \ 219 struct.unpack(structEndArchive64, data) 220 if sig != stringEndArchive64: 221 return endrec 222 223 # Update the original endrec using data from the ZIP64 record 224 endrec[_ECD_SIGNATURE] = sig 225 endrec[_ECD_DISK_NUMBER] = disk_num 226 endrec[_ECD_DISK_START] = disk_dir 227 endrec[_ECD_ENTRIES_THIS_DISK] = dircount 228 endrec[_ECD_ENTRIES_TOTAL] = dircount2 229 endrec[_ECD_SIZE] = dirsize 230 endrec[_ECD_OFFSET] = diroffset 231 return endrec 232 233 234 def _EndRecData(fpin): 235 """Return data from the "End of Central Directory" record, or None. 236 237 The data is a list of the nine items in the ZIP "End of central dir" 238 record followed by a tenth item, the file seek offset of this record.""" 239 240 # Determine file size 241 fpin.seek(0, 2) 242 filesize = fpin.tell() 243 244 # Check to see if this is ZIP file with no archive comment (the 245 # "end of central directory" structure should be the last item in the 246 # file if this is the case). 247 try: 248 fpin.seek(-sizeEndCentDir, 2) 249 except OSError: 250 return None 251 data = fpin.read() 252 if (len(data) == sizeEndCentDir and 253 data[0:4] == stringEndArchive and 254 data[-2:] == b"\000\000"): 255 # the signature is correct and there's no comment, unpack structure 256 endrec = struct.unpack(structEndArchive, data) 257 endrec=list(endrec) 258 259 # Append a blank comment and record start offset 260 endrec.append(b"") 261 endrec.append(filesize - sizeEndCentDir) 262 263 # Try to read the "Zip64 end of central directory" structure 264 return _EndRecData64(fpin, -sizeEndCentDir, endrec) 265 266 # Either this is not a ZIP file, or it is a ZIP file with an archive 267 # comment. Search the end of the file for the "end of central directory" 268 # record signature. The comment is the last item in the ZIP file and may be 269 # up to 64K long. It is assumed that the "end of central directory" magic 270 # number does not appear in the comment. 271 maxCommentStart = max(filesize - (1 << 16) - sizeEndCentDir, 0) 272 fpin.seek(maxCommentStart, 0) 273 data = fpin.read() 274 start = data.rfind(stringEndArchive) 275 if start >= 0: 276 # found the magic number; attempt to unpack and interpret 277 recData = data[start:start+sizeEndCentDir] 278 if len(recData) != sizeEndCentDir: 279 # Zip file is corrupted. 280 return None 281 endrec = list(struct.unpack(structEndArchive, recData)) 282 commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file 283 comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize] 284 endrec.append(comment) 285 endrec.append(maxCommentStart + start) 286 287 # Try to read the "Zip64 end of central directory" structure 288 return _EndRecData64(fpin, maxCommentStart + start - filesize, 289 endrec) 290 291 # Unable to find a valid end of central directory structure 292 return None 293 294 295 class ZipInfo (object): 296 """Class with attributes describing each file in the ZIP archive.""" 297 298 __slots__ = ( 299 'orig_filename', 300 'filename', 301 'date_time', 302 'compress_type', 303 'comment', 304 'extra', 305 'create_system', 306 'create_version', 307 'extract_version', 308 'reserved', 309 'flag_bits', 310 'volume', 311 'internal_attr', 312 'external_attr', 313 'header_offset', 314 'CRC', 315 'compress_size', 316 'file_size', 317 '_raw_time', 318 ) 319 320 def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)): 321 self.orig_filename = filename # Original file name in archive 322 323 # Terminate the file name at the first null byte. Null bytes in file 324 # names are used as tricks by viruses in archives. 325 null_byte = filename.find(chr(0)) 326 if null_byte >= 0: 327 filename = filename[0:null_byte] 328 # This is used to ensure paths in generated ZIP files always use 329 # forward slashes as the directory separator, as required by the 330 # ZIP format specification. 331 if os.sep != "/" and os.sep in filename: 332 filename = filename.replace(os.sep, "/") 333 334 self.filename = filename # Normalized file name 335 self.date_time = date_time # year, month, day, hour, min, sec 336 337 if date_time[0] < 1980: 338 raise ValueError('ZIP does not support timestamps before 1980') 339 340 # Standard values: 341 self.compress_type = ZIP_STORED # Type of compression for the file 342 self.comment = b"" # Comment for each file 343 self.extra = b"" # ZIP extra data 344 if sys.platform == 'win32': 345 self.create_system = 0 # System which created ZIP archive 346 else: 347 # Assume everything else is unix-y 348 self.create_system = 3 # System which created ZIP archive 349 self.create_version = DEFAULT_VERSION # Version which created ZIP archive 350 self.extract_version = DEFAULT_VERSION # Version needed to extract archive 351 self.reserved = 0 # Must be zero 352 self.flag_bits = 0 # ZIP flag bits 353 self.volume = 0 # Volume number of file header 354 self.internal_attr = 0 # Internal attributes 355 self.external_attr = 0 # External file attributes 356 # Other attributes are set by class ZipFile: 357 # header_offset Byte offset to the file header 358 # CRC CRC-32 of the uncompressed file 359 # compress_size Size of the compressed file 360 # file_size Size of the uncompressed file 361 362 def __repr__(self): 363 result = ['<%s filename=%r' % (self.__class__.__name__, self.filename)] 364 if self.compress_type != ZIP_STORED: 365 result.append(' compress_type=%s' % 366 compressor_names.get(self.compress_type, 367 self.compress_type)) 368 hi = self.external_attr >> 16 369 lo = self.external_attr & 0xFFFF 370 if hi: 371 result.append(' filemode=%r' % stat.filemode(hi)) 372 if lo: 373 result.append(' external_attr=%#x' % lo) 374 isdir = self.filename[-1:] == '/' 375 if not isdir or self.file_size: 376 result.append(' file_size=%r' % self.file_size) 377 if ((not isdir or self.compress_size) and 378 (self.compress_type != ZIP_STORED or 379 self.file_size != self.compress_size)): 380 result.append(' compress_size=%r' % self.compress_size) 381 result.append('>') 382 return ''.join(result) 383 384 def FileHeader(self, zip64=None): 385 """Return the per-file header as a string.""" 386 dt = self.date_time 387 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] 388 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) 389 if self.flag_bits & 0x08: 390 # Set these to zero because we write them after the file data 391 CRC = compress_size = file_size = 0 392 else: 393 CRC = self.CRC 394 compress_size = self.compress_size 395 file_size = self.file_size 396 397 extra = self.extra 398 399 min_version = 0 400 if zip64 is None: 401 zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT 402 if zip64: 403 fmt = '<HHQQ' 404 extra = extra + struct.pack(fmt, 405 1, struct.calcsize(fmt)-4, file_size, compress_size) 406 if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT: 407 if not zip64: 408 raise LargeZipFile("Filesize would require ZIP64 extensions") 409 # File is larger than what fits into a 4 byte integer, 410 # fall back to the ZIP64 extension 411 file_size = 0xffffffff 412 compress_size = 0xffffffff 413 min_version = ZIP64_VERSION 414 415 if self.compress_type == ZIP_BZIP2: 416 min_version = max(BZIP2_VERSION, min_version) 417 elif self.compress_type == ZIP_LZMA: 418 min_version = max(LZMA_VERSION, min_version) 419 420 self.extract_version = max(min_version, self.extract_version) 421 self.create_version = max(min_version, self.create_version) 422 filename, flag_bits = self._encodeFilenameFlags() 423 header = struct.pack(structFileHeader, stringFileHeader, 424 self.extract_version, self.reserved, flag_bits, 425 self.compress_type, dostime, dosdate, CRC, 426 compress_size, file_size, 427 len(filename), len(extra)) 428 return header + filename + extra 429 430 def _encodeFilenameFlags(self): 431 try: 432 return self.filename.encode('ascii'), self.flag_bits 433 except UnicodeEncodeError: 434 return self.filename.encode('utf-8'), self.flag_bits | 0x800 435 436 def _decodeExtra(self): 437 # Try to decode the extra field. 438 extra = self.extra 439 unpack = struct.unpack 440 while len(extra) >= 4: 441 tp, ln = unpack('<HH', extra[:4]) 442 if tp == 1: 443 if ln >= 24: 444 counts = unpack('<QQQ', extra[4:28]) 445 elif ln == 16: 446 counts = unpack('<QQ', extra[4:20]) 447 elif ln == 8: 448 counts = unpack('<Q', extra[4:12]) 449 elif ln == 0: 450 counts = () 451 else: 452 raise RuntimeError("Corrupt extra field %s"%(ln,)) 453 454 idx = 0 455 456 # ZIP64 extension (large files and/or large archives) 457 if self.file_size in (0xffffffffffffffff, 0xffffffff): 458 self.file_size = counts[idx] 459 idx += 1 460 461 if self.compress_size == 0xFFFFFFFF: 462 self.compress_size = counts[idx] 463 idx += 1 464 465 if self.header_offset == 0xffffffff: 466 old = self.header_offset 467 self.header_offset = counts[idx] 468 idx+=1 469 470 extra = extra[ln+4:] 471 472 473 class _ZipDecrypter: 474 """Class to handle decryption of files stored within a ZIP archive. 475 476 ZIP supports a password-based form of encryption. Even though known 477 plaintext attacks have been found against it, it is still useful 478 to be able to get data out of such a file. 479 480 Usage: 481 zd = _ZipDecrypter(mypwd) 482 plain_char = zd(cypher_char) 483 plain_text = map(zd, cypher_text) 484 """ 485 486 def _GenerateCRCTable(): 487 """Generate a CRC-32 table. 488 489 ZIP encryption uses the CRC32 one-byte primitive for scrambling some 490 internal keys. We noticed that a direct implementation is faster than 491 relying on binascii.crc32(). 492 """ 493 poly = 0xedb88320 494 table = [0] * 256 495 for i in range(256): 496 crc = i 497 for j in range(8): 498 if crc & 1: 499 crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly 500 else: 501 crc = ((crc >> 1) & 0x7FFFFFFF) 502 table[i] = crc 503 return table 504 crctable = None 505 506 def _crc32(self, ch, crc): 507 """Compute the CRC32 primitive on one byte.""" 508 return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ch) & 0xff] 509 510 def __init__(self, pwd): 511 if _ZipDecrypter.crctable is None: 512 _ZipDecrypter.crctable = _ZipDecrypter._GenerateCRCTable() 513 self.key0 = 305419896 514 self.key1 = 591751049 515 self.key2 = 878082192 516 for p in pwd: 517 self._UpdateKeys(p) 518 519 def _UpdateKeys(self, c): 520 self.key0 = self._crc32(c, self.key0) 521 self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295 522 self.key1 = (self.key1 * 134775813 + 1) & 4294967295 523 self.key2 = self._crc32((self.key1 >> 24) & 255, self.key2) 524 525 def __call__(self, c): 526 """Decrypt a single character.""" 527 assert isinstance(c, int) 528 k = self.key2 | 2 529 c = c ^ (((k * (k^1)) >> 8) & 255) 530 self._UpdateKeys(c) 531 return c 532 533 534 class LZMACompressor: 535 536 def __init__(self): 537 self._comp = None 538 539 def _init(self): 540 props = lzma._encode_filter_properties({'id': lzma.FILTER_LZMA1}) 541 self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[ 542 lzma._decode_filter_properties(lzma.FILTER_LZMA1, props) 543 ]) 544 return struct.pack('<BBH', 9, 4, len(props)) + props 545 546 def compress(self, data): 547 if self._comp is None: 548 return self._init() + self._comp.compress(data) 549 return self._comp.compress(data) 550 551 def flush(self): 552 if self._comp is None: 553 return self._init() + self._comp.flush() 554 return self._comp.flush() 555 556 557 class LZMADecompressor: 558 559 def __init__(self): 560 self._decomp = None 561 self._unconsumed = b'' 562 self.eof = False 563 564 def decompress(self, data): 565 if self._decomp is None: 566 self._unconsumed += data 567 if len(self._unconsumed) <= 4: 568 return b'' 569 psize, = struct.unpack('<H', self._unconsumed[2:4]) 570 if len(self._unconsumed) <= 4 + psize: 571 return b'' 572 573 self._decomp = lzma.LZMADecompressor(lzma.FORMAT_RAW, filters=[ 574 lzma._decode_filter_properties(lzma.FILTER_LZMA1, 575 self._unconsumed[4:4 + psize]) 576 ]) 577 data = self._unconsumed[4 + psize:] 578 del self._unconsumed 579 580 result = self._decomp.decompress(data) 581 self.eof = self._decomp.eof 582 return result 583 584 585 compressor_names = { 586 0: 'store', 587 1: 'shrink', 588 2: 'reduce', 589 3: 'reduce', 590 4: 'reduce', 591 5: 'reduce', 592 6: 'implode', 593 7: 'tokenize', 594 8: 'deflate', 595 9: 'deflate64', 596 10: 'implode', 597 12: 'bzip2', 598 14: 'lzma', 599 18: 'terse', 600 19: 'lz77', 601 97: 'wavpack', 602 98: 'ppmd', 603 } 604 605 def _check_compression(compression): 606 if compression == ZIP_STORED: 607 pass 608 elif compression == ZIP_DEFLATED: 609 if not zlib: 610 raise RuntimeError( 611 "Compression requires the (missing) zlib module") 612 elif compression == ZIP_BZIP2: 613 if not bz2: 614 raise RuntimeError( 615 "Compression requires the (missing) bz2 module") 616 elif compression == ZIP_LZMA: 617 if not lzma: 618 raise RuntimeError( 619 "Compression requires the (missing) lzma module") 620 else: 621 raise RuntimeError("That compression method is not supported") 622 623 624 def _get_compressor(compress_type): 625 if compress_type == ZIP_DEFLATED: 626 return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, 627 zlib.DEFLATED, -15) 628 elif compress_type == ZIP_BZIP2: 629 return bz2.BZ2Compressor() 630 elif compress_type == ZIP_LZMA: 631 return LZMACompressor() 632 else: 633 return None 634 635 636 def _get_decompressor(compress_type): 637 if compress_type == ZIP_STORED: 638 return None 639 elif compress_type == ZIP_DEFLATED: 640 return zlib.decompressobj(-15) 641 elif compress_type == ZIP_BZIP2: 642 return bz2.BZ2Decompressor() 643 elif compress_type == ZIP_LZMA: 644 return LZMADecompressor() 645 else: 646 descr = compressor_names.get(compress_type) 647 if descr: 648 raise NotImplementedError("compression type %d (%s)" % (compress_type, descr)) 649 else: 650 raise NotImplementedError("compression type %d" % (compress_type,)) 651 652 653 class _SharedFile: 654 def __init__(self, file, pos, close, lock): 655 self._file = file 656 self._pos = pos 657 self._close = close 658 self._lock = lock 659 660 def read(self, n=-1): 661 with self._lock: 662 self._file.seek(self._pos) 663 data = self._file.read(n) 664 self._pos = self._file.tell() 665 return data 666 667 def close(self): 668 if self._file is not None: 669 fileobj = self._file 670 self._file = None 671 self._close(fileobj) 672 673 # Provide the tell method for unseekable stream 674 class _Tellable: 675 def __init__(self, fp): 676 self.fp = fp 677 self.offset = 0 678 679 def write(self, data): 680 n = self.fp.write(data) 681 self.offset += n 682 return n 683 684 def tell(self): 685 return self.offset 686 687 def flush(self): 688 self.fp.flush() 689 690 def close(self): 691 self.fp.close() 692 693 694 class ZipExtFile(io.BufferedIOBase): 695 """File-like object for reading an archive member. 696 Is returned by ZipFile.open(). 697 """ 698 699 # Max size supported by decompressor. 700 MAX_N = 1 << 31 - 1 701 702 # Read from compressed files in 4k blocks. 703 MIN_READ_SIZE = 4096 704 705 # Search for universal newlines or line chunks. 706 PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)') 707 708 def __init__(self, fileobj, mode, zipinfo, decrypter=None, 709 close_fileobj=False): 710 self._fileobj = fileobj 711 self._decrypter = decrypter 712 self._close_fileobj = close_fileobj 713 714 self._compress_type = zipinfo.compress_type 715 self._compress_left = zipinfo.compress_size 716 self._left = zipinfo.file_size 717 718 self._decompressor = _get_decompressor(self._compress_type) 719 720 self._eof = False 721 self._readbuffer = b'' 722 self._offset = 0 723 724 self._universal = 'U' in mode 725 self.newlines = None 726 727 # Adjust read size for encrypted files since the first 12 bytes 728 # are for the encryption/password information. 729 if self._decrypter is not None: 730 self._compress_left -= 12 731 732 self.mode = mode 733 self.name = zipinfo.filename 734 735 if hasattr(zipinfo, 'CRC'): 736 self._expected_crc = zipinfo.CRC 737 self._running_crc = crc32(b'') 738 else: 739 self._expected_crc = None 740 741 def __repr__(self): 742 result = ['<%s.%s' % (self.__class__.__module__, 743 self.__class__.__qualname__)] 744 if not self.closed: 745 result.append(' name=%r mode=%r' % (self.name, self.mode)) 746 if self._compress_type != ZIP_STORED: 747 result.append(' compress_type=%s' % 748 compressor_names.get(self._compress_type, 749 self._compress_type)) 750 else: 751 result.append(' [closed]') 752 result.append('>') 753 return ''.join(result) 754 755 def readline(self, limit=-1): 756 """Read and return a line from the stream. 757 758 If limit is specified, at most limit bytes will be read. 759 """ 760 761 if not self._universal and limit < 0: 762 # Shortcut common case - newline found in buffer. 763 i = self._readbuffer.find(b'\n', self._offset) + 1 764 if i > 0: 765 line = self._readbuffer[self._offset: i] 766 self._offset = i 767 return line 768 769 if not self._universal: 770 return io.BufferedIOBase.readline(self, limit) 771 772 line = b'' 773 while limit < 0 or len(line) < limit: 774 readahead = self.peek(2) 775 if readahead == b'': 776 return line 777 778 # 779 # Search for universal newlines or line chunks. 780 # 781 # The pattern returns either a line chunk or a newline, but not 782 # both. Combined with peek(2), we are assured that the sequence 783 # '\r\n' is always retrieved completely and never split into 784 # separate newlines - '\r', '\n' due to coincidental readaheads. 785 # 786 match = self.PATTERN.search(readahead) 787 newline = match.group('newline') 788 if newline is not None: 789 if self.newlines is None: 790 self.newlines = [] 791 if newline not in self.newlines: 792 self.newlines.append(newline) 793 self._offset += len(newline) 794 return line + b'\n' 795 796 chunk = match.group('chunk') 797 if limit >= 0: 798 chunk = chunk[: limit - len(line)] 799 800 self._offset += len(chunk) 801 line += chunk 802 803 return line 804 805 def peek(self, n=1): 806 """Returns buffered bytes without advancing the position.""" 807 if n > len(self._readbuffer) - self._offset: 808 chunk = self.read(n) 809 if len(chunk) > self._offset: 810 self._readbuffer = chunk + self._readbuffer[self._offset:] 811 self._offset = 0 812 else: 813 self._offset -= len(chunk) 814 815 # Return up to 512 bytes to reduce allocation overhead for tight loops. 816 return self._readbuffer[self._offset: self._offset + 512] 817 818 def readable(self): 819 return True 820 821 def read(self, n=-1): 822 """Read and return up to n bytes. 823 If the argument is omitted, None, or negative, data is read and returned until EOF is reached.. 824 """ 825 if n is None or n < 0: 826 buf = self._readbuffer[self._offset:] 827 self._readbuffer = b'' 828 self._offset = 0 829 while not self._eof: 830 buf += self._read1(self.MAX_N) 831 return buf 832 833 end = n + self._offset 834 if end < len(self._readbuffer): 835 buf = self._readbuffer[self._offset:end] 836 self._offset = end 837 return buf 838 839 n = end - len(self._readbuffer) 840 buf = self._readbuffer[self._offset:] 841 self._readbuffer = b'' 842 self._offset = 0 843 while n > 0 and not self._eof: 844 data = self._read1(n) 845 if n < len(data): 846 self._readbuffer = data 847 self._offset = n 848 buf += data[:n] 849 break 850 buf += data 851 n -= len(data) 852 return buf 853 854 def _update_crc(self, newdata): 855 # Update the CRC using the given data. 856 if self._expected_crc is None: 857 # No need to compute the CRC if we don't have a reference value 858 return 859 self._running_crc = crc32(newdata, self._running_crc) 860 # Check the CRC if we're at the end of the file 861 if self._eof and self._running_crc != self._expected_crc: 862 raise BadZipFile("Bad CRC-32 for file %r" % self.name) 863 864 def read1(self, n): 865 """Read up to n bytes with at most one read() system call.""" 866 867 if n is None or n < 0: 868 buf = self._readbuffer[self._offset:] 869 self._readbuffer = b'' 870 self._offset = 0 871 while not self._eof: 872 data = self._read1(self.MAX_N) 873 if data: 874 buf += data 875 break 876 return buf 877 878 end = n + self._offset 879 if end < len(self._readbuffer): 880 buf = self._readbuffer[self._offset:end] 881 self._offset = end 882 return buf 883 884 n = end - len(self._readbuffer) 885 buf = self._readbuffer[self._offset:] 886 self._readbuffer = b'' 887 self._offset = 0 888 if n > 0: 889 while not self._eof: 890 data = self._read1(n) 891 if n < len(data): 892 self._readbuffer = data 893 self._offset = n 894 buf += data[:n] 895 break 896 if data: 897 buf += data 898 break 899 return buf 900 901 def _read1(self, n): 902 # Read up to n compressed bytes with at most one read() system call, 903 # decrypt and decompress them. 904 if self._eof or n <= 0: 905 return b'' 906 907 # Read from file. 908 if self._compress_type == ZIP_DEFLATED: 909 ## Handle unconsumed data. 910 data = self._decompressor.unconsumed_tail 911 if n > len(data): 912 data += self._read2(n - len(data)) 913 else: 914 data = self._read2(n) 915 916 if self._compress_type == ZIP_STORED: 917 self._eof = self._compress_left <= 0 918 elif self._compress_type == ZIP_DEFLATED: 919 n = max(n, self.MIN_READ_SIZE) 920 data = self._decompressor.decompress(data, n) 921 self._eof = (self._decompressor.eof or 922 self._compress_left <= 0 and 923 not self._decompressor.unconsumed_tail) 924 if self._eof: 925 data += self._decompressor.flush() 926 else: 927 data = self._decompressor.decompress(data) 928 self._eof = self._decompressor.eof or self._compress_left <= 0 929 930 data = data[:self._left] 931 self._left -= len(data) 932 if self._left <= 0: 933 self._eof = True 934 self._update_crc(data) 935 return data 936 937 def _read2(self, n): 938 if self._compress_left <= 0: 939 return b'' 940 941 n = max(n, self.MIN_READ_SIZE) 942 n = min(n, self._compress_left) 943 944 data = self._fileobj.read(n) 945 self._compress_left -= len(data) 946 if not data: 947 raise EOFError 948 949 if self._decrypter is not None: 950 data = bytes(map(self._decrypter, data)) 951 return data 952 953 def close(self): 954 try: 955 if self._close_fileobj: 956 self._fileobj.close() 957 finally: 958 super().close() 959 960 961 class ZipFile: 962 """ Class with methods to open, read, write, close, list zip files. 963 964 z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True) 965 966 file: Either the path to the file, or a file-like object. 967 If it is a path, the file will be opened and closed by ZipFile. 968 mode: The mode can be either read 'r', write 'w', exclusive create 'x', 969 or append 'a'. 970 compression: ZIP_STORED (no compression), ZIP_DEFLATED (requires zlib), 971 ZIP_BZIP2 (requires bz2) or ZIP_LZMA (requires lzma). 972 allowZip64: if True ZipFile will create files with ZIP64 extensions when 973 needed, otherwise it will raise an exception when this would 974 be necessary. 975 976 """ 977 978 fp = None # Set here since __del__ checks it 979 _windows_illegal_name_trans_table = None 980 981 def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True): 982 """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x', 983 or append 'a'.""" 984 if mode not in ('r', 'w', 'x', 'a'): 985 raise RuntimeError("ZipFile requires mode 'r', 'w', 'x', or 'a'") 986 987 _check_compression(compression) 988 989 self._allowZip64 = allowZip64 990 self._didModify = False 991 self.debug = 0 # Level of printing: 0 through 3 992 self.NameToInfo = {} # Find file info given name 993 self.filelist = [] # List of ZipInfo instances for archive 994 self.compression = compression # Method of compression 995 self.mode = mode 996 self.pwd = None 997 self._comment = b'' 998 999 # Check if we were passed a file-like object 1000 if isinstance(file, str): 1001 # No, it's a filename 1002 self._filePassed = 0 1003 self.filename = file 1004 modeDict = {'r' : 'rb', 'w': 'w+b', 'x': 'x+b', 'a' : 'r+b', 1005 'r+b': 'w+b', 'w+b': 'wb', 'x+b': 'xb'} 1006 filemode = modeDict[mode] 1007 while True: 1008 try: 1009 self.fp = io.open(file, filemode) 1010 except OSError: 1011 if filemode in modeDict: 1012 filemode = modeDict[filemode] 1013 continue 1014 raise 1015 break 1016 else: 1017 self._filePassed = 1 1018 self.fp = file 1019 self.filename = getattr(file, 'name', None) 1020 self._fileRefCnt = 1 1021 self._lock = threading.RLock() 1022 self._seekable = True 1023 1024 try: 1025 if mode == 'r': 1026 self._RealGetContents() 1027 elif mode in ('w', 'x'): 1028 # set the modified flag so central directory gets written 1029 # even if no files are added to the archive 1030 self._didModify = True 1031 try: 1032 self.start_dir = self.fp.tell() 1033 except (AttributeError, OSError): 1034 self.fp = _Tellable(self.fp) 1035 self.start_dir = 0 1036 self._seekable = False 1037 else: 1038 # Some file-like objects can provide tell() but not seek() 1039 try: 1040 self.fp.seek(self.start_dir) 1041 except (AttributeError, OSError): 1042 self._seekable = False 1043 elif mode == 'a': 1044 try: 1045 # See if file is a zip file 1046 self._RealGetContents() 1047 # seek to start of directory and overwrite 1048 self.fp.seek(self.start_dir) 1049 except BadZipFile: 1050 # file is not a zip file, just append 1051 self.fp.seek(0, 2) 1052 1053 # set the modified flag so central directory gets written 1054 # even if no files are added to the archive 1055 self._didModify = True 1056 self.start_dir = self.fp.tell() 1057 else: 1058 raise RuntimeError("Mode must be 'r', 'w', 'x', or 'a'") 1059 except: 1060 fp = self.fp 1061 self.fp = None 1062 self._fpclose(fp) 1063 raise 1064 1065 def __enter__(self): 1066 return self 1067 1068 def __exit__(self, type, value, traceback): 1069 self.close() 1070 1071 def __repr__(self): 1072 result = ['<%s.%s' % (self.__class__.__module__, 1073 self.__class__.__qualname__)] 1074 if self.fp is not None: 1075 if self._filePassed: 1076 result.append(' file=%r' % self.fp) 1077 elif self.filename is not None: 1078 result.append(' filename=%r' % self.filename) 1079 result.append(' mode=%r' % self.mode) 1080 else: 1081 result.append(' [closed]') 1082 result.append('>') 1083 return ''.join(result) 1084 1085 def _RealGetContents(self): 1086 """Read in the table of contents for the ZIP file.""" 1087 fp = self.fp 1088 try: 1089 endrec = _EndRecData(fp) 1090 except OSError: 1091 raise BadZipFile("File is not a zip file") 1092 if not endrec: 1093 raise BadZipFile("File is not a zip file") 1094 if self.debug > 1: 1095 print(endrec) 1096 size_cd = endrec[_ECD_SIZE] # bytes in central directory 1097 offset_cd = endrec[_ECD_OFFSET] # offset of central directory 1098 self._comment = endrec[_ECD_COMMENT] # archive comment 1099 1100 # "concat" is zero, unless zip was concatenated to another file 1101 concat = endrec[_ECD_LOCATION] - size_cd - offset_cd 1102 if endrec[_ECD_SIGNATURE] == stringEndArchive64: 1103 # If Zip64 extension structures are present, account for them 1104 concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) 1105 1106 if self.debug > 2: 1107 inferred = concat + offset_cd 1108 print("given, inferred, offset", offset_cd, inferred, concat) 1109 # self.start_dir: Position of start of central directory 1110 self.start_dir = offset_cd + concat 1111 fp.seek(self.start_dir, 0) 1112 data = fp.read(size_cd) 1113 fp = io.BytesIO(data) 1114 total = 0 1115 while total < size_cd: 1116 centdir = fp.read(sizeCentralDir) 1117 if len(centdir) != sizeCentralDir: 1118 raise BadZipFile("Truncated central directory") 1119 centdir = struct.unpack(structCentralDir, centdir) 1120 if centdir[_CD_SIGNATURE] != stringCentralDir: 1121 raise BadZipFile("Bad magic number for central directory") 1122 if self.debug > 2: 1123 print(centdir) 1124 filename = fp.read(centdir[_CD_FILENAME_LENGTH]) 1125 flags = centdir[5] 1126 if flags & 0x800: 1127 # UTF-8 file names extension 1128 filename = filename.decode('utf-8') 1129 else: 1130 # Historical ZIP filename encoding 1131 filename = filename.decode('cp437') 1132 # Create ZipInfo instance to store file information 1133 x = ZipInfo(filename) 1134 x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) 1135 x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) 1136 x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] 1137 (x.create_version, x.create_system, x.extract_version, x.reserved, 1138 x.flag_bits, x.compress_type, t, d, 1139 x.CRC, x.compress_size, x.file_size) = centdir[1:12] 1140 if x.extract_version > MAX_EXTRACT_VERSION: 1141 raise NotImplementedError("zip file version %.1f" % 1142 (x.extract_version / 10)) 1143 x.volume, x.internal_attr, x.external_attr = centdir[15:18] 1144 # Convert date/time code to (year, month, day, hour, min, sec) 1145 x._raw_time = t 1146 x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F, 1147 t>>11, (t>>5)&0x3F, (t&0x1F) * 2 ) 1148 1149 x._decodeExtra() 1150 x.header_offset = x.header_offset + concat 1151 self.filelist.append(x) 1152 self.NameToInfo[x.filename] = x 1153 1154 # update total bytes read from central directory 1155 total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] 1156 + centdir[_CD_EXTRA_FIELD_LENGTH] 1157 + centdir[_CD_COMMENT_LENGTH]) 1158 1159 if self.debug > 2: 1160 print("total", total) 1161 1162 1163 def namelist(self): 1164 """Return a list of file names in the archive.""" 1165 return [data.filename for data in self.filelist] 1166 1167 def infolist(self): 1168 """Return a list of class ZipInfo instances for files in the 1169 archive.""" 1170 return self.filelist 1171 1172 def printdir(self, file=None): 1173 """Print a table of contents for the zip file.""" 1174 print("%-46s %19s %12s" % ("File Name", "Modified ", "Size"), 1175 file=file) 1176 for zinfo in self.filelist: 1177 date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6] 1178 print("%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size), 1179 file=file) 1180 1181 def testzip(self): 1182 """Read all the files and check the CRC.""" 1183 chunk_size = 2 ** 20 1184 for zinfo in self.filelist: 1185 try: 1186 # Read by chunks, to avoid an OverflowError or a 1187 # MemoryError with very large embedded files. 1188 with self.open(zinfo.filename, "r") as f: 1189 while f.read(chunk_size): # Check CRC-32 1190 pass 1191 except BadZipFile: 1192 return zinfo.filename 1193 1194 def getinfo(self, name): 1195 """Return the instance of ZipInfo given 'name'.""" 1196 info = self.NameToInfo.get(name) 1197 if info is None: 1198 raise KeyError( 1199 'There is no item named %r in the archive' % name) 1200 1201 return info 1202 1203 def setpassword(self, pwd): 1204 """Set default password for encrypted files.""" 1205 if pwd and not isinstance(pwd, bytes): 1206 raise TypeError("pwd: expected bytes, got %s" % type(pwd)) 1207 if pwd: 1208 self.pwd = pwd 1209 else: 1210 self.pwd = None 1211 1212 @property 1213 def comment(self): 1214 """The comment text associated with the ZIP file.""" 1215 return self._comment 1216 1217 @comment.setter 1218 def comment(self, comment): 1219 if not isinstance(comment, bytes): 1220 raise TypeError("comment: expected bytes, got %s" % type(comment)) 1221 # check for valid comment length 1222 if len(comment) > ZIP_MAX_COMMENT: 1223 import warnings 1224 warnings.warn('Archive comment is too long; truncating to %d bytes' 1225 % ZIP_MAX_COMMENT, stacklevel=2) 1226 comment = comment[:ZIP_MAX_COMMENT] 1227 self._comment = comment 1228 self._didModify = True 1229 1230 def read(self, name, pwd=None): 1231 """Return file bytes (as a string) for name.""" 1232 with self.open(name, "r", pwd) as fp: 1233 return fp.read() 1234 1235 def open(self, name, mode="r", pwd=None): 1236 """Return file-like object for 'name'.""" 1237 if mode not in ("r", "U", "rU"): 1238 raise RuntimeError('open() requires mode "r", "U", or "rU"') 1239 if 'U' in mode: 1240 import warnings 1241 warnings.warn("'U' mode is deprecated", 1242 DeprecationWarning, 2) 1243 if pwd and not isinstance(pwd, bytes): 1244 raise TypeError("pwd: expected bytes, got %s" % type(pwd)) 1245 if not self.fp: 1246 raise RuntimeError( 1247 "Attempt to read ZIP archive that was already closed") 1248 1249 # Make sure we have an info object 1250 if isinstance(name, ZipInfo): 1251 # 'name' is already an info object 1252 zinfo = name 1253 else: 1254 # Get info object for name 1255 zinfo = self.getinfo(name) 1256 1257 self._fileRefCnt += 1 1258 zef_file = _SharedFile(self.fp, zinfo.header_offset, self._fpclose, self._lock) 1259 try: 1260 # Skip the file header: 1261 fheader = zef_file.read(sizeFileHeader) 1262 if len(fheader) != sizeFileHeader: 1263 raise BadZipFile("Truncated file header") 1264 fheader = struct.unpack(structFileHeader, fheader) 1265 if fheader[_FH_SIGNATURE] != stringFileHeader: 1266 raise BadZipFile("Bad magic number for file header") 1267 1268 fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) 1269 if fheader[_FH_EXTRA_FIELD_LENGTH]: 1270 zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) 1271 1272 if zinfo.flag_bits & 0x20: 1273 # Zip 2.7: compressed patched data 1274 raise NotImplementedError("compressed patched data (flag bit 5)") 1275 1276 if zinfo.flag_bits & 0x40: 1277 # strong encryption 1278 raise NotImplementedError("strong encryption (flag bit 6)") 1279 1280 if zinfo.flag_bits & 0x800: 1281 # UTF-8 filename 1282 fname_str = fname.decode("utf-8") 1283 else: 1284 fname_str = fname.decode("cp437") 1285 1286 if fname_str != zinfo.orig_filename: 1287 raise BadZipFile( 1288 'File name in directory %r and header %r differ.' 1289 % (zinfo.orig_filename, fname)) 1290 1291 # check for encrypted flag & handle password 1292 is_encrypted = zinfo.flag_bits & 0x1 1293 zd = None 1294 if is_encrypted: 1295 if not pwd: 1296 pwd = self.pwd 1297 if not pwd: 1298 raise RuntimeError("File %s is encrypted, password " 1299 "required for extraction" % name) 1300 1301 zd = _ZipDecrypter(pwd) 1302 # The first 12 bytes in the cypher stream is an encryption header 1303 # used to strengthen the algorithm. The first 11 bytes are 1304 # completely random, while the 12th contains the MSB of the CRC, 1305 # or the MSB of the file time depending on the header type 1306 # and is used to check the correctness of the password. 1307 header = zef_file.read(12) 1308 h = list(map(zd, header[0:12])) 1309 if zinfo.flag_bits & 0x8: 1310 # compare against the file type from extended local headers 1311 check_byte = (zinfo._raw_time >> 8) & 0xff 1312 else: 1313 # compare against the CRC otherwise 1314 check_byte = (zinfo.CRC >> 24) & 0xff 1315 if h[11] != check_byte: 1316 raise RuntimeError("Bad password for file", name) 1317 1318 return ZipExtFile(zef_file, mode, zinfo, zd, True) 1319 except: 1320 zef_file.close() 1321 raise 1322 1323 def extract(self, member, path=None, pwd=None): 1324 """Extract a member from the archive to the current working directory, 1325 using its full name. Its file information is extracted as accurately 1326 as possible. `member' may be a filename or a ZipInfo object. You can 1327 specify a different directory using `path'. 1328 """ 1329 if not isinstance(member, ZipInfo): 1330 member = self.getinfo(member) 1331 1332 if path is None: 1333 path = os.getcwd() 1334 1335 return self._extract_member(member, path, pwd) 1336 1337 def extractall(self, path=None, members=None, pwd=None): 1338 """Extract all members from the archive to the current working 1339 directory. `path' specifies a different directory to extract to. 1340 `members' is optional and must be a subset of the list returned 1341 by namelist(). 1342 """ 1343 if members is None: 1344 members = self.namelist() 1345 1346 for zipinfo in members: 1347 self.extract(zipinfo, path, pwd) 1348 1349 @classmethod 1350 def _sanitize_windows_name(cls, arcname, pathsep): 1351 """Replace bad characters and remove trailing dots from parts.""" 1352 table = cls._windows_illegal_name_trans_table 1353 if not table: 1354 illegal = ':<>|"?*' 1355 table = str.maketrans(illegal, '_' * len(illegal)) 1356 cls._windows_illegal_name_trans_table = table 1357 arcname = arcname.translate(table) 1358 # remove trailing dots 1359 arcname = (x.rstrip('.') for x in arcname.split(pathsep)) 1360 # rejoin, removing empty parts. 1361 arcname = pathsep.join(x for x in arcname if x) 1362 return arcname 1363 1364 def _extract_member(self, member, targetpath, pwd): 1365 """Extract the ZipInfo object 'member' to a physical 1366 file on the path targetpath. 1367 """ 1368 # build the destination pathname, replacing 1369 # forward slashes to platform specific separators. 1370 arcname = member.filename.replace('/', os.path.sep) 1371 1372 if os.path.altsep: 1373 arcname = arcname.replace(os.path.altsep, os.path.sep) 1374 # interpret absolute pathname as relative, remove drive letter or 1375 # UNC path, redundant separators, "." and ".." components. 1376 arcname = os.path.splitdrive(arcname)[1] 1377 invalid_path_parts = ('', os.path.curdir, os.path.pardir) 1378 arcname = os.path.sep.join(x for x in arcname.split(os.path.sep) 1379 if x not in invalid_path_parts) 1380 if os.path.sep == '\\': 1381 # filter illegal characters on Windows 1382 arcname = self._sanitize_windows_name(arcname, os.path.sep) 1383 1384 targetpath = os.path.join(targetpath, arcname) 1385 targetpath = os.path.normpath(targetpath) 1386 1387 # Create all upper directories if necessary. 1388 upperdirs = os.path.dirname(targetpath) 1389 if upperdirs and not os.path.exists(upperdirs): 1390 os.makedirs(upperdirs) 1391 1392 if member.filename[-1] == '/': 1393 if not os.path.isdir(targetpath): 1394 os.mkdir(targetpath) 1395 return targetpath 1396 1397 with self.open(member, pwd=pwd) as source, \ 1398 open(targetpath, "wb") as target: 1399 shutil.copyfileobj(source, target) 1400 1401 return targetpath 1402 1403 def _writecheck(self, zinfo): 1404 """Check for errors before writing a file to the archive.""" 1405 if zinfo.filename in self.NameToInfo: 1406 import warnings 1407 warnings.warn('Duplicate name: %r' % zinfo.filename, stacklevel=3) 1408 if self.mode not in ('w', 'x', 'a'): 1409 raise RuntimeError("write() requires mode 'w', 'x', or 'a'") 1410 if not self.fp: 1411 raise RuntimeError( 1412 "Attempt to write ZIP archive that was already closed") 1413 _check_compression(zinfo.compress_type) 1414 if not self._allowZip64: 1415 requires_zip64 = None 1416 if len(self.filelist) >= ZIP_FILECOUNT_LIMIT: 1417 requires_zip64 = "Files count" 1418 elif zinfo.file_size > ZIP64_LIMIT: 1419 requires_zip64 = "Filesize" 1420 elif zinfo.header_offset > ZIP64_LIMIT: 1421 requires_zip64 = "Zipfile size" 1422 if requires_zip64: 1423 raise LargeZipFile(requires_zip64 + 1424 " would require ZIP64 extensions") 1425 1426 def write(self, filename, arcname=None, compress_type=None): 1427 """Put the bytes from filename into the archive under the name 1428 arcname.""" 1429 if not self.fp: 1430 raise RuntimeError( 1431 "Attempt to write to ZIP archive that was already closed") 1432 1433 st = os.stat(filename) 1434 isdir = stat.S_ISDIR(st.st_mode) 1435 mtime = time.localtime(st.st_mtime) 1436 date_time = mtime[0:6] 1437 # Create ZipInfo instance to store file information 1438 if arcname is None: 1439 arcname = filename 1440 arcname = os.path.normpath(os.path.splitdrive(arcname)[1]) 1441 while arcname[0] in (os.sep, os.altsep): 1442 arcname = arcname[1:] 1443 if isdir: 1444 arcname += '/' 1445 zinfo = ZipInfo(arcname, date_time) 1446 zinfo.external_attr = (st[0] & 0xFFFF) << 16 # Unix attributes 1447 if isdir: 1448 zinfo.compress_type = ZIP_STORED 1449 elif compress_type is None: 1450 zinfo.compress_type = self.compression 1451 else: 1452 zinfo.compress_type = compress_type 1453 1454 zinfo.file_size = st.st_size 1455 zinfo.flag_bits = 0x00 1456 with self._lock: 1457 if self._seekable: 1458 self.fp.seek(self.start_dir) 1459 zinfo.header_offset = self.fp.tell() # Start of header bytes 1460 if zinfo.compress_type == ZIP_LZMA: 1461 # Compressed data includes an end-of-stream (EOS) marker 1462 zinfo.flag_bits |= 0x02 1463 1464 self._writecheck(zinfo) 1465 self._didModify = True 1466 1467 if isdir: 1468 zinfo.file_size = 0 1469 zinfo.compress_size = 0 1470 zinfo.CRC = 0 1471 zinfo.external_attr |= 0x10 # MS-DOS directory flag 1472 self.filelist.append(zinfo) 1473 self.NameToInfo[zinfo.filename] = zinfo 1474 self.fp.write(zinfo.FileHeader(False)) 1475 self.start_dir = self.fp.tell() 1476 return 1477 1478 cmpr = _get_compressor(zinfo.compress_type) 1479 if not self._seekable: 1480 zinfo.flag_bits |= 0x08 1481 with open(filename, "rb") as fp: 1482 # Must overwrite CRC and sizes with correct data later 1483 zinfo.CRC = CRC = 0 1484 zinfo.compress_size = compress_size = 0 1485 # Compressed size can be larger than uncompressed size 1486 zip64 = self._allowZip64 and \ 1487 zinfo.file_size * 1.05 > ZIP64_LIMIT 1488 self.fp.write(zinfo.FileHeader(zip64)) 1489 file_size = 0 1490 while 1: 1491 buf = fp.read(1024 * 8) 1492 if not buf: 1493 break 1494 file_size = file_size + len(buf) 1495 CRC = crc32(buf, CRC) 1496 if cmpr: 1497 buf = cmpr.compress(buf) 1498 compress_size = compress_size + len(buf) 1499 self.fp.write(buf) 1500 if cmpr: 1501 buf = cmpr.flush() 1502 compress_size = compress_size + len(buf) 1503 self.fp.write(buf) 1504 zinfo.compress_size = compress_size 1505 else: 1506 zinfo.compress_size = file_size 1507 zinfo.CRC = CRC 1508 zinfo.file_size = file_size 1509 if zinfo.flag_bits & 0x08: 1510 # Write CRC and file sizes after the file data 1511 fmt = '<LQQ' if zip64 else '<LLL' 1512 self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size, 1513 zinfo.file_size)) 1514 self.start_dir = self.fp.tell() 1515 else: 1516 if not zip64 and self._allowZip64: 1517 if file_size > ZIP64_LIMIT: 1518 raise RuntimeError('File size has increased during compressing') 1519 if compress_size > ZIP64_LIMIT: 1520 raise RuntimeError('Compressed size larger than uncompressed size') 1521 # Seek backwards and write file header (which will now include 1522 # correct CRC and file sizes) 1523 self.start_dir = self.fp.tell() # Preserve current position in file 1524 self.fp.seek(zinfo.header_offset) 1525 self.fp.write(zinfo.FileHeader(zip64)) 1526 self.fp.seek(self.start_dir) 1527 self.filelist.append(zinfo) 1528 self.NameToInfo[zinfo.filename] = zinfo 1529 1530 def writestr(self, zinfo_or_arcname, data, compress_type=None): 1531 """Write a file into the archive. The contents is 'data', which 1532 may be either a 'str' or a 'bytes' instance; if it is a 'str', 1533 it is encoded as UTF-8 first. 1534 'zinfo_or_arcname' is either a ZipInfo instance or 1535 the name of the file in the archive.""" 1536 if isinstance(data, str): 1537 data = data.encode("utf-8") 1538 if not isinstance(zinfo_or_arcname, ZipInfo): 1539 zinfo = ZipInfo(filename=zinfo_or_arcname, 1540 date_time=time.localtime(time.time())[:6]) 1541 zinfo.compress_type = self.compression 1542 if zinfo.filename[-1] == '/': 1543 zinfo.external_attr = 0o40775 << 16 # drwxrwxr-x 1544 zinfo.external_attr |= 0x10 # MS-DOS directory flag 1545 else: 1546 zinfo.external_attr = 0o600 << 16 # ?rw------- 1547 else: 1548 zinfo = zinfo_or_arcname 1549 1550 if not self.fp: 1551 raise RuntimeError( 1552 "Attempt to write to ZIP archive that was already closed") 1553 1554 zinfo.file_size = len(data) # Uncompressed size 1555 with self._lock: 1556 if self._seekable: 1557 self.fp.seek(self.start_dir) 1558 zinfo.header_offset = self.fp.tell() # Start of header data 1559 if compress_type is not None: 1560 zinfo.compress_type = compress_type 1561 zinfo.header_offset = self.fp.tell() # Start of header data 1562 if compress_type is not None: 1563 zinfo.compress_type = compress_type 1564 if zinfo.compress_type == ZIP_LZMA: 1565 # Compressed data includes an end-of-stream (EOS) marker 1566 zinfo.flag_bits |= 0x02 1567 1568 self._writecheck(zinfo) 1569 self._didModify = True 1570 zinfo.CRC = crc32(data) # CRC-32 checksum 1571 co = _get_compressor(zinfo.compress_type) 1572 if co: 1573 data = co.compress(data) + co.flush() 1574 zinfo.compress_size = len(data) # Compressed size 1575 else: 1576 zinfo.compress_size = zinfo.file_size 1577 zip64 = zinfo.file_size > ZIP64_LIMIT or \ 1578 zinfo.compress_size > ZIP64_LIMIT 1579 if zip64 and not self._allowZip64: 1580 raise LargeZipFile("Filesize would require ZIP64 extensions") 1581 self.fp.write(zinfo.FileHeader(zip64)) 1582 self.fp.write(data) 1583 if zinfo.flag_bits & 0x08: 1584 # Write CRC and file sizes after the file data 1585 fmt = '<LQQ' if zip64 else '<LLL' 1586 self.fp.write(struct.pack(fmt, zinfo.CRC, zinfo.compress_size, 1587 zinfo.file_size)) 1588 self.fp.flush() 1589 self.start_dir = self.fp.tell() 1590 self.filelist.append(zinfo) 1591 self.NameToInfo[zinfo.filename] = zinfo 1592 1593 def __del__(self): 1594 """Call the "close()" method in case the user forgot.""" 1595 self.close() 1596 1597 def close(self): 1598 """Close the file, and for mode 'w', 'x' and 'a' write the ending 1599 records.""" 1600 if self.fp is None: 1601 return 1602 1603 try: 1604 if self.mode in ('w', 'x', 'a') and self._didModify: # write ending records 1605 with self._lock: 1606 if self._seekable: 1607 self.fp.seek(self.start_dir) 1608 self._write_end_record() 1609 finally: 1610 fp = self.fp 1611 self.fp = None 1612 self._fpclose(fp) 1613 1614 def _write_end_record(self): 1615 for zinfo in self.filelist: # write central directory 1616 dt = zinfo.date_time 1617 dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] 1618 dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) 1619 extra = [] 1620 if zinfo.file_size > ZIP64_LIMIT \ 1621 or zinfo.compress_size > ZIP64_LIMIT: 1622 extra.append(zinfo.file_size) 1623 extra.append(zinfo.compress_size) 1624 file_size = 0xffffffff 1625 compress_size = 0xffffffff 1626 else: 1627 file_size = zinfo.file_size 1628 compress_size = zinfo.compress_size 1629 1630 if zinfo.header_offset > ZIP64_LIMIT: 1631 extra.append(zinfo.header_offset) 1632 header_offset = 0xffffffff 1633 else: 1634 header_offset = zinfo.header_offset 1635 1636 extra_data = zinfo.extra 1637 min_version = 0 1638 if extra: 1639 # Append a ZIP64 field to the extra's 1640 extra_data = struct.pack( 1641 '<HH' + 'Q'*len(extra), 1642 1, 8*len(extra), *extra) + extra_data 1643 1644 min_version = ZIP64_VERSION 1645 1646 if zinfo.compress_type == ZIP_BZIP2: 1647 min_version = max(BZIP2_VERSION, min_version) 1648 elif zinfo.compress_type == ZIP_LZMA: 1649 min_version = max(LZMA_VERSION, min_version) 1650 1651 extract_version = max(min_version, zinfo.extract_version) 1652 create_version = max(min_version, zinfo.create_version) 1653 try: 1654 filename, flag_bits = zinfo._encodeFilenameFlags() 1655 centdir = struct.pack(structCentralDir, 1656 stringCentralDir, create_version, 1657 zinfo.create_system, extract_version, zinfo.reserved, 1658 flag_bits, zinfo.compress_type, dostime, dosdate, 1659 zinfo.CRC, compress_size, file_size, 1660 len(filename), len(extra_data), len(zinfo.comment), 1661 0, zinfo.internal_attr, zinfo.external_attr, 1662 header_offset) 1663 except DeprecationWarning: 1664 print((structCentralDir, stringCentralDir, create_version, 1665 zinfo.create_system, extract_version, zinfo.reserved, 1666 zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, 1667 zinfo.CRC, compress_size, file_size, 1668 len(zinfo.filename), len(extra_data), len(zinfo.comment), 1669 0, zinfo.internal_attr, zinfo.external_attr, 1670 header_offset), file=sys.stderr) 1671 raise 1672 self.fp.write(centdir) 1673 self.fp.write(filename) 1674 self.fp.write(extra_data) 1675 self.fp.write(zinfo.comment) 1676 1677 pos2 = self.fp.tell() 1678 # Write end-of-zip-archive record 1679 centDirCount = len(self.filelist) 1680 centDirSize = pos2 - self.start_dir 1681 centDirOffset = self.start_dir 1682 requires_zip64 = None 1683 if centDirCount > ZIP_FILECOUNT_LIMIT: 1684 requires_zip64 = "Files count" 1685 elif centDirOffset > ZIP64_LIMIT: 1686 requires_zip64 = "Central directory offset" 1687 elif centDirSize > ZIP64_LIMIT: 1688 requires_zip64 = "Central directory size" 1689 if requires_zip64: 1690 # Need to write the ZIP64 end-of-archive records 1691 if not self._allowZip64: 1692 raise LargeZipFile(requires_zip64 + 1693 " would require ZIP64 extensions") 1694 zip64endrec = struct.pack( 1695 structEndArchive64, stringEndArchive64, 1696 44, 45, 45, 0, 0, centDirCount, centDirCount, 1697 centDirSize, centDirOffset) 1698 self.fp.write(zip64endrec) 1699 1700 zip64locrec = struct.pack( 1701 structEndArchive64Locator, 1702 stringEndArchive64Locator, 0, pos2, 1) 1703 self.fp.write(zip64locrec) 1704 centDirCount = min(centDirCount, 0xFFFF) 1705 centDirSize = min(centDirSize, 0xFFFFFFFF) 1706 centDirOffset = min(centDirOffset, 0xFFFFFFFF) 1707 1708 endrec = struct.pack(structEndArchive, stringEndArchive, 1709 0, 0, centDirCount, centDirCount, 1710 centDirSize, centDirOffset, len(self._comment)) 1711 self.fp.write(endrec) 1712 self.fp.write(self._comment) 1713 self.fp.flush() 1714 1715 def _fpclose(self, fp): 1716 assert self._fileRefCnt > 0 1717 self._fileRefCnt -= 1 1718 if not self._fileRefCnt and not self._filePassed: 1719 fp.close() 1720 1721 1722 class PyZipFile(ZipFile): 1723 """Class to create ZIP archives with Python library files and packages.""" 1724 1725 def __init__(self, file, mode="r", compression=ZIP_STORED, 1726 allowZip64=True, optimize=-1): 1727 ZipFile.__init__(self, file, mode=mode, compression=compression, 1728 allowZip64=allowZip64) 1729 self._optimize = optimize 1730 1731 def writepy(self, pathname, basename="", filterfunc=None): 1732 """Add all files from "pathname" to the ZIP archive. 1733 1734 If pathname is a package directory, search the directory and 1735 all package subdirectories recursively for all *.py and enter 1736 the modules into the archive. If pathname is a plain 1737 directory, listdir *.py and enter all modules. Else, pathname 1738 must be a Python *.py file and the module will be put into the 1739 archive. Added modules are always module.pyc. 1740 This method will compile the module.py into module.pyc if 1741 necessary. 1742 If filterfunc(pathname) is given, it is called with every argument. 1743 When it is False, the file or directory is skipped. 1744 """ 1745 if filterfunc and not filterfunc(pathname): 1746 if self.debug: 1747 label = 'path' if os.path.isdir(pathname) else 'file' 1748 print('%s "%s" skipped by filterfunc' % (label, pathname)) 1749 return 1750 dir, name = os.path.split(pathname) 1751 if os.path.isdir(pathname): 1752 initname = os.path.join(pathname, "__init__.py") 1753 if os.path.isfile(initname): 1754 # This is a package directory, add it 1755 if basename: 1756 basename = "%s/%s" % (basename, name) 1757 else: 1758 basename = name 1759 if self.debug: 1760 print("Adding package in", pathname, "as", basename) 1761 fname, arcname = self._get_codename(initname[0:-3], basename) 1762 if self.debug: 1763 print("Adding", arcname) 1764 self.write(fname, arcname) 1765 dirlist = os.listdir(pathname) 1766 dirlist.remove("__init__.py") 1767 # Add all *.py files and package subdirectories 1768 for filename in dirlist: 1769 path = os.path.join(pathname, filename) 1770 root, ext = os.path.splitext(filename) 1771 if os.path.isdir(path): 1772 if os.path.isfile(os.path.join(path, "__init__.py")): 1773 # This is a package directory, add it 1774 self.writepy(path, basename, 1775 filterfunc=filterfunc) # Recursive call 1776 elif ext == ".py": 1777 if filterfunc and not filterfunc(path): 1778 if self.debug: 1779 print('file "%s" skipped by filterfunc' % path) 1780 continue 1781 fname, arcname = self._get_codename(path[0:-3], 1782 basename) 1783 if self.debug: 1784 print("Adding", arcname) 1785 self.write(fname, arcname) 1786 else: 1787 # This is NOT a package directory, add its files at top level 1788 if self.debug: 1789 print("Adding files from directory", pathname) 1790 for filename in os.listdir(pathname): 1791 path = os.path.join(pathname, filename) 1792 root, ext = os.path.splitext(filename) 1793 if ext == ".py": 1794 if filterfunc and not filterfunc(path): 1795 if self.debug: 1796 print('file "%s" skipped by filterfunc' % path) 1797 continue 1798 fname, arcname = self._get_codename(path[0:-3], 1799 basename) 1800 if self.debug: 1801 print("Adding", arcname) 1802 self.write(fname, arcname) 1803 else: 1804 if pathname[-3:] != ".py": 1805 raise RuntimeError( 1806 'Files added with writepy() must end with ".py"') 1807 fname, arcname = self._get_codename(pathname[0:-3], basename) 1808 if self.debug: 1809 print("Adding file", arcname) 1810 self.write(fname, arcname) 1811 1812 def _get_codename(self, pathname, basename): 1813 """Return (filename, archivename) for the path. 1814 1815 Given a module name path, return the correct file path and 1816 archive name, compiling if necessary. For example, given 1817 /python/lib/string, return (/python/lib/string.pyc, string). 1818 """ 1819 def _compile(file, optimize=-1): 1820 import py_compile 1821 if self.debug: 1822 print("Compiling", file) 1823 try: 1824 py_compile.compile(file, doraise=True, optimize=optimize) 1825 except py_compile.PyCompileError as err: 1826 print(err.msg) 1827 return False 1828 return True 1829 1830 file_py = pathname + ".py" 1831 file_pyc = pathname + ".pyc" 1832 pycache_opt0 = importlib.util.cache_from_source(file_py, optimization='') 1833 pycache_opt1 = importlib.util.cache_from_source(file_py, optimization=1) 1834 pycache_opt2 = importlib.util.cache_from_source(file_py, optimization=2) 1835 if self._optimize == -1: 1836 # legacy mode: use whatever file is present 1837 if (os.path.isfile(file_pyc) and 1838 os.stat(file_pyc).st_mtime >= os.stat(file_py).st_mtime): 1839 # Use .pyc file. 1840 arcname = fname = file_pyc 1841 elif (os.path.isfile(pycache_opt0) and 1842 os.stat(pycache_opt0).st_mtime >= os.stat(file_py).st_mtime): 1843 # Use the __pycache__/*.pyc file, but write it to the legacy pyc 1844 # file name in the archive. 1845 fname = pycache_opt0 1846 arcname = file_pyc 1847 elif (os.path.isfile(pycache_opt1) and 1848 os.stat(pycache_opt1).st_mtime >= os.stat(file_py).st_mtime): 1849 # Use the __pycache__/*.pyc file, but write it to the legacy pyc 1850 # file name in the archive. 1851 fname = pycache_opt1 1852 arcname = file_pyc 1853 elif (os.path.isfile(pycache_opt2) and 1854 os.stat(pycache_opt2).st_mtime >= os.stat(file_py).st_mtime): 1855 # Use the __pycache__/*.pyc file, but write it to the legacy pyc 1856 # file name in the archive. 1857 fname = pycache_opt2 1858 arcname = file_pyc 1859 else: 1860 # Compile py into PEP 3147 pyc file. 1861 if _compile(file_py): 1862 if sys.flags.optimize == 0: 1863 fname = pycache_opt0 1864 elif sys.flags.optimize == 1: 1865 fname = pycache_opt1 1866 else: 1867 fname = pycache_opt2 1868 arcname = file_pyc 1869 else: 1870 fname = arcname = file_py 1871 else: 1872 # new mode: use given optimization level 1873 if self._optimize == 0: 1874 fname = pycache_opt0 1875 arcname = file_pyc 1876 else: 1877 arcname = file_pyc 1878 if self._optimize == 1: 1879 fname = pycache_opt1 1880 elif self._optimize == 2: 1881 fname = pycache_opt2 1882 else: 1883 msg = "invalid value for 'optimize': {!r}".format(self._optimize) 1884 raise ValueError(msg) 1885 if not (os.path.isfile(fname) and 1886 os.stat(fname).st_mtime >= os.stat(file_py).st_mtime): 1887 if not _compile(file_py, optimize=self._optimize): 1888 fname = arcname = file_py 1889 archivename = os.path.split(arcname)[1] 1890 if basename: 1891 archivename = "%s/%s" % (basename, archivename) 1892 return (fname, archivename) 1893 1894 1895 def main(args = None): 1896 import textwrap 1897 USAGE=textwrap.dedent("""\ 1898 Usage: 1899 zipfile.py -l zipfile.zip # Show listing of a zipfile 1900 zipfile.py -t zipfile.zip # Test if a zipfile is valid 1901 zipfile.py -e zipfile.zip target # Extract zipfile into target dir 1902 zipfile.py -c zipfile.zip src ... # Create zipfile from sources 1903 """) 1904 if args is None: 1905 args = sys.argv[1:] 1906 1907 if not args or args[0] not in ('-l', '-c', '-e', '-t'): 1908 print(USAGE) 1909 sys.exit(1) 1910 1911 if args[0] == '-l': 1912 if len(args) != 2: 1913 print(USAGE) 1914 sys.exit(1) 1915 with ZipFile(args[1], 'r') as zf: 1916 zf.printdir() 1917 1918 elif args[0] == '-t': 1919 if len(args) != 2: 1920 print(USAGE) 1921 sys.exit(1) 1922 with ZipFile(args[1], 'r') as zf: 1923 badfile = zf.testzip() 1924 if badfile: 1925 print("The following enclosed file is corrupted: {!r}".format(badfile)) 1926 print("Done testing") 1927 1928 elif args[0] == '-e': 1929 if len(args) != 3: 1930 print(USAGE) 1931 sys.exit(1) 1932 1933 with ZipFile(args[1], 'r') as zf: 1934 zf.extractall(args[2]) 1935 1936 elif args[0] == '-c': 1937 if len(args) < 3: 1938 print(USAGE) 1939 sys.exit(1) 1940 1941 def addToZip(zf, path, zippath): 1942 if os.path.isfile(path): 1943 zf.write(path, zippath, ZIP_DEFLATED) 1944 elif os.path.isdir(path): 1945 if zippath: 1946 zf.write(path, zippath) 1947 for nm in os.listdir(path): 1948 addToZip(zf, 1949 os.path.join(path, nm), os.path.join(zippath, nm)) 1950 # else: ignore 1951 1952 with ZipFile(args[1], 'w') as zf: 1953 for path in args[2:]: 1954 zippath = os.path.basename(path) 1955 if not zippath: 1956 zippath = os.path.basename(os.path.dirname(path)) 1957 if zippath in ('', os.curdir, os.pardir): 1958 zippath = '' 1959 addToZip(zf, path, zippath) 1960 1961 if __name__ == "__main__": 1962 main()
b、tarfile
1 #!/usr/bin/env python3 2 #------------------------------------------------------------------- 3 # tarfile.py 4 #------------------------------------------------------------------- 5 # Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de> 6 # All rights reserved. 7 # 8 # Permission is hereby granted, free of charge, to any person 9 # obtaining a copy of this software and associated documentation 10 # files (the "Software"), to deal in the Software without 11 # restriction, including without limitation the rights to use, 12 # copy, modify, merge, publish, distribute, sublicense, and/or sell 13 # copies of the Software, and to permit persons to whom the 14 # Software is furnished to do so, subject to the following 15 # conditions: 16 # 17 # The above copyright notice and this permission notice shall be 18 # included in all copies or substantial portions of the Software. 19 # 20 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22 # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24 # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27 # OTHER DEALINGS IN THE SOFTWARE. 28 # 29 """Read from and write to tar format archives. 30 """ 31 32 version = "0.9.0" 33 __author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" 34 __date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $" 35 __cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $" 36 __credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." 37 38 #--------- 39 # Imports 40 #--------- 41 from builtins import open as bltn_open 42 import sys 43 import os 44 import io 45 import shutil 46 import stat 47 import time 48 import struct 49 import copy 50 import re 51 52 try: 53 import grp, pwd 54 except ImportError: 55 grp = pwd = None 56 57 # os.symlink on Windows prior to 6.0 raises NotImplementedError 58 symlink_exception = (AttributeError, NotImplementedError) 59 try: 60 # OSError (winerror=1314) will be raised if the caller does not hold the 61 # SeCreateSymbolicLinkPrivilege privilege 62 symlink_exception += (OSError,) 63 except NameError: 64 pass 65 66 # from tarfile import * 67 __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"] 68 69 #--------------------------------------------------------- 70 # tar constants 71 #--------------------------------------------------------- 72 NUL = b"\0" # the null character 73 BLOCKSIZE = 512 # length of processing blocks 74 RECORDSIZE = BLOCKSIZE * 20 # length of records 75 GNU_MAGIC = b"ustar \0" # magic gnu tar string 76 POSIX_MAGIC = b"ustar\x0000" # magic posix tar string 77 78 LENGTH_NAME = 100 # maximum length of a filename 79 LENGTH_LINK = 100 # maximum length of a linkname 80 LENGTH_PREFIX = 155 # maximum length of the prefix field 81 82 REGTYPE = b"0" # regular file 83 AREGTYPE = b"\0" # regular file 84 LNKTYPE = b"1" # link (inside tarfile) 85 SYMTYPE = b"2" # symbolic link 86 CHRTYPE = b"3" # character special device 87 BLKTYPE = b"4" # block special device 88 DIRTYPE = b"5" # directory 89 FIFOTYPE = b"6" # fifo special device 90 CONTTYPE = b"7" # contiguous file 91 92 GNUTYPE_LONGNAME = b"L" # GNU tar longname 93 GNUTYPE_LONGLINK = b"K" # GNU tar longlink 94 GNUTYPE_SPARSE = b"S" # GNU tar sparse file 95 96 XHDTYPE = b"x" # POSIX.1-2001 extended header 97 XGLTYPE = b"g" # POSIX.1-2001 global header 98 SOLARIS_XHDTYPE = b"X" # Solaris extended header 99 100 USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format 101 GNU_FORMAT = 1 # GNU tar format 102 PAX_FORMAT = 2 # POSIX.1-2001 (pax) format 103 DEFAULT_FORMAT = GNU_FORMAT 104 105 #--------------------------------------------------------- 106 # tarfile constants 107 #--------------------------------------------------------- 108 # File types that tarfile supports: 109 SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, 110 SYMTYPE, DIRTYPE, FIFOTYPE, 111 CONTTYPE, CHRTYPE, BLKTYPE, 112 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 113 GNUTYPE_SPARSE) 114 115 # File types that will be treated as a regular file. 116 REGULAR_TYPES = (REGTYPE, AREGTYPE, 117 CONTTYPE, GNUTYPE_SPARSE) 118 119 # File types that are part of the GNU tar format. 120 GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 121 GNUTYPE_SPARSE) 122 123 # Fields from a pax header that override a TarInfo attribute. 124 PAX_FIELDS = ("path", "linkpath", "size", "mtime", 125 "uid", "gid", "uname", "gname") 126 127 # Fields from a pax header that are affected by hdrcharset. 128 PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"} 129 130 # Fields in a pax header that are numbers, all other fields 131 # are treated as strings. 132 PAX_NUMBER_FIELDS = { 133 "atime": float, 134 "ctime": float, 135 "mtime": float, 136 "uid": int, 137 "gid": int, 138 "size": int 139 } 140 141 #--------------------------------------------------------- 142 # initialization 143 #--------------------------------------------------------- 144 if os.name in ("nt", "ce"): 145 ENCODING = "utf-8" 146 else: 147 ENCODING = sys.getfilesystemencoding() 148 149 #--------------------------------------------------------- 150 # Some useful functions 151 #--------------------------------------------------------- 152 153 def stn(s, length, encoding, errors): 154 """Convert a string to a null-terminated bytes object. 155 """ 156 s = s.encode(encoding, errors) 157 return s[:length] + (length - len(s)) * NUL 158 159 def nts(s, encoding, errors): 160 """Convert a null-terminated bytes object to a string. 161 """ 162 p = s.find(b"\0") 163 if p != -1: 164 s = s[:p] 165 return s.decode(encoding, errors) 166 167 def nti(s): 168 """Convert a number field to a python number. 169 """ 170 # There are two possible encodings for a number field, see 171 # itn() below. 172 if s[0] in (0o200, 0o377): 173 n = 0 174 for i in range(len(s) - 1): 175 n <<= 8 176 n += s[i + 1] 177 if s[0] == 0o377: 178 n = -(256 ** (len(s) - 1) - n) 179 else: 180 try: 181 s = nts(s, "ascii", "strict") 182 n = int(s.strip() or "0", 8) 183 except ValueError: 184 raise InvalidHeaderError("invalid header") 185 return n 186 187 def itn(n, digits=8, format=DEFAULT_FORMAT): 188 """Convert a python number to a number field. 189 """ 190 # POSIX 1003.1-1988 requires numbers to be encoded as a string of 191 # octal digits followed by a null-byte, this allows values up to 192 # (8**(digits-1))-1. GNU tar allows storing numbers greater than 193 # that if necessary. A leading 0o200 or 0o377 byte indicate this 194 # particular encoding, the following digits-1 bytes are a big-endian 195 # base-256 representation. This allows values up to (256**(digits-1))-1. 196 # A 0o200 byte indicates a positive number, a 0o377 byte a negative 197 # number. 198 if 0 <= n < 8 ** (digits - 1): 199 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL 200 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1): 201 if n >= 0: 202 s = bytearray([0o200]) 203 else: 204 s = bytearray([0o377]) 205 n = 256 ** digits + n 206 207 for i in range(digits - 1): 208 s.insert(1, n & 0o377) 209 n >>= 8 210 else: 211 raise ValueError("overflow in number field") 212 213 return s 214 215 def calc_chksums(buf): 216 """Calculate the checksum for a member's header by summing up all 217 characters except for the chksum field which is treated as if 218 it was filled with spaces. According to the GNU tar sources, 219 some tars (Sun and NeXT) calculate chksum with signed char, 220 which will be different if there are chars in the buffer with 221 the high bit set. So we calculate two checksums, unsigned and 222 signed. 223 """ 224 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf)) 225 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf)) 226 return unsigned_chksum, signed_chksum 227 228 def copyfileobj(src, dst, length=None, exception=OSError): 229 """Copy length bytes from fileobj src to fileobj dst. 230 If length is None, copy the entire content. 231 """ 232 if length == 0: 233 return 234 if length is None: 235 shutil.copyfileobj(src, dst) 236 return 237 238 BUFSIZE = 16 * 1024 239 blocks, remainder = divmod(length, BUFSIZE) 240 for b in range(blocks): 241 buf = src.read(BUFSIZE) 242 if len(buf) < BUFSIZE: 243 raise exception("unexpected end of data") 244 dst.write(buf) 245 246 if remainder != 0: 247 buf = src.read(remainder) 248 if len(buf) < remainder: 249 raise exception("unexpected end of data") 250 dst.write(buf) 251 return 252 253 def filemode(mode): 254 """Deprecated in this location; use stat.filemode.""" 255 import warnings 256 warnings.warn("deprecated in favor of stat.filemode", 257 DeprecationWarning, 2) 258 return stat.filemode(mode) 259 260 def _safe_print(s): 261 encoding = getattr(sys.stdout, 'encoding', None) 262 if encoding is not None: 263 s = s.encode(encoding, 'backslashreplace').decode(encoding) 264 print(s, end=' ') 265 266 267 class TarError(Exception): 268 """Base exception.""" 269 pass 270 class ExtractError(TarError): 271 """General exception for extract errors.""" 272 pass 273 class ReadError(TarError): 274 """Exception for unreadable tar archives.""" 275 pass 276 class CompressionError(TarError): 277 """Exception for unavailable compression methods.""" 278 pass 279 class StreamError(TarError): 280 """Exception for unsupported operations on stream-like TarFiles.""" 281 pass 282 class HeaderError(TarError): 283 """Base exception for header errors.""" 284 pass 285 class EmptyHeaderError(HeaderError): 286 """Exception for empty headers.""" 287 pass 288 class TruncatedHeaderError(HeaderError): 289 """Exception for truncated headers.""" 290 pass 291 class EOFHeaderError(HeaderError): 292 """Exception for end of file headers.""" 293 pass 294 class InvalidHeaderError(HeaderError): 295 """Exception for invalid headers.""" 296 pass 297 class SubsequentHeaderError(HeaderError): 298 """Exception for missing and invalid extended headers.""" 299 pass 300 301 #--------------------------- 302 # internal stream interface 303 #--------------------------- 304 class _LowLevelFile: 305 """Low-level file object. Supports reading and writing. 306 It is used instead of a regular file object for streaming 307 access. 308 """ 309 310 def __init__(self, name, mode): 311 mode = { 312 "r": os.O_RDONLY, 313 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 314 }[mode] 315 if hasattr(os, "O_BINARY"): 316 mode |= os.O_BINARY 317 self.fd = os.open(name, mode, 0o666) 318 319 def close(self): 320 os.close(self.fd) 321 322 def read(self, size): 323 return os.read(self.fd, size) 324 325 def write(self, s): 326 os.write(self.fd, s) 327 328 class _Stream: 329 """Class that serves as an adapter between TarFile and 330 a stream-like object. The stream-like object only 331 needs to have a read() or write() method and is accessed 332 blockwise. Use of gzip or bzip2 compression is possible. 333 A stream-like object could be for example: sys.stdin, 334 sys.stdout, a socket, a tape device etc. 335 336 _Stream is intended to be used only internally. 337 """ 338 339 def __init__(self, name, mode, comptype, fileobj, bufsize): 340 """Construct a _Stream object. 341 """ 342 self._extfileobj = True 343 if fileobj is None: 344 fileobj = _LowLevelFile(name, mode) 345 self._extfileobj = False 346 347 if comptype == '*': 348 # Enable transparent compression detection for the 349 # stream interface 350 fileobj = _StreamProxy(fileobj) 351 comptype = fileobj.getcomptype() 352 353 self.name = name or "" 354 self.mode = mode 355 self.comptype = comptype 356 self.fileobj = fileobj 357 self.bufsize = bufsize 358 self.buf = b"" 359 self.pos = 0 360 self.closed = False 361 362 try: 363 if comptype == "gz": 364 try: 365 import zlib 366 except ImportError: 367 raise CompressionError("zlib module is not available") 368 self.zlib = zlib 369 self.crc = zlib.crc32(b"") 370 if mode == "r": 371 self._init_read_gz() 372 self.exception = zlib.error 373 else: 374 self._init_write_gz() 375 376 elif comptype == "bz2": 377 try: 378 import bz2 379 except ImportError: 380 raise CompressionError("bz2 module is not available") 381 if mode == "r": 382 self.dbuf = b"" 383 self.cmp = bz2.BZ2Decompressor() 384 self.exception = OSError 385 else: 386 self.cmp = bz2.BZ2Compressor() 387 388 elif comptype == "xz": 389 try: 390 import lzma 391 except ImportError: 392 raise CompressionError("lzma module is not available") 393 if mode == "r": 394 self.dbuf = b"" 395 self.cmp = lzma.LZMADecompressor() 396 self.exception = lzma.LZMAError 397 else: 398 self.cmp = lzma.LZMACompressor() 399 400 elif comptype != "tar": 401 raise CompressionError("unknown compression type %r" % comptype) 402 403 except: 404 if not self._extfileobj: 405 self.fileobj.close() 406 self.closed = True 407 raise 408 409 def __del__(self): 410 if hasattr(self, "closed") and not self.closed: 411 self.close() 412 413 def _init_write_gz(self): 414 """Initialize for writing with gzip compression. 415 """ 416 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, 417 -self.zlib.MAX_WBITS, 418 self.zlib.DEF_MEM_LEVEL, 419 0) 420 timestamp = struct.pack("<L", int(time.time())) 421 self.__write(b"\037\213\010\010" + timestamp + b"\002\377") 422 if self.name.endswith(".gz"): 423 self.name = self.name[:-3] 424 # RFC1952 says we must use ISO-8859-1 for the FNAME field. 425 self.__write(self.name.encode("iso-8859-1", "replace") + NUL) 426 427 def write(self, s): 428 """Write string s to the stream. 429 """ 430 if self.comptype == "gz": 431 self.crc = self.zlib.crc32(s, self.crc) 432 self.pos += len(s) 433 if self.comptype != "tar": 434 s = self.cmp.compress(s) 435 self.__write(s) 436 437 def __write(self, s): 438 """Write string s to the stream if a whole new block 439 is ready to be written. 440 """ 441 self.buf += s 442 while len(self.buf) > self.bufsize: 443 self.fileobj.write(self.buf[:self.bufsize]) 444 self.buf = self.buf[self.bufsize:] 445 446 def close(self): 447 """Close the _Stream object. No operation should be 448 done on it afterwards. 449 """ 450 if self.closed: 451 return 452 453 self.closed = True 454 try: 455 if self.mode == "w" and self.comptype != "tar": 456 self.buf += self.cmp.flush() 457 458 if self.mode == "w" and self.buf: 459 self.fileobj.write(self.buf) 460 self.buf = b"" 461 if self.comptype == "gz": 462 self.fileobj.write(struct.pack("<L", self.crc)) 463 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF)) 464 finally: 465 if not self._extfileobj: 466 self.fileobj.close() 467 468 def _init_read_gz(self): 469 """Initialize for reading a gzip compressed fileobj. 470 """ 471 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) 472 self.dbuf = b"" 473 474 # taken from gzip.GzipFile with some alterations 475 if self.__read(2) != b"\037\213": 476 raise ReadError("not a gzip file") 477 if self.__read(1) != b"\010": 478 raise CompressionError("unsupported compression method") 479 480 flag = ord(self.__read(1)) 481 self.__read(6) 482 483 if flag & 4: 484 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1)) 485 self.read(xlen) 486 if flag & 8: 487 while True: 488 s = self.__read(1) 489 if not s or s == NUL: 490 break 491 if flag & 16: 492 while True: 493 s = self.__read(1) 494 if not s or s == NUL: 495 break 496 if flag & 2: 497 self.__read(2) 498 499 def tell(self): 500 """Return the stream's file pointer position. 501 """ 502 return self.pos 503 504 def seek(self, pos=0): 505 """Set the stream's file pointer to pos. Negative seeking 506 is forbidden. 507 """ 508 if pos - self.pos >= 0: 509 blocks, remainder = divmod(pos - self.pos, self.bufsize) 510 for i in range(blocks): 511 self.read(self.bufsize) 512 self.read(remainder) 513 else: 514 raise StreamError("seeking backwards is not allowed") 515 return self.pos 516 517 def read(self, size=None): 518 """Return the next size number of bytes from the stream. 519 If size is not defined, return all bytes of the stream 520 up to EOF. 521 """ 522 if size is None: 523 t = [] 524 while True: 525 buf = self._read(self.bufsize) 526 if not buf: 527 break 528 t.append(buf) 529 buf = "".join(t) 530 else: 531 buf = self._read(size) 532 self.pos += len(buf) 533 return buf 534 535 def _read(self, size): 536 """Return size bytes from the stream. 537 """ 538 if self.comptype == "tar": 539 return self.__read(size) 540 541 c = len(self.dbuf) 542 while c < size: 543 buf = self.__read(self.bufsize) 544 if not buf: 545 break 546 try: 547 buf = self.cmp.decompress(buf) 548 except self.exception: 549 raise ReadError("invalid compressed data") 550 self.dbuf += buf 551 c += len(buf) 552 buf = self.dbuf[:size] 553 self.dbuf = self.dbuf[size:] 554 return buf 555 556 def __read(self, size): 557 """Return size bytes from stream. If internal buffer is empty, 558 read another block from the stream. 559 """ 560 c = len(self.buf) 561 while c < size: 562 buf = self.fileobj.read(self.bufsize) 563 if not buf: 564 break 565 self.buf += buf 566 c += len(buf) 567 buf = self.buf[:size] 568 self.buf = self.buf[size:] 569 return buf 570 # class _Stream 571 572 class _StreamProxy(object): 573 """Small proxy class that enables transparent compression 574 detection for the Stream interface (mode 'r|*'). 575 """ 576 577 def __init__(self, fileobj): 578 self.fileobj = fileobj 579 self.buf = self.fileobj.read(BLOCKSIZE) 580 581 def read(self, size): 582 self.read = self.fileobj.read 583 return self.buf 584 585 def getcomptype(self): 586 if self.buf.startswith(b"\x1f\x8b\x08"): 587 return "gz" 588 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY": 589 return "bz2" 590 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): 591 return "xz" 592 else: 593 return "tar" 594 595 def close(self): 596 self.fileobj.close() 597 # class StreamProxy 598 599 #------------------------ 600 # Extraction file object 601 #------------------------ 602 class _FileInFile(object): 603 """A thin wrapper around an existing file object that 604 provides a part of its data as an individual file 605 object. 606 """ 607 608 def __init__(self, fileobj, offset, size, blockinfo=None): 609 self.fileobj = fileobj 610 self.offset = offset 611 self.size = size 612 self.position = 0 613 self.name = getattr(fileobj, "name", None) 614 self.closed = False 615 616 if blockinfo is None: 617 blockinfo = [(0, size)] 618 619 # Construct a map with data and zero blocks. 620 self.map_index = 0 621 self.map = [] 622 lastpos = 0 623 realpos = self.offset 624 for offset, size in blockinfo: 625 if offset > lastpos: 626 self.map.append((False, lastpos, offset, None)) 627 self.map.append((True, offset, offset + size, realpos)) 628 realpos += size 629 lastpos = offset + size 630 if lastpos < self.size: 631 self.map.append((False, lastpos, self.size, None)) 632 633 def flush(self): 634 pass 635 636 def readable(self): 637 return True 638 639 def writable(self): 640 return False 641 642 def seekable(self): 643 return self.fileobj.seekable() 644 645 def tell(self): 646 """Return the current file position. 647 """ 648 return self.position 649 650 def seek(self, position, whence=io.SEEK_SET): 651 """Seek to a position in the file. 652 """ 653 if whence == io.SEEK_SET: 654 self.position = min(max(position, 0), self.size) 655 elif whence == io.SEEK_CUR: 656 if position < 0: 657 self.position = max(self.position + position, 0) 658 else: 659 self.position = min(self.position + position, self.size) 660 elif whence == io.SEEK_END: 661 self.position = max(min(self.size + position, self.size), 0) 662 else: 663 raise ValueError("Invalid argument") 664 return self.position 665 666 def read(self, size=None): 667 """Read data from the file. 668 """ 669 if size is None: 670 size = self.size - self.position 671 else: 672 size = min(size, self.size - self.position) 673 674 buf = b"" 675 while size > 0: 676 while True: 677 data, start, stop, offset = self.map[self.map_index] 678 if start <= self.position < stop: 679 break 680 else: 681 self.map_index += 1 682 if self.map_index == len(self.map): 683 self.map_index = 0 684 length = min(size, stop - self.position) 685 if data: 686 self.fileobj.seek(offset + (self.position - start)) 687 b = self.fileobj.read(length) 688 if len(b) != length: 689 raise ReadError("unexpected end of data") 690 buf += b 691 else: 692 buf += NUL * length 693 size -= length 694 self.position += length 695 return buf 696 697 def readinto(self, b): 698 buf = self.read(len(b)) 699 b[:len(buf)] = buf 700 return len(buf) 701 702 def close(self): 703 self.closed = True 704 #class _FileInFile 705 706 class ExFileObject(io.BufferedReader): 707 708 def __init__(self, tarfile, tarinfo): 709 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data, 710 tarinfo.size, tarinfo.sparse) 711 super().__init__(fileobj) 712 #class ExFileObject 713 714 #------------------ 715 # Exported Classes 716 #------------------ 717 class TarInfo(object): 718 """Informational class which holds the details about an 719 archive member given by a tar header block. 720 TarInfo objects are returned by TarFile.getmember(), 721 TarFile.getmembers() and TarFile.gettarinfo() and are 722 usually created internally. 723 """ 724 725 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime", 726 "chksum", "type", "linkname", "uname", "gname", 727 "devmajor", "devminor", 728 "offset", "offset_data", "pax_headers", "sparse", 729 "tarfile", "_sparse_structs", "_link_target") 730 731 def __init__(self, name=""): 732 """Construct a TarInfo object. name is the optional name 733 of the member. 734 """ 735 self.name = name # member name 736 self.mode = 0o644 # file permissions 737 self.uid = 0 # user id 738 self.gid = 0 # group id 739 self.size = 0 # file size 740 self.mtime = 0 # modification time 741 self.chksum = 0 # header checksum 742 self.type = REGTYPE # member type 743 self.linkname = "" # link name 744 self.uname = "" # user name 745 self.gname = "" # group name 746 self.devmajor = 0 # device major number 747 self.devminor = 0 # device minor number 748 749 self.offset = 0 # the tar header starts here 750 self.offset_data = 0 # the file's data starts here 751 752 self.sparse = None # sparse member information 753 self.pax_headers = {} # pax header information 754 755 # In pax headers the "name" and "linkname" field are called 756 # "path" and "linkpath". 757 def _getpath(self): 758 return self.name 759 def _setpath(self, name): 760 self.name = name 761 path = property(_getpath, _setpath) 762 763 def _getlinkpath(self): 764 return self.linkname 765 def _setlinkpath(self, linkname): 766 self.linkname = linkname 767 linkpath = property(_getlinkpath, _setlinkpath) 768 769 def __repr__(self): 770 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) 771 772 def get_info(self): 773 """Return the TarInfo's attributes as a dictionary. 774 """ 775 info = { 776 "name": self.name, 777 "mode": self.mode & 0o7777, 778 "uid": self.uid, 779 "gid": self.gid, 780 "size": self.size, 781 "mtime": self.mtime, 782 "chksum": self.chksum, 783 "type": self.type, 784 "linkname": self.linkname, 785 "uname": self.uname, 786 "gname": self.gname, 787 "devmajor": self.devmajor, 788 "devminor": self.devminor 789 } 790 791 if info["type"] == DIRTYPE and not info["name"].endswith("/"): 792 info["name"] += "/" 793 794 return info 795 796 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): 797 """Return a tar header as a string of 512 byte blocks. 798 """ 799 info = self.get_info() 800 801 if format == USTAR_FORMAT: 802 return self.create_ustar_header(info, encoding, errors) 803 elif format == GNU_FORMAT: 804 return self.create_gnu_header(info, encoding, errors) 805 elif format == PAX_FORMAT: 806 return self.create_pax_header(info, encoding) 807 else: 808 raise ValueError("invalid format") 809 810 def create_ustar_header(self, info, encoding, errors): 811 """Return the object as a ustar header block. 812 """ 813 info["magic"] = POSIX_MAGIC 814 815 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 816 raise ValueError("linkname is too long") 817 818 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 819 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors) 820 821 return self._create_header(info, USTAR_FORMAT, encoding, errors) 822 823 def create_gnu_header(self, info, encoding, errors): 824 """Return the object as a GNU header block sequence. 825 """ 826 info["magic"] = GNU_MAGIC 827 828 buf = b"" 829 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 830 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) 831 832 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 833 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) 834 835 return buf + self._create_header(info, GNU_FORMAT, encoding, errors) 836 837 def create_pax_header(self, info, encoding): 838 """Return the object as a ustar header block. If it cannot be 839 represented this way, prepend a pax extended header sequence 840 with supplement information. 841 """ 842 info["magic"] = POSIX_MAGIC 843 pax_headers = self.pax_headers.copy() 844 845 # Test string fields for values that exceed the field length or cannot 846 # be represented in ASCII encoding. 847 for name, hname, length in ( 848 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), 849 ("uname", "uname", 32), ("gname", "gname", 32)): 850 851 if hname in pax_headers: 852 # The pax header has priority. 853 continue 854 855 # Try to encode the string as ASCII. 856 try: 857 info[name].encode("ascii", "strict") 858 except UnicodeEncodeError: 859 pax_headers[hname] = info[name] 860 continue 861 862 if len(info[name]) > length: 863 pax_headers[hname] = info[name] 864 865 # Test number fields for values that exceed the field limit or values 866 # that like to be stored as float. 867 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): 868 if name in pax_headers: 869 # The pax header has priority. Avoid overflow. 870 info[name] = 0 871 continue 872 873 val = info[name] 874 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): 875 pax_headers[name] = str(val) 876 info[name] = 0 877 878 # Create a pax extended header if necessary. 879 if pax_headers: 880 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding) 881 else: 882 buf = b"" 883 884 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") 885 886 @classmethod 887 def create_pax_global_header(cls, pax_headers): 888 """Return the object as a pax global header block sequence. 889 """ 890 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8") 891 892 def _posix_split_name(self, name, encoding, errors): 893 """Split a name longer than 100 chars into a prefix 894 and a name part. 895 """ 896 components = name.split("/") 897 for i in range(1, len(components)): 898 prefix = "/".join(components[:i]) 899 name = "/".join(components[i:]) 900 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \ 901 len(name.encode(encoding, errors)) <= LENGTH_NAME: 902 break 903 else: 904 raise ValueError("name is too long") 905 906 return prefix, name 907 908 @staticmethod 909 def _create_header(info, format, encoding, errors): 910 """Return a header block. info is a dictionary with file 911 information, format must be one of the *_FORMAT constants. 912 """ 913 parts = [ 914 stn(info.get("name", ""), 100, encoding, errors), 915 itn(info.get("mode", 0) & 0o7777, 8, format), 916 itn(info.get("uid", 0), 8, format), 917 itn(info.get("gid", 0), 8, format), 918 itn(info.get("size", 0), 12, format), 919 itn(info.get("mtime", 0), 12, format), 920 b" ", # checksum field 921 info.get("type", REGTYPE), 922 stn(info.get("linkname", ""), 100, encoding, errors), 923 info.get("magic", POSIX_MAGIC), 924 stn(info.get("uname", ""), 32, encoding, errors), 925 stn(info.get("gname", ""), 32, encoding, errors), 926 itn(info.get("devmajor", 0), 8, format), 927 itn(info.get("devminor", 0), 8, format), 928 stn(info.get("prefix", ""), 155, encoding, errors) 929 ] 930 931 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) 932 chksum = calc_chksums(buf[-BLOCKSIZE:])[0] 933 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:] 934 return buf 935 936 @staticmethod 937 def _create_payload(payload): 938 """Return the string payload filled with zero bytes 939 up to the next 512 byte border. 940 """ 941 blocks, remainder = divmod(len(payload), BLOCKSIZE) 942 if remainder > 0: 943 payload += (BLOCKSIZE - remainder) * NUL 944 return payload 945 946 @classmethod 947 def _create_gnu_long_header(cls, name, type, encoding, errors): 948 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence 949 for name. 950 """ 951 name = name.encode(encoding, errors) + NUL 952 953 info = {} 954 info["name"] = "././@LongLink" 955 info["type"] = type 956 info["size"] = len(name) 957 info["magic"] = GNU_MAGIC 958 959 # create extended header + name blocks. 960 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ 961 cls._create_payload(name) 962 963 @classmethod 964 def _create_pax_generic_header(cls, pax_headers, type, encoding): 965 """Return a POSIX.1-2008 extended or global header sequence 966 that contains a list of keyword, value pairs. The values 967 must be strings. 968 """ 969 # Check if one of the fields contains surrogate characters and thereby 970 # forces hdrcharset=BINARY, see _proc_pax() for more information. 971 binary = False 972 for keyword, value in pax_headers.items(): 973 try: 974 value.encode("utf-8", "strict") 975 except UnicodeEncodeError: 976 binary = True 977 break 978 979 records = b"" 980 if binary: 981 # Put the hdrcharset field at the beginning of the header. 982 records += b"21 hdrcharset=BINARY\n" 983 984 for keyword, value in pax_headers.items(): 985 keyword = keyword.encode("utf-8") 986 if binary: 987 # Try to restore the original byte representation of `value'. 988 # Needless to say, that the encoding must match the string. 989 value = value.encode(encoding, "surrogateescape") 990 else: 991 value = value.encode("utf-8") 992 993 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' 994 n = p = 0 995 while True: 996 n = l + len(str(p)) 997 if n == p: 998 break 999 p = n 1000 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n" 1001 1002 # We use a hardcoded "././@PaxHeader" name like star does 1003 # instead of the one that POSIX recommends. 1004 info = {} 1005 info["name"] = "././@PaxHeader" 1006 info["type"] = type 1007 info["size"] = len(records) 1008 info["magic"] = POSIX_MAGIC 1009 1010 # Create pax header + record blocks. 1011 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1012 cls._create_payload(records) 1013 1014 @classmethod 1015 def frombuf(cls, buf, encoding, errors): 1016 """Construct a TarInfo object from a 512 byte bytes object. 1017 """ 1018 if len(buf) == 0: 1019 raise EmptyHeaderError("empty header") 1020 if len(buf) != BLOCKSIZE: 1021 raise TruncatedHeaderError("truncated header") 1022 if buf.count(NUL) == BLOCKSIZE: 1023 raise EOFHeaderError("end of file header") 1024 1025 chksum = nti(buf[148:156]) 1026 if chksum not in calc_chksums(buf): 1027 raise InvalidHeaderError("bad checksum") 1028 1029 obj = cls() 1030 obj.name = nts(buf[0:100], encoding, errors) 1031 obj.mode = nti(buf[100:108]) 1032 obj.uid = nti(buf[108:116]) 1033 obj.gid = nti(buf[116:124]) 1034 obj.size = nti(buf[124:136]) 1035 obj.mtime = nti(buf[136:148]) 1036 obj.chksum = chksum 1037 obj.type = buf[156:157] 1038 obj.linkname = nts(buf[157:257], encoding, errors) 1039 obj.uname = nts(buf[265:297], encoding, errors) 1040 obj.gname = nts(buf[297:329], encoding, errors) 1041 obj.devmajor = nti(buf[329:337]) 1042 obj.devminor = nti(buf[337:345]) 1043 prefix = nts(buf[345:500], encoding, errors) 1044 1045 # Old V7 tar format represents a directory as a regular 1046 # file with a trailing slash. 1047 if obj.type == AREGTYPE and obj.name.endswith("/"): 1048 obj.type = DIRTYPE 1049 1050 # The old GNU sparse format occupies some of the unused 1051 # space in the buffer for up to 4 sparse structures. 1052 # Save the them for later processing in _proc_sparse(). 1053 if obj.type == GNUTYPE_SPARSE: 1054 pos = 386 1055 structs = [] 1056 for i in range(4): 1057 try: 1058 offset = nti(buf[pos:pos + 12]) 1059 numbytes = nti(buf[pos + 12:pos + 24]) 1060 except ValueError: 1061 break 1062 structs.append((offset, numbytes)) 1063 pos += 24 1064 isextended = bool(buf[482]) 1065 origsize = nti(buf[483:495]) 1066 obj._sparse_structs = (structs, isextended, origsize) 1067 1068 # Remove redundant slashes from directories. 1069 if obj.isdir(): 1070 obj.name = obj.name.rstrip("/") 1071 1072 # Reconstruct a ustar longname. 1073 if prefix and obj.type not in GNU_TYPES: 1074 obj.name = prefix + "/" + obj.name 1075 return obj 1076 1077 @classmethod 1078 def fromtarfile(cls, tarfile): 1079 """Return the next TarInfo object from TarFile object 1080 tarfile. 1081 """ 1082 buf = tarfile.fileobj.read(BLOCKSIZE) 1083 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1084 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1085 return obj._proc_member(tarfile) 1086 1087 #-------------------------------------------------------------------------- 1088 # The following are methods that are called depending on the type of a 1089 # member. The entry point is _proc_member() which can be overridden in a 1090 # subclass to add custom _proc_*() methods. A _proc_*() method MUST 1091 # implement the following 1092 # operations: 1093 # 1. Set self.offset_data to the position where the data blocks begin, 1094 # if there is data that follows. 1095 # 2. Set tarfile.offset to the position where the next member's header will 1096 # begin. 1097 # 3. Return self or another valid TarInfo object. 1098 def _proc_member(self, tarfile): 1099 """Choose the right processing method depending on 1100 the type and call it. 1101 """ 1102 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1103 return self._proc_gnulong(tarfile) 1104 elif self.type == GNUTYPE_SPARSE: 1105 return self._proc_sparse(tarfile) 1106 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1107 return self._proc_pax(tarfile) 1108 else: 1109 return self._proc_builtin(tarfile) 1110 1111 def _proc_builtin(self, tarfile): 1112 """Process a builtin type or an unknown type which 1113 will be treated as a regular file. 1114 """ 1115 self.offset_data = tarfile.fileobj.tell() 1116 offset = self.offset_data 1117 if self.isreg() or self.type not in SUPPORTED_TYPES: 1118 # Skip the following data blocks. 1119 offset += self._block(self.size) 1120 tarfile.offset = offset 1121 1122 # Patch the TarInfo object with saved global 1123 # header information. 1124 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1125 1126 return self 1127 1128 def _proc_gnulong(self, tarfile): 1129 """Process the blocks that hold a GNU longname 1130 or longlink member. 1131 """ 1132 buf = tarfile.fileobj.read(self._block(self.size)) 1133 1134 # Fetch the next header and process it. 1135 try: 1136 next = self.fromtarfile(tarfile) 1137 except HeaderError: 1138 raise SubsequentHeaderError("missing or bad subsequent header") 1139 1140 # Patch the TarInfo object from the next header with 1141 # the longname information. 1142 next.offset = self.offset 1143 if self.type == GNUTYPE_LONGNAME: 1144 next.name = nts(buf, tarfile.encoding, tarfile.errors) 1145 elif self.type == GNUTYPE_LONGLINK: 1146 next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1147 1148 return next 1149 1150 def _proc_sparse(self, tarfile): 1151 """Process a GNU sparse header plus extra headers. 1152 """ 1153 # We already collected some sparse structures in frombuf(). 1154 structs, isextended, origsize = self._sparse_structs 1155 del self._sparse_structs 1156 1157 # Collect sparse structures from extended header blocks. 1158 while isextended: 1159 buf = tarfile.fileobj.read(BLOCKSIZE) 1160 pos = 0 1161 for i in range(21): 1162 try: 1163 offset = nti(buf[pos:pos + 12]) 1164 numbytes = nti(buf[pos + 12:pos + 24]) 1165 except ValueError: 1166 break 1167 if offset and numbytes: 1168 structs.append((offset, numbytes)) 1169 pos += 24 1170 isextended = bool(buf[504]) 1171 self.sparse = structs 1172 1173 self.offset_data = tarfile.fileobj.tell() 1174 tarfile.offset = self.offset_data + self._block(self.size) 1175 self.size = origsize 1176 return self 1177 1178 def _proc_pax(self, tarfile): 1179 """Process an extended or global header as described in 1180 POSIX.1-2008. 1181 """ 1182 # Read the header information. 1183 buf = tarfile.fileobj.read(self._block(self.size)) 1184 1185 # A pax header stores supplemental information for either 1186 # the following file (extended) or all following files 1187 # (global). 1188 if self.type == XGLTYPE: 1189 pax_headers = tarfile.pax_headers 1190 else: 1191 pax_headers = tarfile.pax_headers.copy() 1192 1193 # Check if the pax header contains a hdrcharset field. This tells us 1194 # the encoding of the path, linkpath, uname and gname fields. Normally, 1195 # these fields are UTF-8 encoded but since POSIX.1-2008 tar 1196 # implementations are allowed to store them as raw binary strings if 1197 # the translation to UTF-8 fails. 1198 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) 1199 if match is not None: 1200 pax_headers["hdrcharset"] = match.group(1).decode("utf-8") 1201 1202 # For the time being, we don't care about anything other than "BINARY". 1203 # The only other value that is currently allowed by the standard is 1204 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8. 1205 hdrcharset = pax_headers.get("hdrcharset") 1206 if hdrcharset == "BINARY": 1207 encoding = tarfile.encoding 1208 else: 1209 encoding = "utf-8" 1210 1211 # Parse pax header information. A record looks like that: 1212 # "%d %s=%s\n" % (length, keyword, value). length is the size 1213 # of the complete record including the length field itself and 1214 # the newline. keyword and value are both UTF-8 encoded strings. 1215 regex = re.compile(br"(\d+) ([^=]+)=") 1216 pos = 0 1217 while True: 1218 match = regex.match(buf, pos) 1219 if not match: 1220 break 1221 1222 length, keyword = match.groups() 1223 length = int(length) 1224 value = buf[match.end(2) + 1:match.start(1) + length - 1] 1225 1226 # Normally, we could just use "utf-8" as the encoding and "strict" 1227 # as the error handler, but we better not take the risk. For 1228 # example, GNU tar <= 1.23 is known to store filenames it cannot 1229 # translate to UTF-8 as raw strings (unfortunately without a 1230 # hdrcharset=BINARY header). 1231 # We first try the strict standard encoding, and if that fails we 1232 # fall back on the user's encoding and error handler. 1233 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8", 1234 tarfile.errors) 1235 if keyword in PAX_NAME_FIELDS: 1236 value = self._decode_pax_field(value, encoding, tarfile.encoding, 1237 tarfile.errors) 1238 else: 1239 value = self._decode_pax_field(value, "utf-8", "utf-8", 1240 tarfile.errors) 1241 1242 pax_headers[keyword] = value 1243 pos += length 1244 1245 # Fetch the next header. 1246 try: 1247 next = self.fromtarfile(tarfile) 1248 except HeaderError: 1249 raise SubsequentHeaderError("missing or bad subsequent header") 1250 1251 # Process GNU sparse information. 1252 if "GNU.sparse.map" in pax_headers: 1253 # GNU extended sparse format version 0.1. 1254 self._proc_gnusparse_01(next, pax_headers) 1255 1256 elif "GNU.sparse.size" in pax_headers: 1257 # GNU extended sparse format version 0.0. 1258 self._proc_gnusparse_00(next, pax_headers, buf) 1259 1260 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1261 # GNU extended sparse format version 1.0. 1262 self._proc_gnusparse_10(next, pax_headers, tarfile) 1263 1264 if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1265 # Patch the TarInfo object with the extended header info. 1266 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1267 next.offset = self.offset 1268 1269 if "size" in pax_headers: 1270 # If the extended header replaces the size field, 1271 # we need to recalculate the offset where the next 1272 # header starts. 1273 offset = next.offset_data 1274 if next.isreg() or next.type not in SUPPORTED_TYPES: 1275 offset += next._block(next.size) 1276 tarfile.offset = offset 1277 1278 return next 1279 1280 def _proc_gnusparse_00(self, next, pax_headers, buf): 1281 """Process a GNU tar extended sparse header, version 0.0. 1282 """ 1283 offsets = [] 1284 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): 1285 offsets.append(int(match.group(1))) 1286 numbytes = [] 1287 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): 1288 numbytes.append(int(match.group(1))) 1289 next.sparse = list(zip(offsets, numbytes)) 1290 1291 def _proc_gnusparse_01(self, next, pax_headers): 1292 """Process a GNU tar extended sparse header, version 0.1. 1293 """ 1294 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1295 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1296 1297 def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1298 """Process a GNU tar extended sparse header, version 1.0. 1299 """ 1300 fields = None 1301 sparse = [] 1302 buf = tarfile.fileobj.read(BLOCKSIZE) 1303 fields, buf = buf.split(b"\n", 1) 1304 fields = int(fields) 1305 while len(sparse) < fields * 2: 1306 if b"\n" not in buf: 1307 buf += tarfile.fileobj.read(BLOCKSIZE) 1308 number, buf = buf.split(b"\n", 1) 1309 sparse.append(int(number)) 1310 next.offset_data = tarfile.fileobj.tell() 1311 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1312 1313 def _apply_pax_info(self, pax_headers, encoding, errors): 1314 """Replace fields with supplemental information from a previous 1315 pax extended or global header. 1316 """ 1317 for keyword, value in pax_headers.items(): 1318 if keyword == "GNU.sparse.name": 1319 setattr(self, "path", value) 1320 elif keyword == "GNU.sparse.size": 1321 setattr(self, "size", int(value)) 1322 elif keyword == "GNU.sparse.realsize": 1323 setattr(self, "size", int(value)) 1324 elif keyword in PAX_FIELDS: 1325 if keyword in PAX_NUMBER_FIELDS: 1326 try: 1327 value = PAX_NUMBER_FIELDS[keyword](value) 1328 except ValueError: 1329 value = 0 1330 if keyword == "path": 1331 value = value.rstrip("/") 1332 setattr(self, keyword, value) 1333 1334 self.pax_headers = pax_headers.copy() 1335 1336 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1337 """Decode a single field from a pax record. 1338 """ 1339 try: 1340 return value.decode(encoding, "strict") 1341 except UnicodeDecodeError: 1342 return value.decode(fallback_encoding, fallback_errors) 1343 1344 def _block(self, count): 1345 """Round up a byte count by BLOCKSIZE and return it, 1346 e.g. _block(834) => 1024. 1347 """ 1348 blocks, remainder = divmod(count, BLOCKSIZE) 1349 if remainder: 1350 blocks += 1 1351 return blocks * BLOCKSIZE 1352 1353 def isreg(self): 1354 return self.type in REGULAR_TYPES 1355 def isfile(self): 1356 return self.isreg() 1357 def isdir(self): 1358 return self.type == DIRTYPE 1359 def issym(self): 1360 return self.type == SYMTYPE 1361 def islnk(self): 1362 return self.type == LNKTYPE 1363 def ischr(self): 1364 return self.type == CHRTYPE 1365 def isblk(self): 1366 return self.type == BLKTYPE 1367 def isfifo(self): 1368 return self.type == FIFOTYPE 1369 def issparse(self): 1370 return self.sparse is not None 1371 def isdev(self): 1372 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1373 # class TarInfo 1374 1375 class TarFile(object): 1376 """The TarFile Class provides an interface to tar archives. 1377 """ 1378 1379 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs) 1380 1381 dereference = False # If true, add content of linked file to the 1382 # tar file, else the link. 1383 1384 ignore_zeros = False # If true, skips empty or invalid blocks and 1385 # continues processing. 1386 1387 errorlevel = 1 # If 0, fatal errors only appear in debug 1388 # messages (if debug >= 0). If > 0, errors 1389 # are passed to the caller as exceptions. 1390 1391 format = DEFAULT_FORMAT # The format to use when creating an archive. 1392 1393 encoding = ENCODING # Encoding for 8-bit character strings. 1394 1395 errors = None # Error handler for unicode conversion. 1396 1397 tarinfo = TarInfo # The default TarInfo class to use. 1398 1399 fileobject = ExFileObject # The file-object for extractfile(). 1400 1401 def __init__(self, name=None, mode="r", fileobj=None, format=None, 1402 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1403 errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None): 1404 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1405 read from an existing archive, 'a' to append data to an existing 1406 file or 'w' to create a new file overwriting an existing one. `mode' 1407 defaults to 'r'. 1408 If `fileobj' is given, it is used for reading or writing data. If it 1409 can be determined, `mode' is overridden by `fileobj's mode. 1410 `fileobj' is not closed, when TarFile is closed. 1411 """ 1412 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"} 1413 if mode not in modes: 1414 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1415 self.mode = mode 1416 self._mode = modes[mode] 1417 1418 if not fileobj: 1419 if self.mode == "a" and not os.path.exists(name): 1420 # Create nonexistent files in append mode. 1421 self.mode = "w" 1422 self._mode = "wb" 1423 fileobj = bltn_open(name, self._mode) 1424 self._extfileobj = False 1425 else: 1426 if (name is None and hasattr(fileobj, "name") and 1427 isinstance(fileobj.name, (str, bytes))): 1428 name = fileobj.name 1429 if hasattr(fileobj, "mode"): 1430 self._mode = fileobj.mode 1431 self._extfileobj = True 1432 self.name = os.path.abspath(name) if name else None 1433 self.fileobj = fileobj 1434 1435 # Init attributes. 1436 if format is not None: 1437 self.format = format 1438 if tarinfo is not None: 1439 self.tarinfo = tarinfo 1440 if dereference is not None: 1441 self.dereference = dereference 1442 if ignore_zeros is not None: 1443 self.ignore_zeros = ignore_zeros 1444 if encoding is not None: 1445 self.encoding = encoding 1446 self.errors = errors 1447 1448 if pax_headers is not None and self.format == PAX_FORMAT: 1449 self.pax_headers = pax_headers 1450 else: 1451 self.pax_headers = {} 1452 1453 if debug is not None: 1454 self.debug = debug 1455 if errorlevel is not None: 1456 self.errorlevel = errorlevel 1457 1458 # Init datastructures. 1459 self.closed = False 1460 self.members = [] # list of members as TarInfo objects 1461 self._loaded = False # flag if all members have been read 1462 self.offset = self.fileobj.tell() 1463 # current position in the archive file 1464 self.inodes = {} # dictionary caching the inodes of 1465 # archive members already added 1466 1467 try: 1468 if self.mode == "r": 1469 self.firstmember = None 1470 self.firstmember = self.next() 1471 1472 if self.mode == "a": 1473 # Move to the end of the archive, 1474 # before the first empty block. 1475 while True: 1476 self.fileobj.seek(self.offset) 1477 try: 1478 tarinfo = self.tarinfo.fromtarfile(self) 1479 self.members.append(tarinfo) 1480 except EOFHeaderError: 1481 self.fileobj.seek(self.offset) 1482 break 1483 except HeaderError as e: 1484 raise ReadError(str(e)) 1485 1486 if self.mode in ("a", "w", "x"): 1487 self._loaded = True 1488 1489 if self.pax_headers: 1490 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1491 self.fileobj.write(buf) 1492 self.offset += len(buf) 1493 except: 1494 if not self._extfileobj: 1495 self.fileobj.close() 1496 self.closed = True 1497 raise 1498 1499 #-------------------------------------------------------------------------- 1500 # Below are the classmethods which act as alternate constructors to the 1501 # TarFile class. The open() method is the only one that is needed for 1502 # public use; it is the "super"-constructor and is able to select an 1503 # adequate "sub"-constructor for a particular compression using the mapping 1504 # from OPEN_METH. 1505 # 1506 # This concept allows one to subclass TarFile without losing the comfort of 1507 # the super-constructor. A sub-constructor is registered and made available 1508 # by adding it to the mapping in OPEN_METH. 1509 1510 @classmethod 1511 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1512 """Open a tar archive for reading, writing or appending. Return 1513 an appropriate TarFile class. 1514 1515 mode: 1516 'r' or 'r:*' open for reading with transparent compression 1517 'r:' open for reading exclusively uncompressed 1518 'r:gz' open for reading with gzip compression 1519 'r:bz2' open for reading with bzip2 compression 1520 'r:xz' open for reading with lzma compression 1521 'a' or 'a:' open for appending, creating the file if necessary 1522 'w' or 'w:' open for writing without compression 1523 'w:gz' open for writing with gzip compression 1524 'w:bz2' open for writing with bzip2 compression 1525 'w:xz' open for writing with lzma compression 1526 1527 'x' or 'x:' create a tarfile exclusively without compression, raise 1528 an exception if the file is already created 1529 'x:gz' create a gzip compressed tarfile, raise an exception 1530 if the file is already created 1531 'x:bz2' create a bzip2 compressed tarfile, raise an exception 1532 if the file is already created 1533 'x:xz' create an lzma compressed tarfile, raise an exception 1534 if the file is already created 1535 1536 'r|*' open a stream of tar blocks with transparent compression 1537 'r|' open an uncompressed stream of tar blocks for reading 1538 'r|gz' open a gzip compressed stream of tar blocks 1539 'r|bz2' open a bzip2 compressed stream of tar blocks 1540 'r|xz' open an lzma compressed stream of tar blocks 1541 'w|' open an uncompressed stream for writing 1542 'w|gz' open a gzip compressed stream for writing 1543 'w|bz2' open a bzip2 compressed stream for writing 1544 'w|xz' open an lzma compressed stream for writing 1545 """ 1546 1547 if not name and not fileobj: 1548 raise ValueError("nothing to open") 1549 1550 if mode in ("r", "r:*"): 1551 # Find out which *open() is appropriate for opening the file. 1552 for comptype in cls.OPEN_METH: 1553 func = getattr(cls, cls.OPEN_METH[comptype]) 1554 if fileobj is not None: 1555 saved_pos = fileobj.tell() 1556 try: 1557 return func(name, "r", fileobj, **kwargs) 1558 except (ReadError, CompressionError) as e: 1559 if fileobj is not None: 1560 fileobj.seek(saved_pos) 1561 continue 1562 raise ReadError("file could not be opened successfully") 1563 1564 elif ":" in mode: 1565 filemode, comptype = mode.split(":", 1) 1566 filemode = filemode or "r" 1567 comptype = comptype or "tar" 1568 1569 # Select the *open() function according to 1570 # given compression. 1571 if comptype in cls.OPEN_METH: 1572 func = getattr(cls, cls.OPEN_METH[comptype]) 1573 else: 1574 raise CompressionError("unknown compression type %r" % comptype) 1575 return func(name, filemode, fileobj, **kwargs) 1576 1577 elif "|" in mode: 1578 filemode, comptype = mode.split("|", 1) 1579 filemode = filemode or "r" 1580 comptype = comptype or "tar" 1581 1582 if filemode not in ("r", "w"): 1583 raise ValueError("mode must be 'r' or 'w'") 1584 1585 stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1586 try: 1587 t = cls(name, filemode, stream, **kwargs) 1588 except: 1589 stream.close() 1590 raise 1591 t._extfileobj = False 1592 return t 1593 1594 elif mode in ("a", "w", "x"): 1595 return cls.taropen(name, mode, fileobj, **kwargs) 1596 1597 raise ValueError("undiscernible mode") 1598 1599 @classmethod 1600 def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1601 """Open uncompressed tar archive name for reading or writing. 1602 """ 1603 if mode not in ("r", "a", "w", "x"): 1604 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1605 return cls(name, mode, fileobj, **kwargs) 1606 1607 @classmethod 1608 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1609 """Open gzip compressed tar archive name for reading or writing. 1610 Appending is not allowed. 1611 """ 1612 if mode not in ("r", "w", "x"): 1613 raise ValueError("mode must be 'r', 'w' or 'x'") 1614 1615 try: 1616 import gzip 1617 gzip.GzipFile 1618 except (ImportError, AttributeError): 1619 raise CompressionError("gzip module is not available") 1620 1621 try: 1622 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj) 1623 except OSError: 1624 if fileobj is not None and mode == 'r': 1625 raise ReadError("not a gzip file") 1626 raise 1627 1628 try: 1629 t = cls.taropen(name, mode, fileobj, **kwargs) 1630 except OSError: 1631 fileobj.close() 1632 if mode == 'r': 1633 raise ReadError("not a gzip file") 1634 raise 1635 except: 1636 fileobj.close() 1637 raise 1638 t._extfileobj = False 1639 return t 1640 1641 @classmethod 1642 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1643 """Open bzip2 compressed tar archive name for reading or writing. 1644 Appending is not allowed. 1645 """ 1646 if mode not in ("r", "w", "x"): 1647 raise ValueError("mode must be 'r', 'w' or 'x'") 1648 1649 try: 1650 import bz2 1651 except ImportError: 1652 raise CompressionError("bz2 module is not available") 1653 1654 fileobj = bz2.BZ2File(fileobj or name, mode, 1655 compresslevel=compresslevel) 1656 1657 try: 1658 t = cls.taropen(name, mode, fileobj, **kwargs) 1659 except (OSError, EOFError): 1660 fileobj.close() 1661 if mode == 'r': 1662 raise ReadError("not a bzip2 file") 1663 raise 1664 except: 1665 fileobj.close() 1666 raise 1667 t._extfileobj = False 1668 return t 1669 1670 @classmethod 1671 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs): 1672 """Open lzma compressed tar archive name for reading or writing. 1673 Appending is not allowed. 1674 """ 1675 if mode not in ("r", "w", "x"): 1676 raise ValueError("mode must be 'r', 'w' or 'x'") 1677 1678 try: 1679 import lzma 1680 except ImportError: 1681 raise CompressionError("lzma module is not available") 1682 1683 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset) 1684 1685 try: 1686 t = cls.taropen(name, mode, fileobj, **kwargs) 1687 except (lzma.LZMAError, EOFError): 1688 fileobj.close() 1689 if mode == 'r': 1690 raise ReadError("not an lzma file") 1691 raise 1692 except: 1693 fileobj.close() 1694 raise 1695 t._extfileobj = False 1696 return t 1697 1698 # All *open() methods are registered here. 1699 OPEN_METH = { 1700 "tar": "taropen", # uncompressed tar 1701 "gz": "gzopen", # gzip compressed tar 1702 "bz2": "bz2open", # bzip2 compressed tar 1703 "xz": "xzopen" # lzma compressed tar 1704 } 1705 1706 #-------------------------------------------------------------------------- 1707 # The public methods which TarFile provides: 1708 1709 def close(self): 1710 """Close the TarFile. In write-mode, two finishing zero blocks are 1711 appended to the archive. 1712 """ 1713 if self.closed: 1714 return 1715 1716 self.closed = True 1717 try: 1718 if self.mode in ("a", "w", "x"): 1719 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 1720 self.offset += (BLOCKSIZE * 2) 1721 # fill up the end with zero-blocks 1722 # (like option -b20 for tar does) 1723 blocks, remainder = divmod(self.offset, RECORDSIZE) 1724 if remainder > 0: 1725 self.fileobj.write(NUL * (RECORDSIZE - remainder)) 1726 finally: 1727 if not self._extfileobj: 1728 self.fileobj.close() 1729 1730 def getmember(self, name): 1731 """Return a TarInfo object for member `name'. If `name' can not be 1732 found in the archive, KeyError is raised. If a member occurs more 1733 than once in the archive, its last occurrence is assumed to be the 1734 most up-to-date version. 1735 """ 1736 tarinfo = self._getmember(name) 1737 if tarinfo is None: 1738 raise KeyError("filename %r not found" % name) 1739 return tarinfo 1740 1741 def getmembers(self): 1742 """Return the members of the archive as a list of TarInfo objects. The 1743 list has the same order as the members in the archive. 1744 """ 1745 self._check() 1746 if not self._loaded: # if we want to obtain a list of 1747 self._load() # all members, we first have to 1748 # scan the whole archive. 1749 return self.members 1750 1751 def getnames(self): 1752 """Return the members of the archive as a list of their names. It has 1753 the same order as the list returned by getmembers(). 1754 """ 1755 return [tarinfo.name for tarinfo in self.getmembers()] 1756 1757 def gettarinfo(self, name=None, arcname=None, fileobj=None): 1758 """Create a TarInfo object from the result of os.stat or equivalent 1759 on an existing file. The file is either named by `name', or 1760 specified as a file object `fileobj' with a file descriptor. If 1761 given, `arcname' specifies an alternative name for the file in the 1762 archive, otherwise, the name is taken from the 'name' attribute of 1763 'fileobj', or the 'name' argument. The name should be a text 1764 string. 1765 """ 1766 self._check("awx") 1767 1768 # When fileobj is given, replace name by 1769 # fileobj's real name. 1770 if fileobj is not None: 1771 name = fileobj.name 1772 1773 # Building the name of the member in the archive. 1774 # Backward slashes are converted to forward slashes, 1775 # Absolute paths are turned to relative paths. 1776 if arcname is None: 1777 arcname = name 1778 drv, arcname = os.path.splitdrive(arcname) 1779 arcname = arcname.replace(os.sep, "/") 1780 arcname = arcname.lstrip("/") 1781 1782 # Now, fill the TarInfo object with 1783 # information specific for the file. 1784 tarinfo = self.tarinfo() 1785 tarinfo.tarfile = self # Not needed 1786 1787 # Use os.stat or os.lstat, depending on platform 1788 # and if symlinks shall be resolved. 1789 if fileobj is None: 1790 if hasattr(os, "lstat") and not self.dereference: 1791 statres = os.lstat(name) 1792 else: 1793 statres = os.stat(name) 1794 else: 1795 statres = os.fstat(fileobj.fileno()) 1796 linkname = "" 1797 1798 stmd = statres.st_mode 1799 if stat.S_ISREG(stmd): 1800 inode = (statres.st_ino, statres.st_dev) 1801 if not self.dereference and statres.st_nlink > 1 and \ 1802 inode in self.inodes and arcname != self.inodes[inode]: 1803 # Is it a hardlink to an already 1804 # archived file? 1805 type = LNKTYPE 1806 linkname = self.inodes[inode] 1807 else: 1808 # The inode is added only if its valid. 1809 # For win32 it is always 0. 1810 type = REGTYPE 1811 if inode[0]: 1812 self.inodes[inode] = arcname 1813 elif stat.S_ISDIR(stmd): 1814 type = DIRTYPE 1815 elif stat.S_ISFIFO(stmd): 1816 type = FIFOTYPE 1817 elif stat.S_ISLNK(stmd): 1818 type = SYMTYPE 1819 linkname = os.readlink(name) 1820 elif stat.S_ISCHR(stmd): 1821 type = CHRTYPE 1822 elif stat.S_ISBLK(stmd): 1823 type = BLKTYPE 1824 else: 1825 return None 1826 1827 # Fill the TarInfo object with all 1828 # information we can get. 1829 tarinfo.name = arcname 1830 tarinfo.mode = stmd 1831 tarinfo.uid = statres.st_uid 1832 tarinfo.gid = statres.st_gid 1833 if type == REGTYPE: 1834 tarinfo.size = statres.st_size 1835 else: 1836 tarinfo.size = 0 1837 tarinfo.mtime = statres.st_mtime 1838 tarinfo.type = type 1839 tarinfo.linkname = linkname 1840 if pwd: 1841 try: 1842 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 1843 except KeyError: 1844 pass 1845 if grp: 1846 try: 1847 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 1848 except KeyError: 1849 pass 1850 1851 if type in (CHRTYPE, BLKTYPE): 1852 if hasattr(os, "major") and hasattr(os, "minor"): 1853 tarinfo.devmajor = os.major(statres.st_rdev) 1854 tarinfo.devminor = os.minor(statres.st_rdev) 1855 return tarinfo 1856 1857 def list(self, verbose=True, *, members=None): 1858 """Print a table of contents to sys.stdout. If `verbose' is False, only 1859 the names of the members are printed. If it is True, an `ls -l'-like 1860 output is produced. `members' is optional and must be a subset of the 1861 list returned by getmembers(). 1862 """ 1863 self._check() 1864 1865 if members is None: 1866 members = self 1867 for tarinfo in members: 1868 if verbose: 1869 _safe_print(stat.filemode(tarinfo.mode)) 1870 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, 1871 tarinfo.gname or tarinfo.gid)) 1872 if tarinfo.ischr() or tarinfo.isblk(): 1873 _safe_print("%10s" % 1874 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) 1875 else: 1876 _safe_print("%10d" % tarinfo.size) 1877 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ 1878 % time.localtime(tarinfo.mtime)[:6]) 1879 1880 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) 1881 1882 if verbose: 1883 if tarinfo.issym(): 1884 _safe_print("-> " + tarinfo.linkname) 1885 if tarinfo.islnk(): 1886 _safe_print("link to " + tarinfo.linkname) 1887 print() 1888 1889 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None): 1890 """Add the file `name' to the archive. `name' may be any type of file 1891 (directory, fifo, symbolic link, etc.). If given, `arcname' 1892 specifies an alternative name for the file in the archive. 1893 Directories are added recursively by default. This can be avoided by 1894 setting `recursive' to False. `exclude' is a function that should 1895 return True for each filename to be excluded. `filter' is a function 1896 that expects a TarInfo object argument and returns the changed 1897 TarInfo object, if it returns None the TarInfo object will be 1898 excluded from the archive. 1899 """ 1900 self._check("awx") 1901 1902 if arcname is None: 1903 arcname = name 1904 1905 # Exclude pathnames. 1906 if exclude is not None: 1907 import warnings 1908 warnings.warn("use the filter argument instead", 1909 DeprecationWarning, 2) 1910 if exclude(name): 1911 self._dbg(2, "tarfile: Excluded %r" % name) 1912 return 1913 1914 # Skip if somebody tries to archive the archive... 1915 if self.name is not None and os.path.abspath(name) == self.name: 1916 self._dbg(2, "tarfile: Skipped %r" % name) 1917 return 1918 1919 self._dbg(1, name) 1920 1921 # Create a TarInfo object from the file. 1922 tarinfo = self.gettarinfo(name, arcname) 1923 1924 if tarinfo is None: 1925 self._dbg(1, "tarfile: Unsupported type %r" % name) 1926 return 1927 1928 # Change or exclude the TarInfo object. 1929 if filter is not None: 1930 tarinfo = filter(tarinfo) 1931 if tarinfo is None: 1932 self._dbg(2, "tarfile: Excluded %r" % name) 1933 return 1934 1935 # Append the tar header and data to the archive. 1936 if tarinfo.isreg(): 1937 with bltn_open(name, "rb") as f: 1938 self.addfile(tarinfo, f) 1939 1940 elif tarinfo.isdir(): 1941 self.addfile(tarinfo) 1942 if recursive: 1943 for f in os.listdir(name): 1944 self.add(os.path.join(name, f), os.path.join(arcname, f), 1945 recursive, exclude, filter=filter) 1946 1947 else: 1948 self.addfile(tarinfo) 1949 1950 def addfile(self, tarinfo, fileobj=None): 1951 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 1952 given, it should be a binary file, and tarinfo.size bytes are read 1953 from it and added to the archive. You can create TarInfo objects 1954 directly, or by using gettarinfo(). 1955 """ 1956 self._check("awx") 1957 1958 tarinfo = copy.copy(tarinfo) 1959 1960 buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 1961 self.fileobj.write(buf) 1962 self.offset += len(buf) 1963 1964 # If there's data to follow, append it. 1965 if fileobj is not None: 1966 copyfileobj(fileobj, self.fileobj, tarinfo.size) 1967 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 1968 if remainder > 0: 1969 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 1970 blocks += 1 1971 self.offset += blocks * BLOCKSIZE 1972 1973 self.members.append(tarinfo) 1974 1975 def extractall(self, path=".", members=None, *, numeric_owner=False): 1976 """Extract all members from the archive to the current working 1977 directory and set owner, modification time and permissions on 1978 directories afterwards. `path' specifies a different directory 1979 to extract to. `members' is optional and must be a subset of the 1980 list returned by getmembers(). If `numeric_owner` is True, only 1981 the numbers for user/group names are used and not the names. 1982 """ 1983 directories = [] 1984 1985 if members is None: 1986 members = self 1987 1988 for tarinfo in members: 1989 if tarinfo.isdir(): 1990 # Extract directories with a safe mode. 1991 directories.append(tarinfo) 1992 tarinfo = copy.copy(tarinfo) 1993 tarinfo.mode = 0o700 1994 # Do not set_attrs directories, as we will do that further down 1995 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), 1996 numeric_owner=numeric_owner) 1997 1998 # Reverse sort directories. 1999 directories.sort(key=lambda a: a.name) 2000 directories.reverse() 2001 2002 # Set correct owner, mtime and filemode on directories. 2003 for tarinfo in directories: 2004 dirpath = os.path.join(path, tarinfo.name) 2005 try: 2006 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner) 2007 self.utime(tarinfo, dirpath) 2008 self.chmod(tarinfo, dirpath) 2009 except ExtractError as e: 2010 if self.errorlevel > 1: 2011 raise 2012 else: 2013 self._dbg(1, "tarfile: %s" % e) 2014 2015 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): 2016 """Extract a member from the archive to the current working directory, 2017 using its full name. Its file information is extracted as accurately 2018 as possible. `member' may be a filename or a TarInfo object. You can 2019 specify a different directory using `path'. File attributes (owner, 2020 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` 2021 is True, only the numbers for user/group names are used and not 2022 the names. 2023 """ 2024 self._check("r") 2025 2026 if isinstance(member, str): 2027 tarinfo = self.getmember(member) 2028 else: 2029 tarinfo = member 2030 2031 # Prepare the link target for makelink(). 2032 if tarinfo.islnk(): 2033 tarinfo._link_target = os.path.join(path, tarinfo.linkname) 2034 2035 try: 2036 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2037 set_attrs=set_attrs, 2038 numeric_owner=numeric_owner) 2039 except OSError as e: 2040 if self.errorlevel > 0: 2041 raise 2042 else: 2043 if e.filename is None: 2044 self._dbg(1, "tarfile: %s" % e.strerror) 2045 else: 2046 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2047 except ExtractError as e: 2048 if self.errorlevel > 1: 2049 raise 2050 else: 2051 self._dbg(1, "tarfile: %s" % e) 2052 2053 def extractfile(self, member): 2054 """Extract a member from the archive as a file object. `member' may be 2055 a filename or a TarInfo object. If `member' is a regular file or a 2056 link, an io.BufferedReader object is returned. Otherwise, None is 2057 returned. 2058 """ 2059 self._check("r") 2060 2061 if isinstance(member, str): 2062 tarinfo = self.getmember(member) 2063 else: 2064 tarinfo = member 2065 2066 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: 2067 # Members with unknown types are treated as regular files. 2068 return self.fileobject(self, tarinfo) 2069 2070 elif tarinfo.islnk() or tarinfo.issym(): 2071 if isinstance(self.fileobj, _Stream): 2072 # A small but ugly workaround for the case that someone tries 2073 # to extract a (sym)link as a file-object from a non-seekable 2074 # stream of tar blocks. 2075 raise StreamError("cannot extract (sym)link as file object") 2076 else: 2077 # A (sym)link's file object is its target's file object. 2078 return self.extractfile(self._find_link_target(tarinfo)) 2079 else: 2080 # If there's no data associated with the member (directory, chrdev, 2081 # blkdev, etc.), return None instead of a file object. 2082 return None 2083 2084 def _extract_member(self, tarinfo, targetpath, set_attrs=True, 2085 numeric_owner=False): 2086 """Extract the TarInfo object tarinfo to a physical 2087 file called targetpath. 2088 """ 2089 # Fetch the TarInfo object for the given name 2090 # and build the destination pathname, replacing 2091 # forward slashes to platform specific separators. 2092 targetpath = targetpath.rstrip("/") 2093 targetpath = targetpath.replace("/", os.sep) 2094 2095 # Create all upper directories. 2096 upperdirs = os.path.dirname(targetpath) 2097 if upperdirs and not os.path.exists(upperdirs): 2098 # Create directories that are not part of the archive with 2099 # default permissions. 2100 os.makedirs(upperdirs) 2101 2102 if tarinfo.islnk() or tarinfo.issym(): 2103 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2104 else: 2105 self._dbg(1, tarinfo.name) 2106 2107 if tarinfo.isreg(): 2108 self.makefile(tarinfo, targetpath) 2109 elif tarinfo.isdir(): 2110 self.makedir(tarinfo, targetpath) 2111 elif tarinfo.isfifo(): 2112 self.makefifo(tarinfo, targetpath) 2113 elif tarinfo.ischr() or tarinfo.isblk(): 2114 self.makedev(tarinfo, targetpath) 2115 elif tarinfo.islnk() or tarinfo.issym(): 2116 self.makelink(tarinfo, targetpath) 2117 elif tarinfo.type not in SUPPORTED_TYPES: 2118 self.makeunknown(tarinfo, targetpath) 2119 else: 2120 self.makefile(tarinfo, targetpath) 2121 2122 if set_attrs: 2123 self.chown(tarinfo, targetpath, numeric_owner) 2124 if not tarinfo.issym(): 2125 self.chmod(tarinfo, targetpath) 2126 self.utime(tarinfo, targetpath) 2127 2128 #-------------------------------------------------------------------------- 2129 # Below are the different file methods. They are called via 2130 # _extract_member() when extract() is called. They can be replaced in a 2131 # subclass to implement other functionality. 2132 2133 def makedir(self, tarinfo, targetpath): 2134 """Make a directory called targetpath. 2135 """ 2136 try: 2137 # Use a safe mode for the directory, the real mode is set 2138 # later in _extract_member(). 2139 os.mkdir(targetpath, 0o700) 2140 except FileExistsError: 2141 pass 2142 2143 def makefile(self, tarinfo, targetpath): 2144 """Make a file called targetpath. 2145 """ 2146 source = self.fileobj 2147 source.seek(tarinfo.offset_data) 2148 with bltn_open(targetpath, "wb") as target: 2149 if tarinfo.sparse is not None: 2150 for offset, size in tarinfo.sparse: 2151 target.seek(offset) 2152 copyfileobj(source, target, size, ReadError) 2153 target.seek(tarinfo.size) 2154 target.truncate() 2155 else: 2156 copyfileobj(source, target, tarinfo.size, ReadError) 2157 2158 def makeunknown(self, tarinfo, targetpath): 2159 """Make a file from a TarInfo object with an unknown type 2160 at targetpath. 2161 """ 2162 self.makefile(tarinfo, targetpath) 2163 self._dbg(1, "tarfile: Unknown file type %r, " \ 2164 "extracted as regular file." % tarinfo.type) 2165 2166 def makefifo(self, tarinfo, targetpath): 2167 """Make a fifo called targetpath. 2168 """ 2169 if hasattr(os, "mkfifo"): 2170 os.mkfifo(targetpath) 2171 else: 2172 raise ExtractError("fifo not supported by system") 2173 2174 def makedev(self, tarinfo, targetpath): 2175 """Make a character or block device called targetpath. 2176 """ 2177 if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2178 raise ExtractError("special devices not supported by system") 2179 2180 mode = tarinfo.mode 2181 if tarinfo.isblk(): 2182 mode |= stat.S_IFBLK 2183 else: 2184 mode |= stat.S_IFCHR 2185 2186 os.mknod(targetpath, mode, 2187 os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2188 2189 def makelink(self, tarinfo, targetpath): 2190 """Make a (symbolic) link called targetpath. If it cannot be created 2191 (platform limitation), we try to make a copy of the referenced file 2192 instead of a link. 2193 """ 2194 try: 2195 # For systems that support symbolic and hard links. 2196 if tarinfo.issym(): 2197 os.symlink(tarinfo.linkname, targetpath) 2198 else: 2199 # See extract(). 2200 if os.path.exists(tarinfo._link_target): 2201 os.link(tarinfo._link_target, targetpath) 2202 else: 2203 self._extract_member(self._find_link_target(tarinfo), 2204 targetpath) 2205 except symlink_exception: 2206 try: 2207 self._extract_member(self._find_link_target(tarinfo), 2208 targetpath) 2209 except KeyError: 2210 raise ExtractError("unable to resolve link inside archive") 2211 2212 def chown(self, tarinfo, targetpath, numeric_owner): 2213 """Set owner of targetpath according to tarinfo. If numeric_owner 2214 is True, use .gid/.uid instead of .gname/.uname. 2215 """ 2216 if pwd and hasattr(os, "geteuid") and os.geteuid() == 0: 2217 # We have to be root to do so. 2218 if numeric_owner: 2219 g = tarinfo.gid 2220 u = tarinfo.uid 2221 else: 2222 try: 2223 g = grp.getgrnam(tarinfo.gname)[2] 2224 except KeyError: 2225 g = tarinfo.gid 2226 try: 2227 u = pwd.getpwnam(tarinfo.uname)[2] 2228 except KeyError: 2229 u = tarinfo.uid 2230 try: 2231 if tarinfo.issym() and hasattr(os, "lchown"): 2232 os.lchown(targetpath, u, g) 2233 else: 2234 os.chown(targetpath, u, g) 2235 except OSError as e: 2236 raise ExtractError("could not change owner") 2237 2238 def chmod(self, tarinfo, targetpath): 2239 """Set file permissions of targetpath according to tarinfo. 2240 """ 2241 if hasattr(os, 'chmod'): 2242 try: 2243 os.chmod(targetpath, tarinfo.mode) 2244 except OSError as e: 2245 raise ExtractError("could not change mode") 2246 2247 def utime(self, tarinfo, targetpath): 2248 """Set modification time of targetpath according to tarinfo. 2249 """ 2250 if not hasattr(os, 'utime'): 2251 return 2252 try: 2253 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) 2254 except OSError as e: 2255 raise ExtractError("could not change modification time") 2256 2257 #-------------------------------------------------------------------------- 2258 def next(self): 2259 """Return the next member of the archive as a TarInfo object, when 2260 TarFile is opened for reading. Return None if there is no more 2261 available. 2262 """ 2263 self._check("ra") 2264 if self.firstmember is not None: 2265 m = self.firstmember 2266 self.firstmember = None 2267 return m 2268 2269 # Advance the file pointer. 2270 if self.offset != self.fileobj.tell(): 2271 self.fileobj.seek(self.offset - 1) 2272 if not self.fileobj.read(1): 2273 raise ReadError("unexpected end of data") 2274 2275 # Read the next block. 2276 tarinfo = None 2277 while True: 2278 try: 2279 tarinfo = self.tarinfo.fromtarfile(self) 2280 except EOFHeaderError as e: 2281 if self.ignore_zeros: 2282 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2283 self.offset += BLOCKSIZE 2284 continue 2285 except InvalidHeaderError as e: 2286 if self.ignore_zeros: 2287 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2288 self.offset += BLOCKSIZE 2289 continue 2290 elif self.offset == 0: 2291 raise ReadError(str(e)) 2292 except EmptyHeaderError: 2293 if self.offset == 0: 2294 raise ReadError("empty file") 2295 except TruncatedHeaderError as e: 2296 if self.offset == 0: 2297 raise ReadError(str(e)) 2298 except SubsequentHeaderError as e: 2299 raise ReadError(str(e)) 2300 break 2301 2302 if tarinfo is not None: 2303 self.members.append(tarinfo) 2304 else: 2305 self._loaded = True 2306 2307 return tarinfo 2308 2309 #-------------------------------------------------------------------------- 2310 # Little helper methods: 2311 2312 def _getmember(self, name, tarinfo=None, normalize=False): 2313 """Find an archive member by name from bottom to top. 2314 If tarinfo is given, it is used as the starting point. 2315 """ 2316 # Ensure that all members have been loaded. 2317 members = self.getmembers() 2318 2319 # Limit the member search list up to tarinfo. 2320 if tarinfo is not None: 2321 members = members[:members.index(tarinfo)] 2322 2323 if normalize: 2324 name = os.path.normpath(name) 2325 2326 for member in reversed(members): 2327 if normalize: 2328 member_name = os.path.normpath(member.name) 2329 else: 2330 member_name = member.name 2331 2332 if name == member_name: 2333 return member 2334 2335 def _load(self): 2336 """Read through the entire archive file and look for readable 2337 members. 2338 """ 2339 while True: 2340 tarinfo = self.next() 2341 if tarinfo is None: 2342 break 2343 self._loaded = True 2344 2345 def _check(self, mode=None): 2346 """Check if TarFile is still open, and if the operation's mode 2347 corresponds to TarFile's mode. 2348 """ 2349 if self.closed: 2350 raise OSError("%s is closed" % self.__class__.__name__) 2351 if mode is not None and self.mode not in mode: 2352 raise OSError("bad operation for mode %r" % self.mode) 2353 2354 def _find_link_target(self, tarinfo): 2355 """Find the target member of a symlink or hardlink member in the 2356 archive. 2357 """ 2358 if tarinfo.issym(): 2359 # Always search the entire archive. 2360 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname))) 2361 limit = None 2362 else: 2363 # Search the archive before the link, because a hard link is 2364 # just a reference to an already archived file. 2365 linkname = tarinfo.linkname 2366 limit = tarinfo 2367 2368 member = self._getmember(linkname, tarinfo=limit, normalize=True) 2369 if member is None: 2370 raise KeyError("linkname %r not found" % linkname) 2371 return member 2372 2373 def __iter__(self): 2374 """Provide an iterator object. 2375 """ 2376 if self._loaded: 2377 return iter(self.members) 2378 else: 2379 return TarIter(self) 2380 2381 def _dbg(self, level, msg): 2382 """Write debugging output to sys.stderr. 2383 """ 2384 if level <= self.debug: 2385 print(msg, file=sys.stderr) 2386 2387 def __enter__(self): 2388 self._check() 2389 return self 2390 2391 def __exit__(self, type, value, traceback): 2392 if type is None: 2393 self.close() 2394 else: 2395 # An exception occurred. We must not call close() because 2396 # it would try to write end-of-archive blocks and padding. 2397 if not self._extfileobj: 2398 self.fileobj.close() 2399 self.closed = True 2400 # class TarFile 2401 2402 class TarIter: 2403 """Iterator Class. 2404 2405 for tarinfo in TarFile(...): 2406 suite... 2407 """ 2408 2409 def __init__(self, tarfile): 2410 """Construct a TarIter object. 2411 """ 2412 self.tarfile = tarfile 2413 self.index = 0 2414 def __iter__(self): 2415 """Return iterator object. 2416 """ 2417 return self 2418 def __next__(self): 2419 """Return the next item using TarFile's next() method. 2420 When all members have been read, set TarFile as _loaded. 2421 """ 2422 # Fix for SF #1100429: Under rare circumstances it can 2423 # happen that getmembers() is called during iteration, 2424 # which will cause TarIter to stop prematurely. 2425 2426 if self.index == 0 and self.tarfile.firstmember is not None: 2427 tarinfo = self.tarfile.next() 2428 elif self.index < len(self.tarfile.members): 2429 tarinfo = self.tarfile.members[self.index] 2430 elif not self.tarfile._loaded: 2431 tarinfo = self.tarfile.next() 2432 if not tarinfo: 2433 self.tarfile._loaded = True 2434 raise StopIteration 2435 else: 2436 raise StopIteration 2437 self.index += 1 2438 return tarinfo 2439 2440 #-------------------- 2441 # exported functions 2442 #-------------------- 2443 def is_tarfile(name): 2444 """Return True if name points to a tar archive that we 2445 are able to handle, else return False. 2446 """ 2447 try: 2448 t = open(name) 2449 t.close() 2450 return True 2451 except TarError: 2452 return False 2453 2454 open = TarFile.open 2455 2456 2457 def main(): 2458 import argparse 2459 2460 description = 'A simple command line interface for tarfile module.' 2461 parser = argparse.ArgumentParser(description=description) 2462 parser.add_argument('-v', '--verbose', action='store_true', default=False, 2463 help='Verbose output') 2464 group = parser.add_mutually_exclusive_group() 2465 group.add_argument('-l', '--list', metavar='<tarfile>', 2466 help='Show listing of a tarfile') 2467 group.add_argument('-e', '--extract', nargs='+', 2468 metavar=('<tarfile>', '<output_dir>'), 2469 help='Extract tarfile into target dir') 2470 group.add_argument('-c', '--create', nargs='+', 2471 metavar=('<name>', '<file>'), 2472 help='Create tarfile from sources') 2473 group.add_argument('-t', '--test', metavar='<tarfile>', 2474 help='Test if a tarfile is valid') 2475 args = parser.parse_args() 2476 2477 if args.test: 2478 src = args.test 2479 if is_tarfile(src): 2480 with open(src, 'r') as tar: 2481 tar.getmembers() 2482 print(tar.getmembers(), file=sys.stderr) 2483 if args.verbose: 2484 print('{!r} is a tar archive.'.format(src)) 2485 else: 2486 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2487 2488 elif args.list: 2489 src = args.list 2490 if is_tarfile(src): 2491 with TarFile.open(src, 'r:*') as tf: 2492 tf.list(verbose=args.verbose) 2493 else: 2494 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2495 2496 elif args.extract: 2497 if len(args.extract) == 1: 2498 src = args.extract[0] 2499 curdir = os.curdir 2500 elif len(args.extract) == 2: 2501 src, curdir = args.extract 2502 else: 2503 parser.exit(1, parser.format_help()) 2504 2505 if is_tarfile(src): 2506 with TarFile.open(src, 'r:*') as tf: 2507 tf.extractall(path=curdir) 2508 if args.verbose: 2509 if curdir == '.': 2510 msg = '{!r} file is extracted.'.format(src) 2511 else: 2512 msg = ('{!r} file is extracted ' 2513 'into {!r} directory.').format(src, curdir) 2514 print(msg) 2515 else: 2516 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2517 2518 elif args.create: 2519 tar_name = args.create.pop(0) 2520 _, ext = os.path.splitext(tar_name) 2521 compressions = { 2522 # gz 2523 '.gz': 'gz', 2524 '.tgz': 'gz', 2525 # xz 2526 '.xz': 'xz', 2527 '.txz': 'xz', 2528 # bz2 2529 '.bz2': 'bz2', 2530 '.tbz': 'bz2', 2531 '.tbz2': 'bz2', 2532 '.tb2': 'bz2', 2533 } 2534 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w' 2535 tar_files = args.create 2536 2537 with TarFile.open(tar_name, tar_mode) as tf: 2538 for file_name in tar_files: 2539 tf.add(file_name) 2540 2541 if args.verbose: 2542 print('{!r} file created.'.format(tar_name)) 2543 2544 else: 2545 parser.exit(1, parser.format_help()) 2546 2547 if __name__ == '__main__': 2548 main()
PyYAML模块
Python也可以很容易的处理ymal文档格式,只不过需要安装一个模块,参考文档:http://pyyaml.org/wiki/PyYAMLDocumentation。
re正则表达式
正则表达式: '.' 默认匹配除\n之外的任意一个字符,若指定flag DOTALL,则匹配任意字符,包括换行 '^' 匹配字符开头,若指定flags MULTILINE,这种也可以匹配上(r"^a","\nabc\neee",flags=re.MULTILINE) '$' 匹配字符结尾,或e.search("foo$","bfoo\nsdfsf",flags=re.MULTILINE).group()也可以 '*' 匹配*号前的字符0次或多次,re.findall("ab*","cabb3abcbbac") 结果为['abb', 'ab', 'a'] '+' 匹配前一个字符1次或多次,re.findall("ab+","ab+cd+abb+bba") 结果['ab', 'abb'] '?' 匹配前一个字符1次或0次 '{m}' 匹配前一个字符m次 '{n,m}' 匹配前一个字符n到m次,re.findall("ab{1,3}","abb abc abbcbbb") 结果'abb', 'ab', 'abb'] '|' 匹配|左或|右的字符,re.search("abc|ABC","ABCBabcCD").group() 结果'ABC' '(...)' 分组匹配,re.search("(abc){2}a(123|456)c", "abcabca456c").group() 结果 abcabca456c '\A' 只从字符开头匹配,re.search("\Aabc","alexabc") 是匹配不到的 '\Z' 匹配字符结尾,同$ '\d' 匹配数字0-9 '\D' 匹配非数字 '\w' 匹配[A-Za-z0-9] '\W' 匹配非[A-Za-z0-9] '\s' 匹配空白字符、\t、\n、\r , re.search("\s+","ab\tc1\n3").group() 结果 '\t' re.match 从头开始匹配 re.search 匹配包含 re.findall 把所有匹配到的字符放到以列表中的元素返回 re.splitall 以匹配到的字符当做列表分隔符 re.sub 匹配字符并替换 匹配模式: re.I:忽略大小写 re.M;多行模式,改变'^'和'$'的行为 re.S:点任意匹配模式,改变'.'的行为
# A开头,[A-Za-z0-9]1-7位,后面1个或n次数字并且以“n”结尾 print(re.match("^A\w{1,7}\d+\w*n$", "Allister12365HaoSen")) # 以数字开头长度为17位,以数字|x|X结尾 18位身份证 print(re.match("^\d{17}(\d|x|X){1}$", "42210319630213275X")) # 15位身份证 以数字开头的15位数字 print(re.match("^\d{15}", "422103196302132")) # 以“A”开头,a-zA-Z 一个或多个,后面加上r print(re.search("^A[a-zA-Z]+r", "Allister123Allister&ds")) # '?' 匹配前一个字符1次或0次 print(re.search("aaa?", "aaEEEEaaa")) # aa print(re.findall("abf?.", "abf%dafsgaabfterftw")) # 按指定字符分割为列表 print(re.split("[0-9]+", "rf123Allister89ljp")) # ['rf', 'Allister', 'ljp'] # sub(pattern, repl, string, count=0, flags=0) 将匹配到的值替换为指定字符 可以指定替换次数 print(re.sub("[0-9]+", "|", "rf123Allister89ljp", 5)) # rf|Allister|ljp """ 将身份证分解为省、市、区、年、月、日 """ # {'city': '09', 'county': '21', 'year': '1990', 'province': '51', 'day': '06', 'month': '08'} print(re.search("(?P<province>[0-9]{2})(?P<city>[0-9]{2})(?P<county>[0-9]{2})(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})",\ "51092119900806181X").groupdict()) """ 匹配模式: re.I:忽略大小写 re.M;多行模式,改变'^'和'$'的行为 re.S:点任意匹配模式,改变'.'的行为 """ # re.I 忽略大小写 print(re.search("[a-z]+", "abcdEFg", re.I)) # abcdEFg
# !/usr/bin/env python # -*- coding: utf-8 -*- # @File : re_test.py # @Author: Allister.Liu # @Date : 2018/1/22 # @Desc : 正则表达式 import re """ 正则表达式: '.' 默认匹配除\n之外的任意一个字符,若指定flag DOTALL,则匹配任意字符,包括换行 '^' 匹配字符开头,若指定flags MULTILINE,这种也可以匹配上(r"^a","\nabc\neee",flags=re.MULTILINE) '$' 匹配字符结尾,或e.search("foo$","bfoo\nsdfsf",flags=re.MULTILINE).group()也可以 '*' 匹配*号前的字符0次或多次,re.findall("ab*","cabb3abcbbac") 结果为['abb', 'ab', 'a'] '+' 匹配前一个字符1次或多次,re.findall("ab+","ab+cd+abb+bba") 结果['ab', 'abb'] '?' 匹配前一个字符1次或0次 '{m}' 匹配前一个字符m次 '{n,m}' 匹配前一个字符n到m次,re.findall("ab{1,3}","abb abc abbcbbb") 结果'abb', 'ab', 'abb'] '|' 匹配|左或|右的字符,re.search("abc|ABC","ABCBabcCD").group() 结果'ABC' '(...)' 分组匹配,re.search("(abc){2}a(123|456)c", "abcabca456c").group() 结果 abcabca456c '\A' 只从字符开头匹配,re.search("\Aabc","alexabc") 是匹配不到的 '\Z' 匹配字符结尾,同$ '\d' 匹配数字0-9 '\D' 匹配非数字 '\w' 匹配[A-Za-z0-9] '\W' 匹配非[A-Za-z0-9] '\s' 匹配空白字符、\t、\n、\r , re.search("\s+","ab\tc1\n3").group() 结果 '\t' re.match 从头开始匹配 re.search 匹配包含 re.findall 把所有匹配到的字符放到以列表中的元素返回 re.splitall 以匹配到的字符当做列表分隔符 re.sub 匹配字符并替换 匹配模式: re.I:忽略大小写 re.M;多行模式,改变'^'和'$'的行为 re.S:点任意匹配模式,改变'.'的行为 """ # A开头,[A-Za-z0-9]1-7位,后面1个或n次数字并且以“n”结尾 print(re.match("^A\w{1,7}\d+\w*n$", "Allister12365HaoSen")) # 以数字开头长度为17位,以数字|x|X结尾 18位身份证 print(re.match("^\d{17}(\d|x|X){1}$", "42210319630213275X")) # 15位身份证 以数字开头的15位数字 print(re.match("^\d{15}", "422103196302132")) # 以“A”开头,a-zA-Z 一个或多个,后面加上r print(re.search("^A[a-zA-Z]+r", "Allister123Allister&ds")) # '?' 匹配前一个字符1次或0次 print(re.search("aaa?", "aaEEEEaaa")) # aa print(re.findall("abf?.", "abf%dafsgaabfterftw")) # 按指定字符分割为列表 print(re.split("[0-9]+", "rf123Allister89ljp")) # ['rf', 'Allister', 'ljp'] # sub(pattern, repl, string, count=0, flags=0) 将匹配到的值替换为指定字符 可以指定替换次数 print(re.sub("[0-9]+", "|", "rf123Allister89ljp", 5)) # rf|Allister|ljp """ 将身份证分解为省、市、区、年、月、日 """ # {'city': '09', 'county': '21', 'year': '1990', 'province': '51', 'day': '06', 'month': '08'} print(re.search("(?P<province>[0-9]{2})(?P<city>[0-9]{2})(?P<county>[0-9]{2})(?P<year>[0-9]{4})(?P<month>[0-9]{2})(?P<day>[0-9]{2})",\ "51092119900806181X").groupdict()) """ 匹配模式: re.I:忽略大小写 re.M;多行模式,改变'^'和'$'的行为 re.S:点任意匹配模式,改变'.'的行为 """ # re.I 忽略大小写 print(re.search("[a-z]+", "abcdEFg", re.I)) # abcdEFg