问题
有时候我们遇到这样的压缩文件,手动用7zip解压到某个目录,打开目录后,里面还有不同压缩格式的其他文件。我们还要对它们分别手动解压,解压出来的文件目录里面可能还包含其他压缩文件。这样,我们可能需要层层解压才能获得自己想看的文件,琐碎且耗时。于是,用软件实现这项任务的自动化就挺有必要的。
设计
由于压缩文件的组织也是目录结构的,和访问普通目录的区别是要先解压后才能访问。
而目录结构对应于树形数据结构。这样,问题可以转换为,如何遍历树形结构的每个节点(文件),解压文件并深入目录继续解压。
自然的,树形数据结构的节点遍历,采用DFS或者BFS算法即可,整体的时间复杂度与解压后文件数量及大小相关。
基于Python的一种实现样例
代码样例如下,类Decompress
提供decompress
方法调用__decompress
实现DFS目录文件的遍历。首先,判断文件是否是目录,是的话,就进入目录,列出文件并逐一查看每个文件,若是目录,则继续深入访问内部文件递归调用__decompress
;否则,若文件名匹配remove_files_pattern
,则删除此文件,这通常是针对自己不关注的文件;否则,若想保留特定的文件而不做任何处理,可以设置ignore_files_pattern
来匹配这些文件;否则,根据文件的后缀名,调用解压缩模块解压。另外,如果想解压后保留原来的压缩文件,设置keep_origin_file
为True
即可。
import os
import re
import zipfile
import tarfile
import gzip
import lzma
'''
GZ decompress
'''
def gz_decompress(infile, tofile):
with open(infile, 'rb') as inf, open(tofile, 'w', encoding='utf8') as tof:
decom_str = gzip.decompress(inf.read()).decode(encoding='utf-8')
tof.write(decom_str)
def xz_decompress(infile, tofile):
with lzma.open(infile) as inf, open(tofile, 'w', encoding='utf8') as tof:
decom_str = inf.read().decode('utf-8')
tof.write(decom_str)
def match_patterns(file, pat_list):
for pat in pat_list:
restr = re.compile(pat)
if restr.search(file) != None:
return True
return False
class Decompressor:
def __init__(self) -> None:
'''
ignore_files_pattern for the files, we don't want to decompress, keep them where it is
'''
self.remove_files_pattern = [
r'(example1|example2)']
self.ignore_files_pattern = [r'(demo1)']
self.keep_origin_file = False
self.root_file = None
def decompress(self, file):
self.root_file = file
self.__decompress(file)
def __require_removal(self, file):
return match_patterns(file, self.remove_files_pattern)
def __need_ignore(self, file):
return match_patterns(file, self.ignore_files_pattern)
def __remove_file(self, file):
if not self.keep_origin_file:
if self.root_file != file:
# print("Remove file: ", file, self.root_file)
os.remove(file)
def __decompress(self, file):
'''
If file is a directory, visit the directory, it means the directory has some sub files
If file is a tar/gz/xz file, try to decompress it with relative tool,
check if new file node is added to current folder node, update the list
'''
# print("Visiting ", file)
if os.path.isdir(file):
os.chdir(file)
list_dir = os.listdir()
for afile in list_dir:
self.__decompress(afile)
os.chdir('..')
elif self.__require_removal(file):
# print("Remove file: ", file)
os.remove(file)
elif self.__need_ignore(file):
# print("ignore file:", file)
return
elif file.endswith('.tar.gz') or file.endswith('.txz'):
bTarfile = False
try:
bTarfile = tarfile.is_tarfile(file)
except BaseException as err:
print("Tarfile check error: {0}, file {1}".format(
err, os.path.abspath(file)))
if bTarfile:
next_dir = file.split('.')[0]
try:
with tarfile.open(file) as f:
f.extractall(next_dir)
f.close()
self.__remove_file(file)
except BaseException as err:
print("Tarfile decompress error: {0}, file {1}".format(
err, os.path.abspath(file)))
f.close()
if os.path.exists(next_dir):
self.__decompress(next_dir)
elif file.endswith('.zip'):
target_dir = file.split('.')[0]
os.mkdir(target_dir)
try:
with zipfile.ZipFile(file, 'r') as zip_ref:
zip_ref.extractall(target_dir)
self.__remove_file(file)
self.__decompress(target_dir)
except BaseException as err:
print("zip file decompress error: {0}, file {1}".format(
err, os.path.abspath(file)))
elif file.endswith('.xz'):
try:
xz_decompress(file, file.removesuffix('.xz'))
self.__remove_file(file)
except BaseException as err:
print("XZ decompress error: {0}, file {1}".format(
err, os.path.abspath(file)))
elif file.endswith('.gz'):
# Use gzip to decompress if fail to decompress with tarfile module
try:
gz_decompress(file, file.removesuffix('.gz'))
self.__remove_file(file)
except BaseException as err:
print("GZ decompress error: {0}, file {1}".format(
err, os.path.abspath(file)))
test = Decompressor()
test.decompress("test.zip")