Python实现复杂压缩文件的解压

问题

有时候我们遇到这样的压缩文件，手动用7zip解压到某个目录，打开目录后，里面还有不同压缩格式的其他文件。我们还要对它们分别手动解压，解压出来的文件目录里面可能还包含其他压缩文件。这样，我们可能需要层层解压才能获得自己想看的文件，琐碎且耗时。于是，用软件实现这项任务的自动化就挺有必要的。

设计

由于压缩文件的组织也是目录结构的，和访问普通目录的区别是要先解压后才能访问。
而目录结构对应于树形数据结构。这样，问题可以转换为，如何遍历树形结构的每个节点（文件），解压文件并深入目录继续解压。
自然的，树形数据结构的节点遍历，采用DFS或者BFS算法即可，整体的时间复杂度与解压后文件数量及大小相关。

基于Python的一种实现样例

代码样例如下，类Decompress提供decompress方法调用__decompress实现DFS目录文件的遍历。首先，判断文件是否是目录，是的话，就进入目录，列出文件并逐一查看每个文件，若是目录，则继续深入访问内部文件递归调用__decompress；否则，若文件名匹配remove_files_pattern，则删除此文件，这通常是针对自己不关注的文件；否则，若想保留特定的文件而不做任何处理，可以设置ignore_files_pattern来匹配这些文件；否则，根据文件的后缀名，调用解压缩模块解压。另外，如果想解压后保留原来的压缩文件，设置keep_origin_file为True即可。

import os
import re
import zipfile
import tarfile
import gzip
import lzma

'''
GZ decompress
'''
def gz_decompress(infile, tofile):
    with open(infile, 'rb') as inf, open(tofile, 'w', encoding='utf8') as tof:
        decom_str = gzip.decompress(inf.read()).decode(encoding='utf-8')
        tof.write(decom_str)

def xz_decompress(infile, tofile):
    with lzma.open(infile) as inf, open(tofile, 'w', encoding='utf8') as tof:
        decom_str = inf.read().decode('utf-8')
        tof.write(decom_str)

def match_patterns(file, pat_list):
    for pat in pat_list:
        restr = re.compile(pat)
        if restr.search(file) != None:
            return True
    return False

class Decompressor:
    def __init__(self) -> None:
        '''
        ignore_files_pattern for the files, we don't want to decompress, keep them where it is
        '''
        self.remove_files_pattern = [
            r'(example1|example2)']
        self.ignore_files_pattern = [r'(demo1)']
        self.keep_origin_file = False
        self.root_file = None

    def decompress(self, file):
        self.root_file = file
        self.__decompress(file)

    def __require_removal(self, file):
        return match_patterns(file, self.remove_files_pattern)

    def __need_ignore(self, file):
        return match_patterns(file, self.ignore_files_pattern)

    def __remove_file(self, file):
        if not self.keep_origin_file:
            if self.root_file != file:
                # print("Remove file: ", file, self.root_file)
                os.remove(file)

    def __decompress(self, file):
        '''
        If file is a directory, visit the directory, it means the directory has some sub files
        If file is a tar/gz/xz file, try to decompress it with relative tool, 
        check if new file node is added to current folder node, update the list
        '''
        # print("Visiting ", file)
        if os.path.isdir(file):
            os.chdir(file)
            list_dir = os.listdir()
            for afile in list_dir:
                self.__decompress(afile)
            os.chdir('..')
        elif self.__require_removal(file):
            # print("Remove file: ", file)
            os.remove(file)
        elif self.__need_ignore(file):
            # print("ignore file:", file)
            return
        elif file.endswith('.tar.gz') or file.endswith('.txz'):
            bTarfile = False
            try:
                bTarfile = tarfile.is_tarfile(file)
            except BaseException as err:
                print("Tarfile check error: {0}, file {1}".format(
                    err, os.path.abspath(file)))
            if bTarfile:
                next_dir = file.split('.')[0]
                try:
                    with tarfile.open(file) as f:
                        f.extractall(next_dir)
                    f.close()
                    self.__remove_file(file)
                except BaseException as err:
                    print("Tarfile decompress error: {0}, file {1}".format(
                        err, os.path.abspath(file)))
                    f.close()
                if os.path.exists(next_dir):
                    self.__decompress(next_dir)
        elif file.endswith('.zip'):
            target_dir = file.split('.')[0]
            os.mkdir(target_dir)
            try:
                with zipfile.ZipFile(file, 'r') as zip_ref:
                    zip_ref.extractall(target_dir)
                self.__remove_file(file)
                self.__decompress(target_dir)
            except BaseException as err:
                print("zip file decompress error: {0}, file {1}".format(
                    err, os.path.abspath(file)))
        elif file.endswith('.xz'):
            try:
                xz_decompress(file, file.removesuffix('.xz'))
                self.__remove_file(file)
            except BaseException as err:
                print("XZ decompress error: {0}, file {1}".format(
                    err, os.path.abspath(file)))
        elif file.endswith('.gz'):
            # Use gzip to decompress if fail to decompress with tarfile module
            try:
                gz_decompress(file, file.removesuffix('.gz'))
                self.__remove_file(file)
            except BaseException as err:
                print("GZ decompress error: {0}, file {1}".format(
                    err, os.path.abspath(file)))

test = Decompressor()
test.decompress("test.zip")

posted on 2023-03-29 16:42 RayChenCode 阅读(251) 评论(0) 编辑收藏举报

刷新页面返回顶部

Ray

公告

问题

设计

基于Python的一种实现样例