zlib压缩爬虫采集到的网页源码保存到mongodb减少存储空间

  1 mport zlib
  2 import pymongo
  3 
  4 def compress_html(infile,dst,level=9):
  5     '''[summary]
  6     
  7     [压缩文件]
  8     Arguments:
  9         infile {[string]} -- [输入文件路径]
 10         dst {[string]} -- [输出文件路径]
 11     
 12     Keyword Arguments:
 13         level {number} -- [压缩比例,压缩级别是一个0-9的数字,0压缩速度最快(压缩的过程),9压缩速度最慢,压缩率最大,0不压缩数据] (default: {9})
 14     '''
 15     infile = open(infile,'rb')
 16     dst = open(dst,'wb')
 17     compress = zlib.compressobj(level)
 18     data = infile.read(1024)
 19     while data:
 20         dst.write(compress.compress(data))
 21         data = infile.read(1024)
 22     dst.write(compress.flush())
 23     infile.close()
 24     dst.close()
 25 
 26 def decompress(infile, dst):
 27     '''[summary]
 28     
 29     [解压文件]
 30     
 31     Arguments:
 32         infile {[string]} -- [输入文件路径]
 33         dst {[string]} -- [输出文件路径]
 34     '''
 35     infile = open(infile, 'rb')
 36     dst = open(dst, 'wb')
 37     decompress = zlib.decompressobj()
 38     data = infile.read(1024)
 39     while data:
 40       dst.write(decompress.decompress(data))
 41       data = infile.read(1024)
 42     dst.write(decompress.flush())
 43     infile.close()
 44     dst.close()
 45 
 46 
 47 def compress_str(instr):
 48     '''[summary]
 49     
 50     [压缩字符串]
 51     
 52     Arguments:
 53         instr {[string]} -- [待压缩的字符串]
 54     '''
 55     # MONGODB 主机名
 56     MONGODB_HOST = "192.168.0.67"
 57     # MONGODB 端口号
 58     MONGODB_PORT = 27017
 59     # 数据库名称
 60     MONGODB_DBNAME = "CompressHtml"
 61     # 存放数据的表名称
 62     MONGODB_SHEETNAME = "compress_html"
 63 
 64     # 创建MONGODB数据库链接
 65     client = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
 66     # 指定数据库
 67     mydb = client[MONGODB_DBNAME]
 68     # 存放数据的数据库表名
 69     postdb = mydb[MONGODB_SHEETNAME]
 70     compress_str = zlib.compress(instr.encode(encoding='utf-8'),level=9)
 71     print(type(compress_str))
 72     print(compress_str)
 73     postdb.insert_one({"cas":"350-03-8","英文名称":"3-Acetylpyridine","英文同义词":"NSC 761;FEMA 3424;Imatinib-int A;3-ActylPyridine;3-ACETOPYRIDINE;3-acetyl-pyridin;3-Acetylpiridine;3-Acetalpyridine;FEMA NUMBER 3424;3-ACETYLPYRIDINE"
 74         ,"中文名称":"3-乙酰基吡啶","html":compress_str})
 75     client.close()
 76 
 77 def decompress_str(bytes_data=None):
 78     '''[summary]
 79     
 80     [将二进制html文件解压成str]
 81     
 82     Arguments:
 83         bytes_data {[bytes]} -- [待解压的html]
 84     '''
 85     # MONGODB 主机名
 86     MONGODB_HOST = "192.168.0.67"
 87     # MONGODB 端口号
 88     MONGODB_PORT = 27017
 89     # 数据库名称
 90     MONGODB_DBNAME = "CompressHtml"
 91     # 存放数据的表名称
 92     MONGODB_SHEETNAME = "compress_html"
 93 
 94     # 创建MONGODB数据库链接
 95     client = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
 96     # 指定数据库
 97     mydb = client[MONGODB_DBNAME]
 98     # 存放数据的数据库表名
 99     postdb = mydb[MONGODB_SHEETNAME]
100     data = postdb.find_one()
101     bytes_html = data.get("html")
102     print(type(bytes_html))
103     html_source = zlib.decompress(bytes_html)
104     print(type(html_source))
105     html_str = html_source.decode('utf-8','ignore')
106     print(type(html_str))
107     client.close()
108 
109 
110 
111 
112 if __name__ == '__main__':
113     # compress_html("3-乙酰基吡啶 _ 350-03-8.html","350-03-8_compress.html")
114     # compress_html("3-乙酰基吡啶 _ 350-03-8_noheaderfooter.txt","350-03-8_compress.txt")
115 
116     # with open("3-乙酰基吡啶 _ 350-03-8.html",'r',encoding='utf-8') as f:
117     #     data = f.read()
118     #     # print(data)
119     #     print(type(data))
120     #     compress_str(data)
121     decompress_str(bytes_data=None)

 


 


压缩效果还是非常不错的,源文件由138kb压缩后为19kb,减小了7.2倍,为大规模存储数据到mongo减少了很多磁盘存储空间


 

 

 mongodb中可以存入Binary二进制的数据

 

posted @ 2020-10-21 09:43  kakaok  阅读(268)  评论(0编辑  收藏  举报