zlib压缩爬虫采集到的网页源码保存到mongodb减少存储空间
1 mport zlib
2 import pymongo
3
4 def compress_html(infile,dst,level=9):
5 '''[summary]
6
7 [压缩文件]
8 Arguments:
9 infile {[string]} -- [输入文件路径]
10 dst {[string]} -- [输出文件路径]
11
12 Keyword Arguments:
13 level {number} -- [压缩比例,压缩级别是一个0-9的数字,0压缩速度最快(压缩的过程),9压缩速度最慢,压缩率最大,0不压缩数据] (default: {9})
14 '''
15 infile = open(infile,'rb')
16 dst = open(dst,'wb')
17 compress = zlib.compressobj(level)
18 data = infile.read(1024)
19 while data:
20 dst.write(compress.compress(data))
21 data = infile.read(1024)
22 dst.write(compress.flush())
23 infile.close()
24 dst.close()
25
26 def decompress(infile, dst):
27 '''[summary]
28
29 [解压文件]
30
31 Arguments:
32 infile {[string]} -- [输入文件路径]
33 dst {[string]} -- [输出文件路径]
34 '''
35 infile = open(infile, 'rb')
36 dst = open(dst, 'wb')
37 decompress = zlib.decompressobj()
38 data = infile.read(1024)
39 while data:
40 dst.write(decompress.decompress(data))
41 data = infile.read(1024)
42 dst.write(decompress.flush())
43 infile.close()
44 dst.close()
45
46
47 def compress_str(instr):
48 '''[summary]
49
50 [压缩字符串]
51
52 Arguments:
53 instr {[string]} -- [待压缩的字符串]
54 '''
55 # MONGODB 主机名
56 MONGODB_HOST = "192.168.0.67"
57 # MONGODB 端口号
58 MONGODB_PORT = 27017
59 # 数据库名称
60 MONGODB_DBNAME = "CompressHtml"
61 # 存放数据的表名称
62 MONGODB_SHEETNAME = "compress_html"
63
64 # 创建MONGODB数据库链接
65 client = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
66 # 指定数据库
67 mydb = client[MONGODB_DBNAME]
68 # 存放数据的数据库表名
69 postdb = mydb[MONGODB_SHEETNAME]
70 compress_str = zlib.compress(instr.encode(encoding='utf-8'),level=9)
71 print(type(compress_str))
72 print(compress_str)
73 postdb.insert_one({"cas":"350-03-8","英文名称":"3-Acetylpyridine","英文同义词":"NSC 761;FEMA 3424;Imatinib-int A;3-ActylPyridine;3-ACETOPYRIDINE;3-acetyl-pyridin;3-Acetylpiridine;3-Acetalpyridine;FEMA NUMBER 3424;3-ACETYLPYRIDINE"
74 ,"中文名称":"3-乙酰基吡啶","html":compress_str})
75 client.close()
76
77 def decompress_str(bytes_data=None):
78 '''[summary]
79
80 [将二进制html文件解压成str]
81
82 Arguments:
83 bytes_data {[bytes]} -- [待解压的html]
84 '''
85 # MONGODB 主机名
86 MONGODB_HOST = "192.168.0.67"
87 # MONGODB 端口号
88 MONGODB_PORT = 27017
89 # 数据库名称
90 MONGODB_DBNAME = "CompressHtml"
91 # 存放数据的表名称
92 MONGODB_SHEETNAME = "compress_html"
93
94 # 创建MONGODB数据库链接
95 client = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
96 # 指定数据库
97 mydb = client[MONGODB_DBNAME]
98 # 存放数据的数据库表名
99 postdb = mydb[MONGODB_SHEETNAME]
100 data = postdb.find_one()
101 bytes_html = data.get("html")
102 print(type(bytes_html))
103 html_source = zlib.decompress(bytes_html)
104 print(type(html_source))
105 html_str = html_source.decode('utf-8','ignore')
106 print(type(html_str))
107 client.close()
108
109
110
111
112 if __name__ == '__main__':
113 # compress_html("3-乙酰基吡啶 _ 350-03-8.html","350-03-8_compress.html")
114 # compress_html("3-乙酰基吡啶 _ 350-03-8_noheaderfooter.txt","350-03-8_compress.txt")
115
116 # with open("3-乙酰基吡啶 _ 350-03-8.html",'r',encoding='utf-8') as f:
117 # data = f.read()
118 # # print(data)
119 # print(type(data))
120 # compress_str(data)
121 decompress_str(bytes_data=None)
压缩效果还是非常不错的,源文件由138kb压缩后为19kb,减小了7.2倍,为大规模存储数据到mongo减少了很多磁盘存储空间
mongodb中可以存入Binary二进制的数据