python request爬取百度贴吧
1 import requests
2 import os
3 import shutil
4 import time
5
6
7 class PostBarSpider(object):
8 def __init__(self, post_bar, page_number, file_dir):
9 # 爬取某个贴吧前多少页内容
10 self.post_bar = post_bar
11 self.page = page_number
12 # 保存到哪个目录
13 self.file_dir = file_dir
14 self.url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
15 self.headers = {
16 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
17 }
18
19 # 构造url
20 def get_url_list(self):
21 # url_list = []
22 # for i in range(0, self.page):
23 # result = self.url.format(self.post_bar, i * 50)
24 # print(result)
25 # url_list.append(result)
26 # return url_list
27 return [self.url.format(self.post_bar, i * 50) for i in range(self.page)]
28
29 # 创建保存文件的目录
30 def set_dir(self):
31 self.file_dir += "/{}/".format(self.post_bar)
32 print("保存路径-----{}".format(self.file_dir))
33 try:
34 if not os.path.exists(self.file_dir):
35 os.makedirs(self.file_dir)
36 else:
37 shutil.rmtree(self.file_dir)
38 os.makedirs(self.file_dir)
39 except Exception as info:
40 print("创建或删除文件夹出现问题")
41
42 def run(self):
43 # 设置保存目录
44 self.set_dir()
45 # 获得url_list
46 url_list = self.get_url_list()
47
48 # 遍历ulr_list
49 for i in range(0, len(url_list)):
50 response = requests.get(url_list[i], headers=self.headers)
51 if response.status_code == 200:
52 # 写出数据
53 file_name = self.file_dir + "{}".format("第{}页.txt".format(i + 1))
54 file = open(file_name, "w", encoding="utf-8")
55 file.write(response.content.decode())
56 print("已写入第{}页".format(i + 1))
57 file.close()
58
59
60 def main():
61 start = time.time()
62 my_spider = PostBarSpider("李毅", 20, "f:/post_bar_test")
63 my_spider.run()
64 end = time.time()
65 print("耗时-----{}s".format(end - start))
66
67
68 if __name__ == '__main__':
69 main()