爬虫本质就是将网站或者接口的数据经过筛选后按需求保存
这里实现一个简单爬虫仅供参考
import requests import bs4 import threading import queue import os class GetWebAndHandle(threading.Thread): status = { 'work': True, 'path': 'your-path', 'suffix': '.html' } headers = { 'User-Agent': 'your-agent', 'Host': 'www.baidu.com' } def __init__(self, kw, page_que): super(GetWebAndHandle, self).__init__() self.page_que = page_que self.url = 'https://www.baidu.com/s?wd=' + kw + '&pn=' self.file_path_name = self.status['path'] + self.name + self.status['suffix'] def run(self): if os.path.isfile(self.file_path_name): os.remove(self.file_path_name) while self.status['work']: try: url = self.url + self.page_que.get(False) response = requests.get(url, headers=self.headers) self.handle(response.text) except queue.Empty: self.status['work'] = False def handle(self, response): soup = bs4.BeautifulSoup(response) sifted = soup.find_all('div', 'result c-container ') with open(self.file_path_name, mode='ab') as file: for sift in sifted: file.write(sift.encode('utf-8')) file.write(('-=-' * 30).encode('utf-8')) if __name__ == '__main__': # ssl._create_default_https_context = ssl._create_unverified_context() kw = input('输入关键字') t_num = min(max(int(input('需要的线程数')), 1), 4) p_num = min(max(int(input('需要的页数')), 1), 20) page_que = queue.Queue(p_num) for p in range(p_num): page_que.put(str(p) + '0') thread_list = [] for i in range(t_num): spider_thread = GetWebAndHandle(kw, page_que) spider_thread.start() thread_list.append(spider_thread)