多线程爬虫案例-(糗事百科)
爬取糗事百科的段子,观察不同页面url的变化,以第2页为例,https://www.qiushibaike.com/text/page/2/,第3页https://www.qiushibaike.com/text/page/3/,找到规律,只需要将后面的数字改成对应页码即可。
说明:
- 使用requests获取页面信息,并利用BeautifulSoup4提取页面数据
- 获取每个帖子的用户头像链接,用户名,段子文本内容,评论数以及点赞数
- 爬取结果保存到json文件中
示例:
# coding:utf-8 import requests import json from bs4 import BeautifulSoup from threading import Thread,Lock,current_thread from queue import Queue class QiushiSpider: def __init__(self,st_page,end_page): self.page_q = Queue() #存放页码队列 self.data_q = Queue() #存放源页面数据队列 self.lock = Lock() #创建互斥锁 self.f = open('./qiushi.json','a') #追加的方式写入数据 self.st_page = st_page self.end_page = end_page self.headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1' } self.url = 'https://www.qiushibaike.com' self.page_url = 'https://www.qiushibaike.com/text/page/{}/' def work(self): '''爬虫开启入口''' # 向页码队列添加页码 for page in range(self.st_page,self.end_page+1): self.page_q.put(page) # 创建并开启3个采集线程 collect_thread_list = [] for i in range(3): t = Thread(target=self.collect) collect_thread_list.append(t) t.start() # 主线程堵塞,等待采集线程执行完毕 for t in collect_thread_list: t.join() # 创建并开启5个解析线程 parse_thread_list = [] for i in range(5): t = Thread(target=self.parse) parse_thread_list.append(t) t.start() #主线程堵塞,等待所有解析线程执行完毕 for t in parse_thread_list: t.join() self.f.close() #关闭文件 def collect(self): '''采集数据''' print("采集线程{}开始采集数据".format(current_thread())) while not self.page_q.empty(): page = self.page_q.get() data = requests.get(self.page_url.format(page),headers = self.headers).text self.data_q.put(data) def parse(self): '''解析数据''' print("解析线程{}开始解析数据".format(current_thread())) while not self.data_q.empty(): html = self.data_q.get() soup = BeautifulSoup(html,'lxml') for element in soup.select('div[class="article block untagged mb15 typs_hot"]'): user_element = element.select('a[rel="nofollow"] img')[0] avatar_link = 'https:' + user_element.get('src') #头像链接 username = user_element.get('alt') #姓名 text_link = self.url + element.select('a[class="contentHerf"]')[0].get('href') text_html = requests.get(text_link,headers=self.headers).text text_soup = BeautifulSoup(text_html,'lxml') text = text_soup.select('div[class="content"]')[0].get_text()#段子内容 comment_num_list = element.select('span[class="stats-comments"] i') if not comment_num_list: comment_num = 0 else: comment_num = comment_num_list[0].get_text() #评论数 vote_num_list = element.select('span[class="stats-vote"] i') if not vote_num_list: vote_num = 0 else: vote_num = vote_num_list[0].get_text() #点赞数 info = { "username":username, "avatar_link":avatar_link, "text":text, "comment_num":comment_num, "vote_num":vote_num } with self.lock: self.f.write(json.dumps(info,ensure_ascii=False)+'\n') def main(): st_page = int(input("请输入爬取起始页:")) end_page = int(input("请输入终止页:")) qiushi = QiushiSpider(st_page,end_page) qiushi.work()#开启爬虫程序 if __name__ == '__main__': main()