多线程爬虫案例-(糗事百科)

爬取糗事百科的段子,观察不同页面url的变化,以第2页为例,https://www.qiushibaike.com/text/page/2/,第3页https://www.qiushibaike.com/text/page/3/,找到规律,只需要将后面的数字改成对应页码即可。

说明:

  • 使用requests获取页面信息,并利用BeautifulSoup4提取页面数据
  • 获取每个帖子的用户头像链接,用户名,段子文本内容,评论数以及点赞数
  • 爬取结果保存到json文件中

示例:

# coding:utf-8
import requests
import json

from bs4 import BeautifulSoup
from threading import Thread,Lock,current_thread
from queue import Queue


class QiushiSpider:
    def __init__(self,st_page,end_page):
        self.page_q = Queue() #存放页码队列
        self.data_q = Queue() #存放源页面数据队列
        self.lock = Lock() #创建互斥锁
        self.f = open('./qiushi.json','a') #追加的方式写入数据
        self.st_page = st_page
        self.end_page = end_page
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'
        }
        self.url = 'https://www.qiushibaike.com'
        self.page_url = 'https://www.qiushibaike.com/text/page/{}/'

    def work(self):
        '''爬虫开启入口'''
        # 向页码队列添加页码
        for page in range(self.st_page,self.end_page+1):
            self.page_q.put(page)

        # 创建并开启3个采集线程
        collect_thread_list = []
        for i in range(3):
            t = Thread(target=self.collect)
            collect_thread_list.append(t)
            t.start()

        # 主线程堵塞,等待采集线程执行完毕
        for t in collect_thread_list:
            t.join()

        # 创建并开启5个解析线程
        parse_thread_list = []
        for i in range(5):
            t = Thread(target=self.parse)
            parse_thread_list.append(t)
            t.start()

        #主线程堵塞,等待所有解析线程执行完毕
        for t in parse_thread_list:
            t.join()
        self.f.close() #关闭文件

    def collect(self):
        '''采集数据'''
        print("采集线程{}开始采集数据".format(current_thread()))
        while not self.page_q.empty():
            page = self.page_q.get()
            data = requests.get(self.page_url.format(page),headers = self.headers).text
            self.data_q.put(data)


    def parse(self):
        '''解析数据'''
        print("解析线程{}开始解析数据".format(current_thread()))
        while not self.data_q.empty():
            html = self.data_q.get()
            soup = BeautifulSoup(html,'lxml')
            for element in soup.select('div[class="article block untagged mb15 typs_hot"]'):
                user_element = element.select('a[rel="nofollow"] img')[0]
                avatar_link = 'https:' + user_element.get('src') #头像链接
                username = user_element.get('alt') #姓名
                text_link = self.url + element.select('a[class="contentHerf"]')[0].get('href')
                text_html = requests.get(text_link,headers=self.headers).text
                text_soup = BeautifulSoup(text_html,'lxml')
                text = text_soup.select('div[class="content"]')[0].get_text()#段子内容
                comment_num_list = element.select('span[class="stats-comments"] i')
                if not comment_num_list:
                    comment_num = 0
                else:
                    comment_num = comment_num_list[0].get_text() #评论数
                vote_num_list = element.select('span[class="stats-vote"] i')
                if not vote_num_list:
                    vote_num = 0
                else:
                    vote_num = vote_num_list[0].get_text() #点赞数
                info = {
                    "username":username,
                    "avatar_link":avatar_link,
                    "text":text,
                    "comment_num":comment_num,
                    "vote_num":vote_num
                }
                with self.lock:
                    self.f.write(json.dumps(info,ensure_ascii=False)+'\n')

def main():
    st_page = int(input("请输入爬取起始页:"))
    end_page = int(input("请输入终止页:"))
    qiushi = QiushiSpider(st_page,end_page)
    qiushi.work()#开启爬虫程序

if __name__ == '__main__':
    main()

 

posted @ 2021-04-09 17:59  eliwang  阅读(77)  评论(0编辑  收藏  举报