爬虫学习（十七）——多线程爬取数据案例

from typing import Optional, Callable, Iterable, Mapping, Any

import requests

from lxml import etree

from threading import Thread

from queue import Queue

import json

url = 'https://www.qiushibaike.com/text/page/%d/'

queue_url = Queue(13)

queue_html = Queue(13)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9'}

exitFlag = False


class CrawlThread(Thread):

    def __init__(self, queue, thread_id) -> None:
        super().__init__()
        self.queue = queue
        self.thread_id = thread_id

    def run(self) -> None:
        super().run()
        print('-------------爬虫线程--%d--启动--------------' % (self.thread_id))
        self.get_html()
        print('-------------爬虫线程--%d--终止--------------' % (self.thread_id))

    def get_html(self):

        while True:
            # 根据队列是否为空，退出
            if self.queue.empty():
                break
            try:
                page = self.queue.get(block=False)

                response = requests.get(url=url % (page), headers=headers)

                response.encoding = 'utf-8'

                html = response.text

                queue_html.put((html, page))

                self.queue.task_done()

                print('----------------爬虫线程--%d--获取--%d--页的数据-----------------' % (self.thread_id, page))
            except Exception as e:
                pass

        pass


class ParseThread(Thread):

    def __init__(self, queue, thread_id, fp):
        super().__init__()
        self.queue = queue
        self.thread_id = thread_id
        self.fp = fp

    def run(self):

        print('-------------解析线程--%d--启动--------------' % (self.thread_id))
        self.parse_html()
        print('-------------解析线程--%d--终止--------------' % (self.thread_id))

    def parse_html(self):

        while True:
            if exitFlag:
                break

            try:
                html,page = self.queue.get(block=False)

                tree = etree.HTML(html)

                divs = tree.xpath('//div[contains(@id,"qiushi_tag_")]')

                # 遍历divs（列表）---------div 元素（包含我们想要的内容）
                for div in divs:
                    try:
                        content = div.xpath('.//div[@class="content"]/span/text()')[0].strip()
                        #  点赞
                        zan = div.xpath('.//span[@class="stats-vote"]/i/text()')[0].strip()
                        # 评论
                        comment = div.xpath('.//span[@class="stats-comments"]//i/text()')[0].strip()
                        # 作者
                        author = div.xpath('.//div[@class="author clearfix"]//h2/text()')[0].strip()
                        item = {}
                        item['author'] = author
                        item['zan'] = zan
                        item['comment'] = comment
                        item['content'] = content
                        json.dump(item,fp,ensure_ascii=False)
                    except Exception as e:
                        print('----------解析异常，页码是：%d---------'%(page))
                        pass
                print('------------解析线程--%d--解析页码--%d--任务-------------'%(self.thread_id,page))
                self.queue.task_done()
            except Exception as e:
                pass

        pass


if __name__ == '__main__':
    for i in range(13):
        queue_url.put(i + 1)

    # 开启网络请求
    for i in range(5):
        t = CrawlThread(queue_url, i)
        t.start()

    fp = open('./糗事百科.txt', mode='a', encoding='utf-8')
    #     解析线程的任务
    for i in range(3):
        t = ParseThread(queue_html, i, fp)
        t.start()


    # 队列锁，队列中任务必须全部完成才可以执行下一步
    queue_url.join()
    queue_html.join()
    exitFlag = True

    fp.close()
posted @ 2019-02-21 22:01 石桥浪子阅读(545) 评论(0) 编辑收藏举报
刷新页面返回顶部
石桥浪子

爬虫学习（十七）——多线程爬取数据案例

公告