糗事百科爬虫_基于线程池

import threading
import time
from queue import Queue
from multiprocessing.dummy import Pool
import requests
from lxml import etree


class QiuBaiSpider(object):
    # 1.爬取的的网站,和请求头
    def __init__(self):
        self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
        self.headers = {
            'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
        self.data = 0
        self.pool = Pool(processes=4)
        self.url_queue = Queue()
        self.count = 0
        self.request = 0
        self.response = 0
        self.is_finish = False

    # 2.爬取网站的url
    def get_url_list(self):
        for i in range(1, 13):
            url = self.base_url.format(i)
            self.url_queue.put(url)
            self.request += 1

    # 3.发送请求
    def send_request(self, url):
        print(url)
        response = requests.get(url, headers=self.headers)
        return response

    # 4. 解析数据
    def analysis_data(self, data):
        data = data.content
        self.count += 1
        html_data = etree.HTML(data)
        div_list = html_data.xpath("""//*[@id="content-left"]/div""")
        for i in div_list:
            text = i.xpath('.//h2/text()')[0]
            self.data += 1
            self.write_file(text)

    # 5.存储
    def write_file(self, data):
        print(data)

    def _start(self):
        url = self.url_queue.get()
        data = self.send_request(url)
        self.analysis_data(data)
        self.response += 1

    def _callback(self, temp):
        self.pool.apply_async(self._start, callback=self._callback)

    def async_start(self):
        self.get_url_list()
        for i in range(4):
            self.pool.apply_async(self._start, callback=self._callback)
        while True:
            time.sleep(0.0001)
            if self.response >= self.request:
                self.is_finish = True
                break

    def run(self):
        start = time.time()
        self.async_start()
        end = time.time()
        print(end - start, "结束时间")
        print(self.data)


if __name__ == '__main__':
    qiu_bai = QiuBaiSpider()
    qiu_bai.run()

 

posted @ 2019-08-01 18:38  Mr_Smith  阅读(138)  评论(0编辑  收藏  举报