糗事百科爬虫_基于线程池
import threading import time from queue import Queue from multiprocessing.dummy import Pool import requests from lxml import etree class QiuBaiSpider(object): # 1.爬取的的网站,和请求头 def __init__(self): self.base_url = 'https://www.qiushibaike.com/hot/page/{}/' self.headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"} self.data = 0 self.pool = Pool(processes=4) self.url_queue = Queue() self.count = 0 self.request = 0 self.response = 0 self.is_finish = False # 2.爬取网站的url def get_url_list(self): for i in range(1, 13): url = self.base_url.format(i) self.url_queue.put(url) self.request += 1 # 3.发送请求 def send_request(self, url): print(url) response = requests.get(url, headers=self.headers) return response # 4. 解析数据 def analysis_data(self, data): data = data.content self.count += 1 html_data = etree.HTML(data) div_list = html_data.xpath("""//*[@id="content-left"]/div""") for i in div_list: text = i.xpath('.//h2/text()')[0] self.data += 1 self.write_file(text) # 5.存储 def write_file(self, data): print(data) def _start(self): url = self.url_queue.get() data = self.send_request(url) self.analysis_data(data) self.response += 1 def _callback(self, temp): self.pool.apply_async(self._start, callback=self._callback) def async_start(self): self.get_url_list() for i in range(4): self.pool.apply_async(self._start, callback=self._callback) while True: time.sleep(0.0001) if self.response >= self.request: self.is_finish = True break def run(self): start = time.time() self.async_start() end = time.time() print(end - start, "结束时间") print(self.data) if __name__ == '__main__': qiu_bai = QiuBaiSpider() qiu_bai.run()