【爬虫】多线程爬取表情包
''' 利用多线程、队列爬取表情包 URL:http://www.bbsnet.com/doutu/page/1 ''' import requests from lxml import etree import os import re from urllib import request from queue import Queue import threading class Producer(threading.Thread): ''' 用于请求和解析网页,将下载地址及文件名放入队列 ''' def __init__(self,url_queue,img_queue,*args,**kwargs): super().__init__(*args,**kwargs) self.url_queue = url_queue self.img_queue = img_queue def run(self): while True: if self.url_queue.empty(): break url = self.url_queue.get() self.parse_page(url) def parse_page(self,url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" } response = requests.get(url, headers=headers) response.encoding = response.apparent_encoding text = response.text html = etree.HTML(text) imgEle = html.xpath('//div[@class="tagbqppdiv"]//img') for img in imgEle: title = img.get('title') img_url = img.get('data-original') # 将title的中文字符进行替换处理 title = re.sub(r'[\-+*.?。,!?、/()“”">::]*', '', title) # os.path.splitext() 函数将文件路径和文件名分开 new_title = title + os.path.splitext(img_url)[1] # 将文件名和图片的url放到队列 self.img_queue.put((new_title,img_url)) class Consumer(threading.Thread): ''' 用于下载图片到本地 ''' def __init__(self, url_queue, img_queue, *args, **kwargs): super().__init__(*args, **kwargs) self.url_queue = url_queue self.img_queue = img_queue def run(self): while True: if self.img_queue.empty() and self.url_queue.empty(): break new_title, img_url = self.img_queue.get() # 下载图片 request.urlretrieve(img_url,"./image/"+new_title) print(new_title + " 下载完成!") def main(): url_queue = Queue(100) img_queue = Queue(500) url = "https://fabiaoqing.com/biaoqing/lists/page/{}.html" for i in range(1,101): new_url = url.format(i) url_queue.put(new_url) for i in range(5): p = Producer(url_queue,img_queue) p.start() for i in range(5): c = Consumer(url_queue,img_queue) c.start() if __name__ == '__main__': main()