【爬虫】多线程爬取表情包

'''
利用多线程、队列爬取表情包
URL:http://www.bbsnet.com/doutu/page/1
'''

import requests
from lxml import etree
import os
import re
from urllib import request
from queue import Queue
import threading


class Producer(threading.Thread):
    '''
    用于请求和解析网页,将下载地址及文件名放入队列
    '''
    def __init__(self,url_queue,img_queue,*args,**kwargs):
        super().__init__(*args,**kwargs)
        self.url_queue = url_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.url_queue.empty():
                break
            url = self.url_queue.get()
            self.parse_page(url)

    def parse_page(self,url):
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
            }
            response = requests.get(url, headers=headers)
            response.encoding = response.apparent_encoding
            text = response.text
            html = etree.HTML(text)
            imgEle = html.xpath('//div[@class="tagbqppdiv"]//img')
            for img in imgEle:
                title = img.get('title')
                img_url = img.get('data-original')

                # 将title的中文字符进行替换处理
                title = re.sub(r'[\-+*.?。,!?、/()“”">::]*', '', title)

                # os.path.splitext() 函数将文件路径和文件名分开
                new_title = title + os.path.splitext(img_url)[1]

                # 将文件名和图片的url放到队列
                self.img_queue.put((new_title,img_url))

class Consumer(threading.Thread):
    '''
    用于下载图片到本地
    '''
    def __init__(self, url_queue, img_queue, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.url_queue = url_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.img_queue.empty() and self.url_queue.empty():
                break
            new_title, img_url = self.img_queue.get()

            # 下载图片
            request.urlretrieve(img_url,"./image/"+new_title)
            print(new_title + " 下载完成!")

def main():
    url_queue = Queue(100)
    img_queue = Queue(500)

    url = "https://fabiaoqing.com/biaoqing/lists/page/{}.html"
    for i in range(1,101):
        new_url = url.format(i)
        url_queue.put(new_url)

    for i in range(5):
        p = Producer(url_queue,img_queue)
        p.start()

    for i in range(5):
        c = Consumer(url_queue,img_queue)
        c.start()

if __name__ == '__main__':
    main()

 

posted @ 2019-02-21 09:53  st--st  阅读(171)  评论(0编辑  收藏  举报