Python多线程实战


项目的源码在GitHub

创建模块

第一步,我们可以自己封装一个模块用来进行多线程爬虫

from threading import Thread  # 继承Thread方法,重写run方法


class Spider(Thread):  # 同时重写多线程的run函数

    def __init__(self, url, target) -> "初始化变量":
        super().__init__(target=target, daemon=True)  # daemon线程等待,target是运行的函数
        self.target = target

        # # 实例化redis数据库,如果要使用,请安装相应插件后取消注释
        # import redis
        # self.redis = redis.Redis()  # 在后面的使用中可以进行去重,防止重复下载

        # 构建ip池,防止由于访问速度过快,使得ip地址被封
        self.file = open("../content/ip.txt")  # 得到大量ip地址
        self.ipList = self.file.readlines() 
        self.file.close()
        from random import choice
        self.ip = choice(self.ipList).strip()

        # # 实例化mongo数据库
        # import pymongo
        # self.mongo = pymongo.MongoClient()
        # self.clo = self.mongo["python"]["default"]  # 还要说明MongoDB使用的表名

        # 传入requests所需要的参数
        self.headers = {
            'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36"  # UA伪装
        }
        self.url = url
        self.proxy = {
            "http": f"http://{self.ip}"  # 代理IP
        }
        self.cookies = None

    def crawl(self) -> "发送请求":
        """发送请求"""
        import requests
        try:  # 反止有人不知道设置代理ip
            if self.cookies:
                resp = requests.get(url=self.url, headers=self.headers, proxies=self.proxy, cookies=self.cookies)
            else:
                resp = requests.get(url=self.url, headers=self.headers, proxies=self.proxy)
        except Exception as e:
            print(e)
            if self.cookies:
                resp = requests.get(url=self.url, headers=self.headers, cookies=self.cookies)
            else:
                resp = requests.get(url=self.url, headers=self.headers)
        if resp.status_code == 200:
            resp.encoding = "utf-8"
            return resp
        else:
            print("Requests Error")

    # def spider(self) -> "业务处理":
    #     """业务处理"""
    #     pass
    #
    # def save(self, data=None, name=None, mode="file") -> "持久化存储":
    #     """持久化存储"""
    #     pass
    #     if mode == "mongo":  # 当选择mongo时,保存在mongo数据库中
    #         if isinstance(data, dict):
    #             self.clo.insert_one(data)
    #         else:
    #             print("Store Error")
    #     elif mode == "file":  # 保存在文件中
    #         with open(f"{self.path}", "a+", encoding="utf-8") as file:
    #             file.write(data)
    #     elif mode == "img" and name:  # 保存为图片
    #         with open(f"./{name}", "wb") as f:
    #             f.write(data)
    #     else:
    #         raise 'FileTypeError.The way can only file or mongo'
    #
    # def parse(self) -> "数据分析":
    #     """数据分析"""
    #     pass
    #
    # def run(self) -> "运行程序":
    #     """运行程序"""
    #     from loguru import logger
    #     logger.info("开始运行爬虫程序")
    #     self.spider()
  • 注意那个写的python模块也可以在以后的爬虫中使用,你们也可以根据自己的喜好更改

编写代码

第二步,写获取每张图片的src的代码

from spiderModule import Spider  # 导入我们刚才创建的模块
from lxml import etree  # 对网页源码进行xpath解析
from threading import BoundedSemaphore  # 加入线程锁,限制线程的数量
import re, os  # 利用正则匹配,合并网页,os模块创建要使用的文件
from loguru import logger  # 进行日志记录


class SpiderEveryUrl(Spider):  # 继承Spider,重写该方法

    def __init__(self, url):
        """初始化变量"""
        Spider.__init__(self, url, target=self.run)
        self.urlAdd = "https://www.umei.cc/meinvtupian/"  # 进行url的拼接

    def spider(self):
        """处理业务"""
        html = etree.HTML(super().crawl().text)  
        pageUrlList = html.xpath("/html/body/div[2]/div[8]/ul/li/a/@href")
        for i in pageUrlList:
            pageUrl = f"{self.urlAdd}{i.split('/meinvtupian/')[1]}"
            urListAll.append(pageUrl)

    def run(self):
        """启动程序"""
        self.spider()


class SpiderPicUrl(Spider):

    def __init__(self, url):
        Spider.__init__(self, url, target=self.run)
        self.add = "https://www.umei.cc"

    def spider(self):
        """处理业务"""
        html = etree.HTML(Spider.crawl(self).text)
        nuUrl = html.xpath("/html/body/div[2]/div[12]/a/@href")  
        try:
            if nuUrl:
                nuUrl = nuUrl[-1]
                maxIndex, headersNum, headersAlph = re.search(obj1, nuUrl).group().split("_")[1], re.search(obj2, nuUrl).group(), re.search(obj3, nuUrl).group()
                for i in range(1, int(maxIndex) + 1):
                    if i == 1:
                        eveUrl = f"{self.add}{headersAlph}{headersNum.split('_')[0]}.htm"
                    else:
                        eveUrl = f"{self.add}{headersAlph}{headersNum}{str(i)}.htm"
                    preUrl.append(eveUrl)
            else:
                unRun.append(self.url)
        except Exception as e:
            print(e)

    def run(self):
        """运行程序"""
        with pool_sema:
            self.spider()


class SpiderPicSrc(Spider):

    def __init__(self, url):
        """初始化变量"""
        Spider.__init__(self, url, target=self.run)  # 继承Spider模块

    def spider(self):
        """处理业务"""
        html = etree.HTML(super(SpiderPicSrc, self).crawl().text)  # 调用模块中封装的方法
        src = html.xpath("//*[@id='ArticleId{dede:field.reid/}']/p/a/img/@src")  # 得到图片的src
        file = open("../content/PicSrc.txt", "a+")  # 将src写入文件
        file.write(f"{src[0]}\n")
        file.close()
        # try:  # 如果安装了redis可以进行redis去重
        #     if src:
        #         if self.redis.sadd("src", src[0]):  # 使用redis去重
        #             print(f"正在保存图片src:{src[0]}")
        #             self.file.write(f"{src[0]}\n")
        #         else:
        #             logger.info(f'{src[0]}已保存')
        # except Exception as e:
        #     with open("./log.log", 'a+') as file:
        #         file.write(f"{e}\n{src}")
        #     print(e)

    def run(self):
        """运行程序"""
        with pool_sema:
            self.spider()


"""线程的使用方法——实例"""
# def Many_Thread(target, *args) -> "示范""其为使用的方式":
#     th = []
#     for i in range(25):  # 开25个线程
#         t = threading.Thread(target=target, args=args)
#         th.append(t)
#         t.setDaemon(True)  # 添加守护线程,即防止进程进度与线程进度不一样
#     for i in th:  # 循环启动25个线程
#         i.start()
#     for i in th:
#         i.join()  # 阻塞线程


if __name__ == '__main__':
    while True:
        start, end = input("请输入要下载该网站的哪部分图片如(1 3)表示下载1到3页的图片,最多有540页:").split()
        try:
            if isinstance(eval(start), int) and isinstance(eval(end), int) and int(start) <= int(end):
                break
            else:
                continue
        except Exception as e:
            print(e)
            print("请按要求输入!!!")
    max_connections = 100  # 定义最大线程数
    pool_sema = BoundedSemaphore(max_connections) # 或使用Semaphore方法,在主函数中使用with pool_sema可以限制线程的数量
    urListAll, threads, preUrl, unRun = [], [], [], []  # 用于存储url和src
    obj1, obj2, obj3 = re.compile(r"_\d+"), re.compile(r"\d+_"), re.compile(r"\D+")  # 在这里创建正则表达式,减少缓存的占用
    for i in range(int(start), int(end)+1):
        if i == 1:
            url = "https://www.umei.cc/meinvtupian/"
        else:
            url = f"https://www.umei.cc/meinvtupian/index_{i}.htm"
        logger.info(f"{url}")
        th = SpiderEveryUrl(url)
        threads.append(th)
    for i in threads:
        i.start()
    for i in threads:
        i.join()
    with open("../content/EveryUrl.txt", "w") as f:  # 将提取到的url保存到EveryUrl中,防止因为意外原因,使得数据丢失
        f.write(str(urListAll))
    print(f"urListAll提取完成")
    threads.clear()

    f = open("../content/EveryUrl.txt", "r")  # 提取文件中的url
    urList = eval(f.read())
    f.close()
    for url in urListAll: 
        logger.info(url)
        th = SpiderPicUrl(url)
        threads.append(th)
    for i in threads:
        i.start()
    for i in threads:
        i.join()
    with open("../content/PicUrl.txt", "w") as f:  # 将提取到的url保存到EveryUrl中,防止因为意外原因,使得数据丢失
        f.write(str(preUrl))
    print(f"preUrl提取完成\n错误的有:{unRun}" if not unRun else "preUrl提取完成")  # 三目运算
    threads.clear()

    f = open("../content/PicUrl.txt", "r")  # 提取文件中的url
    urList = eval(f.read())
    f.close()
    for url in preUrl:
        logger.info(f"{url}_src")
        th = SpiderPicSrc(url)
        threads.append(th)
    for i in threads:
        i.start()
    for i in threads:
        i.join()

    print("all over")

下载图片

第三步,将获得的src进行访问,全部下载

from spiderModule import Spider
from loguru import logger
import os, sys
from threading import BoundedSemaphore


class SpiderDown(Spider):

    def __init__(self, url):
        super().__init__(url, target=self.run)

    def spider(self):
        """处理业务"""
        data = Spider.crawl(self).content
        name = self.url.split("/")[-1]  # 给保存的图片命名
        logger.info(f"正在下载{name}")
        with open(f"../img/{name}", "wb") as f:
            f.write(data)
        # if self.redis.sadd("imgName", name):  # redis去重
        #     logger.info(f"正在下载{name}")
        #     Spider.save(self, data=data, name=name, mode="img")
        # else:
        #     logger.info(f"{name}已经下载")

    def run(self):
        """运行程序"""
        with pool_sema:  # 使用这个方法,限制线程数
            self.spider()


if __name__ == '__main__':
    max_connections = 100  # 定义最大线程数
    pool_sema = BoundedSemaphore(max_connections) # 或使用Semaphore方法,在主函数中使用with pool_sema可以限制线程的数量
    if not os.path.exists("../img"):
        os.mkdir("../img")
    threads = []
    with open("../content/PicSrc.txt", "r") as file:
        urls = file.readlines()
    for url in urls:
        th = SpiderDown(url.strip())
        threads.append(th)
        th.setDaemon(False)
    for i in threads:
        i.start()
    
    os.remove("../content/PicSrc.txt")  # 移除存储src的文件

查看图片

第四步,打开img文件夹,图片保存在img文件夹里面

posted @ 2022-01-05 19:09  Kenny_LZK  阅读(172)  评论(0编辑  收藏  举报