Python多线程实战
项目的源码在GitHub
创建模块
第一步,我们可以自己封装一个模块用来进行多线程爬虫
from threading import Thread # 继承Thread方法,重写run方法 class Spider(Thread): # 同时重写多线程的run函数 def __init__(self, url, target) -> "初始化变量": super().__init__(target=target, daemon=True) # daemon线程等待,target是运行的函数 self.target = target # # 实例化redis数据库,如果要使用,请安装相应插件后取消注释 # import redis # self.redis = redis.Redis() # 在后面的使用中可以进行去重,防止重复下载 # 构建ip池,防止由于访问速度过快,使得ip地址被封 self.file = open("../content/ip.txt") # 得到大量ip地址 self.ipList = self.file.readlines() self.file.close() from random import choice self.ip = choice(self.ipList).strip() # # 实例化mongo数据库 # import pymongo # self.mongo = pymongo.MongoClient() # self.clo = self.mongo["python"]["default"] # 还要说明MongoDB使用的表名 # 传入requests所需要的参数 self.headers = { 'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36" # UA伪装 } self.url = url self.proxy = { "http": f"http://{self.ip}" # 代理IP } self.cookies = None def crawl(self) -> "发送请求": """发送请求""" import requests try: # 反止有人不知道设置代理ip if self.cookies: resp = requests.get(url=self.url, headers=self.headers, proxies=self.proxy, cookies=self.cookies) else: resp = requests.get(url=self.url, headers=self.headers, proxies=self.proxy) except Exception as e: print(e) if self.cookies: resp = requests.get(url=self.url, headers=self.headers, cookies=self.cookies) else: resp = requests.get(url=self.url, headers=self.headers) if resp.status_code == 200: resp.encoding = "utf-8" return resp else: print("Requests Error") # def spider(self) -> "业务处理": # """业务处理""" # pass # # def save(self, data=None, name=None, mode="file") -> "持久化存储": # """持久化存储""" # pass # if mode == "mongo": # 当选择mongo时,保存在mongo数据库中 # if isinstance(data, dict): # self.clo.insert_one(data) # else: # print("Store Error") # elif mode == "file": # 保存在文件中 # with open(f"{self.path}", "a+", encoding="utf-8") as file: # file.write(data) # elif mode == "img" and name: # 保存为图片 # with open(f"./{name}", "wb") as f: # f.write(data) # else: # raise 'FileTypeError.The way can only file or mongo' # # def parse(self) -> "数据分析": # """数据分析""" # pass # # def run(self) -> "运行程序": # """运行程序""" # from loguru import logger # logger.info("开始运行爬虫程序") # self.spider()
- 注意那个写的python模块也可以在以后的爬虫中使用,你们也可以根据自己的喜好更改
编写代码
第二步,写获取每张图片的src的代码
from spiderModule import Spider # 导入我们刚才创建的模块 from lxml import etree # 对网页源码进行xpath解析 from threading import BoundedSemaphore # 加入线程锁,限制线程的数量 import re, os # 利用正则匹配,合并网页,os模块创建要使用的文件 from loguru import logger # 进行日志记录 class SpiderEveryUrl(Spider): # 继承Spider,重写该方法 def __init__(self, url): """初始化变量""" Spider.__init__(self, url, target=self.run) self.urlAdd = "https://www.umei.cc/meinvtupian/" # 进行url的拼接 def spider(self): """处理业务""" html = etree.HTML(super().crawl().text) pageUrlList = html.xpath("/html/body/div[2]/div[8]/ul/li/a/@href") for i in pageUrlList: pageUrl = f"{self.urlAdd}{i.split('/meinvtupian/')[1]}" urListAll.append(pageUrl) def run(self): """启动程序""" self.spider() class SpiderPicUrl(Spider): def __init__(self, url): Spider.__init__(self, url, target=self.run) self.add = "https://www.umei.cc" def spider(self): """处理业务""" html = etree.HTML(Spider.crawl(self).text) nuUrl = html.xpath("/html/body/div[2]/div[12]/a/@href") try: if nuUrl: nuUrl = nuUrl[-1] maxIndex, headersNum, headersAlph = re.search(obj1, nuUrl).group().split("_")[1], re.search(obj2, nuUrl).group(), re.search(obj3, nuUrl).group() for i in range(1, int(maxIndex) + 1): if i == 1: eveUrl = f"{self.add}{headersAlph}{headersNum.split('_')[0]}.htm" else: eveUrl = f"{self.add}{headersAlph}{headersNum}{str(i)}.htm" preUrl.append(eveUrl) else: unRun.append(self.url) except Exception as e: print(e) def run(self): """运行程序""" with pool_sema: self.spider() class SpiderPicSrc(Spider): def __init__(self, url): """初始化变量""" Spider.__init__(self, url, target=self.run) # 继承Spider模块 def spider(self): """处理业务""" html = etree.HTML(super(SpiderPicSrc, self).crawl().text) # 调用模块中封装的方法 src = html.xpath("//*[@id='ArticleId{dede:field.reid/}']/p/a/img/@src") # 得到图片的src file = open("../content/PicSrc.txt", "a+") # 将src写入文件 file.write(f"{src[0]}\n") file.close() # try: # 如果安装了redis可以进行redis去重 # if src: # if self.redis.sadd("src", src[0]): # 使用redis去重 # print(f"正在保存图片src:{src[0]}") # self.file.write(f"{src[0]}\n") # else: # logger.info(f'{src[0]}已保存') # except Exception as e: # with open("./log.log", 'a+') as file: # file.write(f"{e}\n{src}") # print(e) def run(self): """运行程序""" with pool_sema: self.spider() """线程的使用方法——实例""" # def Many_Thread(target, *args) -> "示范""其为使用的方式": # th = [] # for i in range(25): # 开25个线程 # t = threading.Thread(target=target, args=args) # th.append(t) # t.setDaemon(True) # 添加守护线程,即防止进程进度与线程进度不一样 # for i in th: # 循环启动25个线程 # i.start() # for i in th: # i.join() # 阻塞线程 if __name__ == '__main__': while True: start, end = input("请输入要下载该网站的哪部分图片如(1 3)表示下载1到3页的图片,最多有540页:").split() try: if isinstance(eval(start), int) and isinstance(eval(end), int) and int(start) <= int(end): break else: continue except Exception as e: print(e) print("请按要求输入!!!") max_connections = 100 # 定义最大线程数 pool_sema = BoundedSemaphore(max_connections) # 或使用Semaphore方法,在主函数中使用with pool_sema可以限制线程的数量 urListAll, threads, preUrl, unRun = [], [], [], [] # 用于存储url和src obj1, obj2, obj3 = re.compile(r"_\d+"), re.compile(r"\d+_"), re.compile(r"\D+") # 在这里创建正则表达式,减少缓存的占用 for i in range(int(start), int(end)+1): if i == 1: url = "https://www.umei.cc/meinvtupian/" else: url = f"https://www.umei.cc/meinvtupian/index_{i}.htm" logger.info(f"{url}") th = SpiderEveryUrl(url) threads.append(th) for i in threads: i.start() for i in threads: i.join() with open("../content/EveryUrl.txt", "w") as f: # 将提取到的url保存到EveryUrl中,防止因为意外原因,使得数据丢失 f.write(str(urListAll)) print(f"urListAll提取完成") threads.clear() f = open("../content/EveryUrl.txt", "r") # 提取文件中的url urList = eval(f.read()) f.close() for url in urListAll: logger.info(url) th = SpiderPicUrl(url) threads.append(th) for i in threads: i.start() for i in threads: i.join() with open("../content/PicUrl.txt", "w") as f: # 将提取到的url保存到EveryUrl中,防止因为意外原因,使得数据丢失 f.write(str(preUrl)) print(f"preUrl提取完成\n错误的有:{unRun}" if not unRun else "preUrl提取完成") # 三目运算 threads.clear() f = open("../content/PicUrl.txt", "r") # 提取文件中的url urList = eval(f.read()) f.close() for url in preUrl: logger.info(f"{url}_src") th = SpiderPicSrc(url) threads.append(th) for i in threads: i.start() for i in threads: i.join() print("all over")
下载图片
第三步,将获得的src进行访问,全部下载
from spiderModule import Spider from loguru import logger import os, sys from threading import BoundedSemaphore class SpiderDown(Spider): def __init__(self, url): super().__init__(url, target=self.run) def spider(self): """处理业务""" data = Spider.crawl(self).content name = self.url.split("/")[-1] # 给保存的图片命名 logger.info(f"正在下载{name}") with open(f"../img/{name}", "wb") as f: f.write(data) # if self.redis.sadd("imgName", name): # redis去重 # logger.info(f"正在下载{name}") # Spider.save(self, data=data, name=name, mode="img") # else: # logger.info(f"{name}已经下载") def run(self): """运行程序""" with pool_sema: # 使用这个方法,限制线程数 self.spider() if __name__ == '__main__': max_connections = 100 # 定义最大线程数 pool_sema = BoundedSemaphore(max_connections) # 或使用Semaphore方法,在主函数中使用with pool_sema可以限制线程的数量 if not os.path.exists("../img"): os.mkdir("../img") threads = [] with open("../content/PicSrc.txt", "r") as file: urls = file.readlines() for url in urls: th = SpiderDown(url.strip()) threads.append(th) th.setDaemon(False) for i in threads: i.start() os.remove("../content/PicSrc.txt") # 移除存储src的文件
查看图片
第四步,打开img文件夹,图片保存在img文件夹里面
本文来自博客园,作者:Kenny_LZK,转载请注明原文链接:https://www.cnblogs.com/liuzhongkun/p/15768416.html
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?