Python多线程实战
项目的源码在GitHub
创建模块
第一步,我们可以自己封装一个模块用来进行多线程爬虫
from threading import Thread # 继承Thread方法,重写run方法
class Spider(Thread): # 同时重写多线程的run函数
def __init__(self, url, target) -> "初始化变量":
super().__init__(target=target, daemon=True) # daemon线程等待,target是运行的函数
self.target = target
# # 实例化redis数据库,如果要使用,请安装相应插件后取消注释
# import redis
# self.redis = redis.Redis() # 在后面的使用中可以进行去重,防止重复下载
# 构建ip池,防止由于访问速度过快,使得ip地址被封
self.file = open("../content/ip.txt") # 得到大量ip地址
self.ipList = self.file.readlines()
self.file.close()
from random import choice
self.ip = choice(self.ipList).strip()
# # 实例化mongo数据库
# import pymongo
# self.mongo = pymongo.MongoClient()
# self.clo = self.mongo["python"]["default"] # 还要说明MongoDB使用的表名
# 传入requests所需要的参数
self.headers = {
'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36" # UA伪装
}
self.url = url
self.proxy = {
"http": f"http://{self.ip}" # 代理IP
}
self.cookies = None
def crawl(self) -> "发送请求":
"""发送请求"""
import requests
try: # 反止有人不知道设置代理ip
if self.cookies:
resp = requests.get(url=self.url, headers=self.headers, proxies=self.proxy, cookies=self.cookies)
else:
resp = requests.get(url=self.url, headers=self.headers, proxies=self.proxy)
except Exception as e:
print(e)
if self.cookies:
resp = requests.get(url=self.url, headers=self.headers, cookies=self.cookies)
else:
resp = requests.get(url=self.url, headers=self.headers)
if resp.status_code == 200:
resp.encoding = "utf-8"
return resp
else:
print("Requests Error")
# def spider(self) -> "业务处理":
# """业务处理"""
# pass
#
# def save(self, data=None, name=None, mode="file") -> "持久化存储":
# """持久化存储"""
# pass
# if mode == "mongo": # 当选择mongo时,保存在mongo数据库中
# if isinstance(data, dict):
# self.clo.insert_one(data)
# else:
# print("Store Error")
# elif mode == "file": # 保存在文件中
# with open(f"{self.path}", "a+", encoding="utf-8") as file:
# file.write(data)
# elif mode == "img" and name: # 保存为图片
# with open(f"./{name}", "wb") as f:
# f.write(data)
# else:
# raise 'FileTypeError.The way can only file or mongo'
#
# def parse(self) -> "数据分析":
# """数据分析"""
# pass
#
# def run(self) -> "运行程序":
# """运行程序"""
# from loguru import logger
# logger.info("开始运行爬虫程序")
# self.spider()
- 注意那个写的python模块也可以在以后的爬虫中使用,你们也可以根据自己的喜好更改
编写代码
第二步,写获取每张图片的src的代码
from spiderModule import Spider # 导入我们刚才创建的模块
from lxml import etree # 对网页源码进行xpath解析
from threading import BoundedSemaphore # 加入线程锁,限制线程的数量
import re, os # 利用正则匹配,合并网页,os模块创建要使用的文件
from loguru import logger # 进行日志记录
class SpiderEveryUrl(Spider): # 继承Spider,重写该方法
def __init__(self, url):
"""初始化变量"""
Spider.__init__(self, url, target=self.run)
self.urlAdd = "https://www.umei.cc/meinvtupian/" # 进行url的拼接
def spider(self):
"""处理业务"""
html = etree.HTML(super().crawl().text)
pageUrlList = html.xpath("/html/body/div[2]/div[8]/ul/li/a/@href")
for i in pageUrlList:
pageUrl = f"{self.urlAdd}{i.split('/meinvtupian/')[1]}"
urListAll.append(pageUrl)
def run(self):
"""启动程序"""
self.spider()
class SpiderPicUrl(Spider):
def __init__(self, url):
Spider.__init__(self, url, target=self.run)
self.add = "https://www.umei.cc"
def spider(self):
"""处理业务"""
html = etree.HTML(Spider.crawl(self).text)
nuUrl = html.xpath("/html/body/div[2]/div[12]/a/@href")
try:
if nuUrl:
nuUrl = nuUrl[-1]
maxIndex, headersNum, headersAlph = re.search(obj1, nuUrl).group().split("_")[1], re.search(obj2, nuUrl).group(), re.search(obj3, nuUrl).group()
for i in range(1, int(maxIndex) + 1):
if i == 1:
eveUrl = f"{self.add}{headersAlph}{headersNum.split('_')[0]}.htm"
else:
eveUrl = f"{self.add}{headersAlph}{headersNum}{str(i)}.htm"
preUrl.append(eveUrl)
else:
unRun.append(self.url)
except Exception as e:
print(e)
def run(self):
"""运行程序"""
with pool_sema:
self.spider()
class SpiderPicSrc(Spider):
def __init__(self, url):
"""初始化变量"""
Spider.__init__(self, url, target=self.run) # 继承Spider模块
def spider(self):
"""处理业务"""
html = etree.HTML(super(SpiderPicSrc, self).crawl().text) # 调用模块中封装的方法
src = html.xpath("//*[@id='ArticleId{dede:field.reid/}']/p/a/img/@src") # 得到图片的src
file = open("../content/PicSrc.txt", "a+") # 将src写入文件
file.write(f"{src[0]}\n")
file.close()
# try: # 如果安装了redis可以进行redis去重
# if src:
# if self.redis.sadd("src", src[0]): # 使用redis去重
# print(f"正在保存图片src:{src[0]}")
# self.file.write(f"{src[0]}\n")
# else:
# logger.info(f'{src[0]}已保存')
# except Exception as e:
# with open("./log.log", 'a+') as file:
# file.write(f"{e}\n{src}")
# print(e)
def run(self):
"""运行程序"""
with pool_sema:
self.spider()
"""线程的使用方法——实例"""
# def Many_Thread(target, *args) -> "示范""其为使用的方式":
# th = []
# for i in range(25): # 开25个线程
# t = threading.Thread(target=target, args=args)
# th.append(t)
# t.setDaemon(True) # 添加守护线程,即防止进程进度与线程进度不一样
# for i in th: # 循环启动25个线程
# i.start()
# for i in th:
# i.join() # 阻塞线程
if __name__ == '__main__':
while True:
start, end = input("请输入要下载该网站的哪部分图片如(1 3)表示下载1到3页的图片,最多有540页:").split()
try:
if isinstance(eval(start), int) and isinstance(eval(end), int) and int(start) <= int(end):
break
else:
continue
except Exception as e:
print(e)
print("请按要求输入!!!")
max_connections = 100 # 定义最大线程数
pool_sema = BoundedSemaphore(max_connections) # 或使用Semaphore方法,在主函数中使用with pool_sema可以限制线程的数量
urListAll, threads, preUrl, unRun = [], [], [], [] # 用于存储url和src
obj1, obj2, obj3 = re.compile(r"_\d+"), re.compile(r"\d+_"), re.compile(r"\D+") # 在这里创建正则表达式,减少缓存的占用
for i in range(int(start), int(end)+1):
if i == 1:
url = "https://www.umei.cc/meinvtupian/"
else:
url = f"https://www.umei.cc/meinvtupian/index_{i}.htm"
logger.info(f"{url}")
th = SpiderEveryUrl(url)
threads.append(th)
for i in threads:
i.start()
for i in threads:
i.join()
with open("../content/EveryUrl.txt", "w") as f: # 将提取到的url保存到EveryUrl中,防止因为意外原因,使得数据丢失
f.write(str(urListAll))
print(f"urListAll提取完成")
threads.clear()
f = open("../content/EveryUrl.txt", "r") # 提取文件中的url
urList = eval(f.read())
f.close()
for url in urListAll:
logger.info(url)
th = SpiderPicUrl(url)
threads.append(th)
for i in threads:
i.start()
for i in threads:
i.join()
with open("../content/PicUrl.txt", "w") as f: # 将提取到的url保存到EveryUrl中,防止因为意外原因,使得数据丢失
f.write(str(preUrl))
print(f"preUrl提取完成\n错误的有:{unRun}" if not unRun else "preUrl提取完成") # 三目运算
threads.clear()
f = open("../content/PicUrl.txt", "r") # 提取文件中的url
urList = eval(f.read())
f.close()
for url in preUrl:
logger.info(f"{url}_src")
th = SpiderPicSrc(url)
threads.append(th)
for i in threads:
i.start()
for i in threads:
i.join()
print("all over")
下载图片
第三步,将获得的src进行访问,全部下载
from spiderModule import Spider
from loguru import logger
import os, sys
from threading import BoundedSemaphore
class SpiderDown(Spider):
def __init__(self, url):
super().__init__(url, target=self.run)
def spider(self):
"""处理业务"""
data = Spider.crawl(self).content
name = self.url.split("/")[-1] # 给保存的图片命名
logger.info(f"正在下载{name}")
with open(f"../img/{name}", "wb") as f:
f.write(data)
# if self.redis.sadd("imgName", name): # redis去重
# logger.info(f"正在下载{name}")
# Spider.save(self, data=data, name=name, mode="img")
# else:
# logger.info(f"{name}已经下载")
def run(self):
"""运行程序"""
with pool_sema: # 使用这个方法,限制线程数
self.spider()
if __name__ == '__main__':
max_connections = 100 # 定义最大线程数
pool_sema = BoundedSemaphore(max_connections) # 或使用Semaphore方法,在主函数中使用with pool_sema可以限制线程的数量
if not os.path.exists("../img"):
os.mkdir("../img")
threads = []
with open("../content/PicSrc.txt", "r") as file:
urls = file.readlines()
for url in urls:
th = SpiderDown(url.strip())
threads.append(th)
th.setDaemon(False)
for i in threads:
i.start()
os.remove("../content/PicSrc.txt") # 移除存储src的文件
查看图片
第四步,打开img文件夹,图片保存在img文件夹里面
本文来自博客园,作者:Kenny_LZK,转载请注明原文链接:https://www.cnblogs.com/liuzhongkun/p/15768416.html