【python】王者荣耀全英雄头像爬虫(多线程)
成果展示
目标网址
依赖模块
beautifulsoup4
:用于网页元素审查和定位。
pip install beautifulsoup4
lxml
:用于网页文档高速解析。
pip install lxml
requests
:用于发送和接收网络请求。
pip install requests
完整代码
import concurrent.futures as cf
from bs4 import BeautifulSoup
import os, time, requests
class HeroDownloader(object):
# 创建头像保存文件夹
def __init__(self):
self.root = './heros/'
if not os.path.exists(self.root):
os.mkdir(self.root)
else:
pass
self.site = 'https://pvp.qq.com/web201605/herolist.shtml'
# 单张头像下载
def down(self, name, url):
try:
res = requests.get(url)
with open(name, 'wb') as f:
f.write(res.content)
except Exception as e:
print(e)
# 进度条打印
def show(self, num, _sum, runTime):
barLen = 20 # 进度条的长度
perFin = num/_sum
numFin = round(barLen*perFin)
numNon = barLen-numFin
leftTime = (1-perFin)*(runTime/perFin)
print(
f"{num:0>{len(str(_sum))}}/{_sum}",
f"|{'█'*numFin}{' '*numNon}|",
f"PROCESS: {perFin*100:.0f}%",
f"RUN: {runTime:.0f}S",
f"ETA: {leftTime:.0f}S",
end='\r'
)
# 多线程
def main(self):
resp = requests.get(self.site)
soup = BeautifulSoup(resp.content, 'lxml')
hero = soup.select('div.herolist-content li a img') # 头像定位
total = len(hero) # 总的头像个数 (截至到2020/12/09, 王者荣耀正式服共有93名英雄)
count = 0
with cf.ThreadPoolExecutor() as tp:
futures = []
t1 = time.time()
for item in hero:
alt = item['alt']
src = item['src']
name = self.root+alt+'.jpg' # 图片名
url = 'https:'+src # 图片url
future = tp.submit(self.down, name, url)
futures.append(future)
for future in cf.as_completed(futures):
count += 1
t2 = time.time()
self.show(count, total, t2-t1)
print()
if __name__ == "__main__":
HeroDownloader().main()
进度打印
93/93 |████████████████████| PROCESS: 100% RUN: 2S ETA: 0S