爬百度图片

#!/usr/bin/env python
# _*_ coding: utf-8 _*_
# @Time : 2022/9/8 14:31
# @Author : AndyXi
# @Version:V 0.1
# @File : 爬img.py
# @desc :

import json
from datetime import datetime
import time
import requests
from tqdm import tqdm

def get_filename():
return datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")


if __name__ == "__main__":
word = input("请输入要爬取的关键字: ")
page_size = int(input("请输入要爬取的张数: "))

header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
}


res_img = requests.get(f"https://image.baidu.com/search/acjson?tn=resultjson_com&logid=8057700054872665483&ipn=rj&ct=201326592&is="
f"&fp=result&fr=&word={word}"
f"&cg=star&queryWord={word}"
"&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&s=&se=&tab=&width="
"&height=&face=0&istype=2&qc=&nc=1&expermode="
f"&nojc=&isAsync=&pn=60&rn={page_size}&gsm=3c&1662621074446=",
headers=header)
res_dic = json.loads(res_img.text)

################以下能实现功能,但性能不行#######################
# i=1
# for item in res_dic["data"]:
# img_url = item.get("thumbURL", "")
# img_data = requests.get(img_url,headers=header)
# with open(f"downlod_img/{get_filename()}.jpg","wb") as f:
# print(f"正在下载第{i}张图片")
# f.write(img_data.content)
# i+=1
# time.sleep(1)

###################################进度条功能及内存使用优化功能######################
for item in res_dic["data"]:
img_url = item.get("thumbURL","")
img_data = requests.get(img_url,headers=header,stream=True)
if "content-length" in img_data.headers:
###获取图片大小,大小为b
content_size = int(img_data.headers["content-length"])
img_name = get_filename() + ".jpg"
with open(f"downlod_img/{get_filename()}.jpg","wb") as f,tqdm(desc=img_name,total=content_size) as bar:
for chunk in img_data.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
bar.update(len(chunk))
posted @ 2022-09-08 16:51  青空如璃  阅读(18)  评论(0编辑  收藏  举报