多线程的爬取

# 导入一个请求的模块
import json
import time
from concurrent.futures.thread import ThreadPoolExecutor

from urllib.parse import urlencode

import requests
# 图片的名字
num = 1
def spider_girl(n):
# url
url = 'https://image.baidu.com/search/acjson?'
# 进行伪装,让浏览器识别我们是正常的
headers = {
# 当前电脑信息
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
# 跳转来的链接
'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D0%A1%BD%E3%BD%E3&fr=ala&ala=1&alatpl=normal&pos=0&dyTabStr=MCwzLDIsMSw2LDQsNSw3LDgsOQ%3D%3D',
# 域名
'Host': 'image.baidu.com',
# 用户令牌
'Cookie': 'BDqhfp=%E5%B0%8F%E5%A7%90%E5%A7%90%26%26NaN-1undefined%26%26612%26%262; BIDUPSID=EAFCD464609FC69883B57D4510934815; PSTM=1657891990; BAIDUID=EAFCD464609FC6988BE34F438F7E731E:FG=1; indexPageSugList=%5B%22%E6%98%9F%E7%A9%BA%E5%9B%BE%E7%89%87%22%2C%22%E6%98%9F%E7%A9%BA%22%5D; BAIDUID_BFESS=EAFCD464609FC6988BE34F438F7E731E:FG=1; ZFY=:AwYy:AhVoS6yTQoOv5ELUaLbbsRdPWLaUq6:BfFUsovNI:C; BDRCVFR[0-iYRofrloc]=-48_i3v-l4_uhN8uvFLQhP8; H_PS_PSSID=26350; BA_HECTOR=ak012184000000242l2jq0bv1hg4ekp16; delPer=0; PSINO=6; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; userFrom=ala; ab_sr=1.0.1_YmYwYWQ1OTZmOGM4OThkZDQxZjg2MWNhZDZjZjc1MGQwMDgyOTQ2NzUxMDA1MDI4NDFiNGMyMDYwZTI5NTA4ODljNmY3Njc0YjE2YWVjY2ZmOWRkNzE2MzMzOGFjNmExYWYxYTU2OTY3MzNhZDA3YmQyOTc2ZmZhMjRhNmVjOGY1NDY5ZGYyOWUxYWZkZGE0NmU4ZGUyODY3ZmQ2ODQyNA==; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm'
}
# url中的参数
params ={
"tn":"resultjson_com",
"logid":"11459031553873409564",
"ipn":"rj",
"ct":"201326592",
"is":"",
"fp":"result",
"fr":"ala",
"word":"小姐姐",
"queryWord":"小姐姐",
"cl":"2",
"lm":"-1",
"ie":"utf-8",
"oe":"utf-8",
"adpicid":"",
"st":"",
"z":"",
"ic":"",
"hd":"",
"latest":"",
"copyright":"",
"s":"",
"se":"",
"tab":"",
"width":"",
"height":"",
"face":"",
"istype":"",
"qc":"",
"nc":"",
"expermode":"",
"nojc":"",
"isAsync":"",
"pn":n,
"rn":"30",
"gsm":hex(n)[2:],
"1661090568310":"",

}
# 拼接路径
url = url + urlencode(params)
# 发起请求,得到响应
res = requests.get(url, headers=headers)
# print(res)

# 解析数据
#将获取到的json数据转换为python数据
data_list = json.loads(res.text)
for data in data_list['data']:
# 获取图片链接
# 异常捕获
try:
img_url = data['thumbURL']
# 对图片进行请求
resp = requests.get(img_url)
# 对图片进行存储
global num
with open('./img/{}.png'.format(num),'wb') as f1:
print('正在写入第{}张图片'.format(num))
# 写入图片的二进制数据
f1.write(resp.content)
print('第{}张图片写入成功!'.format(num))
num += 1
time.sleep(2)

except:
pass


if __name__ == '__main__':
# 使用多线程
pool = ThreadPoolExecutor(30)
for i in range(1,11):
pool.submit(spider_girl,i*30)
pool.shutdown()
print('执行结束')
posted @ 2022-08-21 23:04  冬天不下雨  阅读(39)  评论(0编辑  收藏  举报