程序简介
百度图片爬虫的封装接口2018年实现的,现在还能用...不错,谢谢百度的不封之恩,先将其贡献给所有热爱技术的开发者
输入:关键词、下载数量、重定尺寸(可省)
输出:自动创建文件夹下载对应数量的百度图片,图片由md5命令
程序/数据集下载
代码分析
导入模块
import numpy as np
import hashlib
import requests
import json
import cv2
import os
evalMd5函数用来计算图片md5,好进行命名和过滤相同图片
def evalMd5(sentence,charset='utf8'):
'''
计算一段字符串的md5
:param sentence: 字符串
:param charset: 字符集
:return: md5值
'''
#将字符串编码成bytes
if type(sentence) != bytes:
sentence = sentence.encode(charset)
md5 = hashlib.md5(sentence).hexdigest()
return md5
resizeImg函数用来重定图片尺寸
def resizeImg(oldPath,size,newPath):
'''
重定图片尺寸
:param oldPath: 图片路径
:param size: 重定大小
:param newPath: 图片保存路径
:return: None
'''
oldPath = oldPath.replace('\\','/')
newPath = newPath.replace('\\','/')
oldImg = cv2.imdecode(np.fromfile(oldPath,dtype=np.uint8),-1)
try:
newImg = cv2.resize(oldImg,size,) #为图片重新指定尺寸
cv2.imwrite(newPath,newImg)
cv2.imencode('.'+newPath.split('.')[-1],newImg)[1].tofile(newPath)
except:
#图片格式不对发生错误,删除
os.remove(oldPath)
核心函数download会调用上面的函数进行批量图片下载
def download(keyWord,imgNumber,imgSize=None):
'''
下载图片到关键词文件夹
:param keyWord: 关键词
:param imgNumber: 图片数量
:param imgSize: 图片重定大小
:return: None
'''
#创建关键词文件夹
dirname = keyWord
if not os.path.exists(dirname):
os.mkdir(dirname)
#开始爬图片
url = 'https://image.baidu.com/search/acjson'#图片网址
same = 0#重复下载数
error = 0#错误数
passNum = 0#无链接数
for i in range(30,30*10000+30,30):
param = {
'tn': 'resultjson_com','ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyWord,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyWord,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1488942260214': ''
}
#所有图片地址列表
data = requests.get(url,params=param).text.replace('\\','\\\\')
try:
data = json.loads(data)['data']
except:
#json数据可能不合法,直接跳过
error += 1
if error >=20:
return None
continue
for item in data:
imgUrl = item.get("middleURL")#图片地址
if passNum>=20:
return None
if imgUrl is None:
passNum+=1
continue
suffix = imgUrl.split('.')[-1]#图片后缀
imgContent = requests.get(imgUrl).content#图片内容
imgMd5 = evalMd5(imgContent)#图片md5
imgPath = os.path.join(dirname,'%s.%s'%(imgMd5,suffix))#图片路径
oldFinish = len(os.listdir(dirname))
open(imgPath, 'wb').write(imgContent)#写入
#重定尺寸
if imgSize:
resizeImg(imgPath,imgSize,imgPath)
newFinish = len(os.listdir(dirname))
print('key:%s goal:%d finish:%d'%(keyWord,imgNumber,newFinish))
#图片数达标,退出
if newFinish >= imgNumber:
return None
#重复下载图片达到100次,说明已经下载完所有图片,退出
if newFinish == oldFinish:
same+=1
if same >= 20:
return
来测试一下看看效果吧~
imgNumber = 10
keys = ['电子琴','苹果']
imgSize = None
for keyWord in keys:
download(keyWord,imgNumber,imgSize)
key:电子琴 goal:10 finish:1
key:电子琴 goal:10 finish:2
key:电子琴 goal:10 finish:3
key:电子琴 goal:10 finish:4
key:电子琴 goal:10 finish:5
key:电子琴 goal:10 finish:6
key:电子琴 goal:10 finish:7
key:电子琴 goal:10 finish:8
key:电子琴 goal:10 finish:9
key:电子琴 goal:10 finish:10
key:苹果 goal:10 finish:1
key:苹果 goal:10 finish:2
key:苹果 goal:10 finish:3
key:苹果 goal:10 finish:4
key:苹果 goal:10 finish:5
key:苹果 goal:10 finish:6
key:苹果 goal:10 finish:7
key:苹果 goal:10 finish:8
key:苹果 goal:10 finish:9
key:苹果 goal:10 finish:10