python爬虫下载百度图片
#!/usr/bin/env python # -*- coding:utf-8-*- # file: {NAME}.py # @author: jory.d # @contact: dangxusheng163@163.com # @time: 2021/06/29 22:09 # @desc: import os, os.path as osp import requests import json import time import re headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", 'Host': 'image.baidu.com', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'text/plain, */*; q=0.01', } SAVE_ROOT_PATH = './image_face' def download_img_from_baidu(keyword='手机正面'): url = 'https://image.baidu.com/search/acjson' per_page_num = 30 params = { "tn": "resultjson_com", "logid": "8357122664305590518", "ipn": "rj", "ct": "201326592", "is": "", "fp": "result", "queryWord": keyword, "cl": "2", "lm": "-1", "ie": "utf-8", "oe": "utf-8", "adpicid": "", "st": "-1", "z": "", "ic": "0", "hd": "", "latest": "", "copyright": "", "word": keyword, "s": "", "se": "", "tab": "", "width": "", "height": "", "face": "0", "istype": "2", "qc": "", "nc": "1", "fr": "", "expermode": "", "nojc": "", "pn": 30, "rn": f'{per_page_num}', "gsm": "1e", "1624980770280": "" } page_num = 30 idx = 0 page_idx = 0 failed_list = [] while page_idx < page_num: params['pn'] = per_page_num * (page_idx + 1) r = requests.get(url, params, headers=headers) if r.status_code == 200: # cont = r.text.replace('\\', '\\\\') cont = r.content.decode('utf-8') # print(cont) try: print(f'page_idx: {page_idx+1}/{page_num}') json_data = json.loads(cont, encoding='utf-8') # print(json_data) total = json_data['displayNum'] page_num = total if 'data' in json_data.keys(): data_list = json_data['data'] for d in data_list: idx += 1 img_url = d['middleURL'] save_filepath = f'{SAVE_ROOT_PATH}/{keyword}/{idx}.jpg' os.makedirs(osp.dirname(save_filepath), exist_ok=True) imgContent = requests.get(img_url).content open(save_filepath, 'wb').write(imgContent) # 写入 if osp.getsize(save_filepath) > 10 * 1024: print(f'{save_filepath} 下载成功!') time.sleep(2) else: failed_list.append(img_url) page_idx += 1 except: print(f'json.loads() exception. page_idx: {page_idx + 1}') continue # 重新下载失败的列表 for j, url in enumerate(u for u in failed_list): imgContent = requests.get(url).content idx += 1 save_filepath = f'{SAVE_ROOT_PATH}/{keyword}/{idx}.jpg' open(save_filepath, 'wb').write(imgContent) # 写入 if osp.getsize(save_filepath) > 5 * 1024: print(f'{save_filepath} 下载成功!') time.sleep(2) failed_list.pop(j) else: failed_list.append(url) print(f'fail: {failed_list}') print('done.') def decode_json_downoad(): html_path = './phone_1.json' keyword = 'phone' with open(html_path, 'r', encoding='utf-8') as rf: html = rf.read() pic_url = re.findall('"objURL":"(.*?)",', html, re.S) print(pic_url) print(len(pic_url)) for i, each in enumerate(pic_url): print('正在下载第' + str(i) + '张图片,图片地址:' + str(each)) try: pic = requests.get(each, timeout=10) except requests.exceptions.ConnectionError: print('【错误】当前图片无法下载') continue dir = './images/' + keyword + '_' + str(i) + '.jpg' os.makedirs(osp.dirname(dir), exist_ok=True) fp = open(dir, 'wb') fp.write(pic.content) fp.close() i += 1 if __name__ == '__main__': download_img_from_baidu(keyword='中国男明星高清人脸') # decode_json_downoad()