python爬虫下载百度图片

#!/usr/bin/env python
# -*- coding:utf-8-*-
# file: {NAME}.py
# @author: jory.d
# @contact: dangxusheng163@163.com
# @time: 2021/06/29 22:09
# @desc:


import os, os.path as osp
import requests
import json
import time
import re

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
    'Host': 'image.baidu.com',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin', 'X-Requested-With': 'XMLHttpRequest',
    'Accept': 'text/plain, */*; q=0.01',
}

SAVE_ROOT_PATH = './image_face'


def download_img_from_baidu(keyword='手机正面'):
    url = 'https://image.baidu.com/search/acjson'
    per_page_num = 30
    params = {
        "tn": "resultjson_com",
        "logid": "8357122664305590518",
        "ipn": "rj",
        "ct": "201326592",
        "is": "",
        "fp": "result",
        "queryWord": keyword,
        "cl": "2",
        "lm": "-1",
        "ie": "utf-8",
        "oe": "utf-8",
        "adpicid": "",
        "st": "-1",
        "z": "",
        "ic": "0",
        "hd": "",
        "latest": "",
        "copyright": "",
        "word": keyword,
        "s": "",
        "se": "",
        "tab": "",
        "width": "",
        "height": "",
        "face": "0",
        "istype": "2",
        "qc": "",
        "nc": "1",
        "fr": "",
        "expermode": "",
        "nojc": "",
        "pn": 30,
        "rn": f'{per_page_num}',
        "gsm": "1e",
        "1624980770280": ""
    }

    page_num = 30
    idx = 0
    page_idx = 0
    failed_list = []
    while page_idx < page_num:
        params['pn'] = per_page_num * (page_idx + 1)
        r = requests.get(url, params, headers=headers)
        if r.status_code == 200:
            # cont = r.text.replace('\\', '\\\\')
            cont = r.content.decode('utf-8')
            # print(cont)
            try:
                print(f'page_idx: {page_idx+1}/{page_num}')
                json_data = json.loads(cont, encoding='utf-8')
                # print(json_data)
                total = json_data['displayNum']
                page_num = total
                if 'data' in json_data.keys():
                    data_list = json_data['data']
                    for d in data_list:
                        idx += 1
                        img_url = d['middleURL']
                        save_filepath = f'{SAVE_ROOT_PATH}/{keyword}/{idx}.jpg'
                        os.makedirs(osp.dirname(save_filepath), exist_ok=True)
                        imgContent = requests.get(img_url).content
                        open(save_filepath, 'wb').write(imgContent)  # 写入
                        if osp.getsize(save_filepath) > 10 * 1024:
                            print(f'{save_filepath} 下载成功!')
                            time.sleep(2)
                        else:
                            failed_list.append(img_url)
                page_idx += 1
            except:
                print(f'json.loads() exception. page_idx: {page_idx + 1}')
                continue

    # 重新下载失败的列表
    for j, url in enumerate(u for u in failed_list):
        imgContent = requests.get(url).content
        idx += 1
        save_filepath = f'{SAVE_ROOT_PATH}/{keyword}/{idx}.jpg'
        open(save_filepath, 'wb').write(imgContent)  # 写入
        if osp.getsize(save_filepath) > 5 * 1024:
            print(f'{save_filepath} 下载成功!')
            time.sleep(2)
            failed_list.pop(j)
        else:
            failed_list.append(url)

    print(f'fail: {failed_list}')
    print('done.')


def decode_json_downoad():
    html_path = './phone_1.json'
    keyword = 'phone'
    with open(html_path, 'r', encoding='utf-8') as rf:
        html = rf.read()
        pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
        print(pic_url)
        print(len(pic_url))
        for i, each in enumerate(pic_url):
            print('正在下载第' + str(i) + '张图片,图片地址:' + str(each))
            try:
                pic = requests.get(each, timeout=10)
            except requests.exceptions.ConnectionError:
                print('【错误】当前图片无法下载')
                continue

            dir = './images/' + keyword + '_' + str(i) + '.jpg'
            os.makedirs(osp.dirname(dir), exist_ok=True)
            fp = open(dir, 'wb')
            fp.write(pic.content)
            fp.close()
            i += 1


if __name__ == '__main__':
    download_img_from_baidu(keyword='中国男明星高清人脸')
    # decode_json_downoad()

 

posted @ 2022-02-13 21:33  dangxusheng  阅读(176)  评论(0编辑  收藏  举报