Python 爬取陈都灵百度图片

Python 爬取陈都灵百度图片

标签(空格分隔): 随笔


今天意外发现了自己以前写的一篇爬虫脚本,爬取的是我的女神陈都灵,尝试运行了一下发现居然还能用。故把脚本贴出来分享一下。

import requests
import os
import json
#import random

#firsturl='https://image.baidu.com/search/acjson?'
#header={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
def get_chenduling(le):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

    url = 'https://image.baidu.com/search/acjson?'
    data={'tn':'resultjson_com',
    'ipn':'rj',
    'ct':'201326592',
    'is':'',
    'fp':'result',
    'queryWord':'陈都灵',
    'cl':'2',
    'lm':'-1',
    'ie':'utf-8',
    'oe':'utf-8',
    'adpicid':'',
    'st':'',
    'z':'',
    'ic':'',
    'word':'陈都灵',
    's':'',
    'se':'',
    'tab':'',
    'width':'',
    'height':'',
    'face':'',
    'istype':'',
    'qc':'',
    'nc':'',
    'fr':'',
    'cg':'star',
    'pn':'30',
    'rn':'30',
    'gsm':le,
    }
    response=requests.get(url,params=data,headers=header)
    #print(response.text[:3000])
    chen=json.loads(response.text)
   #
    if chen and 'data' in chen:
        for item in chen.get('data'):
            newurl=item.get('middleURL')
            #print(newurl)
            if newurl:
                dd=savechen(newurl,header)
                resave(newurl,dd)
    nextle=chen.get('gsm')
    #print(nextle)
    get_chenduling(nextle)
def savechen(item,header):
    try:
        dudu=requests.get(item,headers=header)
        dudu.raise_for_status()
        #fpath='{0}.{1}.{2}'.format('d:\chenduling\\',item.split('.')[-2],'jpg')
        return dudu.content
    except:
        print('有毛病。。。。')
def resave(item,html):
    fpath = '{0}.{1}'.format('d:\chenduling', item.split(',')[-1])
    if not os.path.exists(fpath):

        with open (fpath,'wb') as ff:
            print('downloading.....{0}'.format(item))
            ff.write(html)


def main():
    le='le'
    #firsturl = 'https://image.baidu.com/search/acjson?'
    #header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    get_chenduling(le)
if __name__ =='__main__':
    main()

运行了一下,一点问题都没有,图片都存放到D盘了,拓展到其他图片估计也没问题,至于le这个参数干嘛的,我也记不清了。隐约记得有一个请求头部包含一串数字,但是这串数字并没有卵用。

posted @ 2019-07-16 19:20  夜空守望者Z  阅读(403)  评论(0编辑  收藏  举报