Ajax 请求分析抓取百度图片

import requests
from urllib.parse import urlencode
from multiprocessing import Pool#开启多进程
from requests.exceptions import RequestException #  
import re
import json
from hashlib import md5
def page_get(url):
    try:
        html = requests.get(url)
        if html.status_code == 200:
            return html.text
        return None
    except RequestException:
        print('请求失败')
        return None

def page_html(pn):
    data = {
        'tn': 'resultjson_com',
        'ipn': 'rj',
        'ct': 201326592,
        'is': '',
        'fp': 'result',
        'queryWord': '清晰图片',
        'cl': 2,
        'lm': -1,
        'ie': 'utf-8',
        'oe': 'utf-8',
        'adpicid': '',
        'st': -1,
        'z': 0,
        'ic': 0,
        'hd': 0,
        'latest': 0,
        'copyright': 0,
        'word': '清晰图片',
        's': '',
        'se': '',
        'tab': '',
        'width': 1920,
        'height': 1080,
        'face': '',
        'istype': '',
        'qc': '',
        'nc': 1,
        'fr': '',
        'expermode': '',
        'force': '',
        'pn': pn,
        'rn': 30,
        'gsm': '1e',
        '1561179768452': ''
    }
    url = 'https://image.baidu.com/search/acjson?'+ urlencode(data)
    html = page_get(url)
    for item in page_re(html):
        print(item)
def page_re(html):
    srt = re.compile('.*?fromPageTitle":"(.*?)",.*?thumbURL":"(.*?)",.*?middleURL":"(.*?)",.*?hoverURL":"(.*?)",',re.S)
    srt = re.findall(srt,html)
    for item in srt:
        wrire_to(item[1])
        yield {
            '名称':item[0],
            'img':item[1]
        }
def wrire_to(url):
    try:
        html = requests.get(url)
        if html.status_code == 200:
            asve_img(html.content)
        return None
    except RequestException:
        print('请求失败')
        return None
def asve_img(content):
    file_path = '{0}/{1}.{2}'.format(r'C:\Users\Administrator\Desktop\img',md5(content).hexdigest(), 'jpg')
    # if not os.path.exists(file_path):
    with open(file_path, 'wb') as f:
        f.write(content)
        f.close()
def main():
    # for i in range(10):
    #     page_html(i*30)
    pool = Pool()
    pool.map(page_html, [i*30 for i in range(10)])
    # html = page_html(30)

if __name__ == '__main__':
    main()

 

posted on 2019-06-26 15:42  ||子义  阅读(395)  评论(0编辑  收藏  举报

导航