Ajax 请求分析抓取百度图片
import requests from urllib.parse import urlencode from multiprocessing import Pool#开启多进程 from requests.exceptions import RequestException # import re import json from hashlib import md5 def page_get(url): try: html = requests.get(url) if html.status_code == 200: return html.text return None except RequestException: print('请求失败') return None def page_html(pn): data = { 'tn': 'resultjson_com', 'ipn': 'rj', 'ct': 201326592, 'is': '', 'fp': 'result', 'queryWord': '清晰图片', 'cl': 2, 'lm': -1, 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid': '', 'st': -1, 'z': 0, 'ic': 0, 'hd': 0, 'latest': 0, 'copyright': 0, 'word': '清晰图片', 's': '', 'se': '', 'tab': '', 'width': 1920, 'height': 1080, 'face': '', 'istype': '', 'qc': '', 'nc': 1, 'fr': '', 'expermode': '', 'force': '', 'pn': pn, 'rn': 30, 'gsm': '1e', '1561179768452': '' } url = 'https://image.baidu.com/search/acjson?'+ urlencode(data) html = page_get(url) for item in page_re(html): print(item) def page_re(html): srt = re.compile('.*?fromPageTitle":"(.*?)",.*?thumbURL":"(.*?)",.*?middleURL":"(.*?)",.*?hoverURL":"(.*?)",',re.S) srt = re.findall(srt,html) for item in srt: wrire_to(item[1]) yield { '名称':item[0], 'img':item[1] } def wrire_to(url): try: html = requests.get(url) if html.status_code == 200: asve_img(html.content) return None except RequestException: print('请求失败') return None def asve_img(content): file_path = '{0}/{1}.{2}'.format(r'C:\Users\Administrator\Desktop\img',md5(content).hexdigest(), 'jpg') # if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() def main(): # for i in range(10): # page_html(i*30) pool = Pool() pool.map(page_html, [i*30 for i in range(10)]) # html = page_html(30) if __name__ == '__main__': main()