分析Ajax爬取今日头条街拍美图
1 import os 2 import requests 3 from urllib.parse import urlencode 4 from hashlib import md5 5 from multiprocessing.pool import Pool 6 7 GROUP_START = 1 8 GROUP_END = 5 9 10 def get_page(offset): 11 params = { 12 'offset': offset, 13 'format': 'json', 14 'keyword': '街拍', 15 'autoload': 'true', 16 'count': '20', 17 'cur_tab': '3', 18 'from': 'gallery', 19 } 20 url = 'https://www.toutiao.com/search_content/?' + urlencode(params) 21 try: 22 response = requests.get(url) 23 if response.status_code == 200: 24 return response.json() 25 except requests.ConnectionError: 26 return None 27 28 def get_images(json): 29 data = json.get('data') 30 if data: 31 for item in data: 32 # print(item) 33 image_list = item.get('image_list') 34 title = item.get('title') 35 # print(image_list) 36 for image in image_list: 37 yield { 38 'image': image.get('url'), 39 'title': title 40 } 41 42 def save_image(item): 43 if not os.path.exists(item.get('title')): 44 os.mkdir(item.get('title')) 45 try: 46 local_image_url = item.get('image') 47 new_image_url = local_image_url.replace('list','large') 48 response = requests.get('http:' + new_image_url) 49 if response.status_code == 200: 50 file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg') 51 if not os.path.exists(file_path): 52 with open(file_path, 'wb')as f: 53 f.write(response.content) 54 else: 55 print('Already Downloaded', file_path) 56 except requests.ConnectionError: 57 print('Failed to save image') 58 59 def main(offset): 60 json = get_page(offset) 61 for item in get_images(json): 62 print(item) 63 save_image(item) 64 65 if __name__ == '__main__': 66 pool = Pool() 67 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) 68 pool.map(main, groups) 69 pool.close() 70 pool.join()