Python爬虫学习笔记之爬今日头条的街拍图片
代码:
1 import requests 2 import os 3 from hashlib import md5 4 from urllib.parse import urlencode 5 from multiprocessing.pool import Pool 6 7 GROUP_START = 1 8 GROUP_END = 5 9 10 def get_page(offset): 11 params = { 12 'offset': offset, 13 'format': 'json', 14 'keyword': '街拍', 15 'autoload': 'true', 16 'count': '20', 17 'cur_tab': '3', 18 'form': 'search_tab', 19 } 20 url = 'https://www.toutiao.com/search_content/?' + urlencode(params) 21 try: 22 response = requests.get(url) 23 if response.status_code == 200: 24 return response.json() 25 except requests.ConnectionError: 26 return None 27 28 def get_images(json): 29 data = json.get('data') 30 if data: 31 for item in data: 32 image_list = item.get('image_list') 33 title = item.get('title') 34 if image_list: 35 for image in image_list: 36 # 构造一个生成器,将图片和标题一起返回 37 yield { 38 'image': image.get('url'), 39 'title': title 40 } 41 42 # item就是get_image()返回的一个字典 43 # item里面的title创建一个文件夹 44 def save_image(item): 45 if not os.path.exists(item.get('title')): 46 os.mkdir(item.get('title')) 47 try: 48 local_image_url = item.get('image') 49 new_image_url = local_image_url.replace('list', 'large') 50 response = requests.get('http:' + new_image_url) 51 if response.status_code == 200: 52 file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg') 53 # 判断路径是否存在,如果不存在,写入 54 if not os.path.exists(file_path): 55 with open(file_path, 'wb')as f: 56 f.write(response.content) 57 else: 58 print('Already Download', file_path) 59 except: 60 print('Failed to save image') 61 62 # 定义一个offset数组,遍历,提取图片,下载 63 def main(offset): 64 json = get_page(offset) 65 for item in get_images(json): 66 print(item) 67 save_image(item) 68 69 if __name__ == '__main__': 70 pool = Pool() # 创建进程池 71 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) 72 pool.map(main, groups) # 第一个参数是函数,第二个参数是一个迭代器,将迭代器中的数字作为参数依次传入函数中 73 pool.close() # 关闭pool,使其不在接受新的(主进程)任务 74 pool.join() # 主进程阻塞后,让子进程继续运行完成,子进程运行完后,再把主进程全部关掉
结果:
此时可以看到文件夹里:
随便打开一个:
Successful!