[Python爬虫]头条图集爬取
import requests from urllib.parse import urlencode import os from hashlib import md5 from multiprocessing.pool import Pool def get_page(offset): headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36', 'cookie':'tt_webid=6724223385113069069; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6724223385113069069; csrftoken=9e9d6c3be6aabc313dce0c4f1a116047; sso_uid_tt=27219b1c2d00b8a6021444d85d83dc38; toutiao_sso_user=7562e682c093b193cce298f25dd396ba; login_flag=8391d980bfc8a8908e7c6c80596a016c; __tea_sdk__ssid=undefined; _ga=GA1.2.931504366.1565662966; sid_guard=7562e682c093b193cce298f25dd396ba%7C1565663040%7C5126263%7CFri%2C+11-Oct-2019+10%3A21%3A43+GMT; uid_tt=27219b1c2d00b8a6021444d85d83dc38; sid_tt=7562e682c093b193cce298f25dd396ba; sessionid=7562e682c093b193cce298f25dd396ba; uuid="w:443dcb551552404fbfde212f1054c781"; __tasessionId=i5j7qcydf1569292028372; s_v_web_id=1e7e3b52d7bc46698bb26079c99fd83d', 'pragma':'no-cache', 'referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D', 'x-requested-with':'XMLHttpRequest' } params={ 'aid':'24', 'app_name':'web_search', 'offset':offset, 'format':'json', 'keyword':'街拍', 'autoload':'true', 'count':'20', 'en_qc':'1', 'cur_tab':'1', #'from':'search_tab', #'pd':'synthesis', } print(urlencode(params)) url='https://www.toutiao.com/api/search/content/?'+urlencode(params) try: response=requests.get(url,headers=headers) if response.status_code==200: print(response.json()) return response.json() except requests.ConnectionError: return 'No response' def get_image(json): if json.get('data'): for item in json.get('data'): if 'title' in item and 'image_list' in item and item['image_list']!=[]: title=item.get('title') images=item.get('image_list') for image in images: print(title) print(image) yield { 'image':image.get('url'), 'title':title } else: print('Not parse') def save_image(item): if not os.path.exists(item.get('title')): os.mkdir(item.get('title')) try: response=requests.get(item.get('image')) if response.status_code==200: file_path='{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg') if not os.path.exists(file_path): with open(file_path,'wb') as f: f.write(response.content) else: print('Already Downloaded',file_path) except requests.ConnectionError: print('Failed to Save Image') def main(offset): json=get_page(offset) for item in get_image(json): print(item) save_image(item) GROUP_START=1 GROUP_END=1 if __name__=='__main__': pool=Pool() groups=([x*20 for x in range(GROUP_START,GROUP_END+1)]) pool.map(main,groups) pool.close() pool.join()