利用Python爬取新浪微博营销案例库并下载到本地
1 from bs4 import BeautifulSoup 2 import requests,urllib.request,urllib.parse 3 import json 4 import time 5 import os 6 headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1', 7 'Cookie':'SINAGLOBAL=115.171.224.117_1478818430.840585; UOR=www.baidu.com,www.sina.com.cn,; SGUID=1479602257214_22629370; U_TRS1=000000ca.e4817e03.5830f3d9.0954d478; vjuids=8b9ebf053.1588e9bbe9b.0.a7d3c9f0da2d8; lxlrtst=1480138279_o; vjlast=1479861321.1480207111.11; lxlrttp=1480138279; SCF=AvqGheyBOzJit9zuitL3eGB1w7DgNLfZqC_FT1HI_O6vrMhl4NJAJ8QKegO6Qz5961-unIGKeJj59-0w1ioamqc.; Apache=115.171.186.136_1481426939.303674; SUB=_2A251SKFNDeRxGeVM6lIU8izEwjyIHXVWP5WFrDV_PUNbm9ANLXj4kW-ZXh1EJqzVqCfCs2tJhJUwl2nPfA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhQMFyM94ynlSl9JBZenkS15JpX5KzhUgL.FoeEeK5feozR1K52dJLoI7D8MJLoIEfeKX4G; ALF=1512965277; U_TRS2=00000088.95c945f5.584cd14e.d3ef2984; WEB2_APACHE2_YF=53ce2a867ebeada0edd63e211478fed5; WEB2_APACHE2_JA=4e81a2dfe1afdcedfb634ba45827a3fb; ULV=1481429361019:7:1:1:115.171.186.136_1481426939.303674:1480134833882; appClose=true; NTKF_T2D_CLIENTID=guestAE2E8836-1881-93C9-A9BE-EC1265A9B9B5; nTalk_CACHE_DATA={uid:kf_9378_ISME9754_3210522890,tid:1481429378473190}'} 8 downloadlinks = [] 9 folder_path = 'D:/' 10 for x in range(46): 11 url = 'http://all.vic.sina.com.cn/weibo_alk/hiddphp.php?page={}&act=jplist_ajax'.format(x) 12 data = requests.get(url,headers = headers) 13 time.sleep(1) 14 data.encoding = 'utf-8' 15 res = data.text 16 lal = json.loads(res) 17 for i in lal: 18 if len(i) < 5: 19 file_url =lal[i]['attachmentArr'][0]['url'] 20 else: 21 file_url = i['attachmentArr'][0]['url'] 22 downloadlinks.append(file_url) 23 24 for item in downloadlinks: 25 if item != None: 26 urls = urllib.parse.quote(item,safe='/:?=@$&') 27 time.sleep(1) 28 name = urllib.parse.unquote(item.split('/')[-1]) 29 try: 30 urllib.request.urlretrieve(urls,folder_path + name) 31 print(name + '下载成功') 32 except urllib.error.HTTPError: 33 print('页面不存在') 34 except ValueError: 35 print('未知')