Python获取Origin官网视频
程序说明:最近学习origin,看到官网有入门视频(http://www.originlab.com/index.aspx?go=SUPPORT/VideoTutorials),看着挺多的,就用python写了个简单的爬虫程序,把origin的这些视频下载下来了。利用了requests和bs4解析html,利用re.match方法匹配获取相关连接,最后下载。程序代码没有进行进一步整理,看着有些乱。
#!/usr/bin/python # -*- coding:utf-8 -*- """ Created on Sun Dec 12 14:21:15 2015 Notes: Downloading turtorial vedios from Origin support. @author: zhigang """ import requests from bs4 import BeautifulSoup as bs import re import urllib def download_url(url,outdir): print('Resolving.....from '+url+'....') import os.path if not os.path.exists(outdir):os.mkdir(outdir) response = requests.get(url) soup = bs(response.text,"lxml") all_links = [] all_names = [] for x in soup.findAll("a"): #通配符匹配获得想要的文件名保存到list中 if "href" in x.attrs.keys(): if re.match('.*VideoTutorials&pid.*',x['href']): information = [x['href'],x.string] cur_url = 'http://www.originlab.com/'+information[0] res_new = requests.get(cur_url) soup_new = bs(res_new.text,"lxml") for new_alink in soup_new.findAll('a'): if 'href' in new_alink.attrs.keys(): if re.match('.*mp4',new_alink['href']): all_links.append(new_alink['href']) all_names.append(information[1]+'.mp4') #start downloads print(str(len(all_links))+' tasks found. Started downloading...') for (i,link) in enumerate(all_links): print(str(i)+' : '+all_names[i]+'\t url: '+link) urllib.request.urlretrieve(link,outdir+'\\'+all_names[i]) print(str(i)+' : '+link+'\t completed...') print('All tasks completed.') if __name__=='__main__': url = 'http://www.originlab.com/index.aspx?go=SUPPORT/VideoTutorials' outdir = r'D:\Origin_turtorials' download_url(url,outdir)
posted on 2015-12-20 18:14 未济的Lakers 阅读(1034) 评论(0) 编辑 收藏 举报