Python获取Origin官网视频

程序说明:最近学习origin,看到官网有入门视频(http://www.originlab.com/index.aspx?go=SUPPORT/VideoTutorials),看着挺多的,就用python写了个简单的爬虫程序,把origin的这些视频下载下来了。利用了requests和bs4解析html,利用re.match方法匹配获取相关连接,最后下载。程序代码没有进行进一步整理,看着有些乱。

#!/usr/bin/python
# -*- coding:utf-8 -*-

"""
Created on Sun Dec 12 14:21:15 2015

Notes: Downloading turtorial vedios from Origin support.

@author: zhigang
"""

import requests
from bs4 import BeautifulSoup as bs
import re
import urllib
def download_url(url,outdir):
    print('Resolving.....from '+url+'....')  
    import os.path
    if not os.path.exists(outdir):os.mkdir(outdir)
    response = requests.get(url)
    soup = bs(response.text,"lxml")                
    all_links = []
    all_names = []    
        
    for x in soup.findAll("a"):
            #通配符匹配获得想要的文件名保存到list中
       if "href" in x.attrs.keys():
           if re.match('.*VideoTutorials&pid.*',x['href']):
               information = [x['href'],x.string]
               cur_url = 'http://www.originlab.com/'+information[0]
               res_new = requests.get(cur_url)
               soup_new = bs(res_new.text,"lxml")  
               for new_alink in soup_new.findAll('a'):
                   if 'href' in new_alink.attrs.keys():
                       if re.match('.*mp4',new_alink['href']): 
                           all_links.append(new_alink['href'])
                           all_names.append(information[1]+'.mp4')
    
    #start downloads
    print(str(len(all_links))+' tasks found. Started downloading...')
    for (i,link) in enumerate(all_links):
        print(str(i)+' : '+all_names[i]+'\t url: '+link)        
        urllib.request.urlretrieve(link,outdir+'\\'+all_names[i])
        print(str(i)+' : '+link+'\t completed...')
    print('All tasks completed.')
    
if __name__=='__main__':
    url = 'http://www.originlab.com/index.aspx?go=SUPPORT/VideoTutorials'
    outdir = r'D:\Origin_turtorials'
    download_url(url,outdir)

posted on 2015-12-20 18:14  未济的Lakers  阅读(1026)  评论(0编辑  收藏  举报

导航