抓取网易公开课并下载

原文地址:http://bbs.csdn.net/topics/390361293

View Code
 1 # -*- coding: utf-8 -*-
 2 #抓取网易公开课下载链接
 3 #By  : hnicypb@126.com
 4 #Ver :1.0
 5 #Time:2013-12-30
 6 #Python 2.7 + BeautifulSoup 3.03(用最新版本4.1乱码没搞定,退回3.03)
 7 #eg: python 抓取网易公开课.py http://v.163.com/special/opencourse/paradigms.html
 8 
 9 from BeautifulSoup import BeautifulSoup
10 import re
11 import sys,os
12 import urllib
13 
14 #显示百分比
15 def rpb(blocknum, blocksize, totalsize):
16     percent = 100.0 * blocknum * blocksize / totalsize
17     if percent > 100:percent = 100
18     print "%.2f%%"% percent
19     
20 def downlaod(url):
21     #获取页面
22     html = urllib.urlopen(url).read()
23     #用美汤来装载
24     soup = BeautifulSoup(html)
25     #获取课程信息,名称,简介等
26     title = soup.find('div',{"class" : "m-cdes"})
27     print title.h2.string
28     print title.findAll('p')[0].string
29     print title.findAll('p')[1].string
30     print title.findAll('p')[2].string
31 
32     #获取课程详细列表信息
33     detail=soup.findAll('tr',{"class" : "u-even"})
34     for i in detail:
35         #获取课程名称
36         name=i.find('td',{"class" : "u-ctitle"}) 
37         fileName=name.contents[0].strip() .lstrip() .rstrip(',') + name.a.string.strip() .lstrip() .rstrip(',')
38         #获取课程下载链接
39         downInfo=i.find('td',{"class" : "u-cdown"})
40         downLink=downInfo.a['href']
41         
42         print fileName
43         print downLink
44         
45         #使用urlretrieve下载该文件
46         if not os.path.exists(fileName):
47             urllib.urlretrieve(downLink,fileName+".mp4",rpb)
48     
49 def main(argv):
50     if len(argv)>=2:
51          downlaod(argv[1])
52 
53 if __name__=="__main__":
54      main(sys.argv)

 

posted @ 2013-02-16 17:41  catmelo  阅读(325)  评论(0编辑  收藏  举报