【python爬虫】coursera抓取
1 # -*- coding: utf-8 -*-” 2 #!/usr/bin/env python 3 4 """ 5 用于抓取coursera网站的下载链接 6 """ 7 8 import sys 9 import string 10 import re,random 11 import urllib,urllib2 12 import cookielib 13 import getpass 14 15 16 class Coursera(object): 17 """Coursera类定义 18 19 实现模拟登陆,抓取网页代码和正则匹配,保存连接到文件 20 21 Attributes: 22 login_url:保存真正的登陆页面URL 23 url:保存用于爬取下载连接的URL 24 user_name:存储用户登陆Email 25 password:存储用户登陆密码 26 """ 27 28 def __init__(self,url,user_name,password): 29 self.login_url = "https://accounts.coursera.org/api/v1/login" 30 self.url = url 31 if user_name == "" or password == "": 32 raise UserOrPwdNone("the username or password can't empty string") 33 sys.exit(2) 34 else : 35 self.user_name=user_name 36 self.password = password 37 38 def simulation_login(self): 39 """ 40 模拟登录函数 41 """ 42 43 cookie = cookielib.CookieJar() 44 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) 45 urllib2.install_opener(opener) 46 form_data,request_header = self.structure_headers() 47 req = urllib2.Request(self.login_url,data = form_data,headers=request_header) 48 try: 49 result = urllib2.urlopen(req) 50 except urllib2.URLError,e: 51 if hasattr(e, "code"): 52 print "The server couldn't fulfill the request.Please check your url and read the Reason" 53 print "Error code: %s" % e.code 54 elif hasattr(e, "reason"): 55 print "We failed to reach a server. Please check your url and read the Reason" 56 print "Reason: %s" % e.reason 57 sys.exit(2) 58 if result.getcode()==200: 59 print "登录成功..." 60 61 def structure_headers(self): 62 """ 63 头部构造函数 64 """ 65 #模拟表单数据,这个参数不是字典 66 form_data = urllib.urlencode({ 67 "email":self.user_name, 68 "password":self.password, 69 "webrequest":"true" 70 }) 71 user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " 72 "AppleWebKit/537.36 (KHTML, like Gecko) " 73 "Chrome/38.0.2125.111 Safari/537.36") 74 XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8)) 75 XCSRF2Token = ''.join(self.random_string(24)) 76 XCSRFToken = ''.join(self.random_string(24)) 77 cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token) 78 79 request_header = { 80 "Referer": "https://accounts.coursera.org/signin", #对付防盗链设置, 为跳转来源的url 81 "User-Agent": user_agent, #伪装成浏览器访问 82 "X-Requested-With": "XMLHttpRequest", 83 "X-CSRF2-Cookie": XCSRF2Cookie, 84 "X-CSRF2-Token": XCSRF2Token, 85 "X-CSRFToken": XCSRFToken, 86 "Cookie": cookie 87 } 88 89 return form_data,request_header 90 91 def random_string(self,length): 92 """ 93 随机生成指定长度的字母和数字序列 94 """ 95 return ''.join(random.choice(string.letters + string.digits) for i in xrange(length)) 96 97 def get_links(self): 98 """ 99 爬取页面代码,获取下载MP4和PDF连接 100 """ 101 102 try: 103 result = urllib2.urlopen(self.url) 104 except urllib2.URLError,e: 105 if hasattr(e, "code"): 106 print "The server couldn't fulfill the request." 107 print "Error code: %s" % e.code 108 elif hasattr(e, "reason"): 109 print "We failed to reach a server. Please check your url and read the Reason" 110 print "Reason: %s" % e.reason 111 sys.exit(2) 112 content = result.read().decode("utf-8") 113 print "读取网页成功..." 114 down_links = re.findall(r'<a.*?href="(.*?mp4.*?)"', content) 115 down_pdfs = re.findall(r'<a.*?href="(.*?pdf)"', content) 116 print "正则匹配结束..." 117 return down_links,down_pdfs 118 119 def start_spider(self): 120 """运行爬虫,将爬取链接写入不同文件 121 """ 122 self.simulation_login() 123 down_links,down_pdfs = self.get_links() 124 with open("coursera.html","w+") as my_file: 125 print "下载链接的长度",len(down_links) 126 for link in down_links: 127 print link 128 try: 129 my_file.write(link+"\n") 130 except UnicodeEncodeError: 131 sys.exit(2) 132 with open("coursera.pdf", "w+") as my_file : 133 print "下载pdf的长度", len(down_pdfs) 134 for pdf in down_pdfs : 135 try : 136 my_file.write(pdf + "\n") 137 except UnicodeEncodeError : 138 sys.exit(2) 139 print "抓取Coursera课程下载链接和pdf链接成功" 140 141 142 class UserOrPwdNone(BaseException): 143 """ 144 Raised if the username or password is empty string 145 """ 146 147 def main(): 148 """ 149 if len(sys.argv) != 2: 150 print "Please Input what course you want to download.." 151 sys.exit(2) 152 """ 153 154 """ 155 user_name = raw_input("Input your Email > ") 156 password = getpass.getpass("Input your Password > ") 157 """ 158 url = "https://class.coursera.org/{course}/lecture" 159 user_name = "15258691200@163.com" 160 password = "xxxxxxx" 161 spider = Coursera(url.format(course = "python"),user_name,password) 162 spider.start_spider() 163 164 if __name__ == '__main__': 165 main()
通过谷歌浏览器的network工具分析http请求头中的内容,然后自己定义,模拟登陆。
对比发现:请求头中X-CSRF2-Token和X-CSRFToken是完全随机的,X-CSRF2-Cookie后8位是随机生成的,字母和数字。
于是就有了这样的请求头代码:
def structure_headers(self) : #模拟表单数据,这个参数不是字典 form_data = urllib.urlencode({ "email": self.user_name, "password": self.password, "webrequest": "true" }) user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/38.0.2125.111 Safari/537.36") XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8)) XCSRF2Token = ''.join(self.random_string(24)) XCSRFToken = ''.join(self.random_string(24)) cookie = "csrftoken=%s; %s=%s" % (XCSRFToken, XCSRF2Cookie, XCSRF2Token) request_header = { "Referer": "https://accounts.coursera.org/signin", #对付防盗链设置, 为跳转来源的url "User-Agent": user_agent, #伪装成浏览器访问 "X-Requested-With": "XMLHttpRequest", "X-CSRF2-Cookie": XCSRF2Cookie, "X-CSRF2-Token": XCSRF2Token, "X-CSRFToken": XCSRFToken, "Cookie": cookie } return form_data, request_header def random_string(self, length): return ''.join(random.choice(string.letters + string.digits) for i in xrange(length))
最后的运行结果:
因为输入的请求下载链接不正确,所以下载的长度都是0