Scrapy基础(十四)————Scrapy实现知乎模拟登陆
模拟登陆大体思路见此博文,本篇文章只是将登陆在scrapy中实现而已
之前介绍过通过requests的session 会话模拟登陆;必须是session,涉及到验证码和xsrf的
写入cookie验证的问题;在scrapy中不需担心此问题,因为Request会保证这是一个会话,并且自动传递cookies
原理想通,因为验证码识别的问题,这里先使用cookie模拟登陆
1 # -*- coding: utf-8 -*- 2 3 import scrapy 4 import json 5 import re 6 7 8 9 10 11 class ZhihuSpider(scrapy.Spider): 12 13 name = "zhihu" 14 allowed_domains = ["zhihu.com"] 15 start_urls = ['http://www.zhihu.com/'] 16 #头部 17 headers = { 18 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 19 "Host":"www.zhihu.com", 20 "Referer":"https://www.zhihu.com/", 21 } 22 #从已经登陆的浏览在中copy下来的 23 cookies = { 24 "d_c0":"", 25 "l_cap_id":"", 26 "r_cap_id":"", 27 "cap_id":"", 28 "_zap":"", 29 "__utmc":"", 30 "__utmb":"", 31 "__utmv":"", 32 "__utma":"", 33 "__utmz":"5", 34 "q_c1":"", 35 } 36 #最开始请求的reqeust函数,自动调用,将首次获取的response返回给登陆函数(里面有xsrf) 37 def start_requests(self): 38 #必须带上cookie;return返回,不用生成器,只需爬取登陆页面一次,而且必须返回一个可迭代对象,所以是列表 39 return [scrapy.Request(url="https://www.zhihu.com/#signin",cookies=self.cookies,headers=self.headers,callback=self.login)] 40 41 42 #知乎登录 43 def login(self,response): 44 #正则匹配出xsrf 45 response_text = response.text 46 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) 47 if match_obj: 48 xsrf = (match_obj.group(1)) 49 50 url = "https://www.zhihu.com/login/phone_num" 51 data={ 52 "_xsrf":xsrf, 53 'remember_me': 'true', 54 "password":"", 55 "phone_num":"" 56 } 57 58 #将获取到的xsrf加载到cookie中 59 self.cookies["_xsrf"] = xsrf 60 #通过FormRequest提交表单,这里的request对象和之前的session一样,还是处于刚刚的对话中;回调给检查登陆的函数 61 return [scrapy.FormRequest(url=url,headers=self.headers,formdata=data,callback=self.check_login)] 62 63 #查看登录状态;登陆成功则默认回调parse函数进行解析网页 64 def check_login(self,response): 65 text_json = json.load(response.text) 66 if "msg" in text_json and text_json["msg"]=="\u767b\u5f55\u6210\u529f": 67 for urls in self.start_urls: 68 yield scrapy.Request(url=urls,dont_filter=True,headers=self.headers) 69 70 71 72 73 def parse(self, response): 74 pass