模拟知乎登陆(requests和scrapy)
1. request
登录知乎需要向服务器提交的信息有:
①headers
②_xsrf
③captcha
需要通过解析页面获得_xsrf和captcha(验证码)
而有关captcha的获取则必须要用session的方式获得, 目的是为了使_xsrf和验证码信息一致
(因为session中可以保存cookie, 保证数据的一致性)代码如下:
1 import re 2 import time 3 import os.path 4 import requests 5 6 try: 7 import cookielib 8 except: 9 import http.cookiejar as cookielib 10 11 from PIL import Image 12 13 session = requests.session() 14 session.cookies = cookielib.LWPCookieJar(filename="cookies")# 登陆成功后将cookie保存到文件中, 之后登陆就可以直接加载cookie,而不需要输入账号和密码(session机制) 15 try: 16 session.cookies.load(ignore_discard=True) 17 except: 18 print("cookies未能加载") 19 20 agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0' 21 # agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36' 22 23 # agent = "Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/57.0" 24 headers = { 25 "Host": "www.zhihu.com", 26 "Referer": "https://www.zhihu.com/", 27 "User-Agent": agent, 28 } 29 30 31 def get_xsrf(): 32 response = session.get("https://www.zhihu.com/", headers= headers) 33 match_ojb = re.search('name="_xsrf" value="(.*)"', response.text) 34 print(response.text) 35 if match_ojb: 36 return match_ojb.group(1) 37 else: 38 print("error") 39 40 41 def get_captcha(): 42 t = str(int(time.time() * 1000)) 43 captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login" 44 r = session.get(captcha_url, headers=headers) 45 with open('captcha.jpg', 'wb') as f: 46 f.write(r.content) 47 f.close() 48 try: 49 im = Image.open('captcha.jpg') 50 im.show() 51 im.close() 52 except: 53 print('请到 % s找到captcha.jpg手动输入'.format(os.path.abspath('captcha.jpg'))) 54 captcha = input("please input the captcha\n") 55 return captcha 56 57 58 def is_login(): 59 # 通过用户个人中心验证是否登陆成功 60 check_url = "https://www.zhihu.com/settings/profile" 61 response = session.get(check_url, headers=headers, allow_redirects=False) 62 if response.status_code != 200: 63 return False 64 else: 65 return True 66 67 68 def login(account, password): 69 # 知乎登陆 70 _xsrf = get_xsrf() 71 if '@' in account: 72 print("邮箱登陆") 73 post_url = "https://www.zhihu.com/login/email" 74 post_data = { 75 "_xsrf": _xsrf, 76 "password": password, 77 "email": account, 78 } 79 else: 80 if re.match('^1\d{10}', account): 81 print("手机登陆") 82 post_url = "https://www.zhihu.com/login/phone_num" 83 post_data = { 84 "_xsrf": get_xsrf(), 85 "password": password, 86 "phone_num": account, 87 } 88 # 不需要验证码直接登录成功 89 response = session.post(post_url, data=post_data, headers=header) 90 login_code = response.json() 91 92 if login_code['r'] == 1: 93 print("不输入验证码登陆失败") 94 #当不输入验证码登录失败时, 获取验证码, 重新登录 95 post_data["captcha"] = get_captcha() 96 response = session.post(post_url, data=post_data, headers=header) 97 login_code = response.json() 98 print(login_code['msg']) 99 100 session.cookies.save() 101 102 if __name__ == '__main__': 103 if is_login(): 104 print("已经登陆!") 105 else: 106 login(account, password)
2. scrapy
如果在scrapy中直接调用上文中的get_captcha()函数来获得验证码, 然后提交是无法登陆成功的, 原因是数据不一致,也就是说获取的_xsrf和验证码一起提交到服务器是不匹配的.
scrapy机制是默认保存cookie的,所以可以通过两个request请求来将得到的信息保存在默认的cookie中,代码如下:
1 # -*- coding: utf-8 -*- 2 import re 3 import json 4 import datetime 5 6 try: 7 import urlparse as parse 8 except: 9 from urllib import parse 10 11 import scrapy 12 13 14 class ZhihuSpider(scrapy.Spider): 15 name = "zhihu" 16 allowed_domains = ["www.zhihu.com"] 17 start_urls = ['https://www.zhihu.com/'] 18 19 headers = { 20 "HOST": "www.zhihu.com", 21 "Referer": "https://www.zhizhu.com", 22 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" 23 } 24 25 def start_requests(self): 26 return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)] 27 #获取_xsrf 28 def login(self, response): 29 response_text = response.text 30 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) 31 xsrf = '' 32 if match_obj: 33 xsrf = (match_obj.group(1)) 34 35 if xsrf: 36 post_url = "https://www.zhihu.com/login/phone_num" 37 post_data = { 38 "_xsrf": xsrf, 39 "phone_num": "", 40 "password": "", 41 "captcha": "" 42 } 43 44 import time 45 t = str(int(time.time() * 1000)) 46 captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t) 47 yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha) 48 49 #获取验证码 50 def login_after_captcha(self, response): 51 with open("captcha.jpg", "wb") as f: 52 f.write(response.body) 53 f.close() 54 55 from PIL import Image 56 try: 57 im = Image.open('captcha.jpg') 58 im.show() 59 im.close() 60 except: 61 pass 62 63 captcha = input("输入验证码\n>") 64 65 post_data = response.meta.get("post_data", {}) 66 post_url = "https://www.zhihu.com/login/phone_num" 67 post_data["captcha"] = captcha 68 return [scrapy.FormRequest( 69 url=post_url, 70 formdata=post_data, 71 headers=self.headers, 72 callback=self.check_login 73 )] 74 75 def check_login(self, response): 76 #验证服务器的返回数据判断是否成功 77 text_json = json.loads(response.text) 78 if "msg" in text_json and text_json["msg"] == "登录成功": 79 for url in self.start_urls: 80 yield scrapy.Request(url, dont_filter=True, headers=self.headers)