模拟知乎登陆(requests和scrapy)

1. request

　　登录知乎需要向服务器提交的信息有:

　　 ①headers

　　②_xsrf

　　 ③captcha

　　需要通过解析页面获得_xsrf和captcha(验证码)

　　而有关captcha的获取则必须要用session的方式获得, 目的是为了使_xsrf和验证码信息一致

　　(因为session中可以保存cookie, 保证数据的一致性)代码如下:

  1 import re
  2 import time
  3 import os.path
  4 import requests
  5 
  6 try:
  7     import cookielib
  8 except:
  9     import http.cookiejar as cookielib
 10 
 11 from PIL import Image
 12 
 13 session = requests.session()
 14 session.cookies = cookielib.LWPCookieJar(filename="cookies")# 登陆成功后将cookie保存到文件中, 之后登陆就可以直接加载cookie,而不需要输入账号和密码(session机制)
 15 try:
 16     session.cookies.load(ignore_discard=True)
 17 except:
 18     print("cookies未能加载")
 19 
 20 agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'
 21 # agent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Mobile Safari/537.36'
 22 
 23 # agent = "Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/57.0"
 24 headers = {
 25     "Host": "www.zhihu.com",
 26     "Referer": "https://www.zhihu.com/",
 27     "User-Agent": agent,
 28 }
 29 
 30 
 31 def get_xsrf():
 32     response = session.get("https://www.zhihu.com/", headers= headers)
 33     match_ojb = re.search('name="_xsrf" value="(.*)"', response.text)
 34     print(response.text)
 35     if match_ojb:
 36         return match_ojb.group(1)
 37     else:
 38         print("error")
 39 
 40 
 41 def get_captcha():
 42     t = str(int(time.time() * 1000))
 43     captcha_url = 'https://www.zhihu.com/captcha.gif?r=' + t + "&type=login"
 44     r = session.get(captcha_url, headers=headers)
 45     with open('captcha.jpg', 'wb') as f:
 46         f.write(r.content)
 47         f.close()
 48     try:
 49         im = Image.open('captcha.jpg')
 50         im.show()
 51         im.close()
 52     except:
 53         print('请到 % s找到captcha.jpg手动输入'.format(os.path.abspath('captcha.jpg')))
 54     captcha = input("please input the captcha\n")
 55     return captcha
 56 
 57 
 58 def is_login():
 59     # 通过用户个人中心验证是否登陆成功
 60     check_url = "https://www.zhihu.com/settings/profile"
 61     response = session.get(check_url, headers=headers, allow_redirects=False)
 62     if response.status_code != 200:
 63         return False
 64     else:
 65         return True
 66 
 67 
 68 def login(account, password):
 69     # 知乎登陆
 70     _xsrf = get_xsrf()
 71     if '@' in account:
 72         print("邮箱登陆")
 73         post_url = "https://www.zhihu.com/login/email"
 74         post_data = {
 75             "_xsrf": _xsrf,
 76             "password": password,
 77             "email": account,
 78         }
 79     else:
 80         if re.match('^1\d{10}', account):
 81             print("手机登陆")
 82         post_url = "https://www.zhihu.com/login/phone_num"
 83         post_data = {
 84             "_xsrf": get_xsrf(),
 85             "password": password,
 86             "phone_num": account,
 87         }
 88     # 不需要验证码直接登录成功
 89     response = session.post(post_url, data=post_data, headers=header)
 90     login_code = response.json()
 91 
 92     if login_code['r'] == 1:
 93         print("不输入验证码登陆失败")
 94         #当不输入验证码登录失败时, 获取验证码, 重新登录
 95         post_data["captcha"] = get_captcha()
 96         response = session.post(post_url, data=post_data, headers=header)
 97         login_code = response.json()
 98         print(login_code['msg'])
 99 
100     session.cookies.save()
101 
102 if __name__ == '__main__':
103     if is_login():
104         print("已经登陆!")
105     else:
106         login(account, password)

2. scrapy

　　如果在scrapy中直接调用上文中的get_captcha()函数来获得验证码, 然后提交是无法登陆成功的, 原因是数据不一致,也就是说获取的_xsrf和验证码一起提交到服务器是不匹配的.

　　scrapy机制是默认保存cookie的,所以可以通过两个request请求来将得到的信息保存在默认的cookie中,代码如下:

 1 # -*- coding: utf-8 -*-
 2 import re
 3 import json
 4 import datetime
 5 
 6 try:
 7     import urlparse as parse
 8 except:
 9     from urllib import parse
10 
11 import scrapy
12 
13 
14 class ZhihuSpider(scrapy.Spider):
15     name = "zhihu"
16     allowed_domains = ["www.zhihu.com"]
17     start_urls = ['https://www.zhihu.com/']
18 
19     headers = {
20         "HOST": "www.zhihu.com",
21         "Referer": "https://www.zhizhu.com",
22         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
23     }
24 
25     def start_requests(self):
26         return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]
27     #获取_xsrf
28     def login(self, response):
29         response_text = response.text
30         match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL)
31         xsrf = ''
32         if match_obj:
33             xsrf = (match_obj.group(1))
34 
35         if xsrf:
36             post_url = "https://www.zhihu.com/login/phone_num"
37             post_data = {
38                 "_xsrf": xsrf,
39                 "phone_num": "",
40                 "password": "",
41                 "captcha": ""
42             }
43 
44             import time
45             t = str(int(time.time() * 1000))
46             captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)
47             yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha)
48 
49     #获取验证码
50     def login_after_captcha(self, response):
51         with open("captcha.jpg", "wb") as f:
52             f.write(response.body)
53             f.close()
54 
55         from PIL import Image
56         try:
57             im = Image.open('captcha.jpg')
58             im.show()
59             im.close()
60         except:
61             pass
62 
63         captcha = input("输入验证码\n>")
64 
65         post_data = response.meta.get("post_data", {})
66         post_url = "https://www.zhihu.com/login/phone_num"
67         post_data["captcha"] = captcha
68         return [scrapy.FormRequest(
69             url=post_url,
70             formdata=post_data,
71             headers=self.headers,
72             callback=self.check_login
73         )]
74 
75     def check_login(self, response):
76         #验证服务器的返回数据判断是否成功
77         text_json = json.loads(response.text)
78         if "msg" in text_json and text_json["msg"] == "登录成功":
79             for url in self.start_urls:
80                 yield scrapy.Request(url, dont_filter=True, headers=self.headers)

posted on 2017-11-24 17:07 banji 阅读(232) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

banji

模拟知乎登陆(requests和scrapy)

导航

公告