python爬虫:使用账号、密码和验证码登录知乎网页
先上代码,后分析出现的问题:
1 #coding:utf-8 2 import re 3 from bs4 import BeautifulSoup 4 import gzip 5 import urllib.request 6 import urllib.parse 7 import http.cookiejar 8 import ssl 9 import time 10 11 def get_opener(heads): 12 cj=http.cookiejar.CookieJar() 13 pro=urllib.request.HTTPCookieProcessor(cj) 14 opener=urllib.request.build_opener(pro) 15 header=[] 16 for key,value in heads.items(): 17 header.append((key,value)) 18 opener.addheaders=header 19 return opener 20 21 def ungzip(data): 22 try: 23 print("正在解压....") 24 data=gzip.decompress(data) 25 print("解压完成") 26 except: 27 print("无需解压") 28 return data 29 30 if __name__=="__main__": 31 ssl._create_default_https_context = ssl._create_unverified_context 32 heads={ 33 "Accept":"text/html, application/xhtml+xml, */*", 34 "Accept-Language":"zh-CN", 35 "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0", 36 "Accept-Encoding": "gzip, deflate", 37 "Host": "www.zhihu.com", 38 "DNT": "1", 39 "Connection": "Keep-Alive" 40 } 41 opener=get_opener(heads) 42 url="https://www.zhihu.com/" 43 op=opener.open(url) 44 data1=op.read() 45 data1=ungzip(data1).decode('utf-8') 46 #print(data1.decode('utf-8')) 47 #print(op.read().decode('utf-8')) 48 ## xsrf=re.findall(r'name="_xsrf" value=".*"',data1) 49 ## print(xsrf[0]) 50 ## print(type(xsrf[0])) 51 ## value=xsrf[0].split(" ") 52 ## print(value) 53 ## _xsrf=re.findall(r'".*"',value[1])[0] 54 ## print(_xsrf) 55 soup=BeautifulSoup(data1,"html.parser") 56 _xsrf=soup.find("input",{'type':'hidden'}).get("value") 57 password="hzc19911005" 58 #captcha_type="cn" 59 phone_num="13267243809" 60 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000) 61 captchadata=opener.open(captcha_url).read() 62 with open("1.gif",'wb') as file: 63 file.write(captchadata) 64 yanzhengma=input("captcha:") 65 postdata={ 66 "_xsrf":_xsrf, 67 "password":password, 68 #"captcha_type":captcha_type,#不能带有这个字段 69 "phone_num":phone_num, 70 "captcha":yanzhengma 71 } 72 postdata=urllib.parse.urlencode(postdata).encode() 73 login_url="https://www.zhihu.com/login/phone_num" 74 op2=opener.open(login_url,postdata) 75 login_data=op2.read() 76 data=ungzip(login_data).decode("utf-8") 77 print(data) 78 result=dict(eval(data)) 79 if result["r"]==0: 80 print("登录成功") 81
1、出现“SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)”:
Python 2.7.9 之后版本引入了一个新特性
当你urllib.urlopen一个 https 的时候会验证一次 SSL 证书
当目标使用的是自签名的证书时就会爆出一个
urllib.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> 的错误消息,
处理方法:
import ssl ssl._create_default_https_context = ssl._create_unverified_context
2、出现验证码错误,返回: 验证码过期:{ "r": 1, "errcode": 1991829, "data": {"captcha":"验证码回话无效 :(","name":"ERR_VERIFY_CAPTCHA_SESSION_INVALID"}, "msg": "验证码回话无效 :(" }:
- 发给服务器的post数据没有带验证码:"captcha",解决办法:postdata={
"_xsrf":_xsrf,
"password":password,
#"captcha_type":captcha_type,#不能带有这个字段
"phone_num":phone_num,
"captcha":yanzhengma
} - 验证码过期,解决办法:先从url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)下载图片保存在本地,然后人工识别,手动输入验证码
1 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000) 2 captchadata=opener.open(captcha_url).read() 3 with open("1.gif",'wb') as file: 4 file.write(captchadata) 5 yanzhengma=input("captcha:")