07 验证码处理
引入:
- 相关的门户网站在进行登录的时候,如果用户连续登录的次数超过3次或者5次的时候,就会在登录页中动态生成验证码。通过验证码达到分流和反爬的效果。
今日概要:
- 使用云打码平台识别验证码
云打码官网地址: http://www.yundama.com/
云打码使用流程:
注册:普通用户和开发者用户(两个都要注册)
登陆:
— 普通用户:查询余额
— 开发者用户:
- 创建一个软件:我的软件 --> 添加一个软件
- 下载示例代码:点击开发文档 -> 调用示例及最新的DLL -> 点击PythonHttp示例下载,即可下载
开发者用户下载PythonHttp示例使用:
1. 解压下载好的PythonHttp调用示例,文件中有包含的验证图片,YDMHTTPDemo2.x.py,YDMHTTPDemo2.x.p3 三个文件
2. 使用时,将验证图片 和 YDMHTTPDemo2.x.p3 内容导入到项目即可使用
1 import http.client, mimetypes, urllib, json, time, requests 2 3 ###################################################################### 4 5 class YDMHttp: 6 7 apiurl = 'http://api.yundama.com/api.php' 8 username = '' 9 password = '' 10 appid = '' 11 appkey = '' 12 13 def __init__(self, username, password, appid, appkey): 14 self.username = username 15 self.password = password 16 self.appid = str(appid) 17 self.appkey = appkey 18 19 def request(self, fields, files=[]): 20 response = self.post_url(self.apiurl, fields, files) 21 response = json.loads(response) 22 return response 23 24 def balance(self): 25 data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 26 response = self.request(data) 27 if (response): 28 if (response['ret'] and response['ret'] < 0): 29 return response['ret'] 30 else: 31 return response['balance'] 32 else: 33 return -9001 34 35 def login(self): 36 data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} 37 response = self.request(data) 38 if (response): 39 if (response['ret'] and response['ret'] < 0): 40 return response['ret'] 41 else: 42 return response['uid'] 43 else: 44 return -9001 45 46 def upload(self, filename, codetype, timeout): 47 data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 48 file = {'file': filename} 49 response = self.request(data, file) 50 if (response): 51 if (response['ret'] and response['ret'] < 0): 52 return response['ret'] 53 else: 54 return response['cid'] 55 else: 56 return -9001 57 58 def result(self, cid): 59 data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} 60 response = self.request(data) 61 return response and response['text'] or '' 62 63 def decode(self, filename, codetype, timeout): 64 cid = self.upload(filename, codetype, timeout) 65 if (cid > 0): 66 for i in range(0, timeout): 67 result = self.result(cid) 68 if (result != ''): 69 return cid, result 70 else: 71 time.sleep(1) 72 return -3003, '' 73 else: 74 return cid, '' 75 76 def report(self, cid): 77 data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} 78 response = self.request(data) 79 if (response): 80 return response['ret'] 81 else: 82 return -9001 83 84 def post_url(self, url, fields, files=[]): 85 for key in files: 86 files[key] = open(files[key], 'rb'); 87 res = requests.post(url, files=files, data=fields) 88 return res.text 89 90 ###################################################################### 91 92 # 用户名 (指的是普通用户的用户名和密码) 93 username = 'username' 94 95 # 密码 96 password = 'password' 97 98 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 99 appid = 1 100 101 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 102 appkey = '22cc5376925e9387a23cf797cb9ba745' 103 104 # 图片文件 105 filename = 'getimage.jpg' 106 107 # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 108 codetype = 1004 109 110 # 超时时间,秒 111 timeout = 60 112 113 # 检查 114 if (username == 'username'): 115 print('请设置好相关参数再测试') 116 else: 117 # 初始化 118 yundama = YDMHttp(username, password, appid, appkey) 119 120 # 登陆云打码 121 uid = yundama.login(); 122 print('uid: %s' % uid) 123 124 # 查询余额 125 balance = yundama.balance(); 126 print('balance: %s' % balance) 127 128 # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 129 cid, result = yundama.decode(filename, codetype, timeout); 130 print('cid: %s, result: %s' % (cid, result)) 131 132 ######################################################################
云打码平台处理验证码的实现流程:
代码展示:
爬取人人网登陆后的页面,需要处理验证
import http.client, mimetypes, urllib, json, time, requests ###################################################################### class YDMHttp: apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def report(self, cid): data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} response = self.request(data) if (response): return response['ret'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text def get_code_text(codeType,imgPath): # 用户名 (指的是普通用户的用户名和密码) username = 'mwhshare' # 密码 password = 'mwh@4598105' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 6596 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = '515bcabfb89e3a824619a1d1c8b25f36' # 图片文件 filename = imgPath # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = codeType # 超时时间,秒 timeout = 20 # 检查 if (username == 'username'): print('请设置好相关参数再测试') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login(); print('uid: %s' % uid) # 查询余额 balance = yundama.balance(); print('balance: %s' % balance) # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) return result import requests from lxml import etree from urllib import request # 获取一个session对象 session = requests.session() #session对象和requests作用几乎一样,都可以进行请求的发送,并且请求发送的方式也是一致的, #session进行请求的发送,如果会产生cookie的话,则cookie会自动被存储到session对象中 # 1. 获取人人网验证码图片: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } url ='http://www.renren.com/' page_text = requests.get(url=url,headers=headers).text #将验证码图片解析出来且进行持久化存储 tree = etree.HTML(page_text) # 拿到当前人人网验证码图片的路径 code_img_src = tree.xpath('//*[@id="verifyPic_login"]/@src')[0] # 将图片储存到当前文件下 request.urlretrieve(url=code_img_src,filename="./code.jpg") code = get_code_text(2004,'./code.jpg') # print(code) # 模拟登陆 #拿到登陆所对应的url login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019031446506 ' data = { "email":"1696755793@qq.com", "icod":code, "origURL":"http://www.renren.com/home", "domain":"renren.com", "key_id":"1", "captcha_type":"web_login", "password":"4f1d552d7dc5ba646e93e653da9b06e5a24dceda905323a830e19f6352ae8bc0", "rkey":"9e75e8dc3457b14c55a74627fa64fb43", "f":"" } #进行登录,当登录成功之后,可以获取cookie #cookie就会被存储到session中 response = session.post(url=login_url,headers=headers,data=data) #对登录成功后对应的当前用户的个人详情页进行请求发送 detail_url = 'http://www.renren.com/969393866/profile' #该次get请求使用的是session对象,该请求已经携带了cookie page_text = session.get(url=detail_url,headers=headers).text with open("./renren.html","w",encoding="utf-8") as fp: fp.write(page_text) print("下载完毕")