一、手动输入
1 import requests 2 from bs4 import BeautifulSoup 3 4 '''手动输入验证码:用浏览器登录古诗文网,抓包获取登录接口和form表单; 5 将验证码图片下载,输入验证码,加入form表单; 6 获取form表单所需参数; 7 带着form表单发送登录请求 8 注:需要建立会话''' 9 10 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) ' 11 'AppleWebKit/537.36 (KHTML, like Gecko) ' 12 'Chrome/73.0.3683.86 Safari/537.36'} 13 14 def download_code(s): 15 #向登录界面发送请求 16 url = 'https://so.gushiwen.org/user/login.aspx?from=' 17 r = s.get(url,headers=headers) 18 19 #解析获取验证码图片链接 20 soup = BeautifulSoup(r.text,'lxml') 21 img_src = soup.find('img',id='imgCode')['src'] 22 img_url = 'https://so.gushiwen.org' + img_src 23 print(img_url) 24 25 #向图片链接发送请求,下载图片 26 r_img = s.get(img_url,headers=headers) 27 with open('img/code.png','wb') as fp: 28 fp.write(r_img.content) 29 30 #查找form表单所需的两个参数 31 __VIEWSTATE = soup.find('input',id='__VIEWSTATE')['value'] 32 __VIEWSTATEGENERATOR = soup.find('input',id='__VIEWSTATEGENERATOR')['value'] 33 34 return __VIEWSTATE,__VIEWSTATEGENERATOR 35 36 def login(s,VIEW,VIEWG): 37 post_url = 'https://so.gushiwen.org/user/login.aspx?from=' 38 39 code = input('输入验证码:') 40 41 form_data = {'__VIEWSTATE':VIEW, 42 '__VIEWSTATEGENERATOR': VIEWG, 43 'code':code, 44 'denglu':'登录', 45 'email':'18404904721', 46 'from':'', 47 'pwd':'gjp625262'} 48 49 r = s.post(url=post_url,headers=headers,data=form_data) 50 51 with open('gushi.html','w',encoding='utf8') as fp: 52 fp.write(r.text) 53 54 def main(): 55 #创建会话 56 s = requests.Session() 57 58 #下载验证码 59 VIEW,VIEWG = download_code(s) 60 61 #进行登录 62 login(s,VIEW,VIEWG) 63 64 65 if __name__ == '__main__': 66 main()
二、tesseract光学识别
from PIL import Image import pytesseract '''下载tesseract pip install pytesseract''' #打开图片 img = Image.open(r'img/code.png') #转化为灰度图片 img = img.convert('L') #二值化处理 threshold = 140 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) out = img.point(table,'1') out.show() img = img.convert('RGB') #识别图片 print(pytesseract.image_to_string(img))
三、打码平台(云打码)
from YDMHTTPDemo3 import YDMHttp '''打码平台:云打码 打码兔''' ###################################################################### # 用户名 username = 'mianxiang_mei' # 密码 password = 'gjp625262' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 8212 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = 'dbd2645a635701a0a9f19fd0072d82c3' # 图片文件 filename = 'img/code.png' # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = 1004 # 超时时间,秒 timeout = 60 # 检查 if (username == 'username'): print('请设置好相关参数再测试') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login(); print('uid: %s' % uid) # 查询余额 balance = yundama.balance(); print('balance: %s' % balance) # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout); print('cid: %s, result: %s' % (cid, result)) ######################################################################
附:云打码调用的类
import http.client, mimetypes, urllib, json, time, requests ###################################################################### class YDMHttp: apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def report(self, cid): data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} response = self.request(data) if (response): return response['ret'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text