验证码识别
输入式验证码
Python的第三方库Tesserocr-OCR,准确率却非常受限制.如当图片背景有很多线条的时候,识别准确率是比较低的。
解决方法:
对图片转灰度再进行二值化处理,以此提高识别率
image = Image.open('./1.png') image.show() image = image.convert('L') threshold = 127 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) image = image.point(table,'1') image.show() result = tesserocr.image_to_text(image) print(result)
但是这个办法也有限制.
当背景纹理和字符的RGB都大于127,或者都小于127时(就是亮度接近时),准确率会很低。
深度学习比较火,用深度学习训练个模型,这样的识别率就会高很多。
滑动式验证码
B站的登录界面:
解决思路:
存三张图片,分别是完整的图、有缺口的图和缺口图。
首先识别缺口在图中的位置,然后计算滑动的距离和轨迹。最后用selenium进行模拟操作。
from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver import ActionChains from selenium.webdriver.common.keys import Keys import time import random from PIL import Image web='http://literallycanvas.com/' #初始化 def init(): #定义全局变量 global url, browser, username, password, wait url = 'https://passport.bilibili.com/login' browser = webdriver.Chrome() username = '************' password = '************' wait = WebDriverWait(browser, 20) #登录 def login(): browser.get(url) user = wait.until(EC.presence_of_element_located((By.ID, 'login-username'))) passwd = wait.until(EC.presence_of_element_located((By.ID, 'login-passwd'))) user.send_keys(username) passwd.send_keys(password) #通过输入回车键模仿用户登录 #passwd.send_keys(Keys.ENTER) login_btn=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'a.btn.btn-login'))) #随机延时点击 time.sleep(random.random()*3) login_btn.click() #设置元素的可见性用于截图 def show_element(element): browser.execute_script("arguments[0].style = arguments[1]", element, "display: block;") def hide_element(element): browser.execute_script("arguments[0].style = arguments[1]", element, "display: none;") #截图 def save_pic(obj, name): try: pic_url = browser.save_screenshot('.\\bilibili.png') #开始获取元素位置信息 left = obj.location['x'] top = obj.location['y'] right = left + obj.size['width'] bottom = top + obj.size['height'] im = Image.open('.\\bilibili.png') im = im.crop((left, top, right, bottom)) file_name = 'bili' + name + '.png' im.save(file_name) except BaseException as msg: print("截图失败:%s" % msg) def cut(): c_background = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'canvas.geetest_canvas_bg.geetest_absolute'))) c_slice = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_slice.geetest_absolute'))) c_full_bg = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'canvas.geetest_canvas_fullbg.geetest_fade.geetest_absolute'))) hide_element(c_slice) save_pic(c_background, 'back') show_element(c_slice) save_pic(c_slice, 'slice') show_element(c_full_bg) save_pic(c_full_bg, 'full') #判断元素是否相同 def is_pixel_equal(bg_image, fullbg_image, x, y): #bg_image是缺口的图片 #fullbg_image是完整图片 bg_pixel = bg_image.load()[x, y] fullbg_pixel = fullbg_image.load()[x, y] threshold = 60 if (abs(bg_pixel[0] - fullbg_pixel[0] < threshold) and abs(bg_pixel[1] - fullbg_pixel[1] < threshold) and abs(bg_pixel[2] - fullbg_pixel[2] < threshold)): return True else: return False #计算滑块移动的距离 def get_distance(bg_image, fullbg_image): distance = 57 for i in range(distance, fullbg_image.size[0]): for j in range(fullbg_image.size[1]): if not is_pixel_equal(fullbg_image, bg_image, i, j): return i #构造滑动轨迹 def get_trace(distance): #distance是缺口离滑块的距离 trace = [] faster_distance = distance*(4/5) start, v0, t = 0, 0, 0.2 while start < distance: if start < faster_distance: a = 1.5 else: a = -3 move = v0 * t + 1 / 2 * a * t * t v = v0 + a * t v0 = v start += move trace.append(round(move)) return trace #模拟拖动 def move_to_gap(trace): slider=wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.geetest_slider_button'))) # 使用click_and_hold()方法悬停在滑块上,perform()方法用于执行 ActionChains(browser).click_and_hold(slider).perform() for x in trace: # 使用move_by_offset()方法拖动滑块,perform()方法用于执行 ActionChains(browser).move_by_offset(xoffset=x, yoffset=0).perform() time.sleep(0.5) ActionChains(browser).release().perform() def slide(): distance=get_distance(Image.open('.\\bili_back.png'),Image.open('.\\bili_full.png')) trace = get_trace(distance-5) move_to_gap(trace) time.sleep(3) init() login() cut() slide()
点击式的图文验证和图标选择
常见的点击式验证码有12306、简书.
简书的解决思路:
获取点击式图片的信息——调用第三方识别库——获取第三方返回的坐标——用selenium模拟用户点击。
(第三方识别是超级鹰,这是一个付费的软件,但是注册后关注公众号有免费的测试额度)
import time from PIL import Image from selenium import webdriver from selenium.webdriver import ActionChains def crack(): # 保存网页截图 browser.save_screenshot('222.jpg') # 获取 验证码确定按钮 button = browser.find_element_by_xpath(xpath='//div[@class="geetest_panel"]/a/div') # 获取 验证码图片的 位置信息 img1 = browser.find_element_by_xpath(xpath='//div[@class="geetest_widget"]') location = img1.location size = img1.size top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[ 'width'] print('图片的宽:', img1.size['width']) print(top, bottom, left, right) # 根据获取的验证码位置信息和网页图片 对验证码图片进行裁剪 保存 img_1 = Image.open('222.jpg') capcha1 = img_1.crop((left, top, right, bottom-54)) capcha1.save('tu1-1.png') # 接入超级鹰 API 获取图片中的一些参数 (返回的是一个字典) cjy = Chaojiying('*********', '************', '900751') im = open('tu1-1.png', 'rb').read() content = cjy.post_pic(im, 9004) print(content) # 将图片中汉字的坐标位置 提取出来 positions = content.get('pic_str').split('|') locations = [[int(number)for number in group.split(",")] for group in positions] print(positions) print(locations) # 根据获取的坐标信息 模仿鼠标点击验证码图片 for location1 in locations: print(location1) ActionChains(browser).move_to_element_with_offset(img1 , location1[0],location1[1]).click().perform() time.sleep(1) button.click() time.sleep(1) # 失败后重试 lower = browser.find_element_by_xpath('//div[@class="geetest_table_box"]/div[2]').text print('判断', lower) if lower != '验证失败 请按提示重新操作'and lower != None: print('登录成功') time.sleep(3) else: time.sleep(3) print('登录失败') # 登录失败后 , 调用 该函数 , 后台 则对该次判断不做扣分处理 pic_id = content.get('pic_id') print('图片id为:',pic_id) cjy = Chaojiying('********', '**********', '900751') cjy.report_error(pic_id) crack() if __name__ == '__main__': patn = 'chromedriver.exe' browser = webdriver.Chrome(patn) browser.get('https://www.jianshu.com/sign_in') browser.save_screenshot('lodin.png') # 填写from表单 点击登陆 获取验证码 的网页截图 login = browser.find_element_by_id('sign-in-form-submit-btn') username = browser.find_element_by_id('session_email_or_mobile_number') password = browser.find_element_by_id('session_password') username.send_keys('***********') password.send_keys('***********') login.click() time.sleep(5) crack()
import requests from hashlib import md5 class Chaojiying(object): def __init__(self, username, password, soft_id): self.username = username self.password = md5(password.encode('utf-8')).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def post_pic(self, im, codetype): """ im: 图片字节 codetype: 题目类型 参考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() # 验证不通过,请求该函数 , 后台 则对该次判断不做扣分处理 def report_error(self, im_id): """ im_id:报错题目的图片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json()
宫格验证码
原文链接:https://blog.csdn.net/m0_37872090/article/details/97392185