python识别验证码-PIL+pytesseract
直接上代码
验证码识别目标:http://op2c.ikang.com/login
比如:
步骤:
1.图片二值化变色
2.剪裁图片,去掉多余的影响识别的部分,只保留验证码主体
3.去除噪点
4.识别验证码
1 from PIL import Image, ImageDraw 2 import pytesseract 3 import requests 4 import base64 5 from time import strftime, localtime 6 7 pic_time = strftime('%Y_%m_%d_%H_%M_%S', localtime()) 8 9 10 def get_pic(pic_time, url='http://op2c.ikang.com/admin/captcha/image'): 11 pic_data = requests.get(url) 12 pic_data = pic_data.json()['data']['captcha'] 13 img = base64.b64decode(pic_data) 14 15 16 pic_path = 'C:/Users/Administrator/Desktop/验证码/{}.jpg'.format(pic_time) 17 18 file = open(pic_path,'wb') 19 file.write(img) 20 file.close() 21 print(u'验证码已存储-0') 22 return pic_path 23 24 25 def oper_pic(path, pic_time): 26 img = Image.open(path) 27 box = (55, 15, 195, 60) 28 # 截图,只保留有验证码的部分 29 img = img.crop(box) 30 img = img.convert("RGBA") 31 pixdata = img.load() 32 ## 二值化 33 for y in range(img.size[1]): 34 for x in range(img.size[0]): 35 if pixdata[x, y][0] < 90: 36 pixdata[x, y] = (0, 0, 0, 255) 37 38 for y in range(img.size[1]): 39 for x in range(img.size[0]): 40 if pixdata[x, y][1] < 136: 41 pixdata[x, y] = (0, 0, 0, 255) 42 43 for y in range(img.size[1]): 44 for x in range(img.size[0]): 45 if pixdata[x, y][2] > 0: 46 pixdata[x, y] = (255, 255, 255, 255) 47 48 pic_path = 'C:/Users/Administrator/Desktop/验证码/{}_1.png'.format(pic_time) 49 50 img.save(pic_path) 51 print(u'验证码已存储-1') 52 return img 53 54 55 def recognize_captcha(im): 56 num = pytesseract.image_to_string(im) 57 return num.replace(' ', '') 58 59 60 res = recognize_captcha(oper_pic(get_pic(pic_time), pic_time)) 61 print(res)
目前这玩意针对上边这种类型的验证码识别准确率在80%左右,勉强能用,先记录下,以后再优化。