爬虫--使用百度OCR来识别图片验证码
通过使用用百度AI的OCR来自动识别图片中的文字,效果很不错。
在这里跟大家简单分享一下如何使用,在处理图片前,最好先经过灰度化、二值化、降噪等预处理,直接上代码:
# coding:utf-8
import requests
import base64
# import jsonpath
from io import BytesIO
from PIL import Image
def get_access_token():
'''获取access_token'''
API_KEY = 【'官网获取的AK'】#通过百度账号登陆百度AI,进入控制台,创建对应的应用就能获得
SECRET_KEY = 【'官网获取的SK'】
# client_id 为官网获取的AK, client_secret 为官网获取的SK
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(API_KEY,SECRET_KEY)
response = requests.get(host)
return response.json().get('access_token')
def deal_img(img_name):
'''图片进行预处理,并返回bytes类型数据'''
img = Image.open(img_name)
buf = BytesIO()
img1 = img.convert('L') #灰度化
img2 = img1.point(lambda x:1 if x > 140 else 0,'1') #二值化
width = img2.width
height = img2.height
noisy_point_list = [] # 用于噪点存储
for x in range(width):
for y in range(height):
if img2.getpixel((x, y)) == 0: # 该点是黑点的情况下,判断是否是孤点
num = sum_black_point(img2, x, y)
if num < 1:
noisy_point_list.append((x, y)) # 周围没有黑点,记录下这些噪点位置
for position in noisy_point_list: # 遍历噪点位置,涂白
img2.putpixel(position, 1)
img2.save(buf,'png')
return buf.getvalue()
def sum_black_point(img,x,y):
'''统计黑点周围的其它黑点数'''
if x == 0:
if y == 0:#左上角顶点
num = img.getpixel((x+1,y))\
+img.getpixel((x,y+1))\
+img.getpixel((x+1,y+1))
return 3-num
elif y == img.height-1:#左下角顶点
num = img.getpixel((x+1,y))\
+img.getpixel((x,y-1))\
+img.getpixel((x+1,y-1))
return 3 - num
else:#第一列其它位置
num = img.getpixel((x+1,y))\
+img.getpixel((x,y-1))\
+img.getpixel((x+1,y-1))\
+img.getpixel((x,y+1))\
+img.getpixel((x+1,y+1))
return 5-num
elif x == img.width-1:
if y == 0:#右上角顶点
num = img.getpixel((x-1,y))\
+img.getpixel((x,y+1))\
+img.getpixel((x-1,y+1))
return 3-num
elif y == img.height-1:#右下角顶点
num = img.getpixel((x-1,y))\
+img.getpixel((x-1,y-1))\
+img.getpixel((x,y-1))
return 3 - num
else:#最后一列其它位置
num = img.getpixel((x,y-1))\
+img.getpixel((x-1,y-1))\
+img.getpixel((x-1,y))\
+img.getpixel((x-1,y+1))\
+img.getpixel((x,y+1))
return 5-num
else:
if y == 0:#第一行除去顶点的其它位置
num = img.getpixel((x-1,y))\
+img.getpixel((x+1,y))\
+img.getpixel((x-1,y+1))\
+img.getpixel((x,y+1))\
+img.getpixel((x+1,y+1))
return 5-num
elif y == img.height-1:#最后一行除去顶点的其它位置
num = img.getpixel((x-1,y))\
+img.getpixel((x+1,y))\
+img.getpixel((x-1,y-1))\
+img.getpixel((x,y-1))\
+img.getpixel((x+1,y-1))
return 5-num
else:#其它周围拥有8个格子的位置
num = img.getpixel((x-1,y-1))\
+img.getpixel((x,y-1))\
+img.getpixel((x+1,y-1))\
+img.getpixel((x-1,y))\
+img.getpixel((x+1,y))\
+img.getpixel((x-1,y+1))\
+img.getpixel((x,y+1))\
+img.getpixel((x+1,y+1))
return 8-num
def get_text():
'''获取验证码文本'''
img_name = input("请输入需要识别的图片文件名:")
img_data = deal_img(img_name)
# with open(img_name,'rb') as f:
# img_data = f.read()
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic" #高精度版
# request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic" 标准版
img = base64.b64encode(img_data)
params = {"image":img}
access_token = get_access_token()
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response.json()["words_result"]:
# return ''.join(jsonpath.jsonpath(response.json()["words_result"],'$..words')) 识别非验证码图片中的字符
return response.json()["words_result"][0]['words'].replace(' ','') #识别的是验证码
else:
return '未能成功识别'
if __name__ == '__main__':
print('验证码:%s'%get_text())