python之基于libsvm识别数字验证码

1. 参考

2.图片预处理和手动分类

(1)分析图片

from PIL import Image
img = Image.open('nums/ttt.png')
gray = img.convert('L')
img.show()

windows图片查看器可以放大像素级别：从左到右，从上到下依次为原图，灰度图，阈值为100的二值图，分割图。

# 输出为(count,(R,G,B,A))   alpha透明度一般为255
In [366]: sorted(img.getcolors())
(22, (251, 0, 0, 255)),
(24, (251, 184, 245, 255)),
(41, (192, 192, 192, 255)),   #没有交叉的灰色干扰线
(102, (255, 0, 0, 255)),      #红色数字
(490, (245, 245, 245, 255))]  #背景色白色

# img.convert帮助显示 L = R * 299/1000 + G * 587/1000 + B * 114/1000 
# 所以可以确定干扰线灰色RGB 192 192 192的灰度为192
In [367]: sorted(gray.getcolors())
(24, 210),
(41, 75),
(41, 192),
(102, 76),
(505, 245)]

# 按照灰度排序，基本确定阈值为100以下全黑
In [369]: sorted(gray.getcolors(),key=lambda x:x[1])
[(1, 70),
 (2, 73),
 (41, 75),
 (102, 76),
 (2, 82),
 (11, 83),
 (10, 88),
 (5, 98),
 
# getdata也可以查看数据
In [371]: list(img.getdata())
Out[371]:
[(245, 245, 245, 255),

(2)批量下载图片

# 批量下载100张验证码
# urllib.urlretrieve(url,'nums/ttt.png')  #也行，不支持https
# 如 urls='https://bytebucket.org/wswp/code/raw/9e6b82b47087c2ada0e9fdf4f5e037e151975f0f/chapter07/samples/sample1.png'
# http://blog.csdn.net/zyz511919766/article/details/25049365
# python内置的urllib模块不支持https协议的解决办法 
# 编译安装python之前没有编译安装类似于openssl这样的SSL库,以至于python不支持SSL
# [Errno socket error] [SSL: UNKNOWN_PROTOCOL] unknown protocol (_ssl.c:590)
url='http://jbywcg.lnypcg.com.cn/CommonPage/Code.aspx?0.10330188674268' #后面添加任意数字即可
for i in range(100):
    with open('nums/%s.png'%i,'wb') as f:
        f.write(urllib2.urlopen(url+str(i)).read())
        
# 新建分类文件夹，0~9
for i in range(10):
    os.mkdir('nums/%s'%i)

(3)对100张验证码进行预处理，数字分割，然后手动分类并保存到相应文件夹

import time
for index in range(100):
    img = Image.open('nums/%s.png'%index)
    gray = img.convert('L')
    gray_array = np.array(gray)
    # 阈值100以下黑色标记为1，方便确定边缘
    bilevel_array = np.where(gray_array<100,1,0)  

    left_list = []
    # 从左到右按列求和
    vertical = bilevel_array.sum(0)
    # 验证码图片规律：左右留白，上下留白3和4，每个数字占据w8h13，总共4个数字
    # 从左到右按列扫描，2白1黑确定为数字左边缘
    for i,c in enumerate(vertical[:-2]):
        if vertical[i] == 0 and vertical[i+1] == 0 and vertical[i+2] != 0:
            left_list.append(i+2)
        if len(left_list) == 4:
            break

    # 分割为肉眼可分辨的图片
    bilevel = Image.fromarray(np.where(gray_array<100,0,255))
    children = [bilevel.crop((left,3,left+8,img.height-4)) for left in left_list]
    for child in children:
        child.show()
        result = raw_input(':')
        child.save('nums/%s/%s_%s.png'%(result,result,time.strftime('%H%M%S')))
    print index

(4)确认分类结果

# 分割图片尺寸太小w8h13，windows看图软件显示为小黑块，img.show()则正常
# 将所有分割图片按行排列合并一图
import os
# 确定新建图片最大宽度
count_max = max(len(os.listdir('nums/%s'%i)) for i in range(10))
img_merge = Image.new('1',(8*count_max,13*10))
for h in range(10):
    for w,f in enumerate(os.listdir('nums/%s'%h)):
        img_merge.paste(Image.open('nums/%s/%s'%(h,f)),(w*8,h*13))
img_merge.show()

3.libsvm训练

# 1.官网页面搜索download下载压缩包 http://www.csie.ntu.edu.tw/~cjlin/libsvm/#download 
# 2.将压缩包所有文件解压到 Lib\site-packages\libsvm 
# 3.将 libsvm\windows 的 libsvm.dll 复制到 C:\WINDOWS\system32\ http://blog.csdn.net/yearningseeker/article/details/49018015
# 4.在 libsvm 根目录和 libsvm\python 子目录下中分别新建名为__init__.py的空文件即可 http://www.cnblogs.com/Finley/p/5329417.html

def get_feature(num, picpath):
    img = Image.open(picpath)
    # 纯黑白转为01二值
    img_array = np.array(img)/255
    # 先遍历w，再遍历h，8+13总共21维度
    # 这里以每一维上的0个数为特征值，img_array.shape[0]是总行数
    return (num, list(img_array.shape[0]-img_array.sum(0)) + list(img_array.shape[1]-img_array.sum(1)))

def write_features(feature_list, filepath='nums/result_temp.txt'):
    with open(filepath,'w') as fp:
        # LIBSVM 对样本文件的格式要求为：<label> <index1>:<value1> <index2>:<value2> ...
        # 1 1:1 2:2 3:2 4:3 5:4 6:13 7:2 8:2 9:1 10:2 11:2 12:2 13:1 14:2 15:2 16:1 17:8 18:1 19:1 20:1 21:5
        for num, data in feature_list:
            temp = ' '.join([str(num)]  +  [str(i)+':'+str(j) for (i,j) in zip(range(1,len(data)+1), data)])
            fp.write(temp+'\n')

# 批量获取0~9十个文件夹所有分割数字的特征值并写入features.txt
feature_list = []    
for num in range(10):
    for filename in os.listdir('nums/%s'%num):
        feature_list.append(get_feature(num, 'nums/%s/%s'%(num,filename)))
write_features(feature_list, 'nums/features.txt')      


from libsvm.python.svmutil import *
from libsvm.python.svm import *

# 训练得到分类模型model文件
def train_svm_model():
    y, x = svm_read_problem('nums/features.txt')
    model = svm_train(y, x)
    svm_save_model('nums/model', model)

4.libsvm测试

重复之前批量下载和手动分类得到features_test.txt，测试正确率。

def svm_model_test(filepath='nums/features_test.txt'):
    yt, xt = svm_read_problem(filepath)
    model = svm_load_model('nums/model')
    p_label, p_acc, p_val = svm_predict(yt, xt, model)#p_label即为识别的结果
    return ''.join(str(int(p)) for p in p_label)

5.完整应用

#!/usr/bin/env python
#coding: UTF-8
import os, time
import urllib, urllib2
from PIL import Image
import numpy as np
from libsvm.python.svmutil import *
from libsvm.python.svm import *

def get_image(url=''):    
    url='http://jbywcg.lnypcg.com.cn/CommonPage/Code.aspx?0.10330188674268'
    temp = time.strftime('%H%M%S')
    picpath = 'nums/temp/%s.png'%(temp)
    with open(picpath,'wb') as f:
        f.write(urllib2.urlopen(url+str(temp)).read())
        return picpath

def split_image(filepath):
    img = Image.open(filepath)
    # img.show()
    gray = img.convert('L')
    gray_array = np.array(gray)
    bilevel_array = np.where(gray_array<100,1,0)  
    left_list = []
    vertical = bilevel_array.sum(0)
    for i,c in enumerate(vertical[:-2]):
        if vertical[i] == 0 and vertical[i+1] == 0 and vertical[i+2] != 0:
            left_list.append(i+2)
        if len(left_list) == 4:
            break
    bilevel = Image.fromarray(np.where(gray_array<100,0,255))
    children = [bilevel.crop((left,3,left+8,img.height-4)) for left in left_list]
    filepath_list = []
    for i,child in enumerate(children):
        filepath = 'nums/temp/%s_%s.png'%(time.strftime('%H%M%S'),i+1)
        filepath_list.append(filepath)
        child.save(filepath)
    return filepath_list    


def get_feature(num, picpath):
    img = Image.open(picpath)
    img_array = np.array(img)/255
    #先遍历w，再遍历h
    return (num, list(img_array.shape[0]-img_array.sum(0)) + list(img_array.shape[1]-img_array.sum(1)))

def write_features(feature_list, filepath='nums/features_test.txt'):
    with open(filepath,'w') as fp:
        for num, data in feature_list:
            temp = ' '.join([str(num)]  +  [str(i)+':'+str(j) for (i,j) in zip(range(1,len(data)+1), data)])
            fp.write(temp+'\n')
            
def svm_model_test(filepath='nums/features_test.txt'):
    yt, xt = svm_read_problem(filepath)
    model = svm_load_model('nums/model')
    p_label, p_acc, p_val = svm_predict(yt, xt, model)  #p_label即为识别的结果
    return ''.join(str(int(p)) for p in p_label)

def main():
    while True:
        picpath = get_image()
        splitpath_list = split_image(picpath)
        feature_list = []
        for splitpath in splitpath_list:
            feature_list.append(get_feature(1, splitpath))  #1为任意预设整数值
            os.remove(splitpath)
        write_features(feature_list) 
        result = svm_model_test() 
        print result
        (dirname, filename) = os.path.split(picpath)
        (shortname, extension) = os.path.splitext(picpath)
        try:
            os.rename(picpath, os.path.join(dirname,result+extension))
        except:
            os.rename(picpath, os.path.join(dirname,result+'_'+time.strftime('%H%M%S')+extension))
        
if __name__ == '__main__':
    main()

6.运行结果

posted @ 2017-07-06 15:18 my8100 阅读(1399) 评论(0) 收藏举报

刷新页面返回顶部

my8100

python之基于libsvm识别数字验证码

1. 参考

2.图片预处理和手动分类

(1)分析图片

(2)批量下载图片

(3)对100张验证码进行预处理，数字分割，然后手动分类并保存到相应文件夹

(4)确认分类结果

3.libsvm训练

4.libsvm测试

5.完整应用

6.运行结果

公告