python批量下载验证码,用来做验证码处理

刚学到爬虫识别验证码,所以自己建一个获取验证码的类,感兴趣的道友,可以看看,代码如下:

 

 

import requests
import time
import os
import re

class Pictures:
    """docstring for Pictures"""
    def __init__(self, url, request=None, file_dir=None, headers=None):
        self.url = url
        if not request:
            self.requests = requests.session()
        else:
            self.requests = request
        if not file_dir:
            self.image_dir = './image/'
        else:
            self.image_dir = file_dir
        if not headers:
            self.headers = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding':'gzip,deflate',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'
            }
        else:
            self.headers = headers

    '''
    保存图片
    '''
    def save_image(self, url=None):
        if url is not None:
            self.url = url
        if not self.url:
            return False
        size = 0
        number = 0
        while size == 0:
            try:
                img_file = self.requests.get(url=self.url, headers=self.headers)
            except self.requests.exceptions.RequestException as e:
                raise e

            # 不是图片跳过
            if not self.check_image(img_file.headers['Content-Type']):
                return False
            file_path = self.image_path(img_file.headers)
            # 保存
            with open(file_path, 'wb') as f:
                f.write(img_file.content)
            # 判断是否正确保存图片
            size = os.path.getsize(file_path)
            if size == 0:
                os.remove(file_path)
            # 如果该图片获取超过十次则跳过
            number += 1
            if number >= 10:
                break
        return file_path if (size > 0) else False

    '''
    图片保存的路径
    '''
    def image_path(self, header):
        # 文件夹
        if not os.path.exists(self.image_dir):
            os.makedirs(self.image_dir)
        # 文件名
        file_name = str(time.time()).replace('.', '')
        # 文件后缀
        suffix = self.img_type(header)

        return self.image_dir + file_name + suffix

    '''
    获取图片后缀名
    '''
    def img_type(self,header):
        # 获取文件属性
        image_attr = header['Content-Type']
        pattern = 'image/([a-zA-Z]+)'
        suffix = re.findall(pattern, image_attr, re.IGNORECASE)
        # 获取后缀
        if not suffix:
            suffix = 'png'
        else:
            suffix = suffix[0]
        if re.search('jpeg', suffix, re.IGNORECASE):
            suffix = 'jpg'

        return '.' + suffix

    # 检查是否为图片类型
    def check_image(self, content_type):
        if 'image' in content_type:
            return True
        else:
            return False

if __name__ == '__main__':
    image = Pictures('http://my.cnki.net/elibregister/CheckCode.aspx')
    for i in range(50):
        image.save_image()

 

posted @ 2019-06-18 09:39  大道至简,小而蕴真  阅读(1742)  评论(0编辑  收藏  举报