Python: Regular expressions

'''
引用类库
import re
import string 
'''
@staticmethod
def strSplit(textSource: str, patterns: str) -> list:
    """
    分割字符串
    :param textSource: 需要进行分割的字符串
    :param patterns: 指定指符串分割
    :return: list
    """
    ls = re.split(r'[' + patterns + '\s]', textSource)
    return ls

@staticmethod
def strFunSplit(textSource: str) -> list:
    """
    分割字符串 用字符串函数
    :param textSource: 需要进行分割的字符串
    :return: list
    """
    ls = textSource.split(" ")
    return ls

@staticmethod
def strRegSplit(textSource: str) -> list:
    """
    分割字符串
    :param textSource: 需要进行分割的字符串
    :return: list
    """
    ls = re.findall(r"\S+", textSource)
    return ls



@staticmethod
def strForSplit(textSource: str) -> list:
    """

    :param textSource:
    :return:
    """
    lenght=len(textSource)  #字符串长度
    firstname=""
    lastname=""
    space=0   #所用于分割字符串的索引值 index
    patterns=" "
    print(patterns)
    for i in range(0,lenght):
        if textSource[i]==patterns:
            space=i
    firstname=textSource[0:space]
    lastname=textSource[space+1:lenght]
    print("firstname:",firstname,"lastname:",lastname)
    return [firstname,lastname]


@staticmethod
def strForSplitTwo(textSource: str,patterns:str) -> list:
    """

    :param textSource:
    :param patterns:
    :return:
    """
    lenght=len(textSource)  #字符串长度
    firstname=""
    lastname=""
    space=0   #所用于分割字符串的索引值 index
    print(patterns)
    for i in range(0,lenght):
        if textSource[i]==patterns:
            space=i
    firstname=textSource[0:space]
    lastname=textSource[space+1:lenght]
    print("firstname:",firstname,"lastname:",lastname)
    return [firstname,lastname]

    #引用库 import re 正则表达式的方式
    #1.re.compile(): 该函数用于生成一个正则表达式，也就是匹配的核心部分，用来定义你需要怎么匹配，匹配什么内容，更多细节可以去参看菜鸟教程。
    #2.re.findall(): 该函数用于在指定的字符串中进行匹配。

    #str1 = 'lukfook8-hongkong+90shenzhen-4hh h7+8facai-shanghai geovindu'
    fullname=input("please enter full name:")
    firstname=""
    lastname=""
    ls=re.split(r'[-+' '.\s]', fullname)
    #print(re.split(r'[-+' '.\s]', str1))  # 以有+(加号）、-（减号)、' '（一个空格）、.（单字节英文点号） 字符串分割
    print(type(ls))
    for s in ls:
        print(s) #循环序列出列表中的字符串
    firstname=ls[0]
    lastname=ls[1]
    print("firstname",firstname,",lastname:",lastname)

# encoding: utf-8
# 版权所有 2024 涂聚文有限公司
# 许可信息查看：
# 描述：
# Author    : geovindu,Geovin Du 涂聚文.
# IDE       : PyCharm 2023.1 python 3.11
# Datetime  : 2024/4/28 20:01
# User      : geovindu
# Product   : PyCharm
# Project   : pyBaiduAi
# File      : DuString.py
# explain   : 学习

import re
import os
import sys
import string


class DuString(object):
    """
    正则表达式用法
    """

    @staticmethod
    def strSplitUper(textSource: str, patterns: str) -> list:
        """
        分割字符串 第一个字母大写
        :param textSource:
        :param patterns:
        :return: list
        """
        ls = re.findall(r'[A-Z][a-z]*\S', textSource)
        return ls

    @staticmethod
    def strSplitLow(textSource: str, patterns: str) -> list:
        """
        分割字符串 第一个字母小写或大写
        :param textSource:
        :param patterns:
        :return: list
        """
        ls = re.findall(r'[A-Z][a-z]*|[a-z]*\S', textSource)
        return ls

    @staticmethod
    def strSplit(textSource: str, patterns: str) -> list:
        """
        分割字符串
        :param textSource:
        :param patterns:
        :return: list
        """
        ls = re.split(r'[' + patterns + '\s]', textSource)
        return ls

    @staticmethod
    def strFunSplit(textSource: str) -> list:
        """
        分割字符串 用字符串函数
        :param textSource: 
        :return: list
        """
        ls = textSource.split(" ")
        return ls

    @staticmethod
    def strRegSplit(textSource: str) -> list:
        """
        分割字符串
        :param textSource:
        :param patterns:
        :return: list
        """
        ls = re.findall(r"\S+", textSource)
        return ls

    @staticmethod
    def getdit(textSrource: str) -> list:
        """
        提取数字数据 静态方法
        所有数值
        数字提取：可以用正则表达式来提取数字，包括整数、浮点数等。
        "去商店买了8个苹果, 12斤香蕉, 共计12.85元."
        :param textSource: 
        :return:
        """
        pattern = r'\d+\.\d+|\d+'
        match = re.findall(pattern, textSrource)
        if match:
            print(match)  # ['8', '12', '12.85']
        else:
            print("未找到数值")
        return match

    @staticmethod
    def getint(textSource: str) -> list:
        """
        提取整数
        :param textSource: 
        :return:
        """

        # 匹配浮点数的正则表达式
        pattern = r'\d+'
        match = re.findall(pattern, textSource)
        if match:
            print(match)
        else:
            print("未找到数值")

        return match

    @staticmethod
    def getfloat(textSource: str) -> list:
        """
        提取浮点数
        :param textSource: 
        :return:
        """
        # 匹配浮点数的正则表达式
        pattern = r"\d+\.\d+"

        match = re.search(pattern, textSource)
        if match:
            float_number = float(match.group())
            print(float_number)  #
        else:
            print("未找到数值")

        return match

    @staticmethod
    def getDate(textSource: str) -> list:
        """
                提取日期
        处理逻辑:
            年 4位有效数值 \d{4}
            月 0-12   \d{1,2}
            日 0-31   \d{1,2}
        :param textSource: 
        :return: 
        """
        dateText = ""
        if isinstance(textSource, str):
            regexRule = r"(\d{4}-\d{1,2}-\d{1,2})"
            regexPattern = re.compile(regexRule)
            dateList = regexPattern.findall(textSource)
            if dateList:
                dateText = dateList[0]
        return dateText

    @staticmethod
    def getTime(textSource: str) -> list:
        """
        提取时间
        :param textSource:
        :return:
        """
        regexRule = r'(0?[0-9]|1[0-9]|2[0-3]):([0-5][0-9]|0?[0-9]):([1-5][0-9]|0?[0-9])'
        regexPattern = re.compile(regexRule)
        retList = regexPattern.findall(textSource)
        if retList:
            return retList
        else:
            print('没有匹配成功.')
        return None

    @staticmethod
    def getUrl(textSource: str) -> list:
        """
        提取网址
        :param textSource:
        :return:
        """
        # 定义一个正则表达式模式来匹配URL
        pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
        # 使用re.findall()函数找到所有匹配的URL
        urls = re.findall(pattern, textSource)
        return urls

    @staticmethod
    def getMainIs(textSource: str) -> bool:
        """
        是否有效的邮件
        :param textSource:
        :return:
        """
        pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
        if re.match(pattern, textSource):
            print("有效的邮件地址")
            return True
        else:
            print("无效的邮件地址")
            return False

    @staticmethod
    def getIPIs(textSource: str) -> bool:
        """
        是否有效的IP
        :param textSource:
        :return:
        """
        # 定义IPv4地址的正则表达式
        ipv4Pattern = r'^((25[0-5]|2[0-4]\d|[01]?\d{1,2})\.){3}(25[0-5]|2[0-4]\d|[01]?\d{1,2})$'
        # 定义 IPv6 地址的正则表达式
        ipv6Pattern = r'^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$'
        if re.match(ipv4Pattern, textSource):
            print("IPv4 地址")
            return True
        elif re.match(ipv6Pattern, textSource):
            print("IPv6 地址")
            return True
        else:
            print("无效的 IP 地址")
            return False

    @staticmethod
    def getChinaMobileIs(textSource: str) -> bool:
        """
        是否有效的国内手机号码
        :param textSource:
        :return:
        """
        # 匹配以1开头，第二位是3、4、5、6、7、8或9，后面有9位数字的手机号码。
        pattern = r'^1[3456789]\d{9}$'
        for number in textSource:
            if re.match(pattern, number):
                print(f'{number} 是有效的手机号码')
                return True
            else:
                print(f'{number} 不是有效的手机号码')
                return False

    @staticmethod
    def getPostCodeIs(textSource: str) -> bool:
        """
        是滞中国国内邮政编码
        :param textSource:
        :return:
        """
        pattern = r'^\d{6}$'  # 匹配6位数字
        if re.match(pattern, textSource):
            print("邮政编码有效！")
            return True
        else:
            print("邮政编码无效！")
            return False

    @staticmethod
    def getICDIs(textSource: str) -> bool:
        """
        是否有效的中国内地自份证号码
        :param textSource:
        :return:
        """
        pattern = r'^[1-9]\d{5}(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2]\d|3[0-1])\d{3}[0-9Xx]$'
        match = re.match(pattern, textSource)
        if match:
            print("身份证号码合法！")
            return True
        else:
            print("身份证号码不合法！")
            return False

    @staticmethod
    def extractHtmltags(textSource: str) -> list:
        """
        提取HTML属性
        :param textSource:
        :return:
        """
        pattern = r"<([^>]+)>"
        tags = re.findall(pattern, textSource)
        return tags

    @staticmethod
    def getStock(textSource: str) -> list:
        """

        :param textSource:
        :return:
        """
        # text = "工商银行（600886）\n\t 贵州茅台（000123）"
        # 提取公司简称
        companyNamePattern = r"[\u4e00-\u9fff]+"
        companyNameMatches = re.findall(companyNamePattern, textSource)
        companyCame = companyNameMatches if companyNameMatches else None
        # 提取证券代码 6位数
        stockCodePattern = r"\d{6}"
        stockCodeMatches = re.findall(stockCodePattern, textSource)
        stockCode = stockCodeMatches if stockCodeMatches else None
        print("公司简称:", companyCame)  # 公司简称: ['工商银行', '贵州茅台']
        print("证券代码:", stockCode)  # 证券代码: ['600886', '000123']
        return companyCame, stockCode

    print(string.whitespace)

    str = "Geovin Du"
    d = str.rstrip(' ')
    print(d)
    du=re.findall(r" (.+?)", str,re.S)
    print(du)
    pattern = r'[?|&]'  # 定义分隔符
    url = 'http://www.baidu.com/login.jsp?username="wei"&pwd="123"'  # 需要拆分的字符串
    result = re.split(pattern, url)  # 以pattern的值 分割字符串 ,maxsplit = 0, flags = 0
    print(result)
    str='Geovin Du'
    newstr=re.findall(r"[a-z]*\S", str, re.M)
    print(newstr)
    result = re.search(pattern='.+ ', string=str, flags=re.S)
    print(result.group())
    result = re.findall(pattern='.+ ', string=str, flags=re.S)
    print("firstname:",result[0].rstrip())
    result = re.findall(pattern=' .*', string=str, flags=re.S)
    print("lastname:",result[0].lstrip())
    re_obj = re.compile('^\S+')
    result = re_obj.search(string=str)
    print(result)
    str_list =str.split(" ") # filter(None, str.split(" "))
    print(str_list)
    print(re.findall(r"\S+", str))
    dustr='Janice is 22 and Theon is 33 Gabriel is 44 and Joey is 21'
    names = re.findall(r'[A-Z][a-z]*', dustr)
    print(names)
    names = re.findall(r'[A-Z][a-z]*\S', str)
    print(names)
    lstr='geovin du'
    names = re.findall(r'[A-Z][a-z]*|[a-z]*\S', lstr)
    print(names)

    regex = re.compile(" ")
    randstr = regex.sub(":", str) #替换
    print(randstr)

    '''
    ocr=BLL.ImageOCR.ImageOCR()
    ocr.imageurl=r'6.png'
    ocr.texturl=r'ocr.txt'
    word=ocr.imagetowords(r'6.jpg')
    ocr.savewords(r'ocr.txt',word) 
    pdf2docx.parse("CreateTwoColumnPDF.pdf","1.docx");
    
    '''
    #引用库 import re 正则表达式的方式  Regular expressions  https://docs.python.org/zh-cn/3/library/re.html
    #1.re.compile(): 该函数用于生成一个正则表达式，也就是匹配的核心部分，用来定义你需要怎么匹配，匹配什么内容
    #2.re.findall(): 该函数用于在指定的字符串中进行匹配。
    #str1 = 'lukfook8-hongkong+90shenzhen-4hh h7+8facai-shanghai geovindu'
    fullname = input("please enter full name:")
    firstname = ""
    lastname = ""
    ls =strRegSplit(fullname)
    #ls =strSplit(fullname,' ') #用函数的方式
    #ls =re.split(r'[-+' '.\s]', fullname) #一行代码提取 以有+(加号）、-（减号)、' '（一个空格）、.（单字节英文点号）
    #ls = re.split(r'[' '\s]', fullname) #字符串分割，也可以一个字符去分割
    #print(re.split(r'[-+' '.\s]', str1))  # 以有+(加号）、-（减号)、' '（一个空格）、.（单字节英文点号） 字符串分割
    #print(type(ls)) #判断数据类型
    #for s in ls:
        #print(s) #循环序列出列表中的字符串
    firstname = ls[0]
    lastname = ls[1]
    print("firstname", firstname, ",lastname:", lastname)

字符	含义	举例	备注	符合条件
.	一个任意字符	a..b	a开头b结尾，中间两个任意字符	a\|2b
\w	一个字母/数字/下划线	\w...	字母/数字/下划线开头	o8js
\W	非字母/数字/下划线	\Wabc		#abc
\s	一个空白字符	a\sb		a\nb
\S	一个非空白字符	\S…	三个数字	2jkh
\d	数字字符	\d\d\d		675
\D	非数字字符	\D\w\w\w		#h7_
[]	括号中任意一个字符[1-9]数字1到9 [a-z]小写 [A-Z]大写	[abc]aaa	a/b/c开头	caaa
[^字符集]	一个不在字符集中的任意字符	[^abc]...	非a/b/c开头	898i
^	字符串开头	^\ddid		866
$	字符串结尾	abc$		abc
\b	(检测边界)	Abc\b\saaa	abclb\saaa	abc aaa
*	匹配≥0次	\d*	数字0或很多次 1个或很多个数字开	12312
+	匹配≥1次	\d+abc	1个或很多个数字开头	99abc
?	匹配0/1次	a?123	有a或者无a	a123
{N}	匹配N次
{M,N}	匹配M到N次
{M,}	至少匹配M次
{,N}	最多匹配N次

https://docs.python.org/zh-cn/3.11//howto/regex.htmll

match()和search()都只匹配出一个符合条件的字符串，若想要所有，可以使用re.findall()

语法	释义
\|	或者
()	组合(将括号中的内容作为一个整体进行操作) 捕获--使用带括号的正则表达式匹配成功后，只获取括号中的内容重复--在正则表达式中可以通过\数字来重复前面()中匹配到的结果。数字代表前第几个分组
\	转义符号,在特殊的符号前加\，来让特殊的符号没有意义不管在哪儿都需要转义 -在口外面没有特殊功能,在口中要表示-本身，就不要放在两个字符之间()需要转义
compile	将正则表达式字符串转换成正则表达式对象
fullmatch/match	匹配对象或者None
string	获取被匹配的原字符串
findall	获取字符串中满足正则表达式的所有的子串，返回一个列表
finditer	查找所有满足正则条件的子串，返回值是迭代器，迭代器中的元素是匹配对象
split	将字符串按照满足正则表达式条件的子串进行分割
sub(正则,repl,字符串)	字符串中满足正则表达式条件的子串替换成repl。返回替换后的字符串

posted @ 2024-04-28 11:57 ®Geovin Du Dream Park™ 阅读(10) 评论(0) 编辑收藏举报

®Geovin Du Dream Park™

why we only heard about haves and have-nots, but we did'nt heard about doers and doer-nots. 人生是一种心境,生活是一种艺术,成功是一种心态,幸福是一种感觉,竞争是一种建构,情感是一种容合.学习是一种成长.

Python: Regular expressions

公告