正则表达式
re功能函数
-
findall,获取匹配到的所有数据
import re text = "dsf130429191912015219k13042919591219521Xkk" res = re.findall('\d{6}\d{4}\d{2}\d{2}\d{3}[\dX]', text) print(res) # ['130429191912015219', '13042919591219521X']
-
match,从起始位置开始匹配,匹配成功返回一个对象,未匹配成功返回None
import re text = "大小逗2B最逗3B欢乐" res = re.match('逗\dB',text) print(res) # None
import re text = "逗2B最逗3B欢乐" res = re.match('逗\dB',text) print(res.group()) # <re.Match object; span=(0, 3), match='逗2B'> # 逗2B
-
search,浏览整个字符串去匹配第一个,未匹配成功返回None
import re text = "大小逗2B最逗3B欢乐" res = re.search("逗\dB", text) print(res) print(res.group()) # <re.Match object; span=(2, 5), match='逗2B'> # 逗2B
-
sub,替换匹配成功的位置
import re text = "逗2B最逗3B欢乐逗5B" res = re.sub("逗\dB", "alex", text) print(res) # alex最alex欢乐alex
import re # 最后的参数表示前前几个 text = "逗2B最逗3B欢乐逗5B" res = re.sub("逗\dB", "john", text, 2) print(res) # john最john欢乐逗5B
-
split,根据匹配成功的位置分割
import re text = "逗2B最逗3B欢乐" res = re.split("\dB", text) print(res) # ['逗', '最逗', '欢乐']
import re # 最后的参数表示前前几个 text = "逗2B最逗3B欢乐" res = re.split("\dB", text, 1) print(res) # ['逗', '最逗3B欢乐']
-
finditer
import re text = "dsf130429191912015219k13042919591219521Xkk" res = re.finditer('\d{6}(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})[\dX]', text) for item in res: print(item.group()) """ 130429191912015 130429195912195 """
import re text = "dsf130429191912015219k13042919591219521Xkk" res = re.finditer('\d{6}(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})[\dX]', text) for item in res: print(item.groupdict()) """ {'year': '1919', 'month': '12', 'day': '01'} {'year': '1959', 'month': '12', 'day': '19'} """
3.2 正则表达式相关
1. 字符相关
-
alex
匹配文本中的aleximport re text = '你好alex,阿斯顿发alexhaha 阿士大夫能接受的alexff' res = re.findall("alex", text) print(res) # ['alex', 'alex', 'alex']
-
[abc]
匹配a或b或c 字符。import re text = '你好alex,阿斯顿发alexhaha 阿士大夫能接受的alexff' res = re.findall('[aex]', text) print(res) # ['a', 'e', 'x', 'a', 'e', 'x', 'a', 'a', 'a', 'e', 'x']
import re text = '你好alex,阿斯顿发alexhaha 阿士大夫能接受的alexab' res = re.findall('a[lb]', text) print(res) # ['al', 'al', 'al', 'ab']
-
[^abc]
匹配除了abc意外的其他字符。import re res = re.findall('[^alexhb]', text) print(res) # ['你', '好', ',', '阿', '斯', '顿', '发', ' ', '阿', '士', '大', '夫', '能', '接', '受', '的']
-
[a-z]
匹配a~z的任意字符( [0-9]也可以 )。import re text = "alexrootrootadmin" res = re.findall("[a-z]",text) print(res) # ['a', 'l', 'e', 'x', 'r', 'o', 'o', 't', 'r', 'o', 'o', 't', 'a', 'd', 'm', 'i', 'n']
-
.
代指除换行符以外的任意字符。import re text = "alexraotrootadmin" res = re.findall('r.o', text) print(res) # ['rao', 'roo']
import re text = "alexraotrootadmin" # . 表示匹配任何字符(除了换行符) + 表示至少匹配一次这个字符 res = re.findall('r.+o', text) print(res) # ['raotroo']
import re text = "alexraotrootadmin" # . 表示匹配任何字符(除了换行符) + 表示至少匹配一次这个字符 ?是使量词 + 变为非贪婪 ,即匹配尽可能少的字符,而不是尽可能多。 res = re.findall('r.+?o', text) print(res) # # ['rao', 'roo']
-
\w
代指字母或数字或下划线(汉字)。import re text = "北京johnalex齐北 京johnalex6" res = re.findall('(john\w+(x|6))',text) print(res) # [('johnalex', 'x'), ('johnalex6', '6')]
-
\d
代指数字import re text = "root-ad32min-add3-admd1in" res = re.findall("d\d", text) print(res) # ['d3', 'd3', 'd1']
import re text = "root-ad32min-add3-admd1in" res = re.findall("d\d+", text) print(res) # ['d32', 'd3', 'd1']
-
\s
代指任意的空白符,包括空格、制表符等。import re text = "root admin add admin" res = re.findall("a\w+\s\w+", text) print(res) # ['admin add']
2. 数字相关
-
*
重复0次或更多次import re text = "他是大B个,确实是个大2B。" res = re.findall("大\d*B",text) print(res) # ['大B', '大2B']
-
+
重复1次或更多次import re text = "他是大B个,确实是个大2B,大3B,大66666B。" res = re.findall("大\d+B",text) print(res) # ['大2B', '大3B', '大66666B']
-
?
重复0次或1次import re text = "他是大B个,确实是个大2B,大3B,大66666B。" res = re.findall("大\d?B",text) print(res) # ['大B', '大2B', '大3B']
-
{n}
重复n次import re text = "他是大B个,确实是个大2B,大3B,大66666B。" res = re.findall('大\d{5}B',text) print(res) # ['大66666B']
-
{n,}
重复n次或更多次import re text = "他是大B个,确实是个大2B,大325B,大66666B。" res = re.findall('大\d{0,}B',text) print(res) # ['大B', '大2B', '大325B', '大66666B']
-
{n,m}
重复n到m次import re text = "他是大B个,确实是个大2B,大325B,大66666B。" res = re.findall('大\d{3,5}B',text) print(res) # ['大325B', '大66666B']
3. 括号(分组)
-
提取数据区域
import re text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("151312\d{5}",text) print(res) # ['15131255789']
import re text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来15131266666呀" res = re.findall("1513(12)(\d{5})",text) print(res) # [('12', '55789'), ('12', '66666')]
import re text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("(151312(\d{5}))",text) print(res) # [('15131255789', '55789')]
-
获取指定区域 + 或条件
import re text = "楼主15131root太牛15131alex逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("(15131(\d{6}))",text) print(res) # [('15131255789', '255789')]
import re text = "楼主15131root太牛15131alex逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀" res = re.findall("(15131(\d{6}|r\w+太))",text) print(res) # [('15131root太', 'root太'), ('15131255789', '255789')]
4. 起始和结束
上述示例中都是去一段文本中提取数据,只要文本中存在即可。
但,如果要求用户输入的内容必须是指定的内容开头和结尾,比就需要用到如下两个字符。
-
^
开始 -
$
结束import re text = "啊442662578@qq.com我靠" email_list = re.findall("^\w+@\w+.\w+$", text, re.ASCII) print(email_list) # []
import re text = "442662578@qq.com" email_list = re.findall("^\w+@\w+\.\w+$", text, re.ASCII) print(email_list) # ['442662578@qq.com']
这种一般用于对用户输入数据格式的校验比较多,例如:
import re text = input("请输入邮箱:") email = re.findall("^\w+@\w+.\w+$", text, re.ASCII) if not email: print("邮箱格式错误") else: print(email)
5. 特殊字符
由于正则表达式中 * . \ { } ( )
等都具有特殊的含义,所以如果想要在正则中匹配这种指定的字符,需要转义,例如:
# 错误匹配
import re
text = "我是你{5}哈哈"
res = re.findall("你{5}哈哈",text)
print(res) # []
import re
text = "我是你{5}哈哈"
# 使用 \{ 和 \} 来转义大括号字符。
res = re.findall("你\{5\}哈",text)
print(res) # ['你{5}哈']