Python正则表达式
Python正则表达式语法及应用
一、re模块常用方法:
1、match(pattern, string, flags=0)
从起始位置开始根据模型去字符串中匹配指定内容,匹配单个
- 正则表达式
- 要匹配的字符串
- 标志位,用于控制正则表达式的匹配方式
import re obj = re.match('\d+', '123uuasf') if obj: print(obj.group())
flags
2、search(pattern, string, flags=0)
根据模型去字符串中匹配指定内容,匹配单个
import re obj = re.search('\d+', 'u123uu888asf') if obj: print(obj.group())
3、group和groups
a = "123abc456" print re.search("([0-9]*)([a-z]*)([0-9]*)", a).group() print re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(0) print re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(1) print re.search("([0-9]*)([a-z]*)([0-9]*)", a).group(2) print re.search("([0-9]*)([a-z]*)([0-9]*)", a).groups()
4、findall(pattern, string, flags=0)
上述两中方式均用于匹配单值,即:只能匹配字符串中的一个,如果想要匹配到字符串中所有符合条件的元素,则需要使用 findall。
import re obj = re.findall('\d+', 'fa123uu888asf') print(obj) # ['123', '888']
5、sub(pattern, repl, string, count=0, flags=0)
用于替换匹配的字符串
content = "123abc456" new_content = re.sub('\d+', 'sb', content) # new_content = re.sub('\d+', 'sb', content, 1) print(new_content) sbabcsb
相比于str.replace功能更加强大
6、split(pattern, string, maxsplit=0, flags=0)
根据指定匹配进行分组
content = "'1 - 2 * ((60-30+1*(9-2*5/3+7/3*99/4*2998+10*568/14))-(-4*3)/(16-3*2) )'" new_content = re.split('\*', content) # new_content = re.split('\*', content, 1) print(new_content) ["'1 - 2 ", ' ((60-30+1', '(9-2', '5/3+7/3', '99/4', '2998+10', '568/14))-(-4', '3)/(16-3', "2) )'"] content = "'1 - 2 * ((60-30+1*(9-2*5/3+7/3*99/4*2998+10*568/14))-(-4*3)/(16-3*2) )'" new_content = re.split('[\+\-\*\/]+', content) # new_content = re.split('\*', content, 1) print(new_content) ["'1 ", ' 2 ', ' ((60', '30', '1', '(9', '2', '5', '3', '7', '3', '99', '4', '2998', '10', '568', '14))', '(', '4', '3)', '(16', '3', "2) )'"] inpp = '1-2*((60-30 +(-40-5)*(9-2*5/3 + 7 /3*99/4*2998 +10 * 568/14 )) - (-4*3)/ (16-3*2))' inpp = re.sub('\s*','',inpp) new_content = re.split('\(([\+\-\*\/]?\d+[\+\-\*\/]?\d+){1}\)', inpp, 1) print(new_content) ['1-2*((60-30+', '-40-5', '*(9-2*5/3+7/3*99/4*2998+10*568/14))-(-4*3)/(16-3*2))']
相比于str.split更加强大
二、单字符匹配
In [8]: ma = re.match(r'.','b') In [9]: ma.gro ma.group ma.groupdict ma.groups In [9]: ma.group() Out[9]: 'b' In [10]: ma = re.match(r'.','0') In [11]: ma.grou ma.group ma.groupdict ma.groups In [11]: ma.group() Out[11]: '0' In [12]: clear In [13]: ma = re.match(r'{.}','{a}') In [14]: ma.group() Out[14]: '{a}' In [15]: ma = re.match(r'{.}','{0}') In [16]: ma.grou ma.group ma.groupdict ma.groups In [16]: ma.group() Out[16]: '{0}' In [17]: ma = re.match(r'{..}','{01}') In [18]: ma.group() Out[18]: '{01}' In [19]: ma = re.match(r'{[abc]}','{a}') In [20]: ma.group() Out[20]: '{a}' In [21]: ma = re.match(r'{[a-z]}','{d}') In [22]: ma.group() Out[22]: '{d}' In [23]: ma = re.match(r'{[a-zA-Z]}','{A}') In [24]: ma.group() Out[24]: '{A}' In [25]: ma = re.match(r'{[a-zA-Z0-9]}','{0}') In [26]: ma.group() Out[26]: '{0}' In [27]: ma = re.match(r'{[\w]}','{ }') In [28]: ma In [29]: ma = re.match(r'{[\W]}','{ }') In [30]: ma Out[30]: <_sre.SRE_Match object; span=(0, 3), match='{ }'> In [31]: ma.group() Out[31]: '{ }' In [32]: ma = re.match(r'{[\W]}','{9}') In [33]: ma.group() --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-33-7c62fc675aee> in <module>() ----> 1 ma.group() AttributeError: 'NoneType' object has no attribute 'group' In [34]: ma In [35]: ma = re.match(r'[[\w]]','[a]') In [36]: ma In [37]: ma = re.match(r'\[[\w]\]','[a]') In [38]: ma.group() Out[38]: '[a]' In [39]: ma = re.match(r'\[[\w]\]','[0]') In [40]: ma.group() Out[40]: '[0]'
三、表示数量
In [1]: import re In [2]: ma = re.match(r'[A-Z][a-z]','Aa') In [3]: ma.grou ma.group ma.groupdict ma.groups In [3]: ma.group() Out[3]: 'Aa' In [4]: ma = re.match(r'[A-Z][a-z]','A') In [6]: ma In [8]: In [8]: ma = re.match(r'[A-Z][a-z]*','A') In [9]: ma Out[9]: <_sre.SRE_Match object; span=(0, 1), match='A'> In [10]: ma.group() Out[10]: 'A' In [12]: ma = re.match(r'[A-Z][a-z]*','Asdsdwqass') In [14]: ma. ma.end ma.group ma.lastgroup ma.re ma.start ma.endpos ma.groupdict ma.lastindex ma.regs ma.string ma.expand ma.groups ma.pos ma.span In [14]: ma.group() Out[14]: 'Asdsdwqass' In [15]: ma = re.match(r'[A-Z][a-z]*','1Asdsdwqass') In [16]: ma In [17]: ma = re.match(r'[A-Z][a-z]*','Asd1sdwqass') In [18]: ma.group() Out[18]: 'Asd' In [19]: ma = re.match(r'[_a-zA-Z]+[_\w]*','10') In [20]: ma In [21]: ma = re.match(r'[_a-zA-Z]+[_\w]*','_ht11') In [22]: ma.group() Out[22]: '_ht11' In [23]: ma = re.match(r'[1-9]?[0-9]','99') In [24]: ma.group() Out[24]: '99' In [25]: ma = re.match(r'[1-9]?[0-9]','90') In [26]: ma.group() Out[26]: '90' In [27]: ma = re.match(r'[1-9]?[0-9]','9') In [28]: ma.group() Out[28]: '9' In [29]: ma = re.match(r'[1-9]?[0-9]','0') In [30]: ma.group() Out[30]: '0' In [31]: ma = re.match(r'[1-9]?[0-9]','09') In [32]: ma.group() Out[32]: '0' In [33]: ma = re.match(r'[[a-zA-Z0-9]{6}','abc123') In [34]: ma.group() Out[34]: 'abc123' In [35]: ma = re.match(r'[[a-zA-Z0-9]{6}','abc1234') In [36]: ma.group() Out[36]: 'abc123' In [37]: ma = re.match(r'[[a-zA-Z0-9]{6}','abc1__') In [38]: ma In [39]: ma = re.match(r'[[a-zA-Z0-9]{6}@163.com','abc123@163.com') In [40]: ma.group() Out[40]: 'abc123@163.com' In [41]: ma = re.match(r'[[a-zA-Z0-9]{6,10}@163.com','abc1234@163.com') In [42]: ma.grou ma.group ma.groupdict ma.groups In [42]: ma.group() Out[42]: 'abc1234@163.com' In [43]: ma = re.match(r'[0-9][a-z]*?','1bc') In [44]: ma.group() Out[44]: '1' In [45]: ma = re.match(r'[0-9][a-z]*','1bc') In [46]: ma.group() Out[46]: '1bc'
四、表示边界
In [48]: ma = re.match(r'[[a-zA-Z0-9]{6,10}@163.com','abc1234@163.comabc') In [49]: ma.group() Out[49]: 'abc1234@163.com' In [50]: ma = re.match(r'[[a-zA-Z0-9]{6,10}@163.com$','abc1234@163.comabc') In [51]: ma In [52]: ma = re.match(r'^[[a-zA-Z0-9]{6,10}@163.com$','abc1234@163.com') In [53]: ma.group() Out[53]: 'abc1234@163.com' In [54]: ma = re.match(r'\Aimooc[\w]*','imoocpython') In [55]: ma.group() Out[55]: 'imoocpython' In [56]: ma = re.match(r'\Aimooc[\w]*','iimooc') In [57]: ma.group() --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-57-7c62fc675aee> in <module>() ----> 1 ma.group() AttributeError: 'NoneType' object has no attribute 'group'
# 手机号匹配 result = re.match(r'1[35678]\d{9}$','15735177116') result <_sre.SRE_Match object; span=(0, 11), match='15735177116'> result.group() '15735177116'
五、分组匹配
In [59]: ma = re.match(r'abc|d','abc') In [60]: ma.group() Out[60]: 'abc' In [61]: ma = re.match(r'abc|d','d') In [62]: ma.group() Out[62]: 'd' In [63]: ma = re.match(r'[1-9]?\d$','9') In [64]: ma.group() Out[64]: '9' In [65]: ma = re.match(r'[1-9]?\d$','99') In [66]: ma.group() Out[66]: '99' In [67]: ma = re.match(r'[1-9]?\d$','09') In [68]: ma In [69]: ma = re.match(r'[1-9]?\d$','100') In [70]: ma.group() --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-70-7c62fc675aee> in <module>() ----> 1 ma.group() AttributeError: 'NoneType' object has no attribute 'group' In [71]: ma = re.match(r'[1-9]?\d$|100','100') In [72]: ma.group() Out[72]: '100' In [73]: ma = re.match(r'[1-9]?\d$|100','99') In [74]: ma.group() Out[74]: '99' In [75]: ma = re.match(r'[\w]{4,6}@163.com','imooc@163.com') In [76]: ma.group() Out[76]: 'imooc@163.com' In [77]: ma = re.match(r'[\w]{4,6}@(163,123).com','imooc@163.com') In [78]: ma = re.match(r'[\w]{4,6}@(163,123).com','imooc@123.com') In [79]: ma.group() --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-79-7c62fc675aee> in <module>() ----> 1 ma.group() AttributeError: 'NoneType' object has no attribute 'group' In [80]: ma = re.match(r'[\w]{4,6}@(163|123).com','imooc@123.com') In [81]: ma.group() Out[81]: 'imooc@123.com' In [82]: ma = re.match(r'<[\w]+>','<book>') In [83]: ma.group() Out[83]: '<book>' In [84]: ma = re.match(r'<([\w]+>)','<book>') In [85]: ma.group() Out[85]: '<book>' In [86]: ma = re.match(r'<([\w]+>)\1','<book>') In [87]: ma.groups() --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-87-f4e4ca66607d> in <module>() ----> 1 ma.groups() AttributeError: 'NoneType' object has no attribute 'groups' In [88]: ma = re.match(r'<([\w]+>)\1','<book>book>') In [89]: ma.groups() Out[89]: ('book>',) In [90]: ma.group() Out[90]: '<book>book>' In [91]: ma = re.match(r'<([\w]+>\1','<book>book>') In [3]: ma = re.match(r'<([\w]+>)[\w]+</\1','<book>python</book>') In [4]: ma.group() Out[4]: '<book>python</book>' In [5]: ma = re.match(r'<([\w]+>)[\w]+</\1','<book>python</book1>') In [6]: ma In [9]: ma = re.match(r'<(?P<mark>[\w]+>)[\w]+</(?P=mark)','<book>python</book>') In [10]: ma.group() Out[10]: '<book>python</book>'
# 匹配邮箱 p = '(\w+)@(163|126|gmail|qq)\.(com|cn|net)$' r = re.match(p,'zhang@qq.com') r <_sre.SRE_Match object; span=(0, 12), match='zhang@qq.com'> r.group() '# zhang@qq.com'
六、python贪婪和非贪婪
Python里数量词默认是贪婪的(在少数语言里也可能是默认非贪婪),总是尝试匹配尽可能多的字符;非贪婪则相反,总是尝试匹配尽可能少的字符。
在"*","?","+","{m,n}"后面加上?,使贪婪变成非贪婪。
s = 'this is a number 234-235-22-432' r = re.match(r'.+(\d+-\d+-\d+-\d+)',s) r.group(1) Out[32]: '4-235-22-432'
咦?怎么和我们想的不一样啊?这就是因为Python默认的贪婪算法,解决方法:在*,+后面+?
r = re.match(r'(.+?)(\d+-\d+-\d+-\d+)',s) r.groups() Out[33]: ('this is a number ', '234-235-22-432') r = re.match(r'(.+?)(\d+-\d+-\d+-\d+)',s) r.group(1) Out[34]: 'this is a number ' r = re.match(r'(.+?)(\d+-\d+-\d+-\d+)',s) r.group(2) Out[35]: '234-235-22-432'