第1章 1.9 深入研究正则表达式
示例如下:
>>> import re
>>> match = re.search(r'the phone number is ([\d-]+)', '37: the phone number is 1234-567-890')
#\d: 标记任何数字(0-9)。
>>> match.group()
'the phone number is 1234-567-890'
>>> match.group(0)
'the phone number is 1234-567-890'
#group()或group(0)总是代表整个匹配部分的内容。
>>>
>>> match.group(1)
'1234-567-890'
#group(1)批匹配的正则表达式内容
>>>
>>> pattern = re.compile(r'The answer to question (\w+) is (yes|no)', re.IGNORECASE)
#\w:标记任何字母(包括数字,但是不包括句号等字符), '(yes|no)'表示匹配'yes'或者'no', re.IGNORECASE指不区分大小写
>>>
>>> pattern.search('Naturaly, the answer to question 3b is YES')
<re.Match object; span=(10, 42), match='the answer to question 3b is YES'>
>>>
>>> _.groups()
('3b', 'YES')
#'_.groups()'只能用一次,如果再次执行就报错,'_'比较特殊
>>> _.groups()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'tuple' object has no attribute 'groups'
>>> _.groups(1)
('3b', 'YES')
#此处的'_.group(1)'不同于往后的匹配后的'match.group(1)'
>>> pattern.search('Naturaly, the answer to question 3b is YES')
<re.Match object; span=(10, 42), match='the answer to question 3b is YES'>
>>> _.groups(2)
('3b', 'YES')
#此处的'_.group(2)'不同于往后的匹配后的'match.group(2)'
>> pattern.search('Naturaly, the answer to question 3b is YES').group()
'the answer to question 3b is YES'
>>> pattern.search('Naturaly, the answer to question 3b is YES').group(1)
'3b'
#此处的'group(1)'指匹配的第一个正则表达式
>>> pattern.search('Naturaly, the answer to question 3b is YES').group(2)
'YES'
#此处的'group(2)'指匹配的第二个正则表达式,如果没有会报错。如下:
>>> pattern.search('Naturaly, the answer to question 3b is YES').group(3)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
IndexError: no such group
>>> PATTERN = re.compile(r'([A-Z][\w\s]+).(TX|OR|OH|MI)')
#\s: 标记任何空白字符,包括制表符和其他显示为空白的特殊字符。注意,这与'\S'相反;.:标记任何字符;"([A-Z][\w\s]+)"指大写字母开头,后面跟字母或者空格的组合([\w\s]+),但不接收句号或逗号等标点符号
>>> TEXT='the jackalopes are the team of Odessa,TX while the knights are native of Corvallis OR and the mud hens come from Toledo.OH; the whitecaps have their base in Grand Rapids,MI'
>>> list(PATTERN.finditer(TEXT))
[<re.Match object; span=(31, 40), match='Odessa,TX'>, <re.Match object; span=(73, 85), match='Corvallis,OR'>, <re.Match object; span=(113, 122), match='Toledo.OH'>, <re.Match object; span=(157, 172), match='Grand Rapids,MI'>]
>>>
>>> TEXT='the jackalopes are the team of Odessa,TX while the knights are native of Corvallis OR and the mud hens come from Toledo.OH; the whitecaps have their base in Grand Rapids,MI'
>>> list(PATTERN.finditer(TEXT))
[<re.Match object; span=(31, 40), match='Odessa,TX'>, <re.Match object; span=(73, 122), match='Corvallis OR and the mud hens come from Toledo.OH>, <re.Match object; span=(157, 172), match='Grand Rapids,MI'>]
>>> list(PATTERN.finditer(TEXT))
[<re.Match object; span=(31, 40), match='Odessa,TX'>, <re.Match object; span=(73, 85), match='Corvallis-OR'>, <re.Match object; span=(113, 122), match='Toledo.OH'>, <re.Match object; span=(157, 172), match='Grand Rapids,MI'>]
>>> list(PATTERN.findall(TEXT))
[('Odessa', 'TX'), ('Corvallis', 'OR'), ('Toledo', 'OH'), ('Grand Rapids', 'MI')]
#findall()方法只返回匹配的字符对象,而finditer()方法则返回完整的匹配对象,finditer()返回的是迭代器。
>>> PATTERN.search(TEXT)
<re.Match object; span=(31, 40), match='Odessa,TX'>
#search()中返回第一个匹配对象。
#\D: 标记任何非数据字符。
#\W: 标记任何非字母字符。
#\B: 标记任何不在单词开头或结尾的字符。
>>> TEXT='the jackalopes are the team of Odessa,TX while the knights are native of Corvallis-OR and the mud hens come from Toledo.OH; the whitecaps have their base in Grand Rapids,MI'
>>> PATTERN = re.compile(r'(?P<city>[A-Z][\w\s]+?).(?P<state>TX|OR|OH|MN)')
#使用组分配名称,如:(?P<groupname>PATTERN)
>>>
>>> PATTERN.search(TEXT)
<re.Match object; span=(31, 40), match='Odessa,TX'>
>>> match = PATTERN.search(TEXT)
>>> match.groupdict()
{'city': 'Odessa', 'state': 'TX'}
#使用groupdict()返回字典。
>>> match.group('city')
'Odessa'
>>> match.group('state')
'TX'
>>> match.group(1), match.group(2)
('Odessa', 'TX')