Python中re模块的使用
Python的re模块
#预备知识点
#正则表达式regex
#特殊符号和字符 ---> 元字符
通配符 | 含义 | 正则示例 | 匹配结果 |
---|---|---|---|
reg1 | reg2 | 匹配正则表达式reg1或reg2 | foo | bar | foo |
. | 匹配任何字符(\n除外) | a.a | abc |
^ | 匹配字符串起始部分 | ^a | ab.... |
$ | 匹配字符串终止部分 | .txt$ | a.txt |
* | 匹配0次或者多次前面出现的正则表达式 | a* | aaaaa |
+ | 匹配1次或者多次前面出现的正则表达式 | [a-z]+ | aasx |
? | 匹配0次或者1次前面出现的正则表达式 | first? | first |
{N} | 匹配N次前面出现的正则表达式 | *.c{2} | first.c abc.c |
{M,N} | 匹配M~N次前面出现的正则表达式 | *.c{0,1} | one.c |
[...] | 匹配来自字符集的任意单个字符 | [abc] | b |
[...x-y...] | 匹配x~y范围中的任意单个字符 | [0-9] | 9 |
[^...] | 不匹配次字符集中任意单个字符 | [^0-9] | a |
(*|+|?|{})? | 匹配上面频繁出现符号的非贪婪版 | (*|+|?|{})? | ({}) |
(...) | 匹配封闭的正则表达式,然后另存为子组 | ([0-1][0-9])? | 12 |
\d | 匹配任何十进制数字 | \d.txt | 1.txt |
\w | 匹配任何字母数字字符 | \w{2}txt | 1.txt |
\s | 匹配任何空格字符 | a\sb | a b |
\b | 匹配任何单词边界 | The\bdog | The dog |
\N | 匹配已保存的子组 | ([0-9])\1 | 1 |
\. | 匹配"."这个字符 | a\.txt | a.txt |
正则表达式 | 描述 | 匹配结果 |
---|---|---|
\d+(\.\d*)? | 任意整数和浮点数 | 0.004 2 75. |
\b[^\Wa-z0-9_][^\WA-Z0-9_]*\b | 首字母只能大写 | Boo Foo |
^http:\/\/([\w-]+(\.[\w-]+)+(\/[\w-.\/\?%&=\u4e00-\u9fa5]*)?)?$ | 验证网址 | http://www.baidu.com/?id=1 |
^[\u4e00-\u9fa5]{0,}$ | 验证汉字 | 汉字汉字 |
\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)* | 验证电子邮件 | example@163.com |
^[1-9]([0-9]{16}|[0-9]{13})[xX0-9]$ | 验证身份证 | 14525419951215445X |
^13[0-9]{1}[0-9]{8}|^15[9]{1}[0-9]{8} | 验证手机号 | 138459572*** |
^(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])$ | 验证IP | 192.168.1.1 |
^[a-zA-Z0-9]+([a-zA-Z0-9\-\.]+)?\.s|)$ | 验证域名 | baidu.com |
^([a-zA-Z]\:|\\)\\([^\\]+\\)*[^\/:*?"<>|]+\.txt(l)?$ | 验证文件路径 | C:\user\wo |
<(.*)>(.*)<\/(.*)>|<(.*)\/> | HTML标签匹配 | xxxx |
#re模块
#常用的方法
compile(pattern, flags = 0) 匹配任何可选的标记来编译正则表达式的模式,然后返回一个正则表达式对象
match(pattern, string, flags = 0) 使用带有可选标记的正则表达式的模式来匹配字符串。如果匹配成功,返回匹配对象,否则返回None
search(pattern, string ,flags = 0) 使用可选标记搜索字符串中第一次出现的正则表达式模式。如果匹配成功,则返回匹配对象,否则返回None
findall(pattern, string[,flags] ) 查找字符串中所有(非重复)出现的正则表达式模式,并返回一个匹配列表
finditer(pattern, string[,flags] ) 与findall()相同,但返回的是一个迭代器。对于每一次匹配,迭代器都能返回一个匹配对象
split(pattern, string, max = 0) 根据正则表达式的模式分隔符,split函数将字符串分割为列表,返回匹配列表,分割最多操作max次
group(num = 0) 返回整个匹配对象,或者编号为num的特定子组
import re m = re.search('foo','asdasdfooasd') #这里如果使用match将匹配不到任何字符串,因为match从第一个a开始匹配 if m is not None: print(m.group())
regex = <(.*)>(.*)<\/(.*)>|<(.*)\/> m = re.search(regex,"aa<a>aaaa</a>") #一样只有search能匹配到标签 if m is not None: print(m.group())
regex = '(foo\w)(\w)' m = re.match(r'(foo\w)(\w)','fooasdfooasd') if m is not None: print(m.group(1)) print(m.groups()) #输出 #fooa #('fooa', 's')
regex = 'apple' m = re.findall(regex,'apple1 apple2 apple3') print(m) #输出 #['apple', 'apple', 'apple']
regex = 'apple' m = [ g.group() for g in re.finditer(regex,'apple1 apple2 apple3')] print(m) #输出 #['apple', 'apple', 'apple']
list = [ 'aaa, bbb ccc', 'ddd, eee fff', ] for i in list: print(re.split(', |(?= (?:[a-z]{3})) ',i)) #输出 #['aaa', 'bbb', 'ccc'] #['ddd', 'eee', 'fff']
re模块小实例:
__author__ = 'cq' import re from random import randrange,choice,randint from string import ascii_lowercase as lc from time import ctime #生成数据文件 def generate_data(): with open('./data.txt','w') as f: for i in range(randint(20,30)): tlds = ('com', 'edu', 'net', 'org', 'gov') dtint = randint(100000000,1200000000) #生成时间戳 dtstr = ctime(dtint) #将时间戳转化为特定时间格式 llen = randrange(4, 8) #用户名长度 login = ''.join(choice(lc) for i in range(llen)) #生成用户名 dlen = randrange(llen,13) #域名长度 dom = ''.join(choice(lc) for i in range(dlen)) #生成域名 data_line = "%s::%s@%s.%s::%d-%d-%d\n" % (dtstr, login, dom, choice(tlds), dtint, llen, dlen) f.write(data_line) #写入文件 print(data_line) #打印每行记录 #匹配指定日期的行 def match_date(): regex = '(Mon|Tue|Wed|Thu|Fri|Sat|Sun)(.*)' with open('./data.txt','r') as f: m = re.findall(regex,f.read()) for i in m: print(i) #匹配在某时间段内的记录 def match_time_slot(): regex = ' ([0-9]{1,2}) .*([0-9]{4})::(.*)' # regex = ' ([0-9]{0,2}).*(::)(.*) ' with open('./data.txt','r') as f: m = re.findall(regex,f.read()) for i in m: if 2000 <= int(i[1]) and int(i[1]) <= 2020 and 20 <= int(i[0]) and int(i[0]) <= 31: print(i) #匹配某名单中人员的记录 def match_name(): regex = '::([a-z]{2,13})@([a-z]{2,13})\.(com|edu|net|org|gov)' with open('./data.txt','r') as f: m = re.findall(regex,f.read()) for i in m: print(i) def main(): generate_data() print("\n---------------match_date--------------------\n") match_date() print("\n---------------match_time_slot--------------------\n") match_time_slot() print("\n---------------match_name--------------------\n") match_name() if '__main__' == __name__: main()
输出结果
Sun Mar 5 00:55:55 1989::qvnc@ygeowwaf.com::605033755-4-8 Mon Oct 17 17:16:31 2005::yene@rtewqvvyfe.edu::1129540591-4-10 Tue Oct 7 06:33:30 2003::wlyi@coagmnososzy.edu::1065479610-4-12 Mon Oct 16 00:01:06 2006::zsgok@jkpiplcm.edu::1160928066-5-8 Wed Mar 15 06:37:35 2000::paok@anpekysphicu.com::953073455-4-12 Wed Mar 26 12:27:25 1980::bodqoe@iydohek.org::322892845-6-7 Mon Jun 5 13:54:28 1989::fgiy@oppcjnafx.gov::613029268-4-9 Sun Jul 25 05:27:23 2004::agmljfx@qvxgjqtkiwnl.org::1090704443-7-12 Mon Nov 14 16:15:36 2005::tctz@bcikib.gov::1131956136-4-6 Sun Jan 14 23:20:42 2007::qqlfkf@isslbh.com::1168788042-6-6 Sun Jul 27 02:00:13 1980::cpiqwau@drbpfsfglip.edu::333482413-7-11 Sun Feb 20 16:10:34 2005::aguqfd@hnrcaged.com::1108887034-6-8 Wed Jun 27 06:13:05 1979::kowyk@ruoackjavkpq.net::299283185-5-12 Wed Oct 12 19:52:54 1994::kqaol@mzewoas.edu::781962774-5-7 Thu Aug 23 01:46:59 1973::uofpdq@zdeidbobin.org::114889619-6-10 Sat Dec 21 11:36:20 1991::hodw@wfbw.org::693286580-4-4 Tue Jun 22 14:42:19 1993::azgagm@nfmguh.org::740731339-6-6 Sun Feb 23 04:50:57 2003::cysfu@fnzdo.com::1045947057-5-5 Fri Jun 10 13:38:02 1983::qdhqw@fcdsvlmnhx.net::424071482-5-10 Sat Jan 24 21:56:37 1998::dfyicjw@fklbymd.org::885650197-7-7 Sun Jun 3 07:48:45 2007::wptuyjk@tsngnle.edu::1180828125-7-7 Mon Nov 19 00:34:41 2001::ocjlb@nusyk.net::1006101281-5-5 Sat Dec 1 21:01:23 1973::bvhx@lmir.net::123598883-4-4 Sun Dec 16 17:42:51 1979::rpgs@hppau.org::314185371-4-5 Mon Jul 21 23:46:13 1986::fnsro@nmbcwdmie.org::522344773-5-9 ---------------match_date-------------------- ('Sun', ' Mar 5 00:55:55 1989::qvnc@ygeowwaf.com::605033755-4-8') ('Mon', ' Oct 17 17:16:31 2005::yene@rtewqvvyfe.edu::1129540591-4-10') ('Tue', ' Oct 7 06:33:30 2003::wlyi@coagmnososzy.edu::1065479610-4-12') ('Mon', ' Oct 16 00:01:06 2006::zsgok@jkpiplcm.edu::1160928066-5-8') ('Wed', ' Mar 15 06:37:35 2000::paok@anpekysphicu.com::953073455-4-12') ('Wed', ' Mar 26 12:27:25 1980::bodqoe@iydohek.org::322892845-6-7') ('Mon', ' Jun 5 13:54:28 1989::fgiy@oppcjnafx.gov::613029268-4-9') ('Sun', ' Jul 25 05:27:23 2004::agmljfx@qvxgjqtkiwnl.org::1090704443-7-12') ('Mon', ' Nov 14 16:15:36 2005::tctz@bcikib.gov::1131956136-4-6') ('Sun', ' Jan 14 23:20:42 2007::qqlfkf@isslbh.com::1168788042-6-6') ('Sun', ' Jul 27 02:00:13 1980::cpiqwau@drbpfsfglip.edu::333482413-7-11') ('Sun', ' Feb 20 16:10:34 2005::aguqfd@hnrcaged.com::1108887034-6-8') ('Wed', ' Jun 27 06:13:05 1979::kowyk@ruoackjavkpq.net::299283185-5-12') ('Wed', ' Oct 12 19:52:54 1994::kqaol@mzewoas.edu::781962774-5-7') ('Thu', ' Aug 23 01:46:59 1973::uofpdq@zdeidbobin.org::114889619-6-10') ('Sat', ' Dec 21 11:36:20 1991::hodw@wfbw.org::693286580-4-4') ('Tue', ' Jun 22 14:42:19 1993::azgagm@nfmguh.org::740731339-6-6') ('Sun', ' Feb 23 04:50:57 2003::cysfu@fnzdo.com::1045947057-5-5') ('Fri', ' Jun 10 13:38:02 1983::qdhqw@fcdsvlmnhx.net::424071482-5-10') ('Sat', ' Jan 24 21:56:37 1998::dfyicjw@fklbymd.org::885650197-7-7') ('Sun', ' Jun 3 07:48:45 2007::wptuyjk@tsngnle.edu::1180828125-7-7') ('Mon', ' Nov 19 00:34:41 2001::ocjlb@nusyk.net::1006101281-5-5') ('Sat', ' Dec 1 21:01:23 1973::bvhx@lmir.net::123598883-4-4') ('Sun', ' Dec 16 17:42:51 1979::rpgs@hppau.org::314185371-4-5') ('Mon', ' Jul 21 23:46:13 1986::fnsro@nmbcwdmie.org::522344773-5-9') ---------------match_time_slot-------------------- ('25', '2004', 'agmljfx@qvxgjqtkiwnl.org::1090704443-7-12') ('20', '2005', 'aguqfd@hnrcaged.com::1108887034-6-8') ('23', '2003', 'cysfu@fnzdo.com::1045947057-5-5') ---------------match_name-------------------- ('qvnc', 'ygeowwaf', 'com') ('yene', 'rtewqvvyfe', 'edu') ('wlyi', 'coagmnososzy', 'edu') ('zsgok', 'jkpiplcm', 'edu') ('paok', 'anpekysphicu', 'com') ('bodqoe', 'iydohek', 'org') ('fgiy', 'oppcjnafx', 'gov') ('agmljfx', 'qvxgjqtkiwnl', 'org') ('tctz', 'bcikib', 'gov') ('qqlfkf', 'isslbh', 'com') ('cpiqwau', 'drbpfsfglip', 'edu') ('aguqfd', 'hnrcaged', 'com') ('kowyk', 'ruoackjavkpq', 'net') ('kqaol', 'mzewoas', 'edu') ('uofpdq', 'zdeidbobin', 'org') ('hodw', 'wfbw', 'org') ('azgagm', 'nfmguh', 'org') ('cysfu', 'fnzdo', 'com') ('qdhqw', 'fcdsvlmnhx', 'net') ('dfyicjw', 'fklbymd', 'org') ('wptuyjk', 'tsngnle', 'edu') ('ocjlb', 'nusyk', 'net') ('bvhx', 'lmir', 'net') ('rpgs', 'hppau', 'org') ('fnsro', 'nmbcwdmie', 'org') Process finished with exit code 0