正则表达式python

一、re的正则方法简介

import re

"""
小结:
re.match()     # 默认从行首开始匹配
re.search()    # 从任意位置开始匹配。不局限于行首
re.findall()  # 匹配多次,并返回匹配值的列表
re.split()     # 按正则regx分割字串,并返回分割后的字串列表
re.sub()       # 按正则regx找到多个子字串值a1、a2、an,并用指定字串b统一替换a1、a2、an,并返回替换后的整体新字串

说明:ret是re.match()、re.search()的返回结果
ret.group()     # 获取整体匹配上的结果
ret.group(1)    # 获取第1个分组的结果。就是正则regx中第1个小括号中匹配上的值
ret.group(2)    # 获取第2个分组的结果。就是正则regx中第2个小括号中匹配上的值
ret.group(3)    # 获取第3个分组的结果。就是正则regx中第3个小括号中匹配上的值,如果超过了分组数量,就会报错。
"""
# 一、re.match() 能够匹配出以xxx(正则表达式)开头的字符串
# 1、不分组匹配。没有小括号
ret = re.match(r"H", "Hello Python")
print(ret.group())

# 2、分组匹配。有小括号,一个小括号就是1个组,小组编号从1开始算。
# re.match()、re.search()都支持小括号的分组。
ret = re.match(r"([^-]*)-(\d+)", "010-12345678")
print(ret.group())  # 010-12345678
print(ret.group(1))  # 010
print(ret.group(2))  # 12345678

# 非贪婪模式。在"*","?","+","{m,n}"后面加上?,使贪婪变成非贪婪。
s = "aa2343ddd"
r = re.match(r"aa(\d+?)", s)
print(r.group())  # 'aa2'
print(r.group(1))  # '2'

# 通过引用分组中匹配到的数据即可,但是要注意是元字符串,即类似 r""这种格式
ret = re.match(r"<([a-zA-Z]*)>\w*</\1>", "<html>hh</html>")
print(ret.group())  # <html>hh</html>
print(ret.group(1))  # html
# print(ret.group(2)) # 报错。因为没有第2组的小括号,越界了。

# 二、 re.search() 不仅匹配开头
ret = re.search(r"\d+", "阅读次数为 9999")
print(ret.group())  # 9999

# 三、re.findall()匹配多个值,并返回一个列表
ret_list = re.findall(r"\d+", "python = 9999, c = 7890, c++ = 12345")
print(ret_list)  # ['9999', '7890', '12345']

# 四、re.split()按指定字符进行分割。匹配并切割
ret_list = re.split(r":| ", "info:xiaoZhang 33 shandong")
print(ret_list)  # ['info', 'xiaoZhang', '33', 'shandong']

# 五、re.sub()将匹配到的值替换。匹配并替换多个值,并返回替换后的整体结果
ret_new_str = re.sub(r"\d+", '998', "python = 997 python = 997")
print(ret_new_str)  # python = 998 python = 998
print('ret_new_str=====: %s' % ret_new_str)

print('=' * 50)
print('案例:域名相关的正则匹配')
# 匹配:'http://www.freebuf.com',
# url = 'http://www.freebuf.com'
url = 'https://freebuf.com/articles/es/123%e7%b1%b3%e9%9b%aa%e5%84%bf'
url = 'http://www.freebuf.com/author/%e7%b1%b3%e9%9b%aa%e5%84%bf'
# url = 'http://www.freebuf.com/157843sdf.html'
# 匹配域名:https://www.freebuf.com
ret = re.search(r'https?://(\w+?\.)+\w+\/?', url)
print(ret.group())  # 'http://www.freebuf.com/'
print(ret.group(1))  # 'freebuf.'
print(ret.group()) if ret != None else print('ret = None')

# 匹配文件夹:https://www.freebuf.com/articles/es
url = 'https://www.freebuf.com/articles/es'
ret = re.search(r'https?://(\w+?\.)+\w+(\/\w+)*(\/\w+\/?)', url)
print(ret.group())  # 'https://www.freebuf.com/articles/es'
print(ret.group(1))  # 'freebuf.'
print(ret.group(2))  # '/articles'
print(ret.group(3))  # '/es'
if ret != None:
    print(ret.group())  # 'https://www.freebuf.com/articles/es'
else:
    print('ret = None')

# 匹配文件:http://www.freebuf.com/news/157843.html
url = 'http://www.freebuf.com/news/157843.html'
ret = re.search(r'https?://(\w+?\.)+\w+(\/\w+)*(\/\w+\.\w+)', url)
print(ret.group())  # 'http://www.freebuf.com/news/157843.html'
print(ret.group(1))  # 'freebuf.'
print(ret.group(2))  # '/news'
print(ret.group(3))  # '/157843.html'
print(ret.group()) if ret_list != None else print('ret_list = None')

  

二、re的预编译方式,结合前面的正则方式使用

1、先编译一次,后面就不编译了

import re

str = 'https://www.freebuf.com/page/357'

pattern = re.compile(r'(p|s)\:\/\/www\.(.+?\..+?)\/+?', re.DOTALL)
match = pattern.findall(str)

print(type(match))  # <class 'list'>
print(match)  # [('s', 'freebuf.com')]

  

2、忽略大小写

在正则表达式前面加(?i)

import re

str1 = """
201.158.69.116 - - [03/Jan/2013:21:17:20 -0600] fwf[-] tip[-] 127.0.0.1:9000 0.007 0.007 MX pythontab.com GET /html/test.html http/1.1 "


200" 2426
"http://a.com" "es-ES,es;q=0.8" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11"


172.16.119.8 - admin [15/Aug/2011:18:17:50 +0800] "PROPFIND /svn/EAGLE HTtP/1.1" 201 649
172.16.119.8 - admin [15/Aug/2011:18:17:50 +0800] "PROPFIND /svn/EAGLE/!svn/vcc/default HTTP/1.1" 207 401
172.16.119.8 - admin [15/Aug/2011:18:17:50 +0800] "PROPFIND /svn/EAGLE/!svn/bln/31 HTTP/1.1" 207 454
172.16.119.8 - admin [15/Aug/2011:18:17:50 +0800] "PROPFIND /svn/EAGLE HTTP/1.1" 207 649
172.16.119.8 - admin [15/Aug/2011:18:17:50 +0800] "PROPFIND /svn/EAGLE/!svn/vcc/default HTTP/1.1" 207 454
"""

pattern = re.compile(r'(?i)HTTP/.+?\b[^\d]+?([2345]\d{2})', re.DOTALL)
ret = pattern.findall(str1)
print(ret)  # ['200', '201', '207', '207', '207', '207']

 

3、点号默认不匹配换行,要想匹配换行,需要设置re.DOTALL

pattern = re.compile(r'(?i)HTTP/.+?\b[^\d]+?([2345]\d{2})', re.DOTALL)

  

posted @ 2018-01-30 19:49  安迪9468  阅读(224)  评论(0编辑  收藏  举报