# 正则 # 正则 ''' 正则就是带语法的字符串, 用来匹配目标字符串得到想要得字符串结果 ''' # 部分语法: # 1.单独字符 # \d 范围 [0-9] # \D 范围刨除 [^0-9] # \w 字母+数字+_ # ... # 2.多个字符 # zo* == zo{0,} # zo+ == zo{1,} # zo? == zo{0,1} # ---------------------------------------------------------------------------------------------- # 3.多行 # re.M # ^以什么开头 # $以什么结尾 # re.S # 换行符也可以进行匹配\n # re.I # 不区分大小写 import re # --(re模块,正则)-------------------------------------------------------------------------------------------------- # --(import re)-------------------------------------------------------------------------------------------------- # 有语法的字符串,用来匹配获取目标字符串中指定需求的字符串 # \一个杠用来转义,\\俩个杠代表一个\ # Ex # (find all,查找全部) # x = 'asdas64fd65q4f65a4s6d//asdqwr' # res = re.findall(r'//',x) # print(res) # | 代表或,or # r''原义内容 # \+转义内容 # --(大写都是小写的反取)-------------------------------------------------------------------------------------- # r'[A-Za-z0-9哈]' 区间方法[A-Z]|[a-z]|[0-9]|'哈' # r'a' --- 字符串 # r'ad' --- 字符串 # r'\D' --- 非数字 # r'\d' --- 数字 # r'\w' --- 数字+字母+下划线 # r'\W' --- 刨除(数字+字母+下划线) # r'\s' --- 所有空白 制表符 换行符 # r'\S' --- 刨除(所有空白 制表符 换行符) # r'.' --- 所有单个字符,刨除 \n 换行符 # --(Ex)----------------------------------------------------------------------------------------------- # print(re.findall(r'需求',r'查找对象')) # print(re.findall(r'需求',查找对象变量名)) # print(re.findall(r'[条件范围]{个数}',r'查找对象')) # print(re.findall(r'[a-z]{2}',r'asdlasjdnsb')) # ['as', 'dl', 'as', 'jd', 'ns'] # --(贪婪匹配 条件{n,n})----------------------------------------------------------------------------------------------- # --( * + ? )----------------------------------------------------------------------------------------------- # print(re.findall(r'条件{个数起数,个数止数}',r'查找对象')) # print(re.findall(r'o{2,}',r'ooaoobooodooqooooooo')) # ['oo', 'oo', 'ooo', 'oo', 'ooooooo'] # --(多行匹配 条件 re.M)-- # --------------------------------------------------------------------------------------------- # 格式: print(re.findall(r'^开头符','fasdasd\nefasd\nffwe\nofo',re.M多行)) # ^以后面条件 为开头 # ^.+ 匹配剩余 # re.M 多行(无视\n换行符) 结合^$ # re.S 将\n也能被.匹配 # re.I 不区分大小写 # Ex # print(re.findall(r'^f.+','faa\nefbb\nffcc\nofo',re.M)) # ['faa', 'ffcc'] # --(分组匹配)----------------------------------------------------------------------------------------------- # ?:取消分组(取消当前分组) # 1.从左往右数,进行编号 # baidu = 'http://www.baidu.com/' # regexp = re.compile('((http://)(.+))') # 生成正则对象 comile编译 # print(regexp) # re.compile('((http://)(.+)/)') # print(regexp.search(baidu)) # re.compile('((http://)(.+)/)') # target = regexp.match(baidu) # match 进行配对处理 # print(target) # <_sre.SRE_Match object; span=(0, 21), match='http://www.baidu.com/'> # print(target.group(0),id(target.group(0))) # http://www.baidu.com/ 自身 # print(target.group(1),id(target.group(1))) # http://www.baidu.com/ # print(target.group(2)) # http:// # print(target.group(3)) # www.baidu.com # --(替换)------------------------------------------------------------------- # 不参与匹配的条件用( ?:) # 参与匹配的都会被替换为指定字符串( r'\3\2\1' 内部按照分组重新输出 | r'\' # 在指定字符串值\num拿到具体分组 # 其他字符串都是原样字符串 # print(re.sub('([a-z]+)(\d+)(.+)',r'\3\2\1','abc123你好')) # 你好123abc # print(re.sub('([a-z]+)(\d+)(.+)',r'\1','abc123你好')) # abc # print(re.sub('([a-z]+)(\d+)(.+)',r'\2','abc123你好')) # 123 # print(re.sub('([a-z]+)(\d+)(.+)',r'\3','abc123你好')) # 你好 # # baidu = 'http://_.www.baidu.com' # SOHO = 'http://www.soho.com' # GOOLE = 'goole' # 拿到外部变量名: # 在r原义模式下字符串拼接 'sth'+变量名+'sth' # print(re.sub('(http://)(www.)(\w+)(.com)',r'\1\2'+GOOLE+'\\4',baidu)) # http://www.goole.com # 在r原义模式下 %s 占位符替换 # print(re.sub('(http://)(www\.)(\w+)(\.com)',r'\1\2%s\4'%GOOLE,baidu)) # http://www.goole.com # 在转义模式下,使用占位符 %s # print(re.sub('(http://)(www\.)(\w+)(\.com)','\\1\\2%s\\4\\n%s'%(GOOLE,baidu),baidu)) # http://www.goole.com # http://www.baidu.com # 非贪婪匹配的引用场景:结合特定的开头与结尾 # ------------------------------------------------------------------------------------------------------------------------ # Ex # print(re.findall(r'f+','faa\nefbb\nffcc\nofo')) # # (+ 相同字符尽可能多)f+ ['faa', 'fbb', 'ffcc', 'fo'] # print(re.findall(r'f.+','faa\nefbb\nffcc\nofo')) # # (.+ f .表示一个字符,(.+ = ..........) # # 后面有多少加多少)['faa', 'fbb', 'ffcc', 'fo'] # res = re.findall('.+','<a>abc</a>') # # print(res) #['<a>abc</a>'] # res = re.findall('.*','<a>abc</a>') # # print(res) # ['<a>abc</a>', ''] # res = re.findall('.*?','<a>abc</a>') # # print(res) # ['', '', '', '', '', '', '', '', '', '', ''] # res = re.findall('<.*>','<a>abc</a>') # # print(res) # ['<a>abc</a>'] # res = re.findall('.*?>','<a>abc</a>') # # print(res) # ['<a>', 'abc</a>'] # res = re.findall('\w*?>','<a>abc</a>') # # print(res) # ['a>', 'a>'] # res = re.findall('(\w*?)>','<a>abc</a>') # # print(res) # ['a', 'a'] # --------------------------------------------------------------------------------------------------------- # 操作分组的方法 # ?P<name> : 有名分组 # # res = re.match('(?P<left>\d{3})(?P<center>\d{3})(\d{3})','123456789') # print(res.group(1)) # 123 # print(res.group('left')) # 123 # print(res.group(2)) # 456 # print(res.group('center')) # 456 # print(res.group(3)) # 789 # print(res.group(0)) # 123456789