python基础语法21 re模块补充 正则表达式
正则表达式
1 import re 2 3 print(re.findall('\w', 'hello 123_ */-=')) 4 print(len(re.findall('\w', 'hello 123_ */-='))) 5 6 print(re.findall('\W', 'hello 123_ */-=')) 7 8 print(re.findall('\s', 'hell\no 12\t3_ */-=')) 9 10 print(re.findall('\S', 'hell\no 12\t3_ */-=')) 11 12 print(re.findall('\d', 'hell\no 12\t3_ */-=')) 13 print(re.findall('\D', 'hell\no 12\t3_ */-=')) 14 15 print(re.findall('\n', 'hell\no 12\t3_ */-=')) 16 print(re.findall('\t', 'hell\no 12\t3_ */-=')) 17 print(re.findall('l', 'hell\no 12\t3_ */-=')) 18 19 print(re.findall('tank', 'my name is tank, tank is handsome')) 20 print(re.findall('^tank', 'tank my name is tank, tank is handsome')) 21 print(re.findall('tank$', 'tank my name is tank,tank is handsome tank')) 22 23 重复匹配 24 .:匹配换行符以外的任意一个字符 25 ['abc','a1c','aac','aac','a*c','a+c'] 26 print(re.findall('a.c', 'abc a1c aac asd aaaaac a*c a+c abasd')) 27 a.c 28 print(re.findall('a.c', 'abc a1c aac a\nc asd aaaaac a*c a+c abasd', re.DOTALL)) 29 30 []: 匹配一个字符,该字符属于中括号内指定的字符 31 print(re.findall('a..c', 'abc a1 c aac asd aaaaac a *c a+c abasd =')) 32 print(re.findall('a.c', 'abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c')) 33 print(re.findall('a[a-z]c', 'abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c')) 34 print(re.findall('a[A-Z]c', 'abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c')) 35 36 []内的+ * 不是量词 37 print(re.findall('a[-+*/]c', 'abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c')) 38 print(re.findall('a[a-z][a-z]c', 'abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c')) 39 ^在[]内代表非 40 print(re.findall('a[^a-z]c', 'abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c')) 41 42 *: 必须与其他字符连用,代表左侧的字符出现0次或者无穷次 43 ab* ---》 匹配0个或多个b字符, 并且是a开头的 44 print(re.findall('ab*', 'a ab abbb abbbb a1bbbb a-123')) 45 # ['a','ab','abbb','abbbb','a','a'] 46 * == {0,} 47 print(re.findall('ab{0,}', 'a ab abbb abbbb a1bbbb a-123')) 48 49 ?: 必须与其他字符连用,代表左侧的字符出现0次或者1次 50 print(re.findall('ab?', 'a ab abbb abbbb a1bbbb a-123')) 51 # ab? 52 # ['a','ab','ab','ab','a','a'] 53 {0,1} == ? 54 print(re.findall('ab{0,1}', 'a ab abbb abbbb a1bbbb a-123')) 55 56 +: 必须与其他字符连用,代表左侧的字符出现1次或者无穷次 57 ab+ 58 print(re.findall('ab+', 'a ab abbb abbbb a1bbbb a-123')) 59 # ['ab','abbb','abbbb'] 60 {1,} == + 61 print(re.findall('ab{1,}', 'a ab abbb abbbb a1bbbb a-123')) 62 63 # {n,m}: 必须与其他字符连用 64 ab{1,3} b字符出现1次——3次 65 print(re.findall('ab{1,3}', 'a ab abbb abb abbbb a1bbbb a-123')) 66 # ['ab', 'abbb', 'abb', 'abbb'] 67 68 69 .*:贪婪匹配 70 a.*d ---> 匹配字符以最后的d作为结束标识 71 print(re.findall('a.*d', 'ab123adfc1134124123aasfc123123')) 72 73 # .*?:非贪婪匹配 74 a.*?c 75 print(re.findall('a.*?c', 'ab123adfc1134124123adasfc123123')) 76 77 ():分组 78 expression=".*?" 79 print(re.findall('expression="(.*?)"', 'expression="1+2+3/4*5" tank="handsome"')) 80 print(re.findall('href="(.*?)"', 81 '<p>段落</p><a href="https://www.sb.com">点我啊</a><h1>标题</h1><a href="https://www.sb.com">点我啊</a>')) 82 83 a|b 84 print(re.findall('a|b', 'ab123abasdfaf')) 85 86 companies company 87 (?:)表示非捕获分组,和捕获分组唯一的区别在于,非捕获分组匹配的值不会保存起来 88 (?:)---> 将ies或者y保留与compan拼接 89 print(re.findall('compan(?:ies|y)', 90 'Too many companies have gone bankrupt, and the next one is my company')) 91 92 标识性字符(提取的内容) 93 print(re.findall('ale(x)', 'alex is SB,alex is bigSB')) 94 print(re.search('alex', 'alex is SB,alex is bigSB').group()) 95 print(re.search('abcdefg', 'alex is SB,alex is bigSB')) 96 97 print(re.search('^alex', '123alex is SB,alex is bigSB')) 98 print(re.match('alex', '123alex is SB,alex is bigSB')) 99 100 l = 'tank:17:male'.split(':') 101 print(l) 102 了解: 根据“ ” or “:” or “/” or “-” 来进行切分 103 l1 = re.split('[ :/-]', 'a-b/c tank:17:male xxx') 104 print(l1) 105 106 [a-z]+xx 107 了解: sub: 替换 ---》 第二个参数 根据正则替换到第三参数中 108 print(re.sub('[a-z]+xx', 'yxp', 'lxx is good,sb is lllxx wxx is good cxx is good')) 109 110 了解: compile可以将字符转成字节 111 pattern = re.compile('alex') # 返回一个对象 112 print(pattern) 113 print(pattern.findall('alex is SB,alex is bigSB')) 114 print(re.findall('alex', 'alex is SB,alex is bigSB')) 115 116 print(pattern.search('alex is SB,alex is bigSB').group()) # alex 117 118 import re 119 120 str1 = '1abc a1 c aac aAc\n \taBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c' 121 print(re.findall('\w', str1)) # \w---匹配字母数字及下划线 122 print(re.findall('\W', str1)) # \w---匹配非字母数字及下划线 \n \t 123 print(re.findall('\s', str1)) # 匹配任意空白字符\n\t\r\f 124 print(re.findall('\S', str1)) # 匹配非空白字符 125 print(re.findall('\d', str1)) # 匹配数字等价0-9 126 print(re.findall('\D', str1)) # 匹配任意非数字0-9 127 print(re.findall('\Aac', str1)) # 匹配字母开始 128 print(re.findall('\\n\Z', str1)) # 匹配字母结束,只匹配到换行前的结束字符串 129 print(re.findall('\n', str1)) # 匹配换行符 130 print(re.findall('\t', str1)) # 匹配换行符 131 print(re.findall('^1abc', str1)) # 匹配以什么开头 132 print(re.findall('c$', str1)) # 匹配以什么结尾 133 134 str1 = '1abbb a1 a\nbc aac aAc\n \taBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c' 135 print(re.findall('a.b', str1)) # 匹配中间是任意字符除了换行符 136 print(re.findall('a.b', str1, re.S)) # 匹配中间是任意字符包含换行符 137 print(re.findall('a.b', str1, re.DOTALL)) # 匹配中间是任意字符包含换行符 138 print(re.findall('ab*', str1)) # 匹配0个或多个表达式 139 print(re.findall('ab+', str1)) # 匹配1个或多个表达式 140 print(re.findall('ab?', str1)) # 匹配0个或1个表达式 141 print(re.findall('ab?a', str1)) # 匹配0个或1个表达式指代找b 142 print(re.findall('ab{2}', 'abbb aabxbaa')) # 表示1个a2个b 143 print(re.findall('a[1*-]b', 'a1b a\nb a*b a-b')) # ['a1b', 'a*b', 'a-b'] 144 print(re.findall('a[^1*-]b', 'a1b a*b a-b a=b')) # []内的^表示取反 145 print(re.findall('a[0-9]b', 'a1b a*b a-b a=b')) # ['a1b'] 146 print(re.findall('a[a-z]b', 'a1b a*b a-b a=b aeb')) # ['aeb'] 147 print(re.findall('a[a-zA-Z]b', 'a1b a*b a-b a=b aeb aEb')) # ['aeb', 'aEb'] 148 print(re.findall(r'a\\c', 'a\c')) 149 print(re.findall('(ab)+123', 'ababab123')) 150 print(re.findall('(?:ab)+123', 'xxxaab123')) # ['ab123'] 151 print(re.findall('(?:ab)+123', '12abab123')) # ['abab123']如果有相同的ab连接在一起就一起显示 152 print(re.findall('compan(?:ies|y)', 'Too many companies have gone bankrupt, and the next one is my company')) 153 print(re.findall('href="(.*?)"', 154 '<p>段落</p><a href="https://www.sb.com">点我啊</a><h1>标题</h1><a href="https://www.sb.com">点我啊</a>')) 155 print(re.findall('a|b', 'ab123abasdfaf')) 156 print(re.split('ab', 'abcd')) # ['', 'cd'] 157 print(re.split('[ab]', 'abcd')) # ['', '', 'cd'] #如果是列表按照索引取 158 print('===>', re.sub('a', 'A', 'alex make love')) # ===> Alex mAke love,不指定n,默认替换所有 159 print('===>', re.sub('a', 'A', 'alex make love', 1)) # ===> Alex make love 160 161 obj = re.compile('\d{3}') # 查找3个数字还要连续的 162 print(obj.search('abc123eee1e').group()) # 12 163 print(obj.findall('abc123eeee')) # ['12'],重用了obj 164 165 print(re.findall('a,b|c', 'ac,a,b,accc')) 166 print(re.findall('ab?', 'a')) 167 168 import re 169 170 print(re.findall("<(?P<tag_name>\w+)>\w+</(?P=tag_name)>", "<h1>hello</h1>")) # ['h1'] 171 172 173 import re 174 175 str1 = '<h1>www.oldboyedu.*+com<h1>' 176 177 # www.oldboyedu. 178 re.findall( 179 # \. == '.' 180 # . == 任意字符 181 'www.*\.\*\+', 182 str1 183 )