正则表达式
1 import re 2 3 # match:从开始的未知开始匹配 4 # search:在字符串中找到第一个满足条件的字符串 5 # 匹配某个字符串 6 text = 'hello' 7 ret = re.match('he', text) 8 print('>>' + ret.group()) # >>he 9 10 # 点.匹配任意的一个字符 11 text = 'hello' 12 ret = re.match('.ello', text) 13 print('>>' + ret.group()) # >>hello 14 15 # 点.+匹配任意的字符 16 text = 'hello' 17 ret = re.match('.+llo', text) 18 print('>>' + ret.group()) # >>hello 19 20 # \d匹配任意的一个数字 21 text = '1234' 22 ret = re.match('\d', text) 23 print('>>' + ret.group()) # >>1 24 25 # \D匹配任意的一个非数字 26 text = 'f1234' 27 ret = re.match('\D', text) 28 print('>>' + ret.group()) # >>f 29 30 # \s匹配空白字符(\n,\t,\r,空格) 31 text = ' ab' 32 ret = re.match('\s', text) 33 print('>>' + ret.group()) # >> 34 35 # \w匹配的是a-z 和A-z已经数字和下划线 36 text = 'Zab' 37 ret = re.match('\w', text) 38 print('>>' + ret.group()) # >>Z 39 40 # []的形式代替\w 41 text = '_Zab_' 42 ret = re.match('[a-zA-Z0-9_]', text) 43 print('>>' + ret.group()) # >>_ 44 45 # \W 与 \w相反 46 text = '$+' 47 ret = re.match('\W', text) 48 print('>>' + ret.group()) # >>$ 49 50 # []组合的方式,只要满足括号中的字符,就可以匹配 51 text = '0733-888888abc' 52 ret = re.match('[\d\-]+', text) 53 print('>>' + ret.group()) # >>0733-888888 54 55 # 中括号的形式代替\d 56 text = '09345' 57 ret = re.match('[0-9]+', text) 58 print('>>' + ret.group()) # >>09345 59 60 # [^...]非中括号里面的字符 61 text = 'abc45' 62 ret = re.match('[^0-9]+', text) 63 print('>>' + ret.group()) # >>abc 64 65 # [^...]非中括号里面的字符 66 text = '$@!_' 67 ret = re.match('[^a-zA-Z0-9_]+', text) 68 print('>>' + ret.group()) # >>$@! 69 70 # *匹配0个或多个字符 71 text = '0731ab31' 72 ret = re.match('\d*', text) 73 print('>>' + ret.group()) # >>0731 74 75 # *匹配0个或多个字符 76 text = 'abc' 77 ret = re.match('\d*', text) 78 print('>>' + ret.group()) # >> 79 80 # +匹配至少一个或者多个字符 81 text = '_ab#abc' 82 ret = re.match('\w+', text) 83 print('>>' + ret.group()) # >>_ab 84 85 # ?匹配0个或一个字符 86 text = '_ab#abc' 87 ret = re.match('\w?', text) 88 print('>>' + ret.group()) # >>_ 89 90 # ?匹配0个或一个字符 91 text = '#ab#abc' 92 ret = re.match('\w?', text) 93 print('>>' + ret.group()) # >> 94 95 # {m}匹配m个字符 96 text = 'abcdabc' 97 ret = re.match('\w{3}', text) 98 print('>>' + ret.group()) # >>abc 99 100 # {m,n}匹配m-n个字符 101 text = 'abcdefg' 102 ret = re.match('\w{1,5}', text) 103 print('>>' + ret.group()) # >>abcde 104 105 # 验证手机号码,1开头,第二位数3/4/5/7/8,再接着后面9位数 106 text = '13426262626' 107 ret = re.match('1[34578]\d{9}', text) 108 print('>>' + ret.group()) # >>13426262626 109 110 # 验证邮箱,格式xxxx@xx.xx 111 text = 'hello@126.com' 112 ret = re.match('\w+@[a-z0-9]+\.[a-z]+', text) 113 print('>>' + ret.group()) # >>hello@126.com 114 115 # 验证URL,http/https/ftp://xxxxxx 116 text = 'http://www.baidu.com/' 117 ret = re.match('(http|htts|ftp)://[^\s]+', text) 118 print('>>' + ret.group()) # >>http://www.baidu.com/ 119 120 # 验证身份证号码 121 text = '36011112222222222X' 122 ret = re.match('\d{17}[xX\d]', text) 123 print('>>' + ret.group()) # >>36011112222222222X 124 125 # ^脱字号,以xxx开始,在中括号里代表取反的作用 126 text = 'hello' 127 ret = re.search('^he', text) 128 print('>>' + ret.group()) # >>he 129 130 # $:表示以xxx结尾 131 text = 'xxx@126.com' 132 ret = re.match('\w+@126.com$', text) 133 print('>>' + ret.group()) # >>xxx@126.com 134 135 # |匹配多个字符串或者表达式 136 text = 'http' 137 ret = re.match('ftp|http|https', text) 138 print('>>' + ret.group()) # >>http 139 140 # 贪婪模式,+尽量匹配多个字符 141 text = '0123456' 142 ret = re.match('\d+', text) 143 print('>>' + ret.group()) # >>0123456 144 145 # 非贪婪模式,只匹配最小的条件字符 146 text = '0123456' 147 ret = re.match('\d+?', text) 148 print('>>' + ret.group()) # >>0 149 150 text = '<h1>标题</h1>' 151 ret = re.match('<.+?>', text) 152 print('>>' + ret.group()) # >><h1> 153 154 # 匹配0-100中间的数字 155 # 可以出现1,2,3,10....99,100 156 # 不可以出现0,09,012,101 157 text = '17' 158 ret = re.match('[1-9]\d?$|100$', text) # \d?一位数字,且这个数字可有可无 159 print('>>' + ret.group()) # >>99 160 161 # \转移字符 162 text = 'applepad is worth $9999' 163 ret = re.search('\$\d+', text) 164 print('>>' + ret.group()) # >>$9999 165 166 text = '\\n' # '\\n'== r'\n' 167 ret = re.search('\\\\n', text) 168 print('>>' + ret.group()) # >>\n 169 170 # 使用r原生 171 text = r'\n' # '\\n'== r'\n' 172 ret = re.search(r'\\n', text) 173 print('>>' + ret.group()) # >>\n 174 175 # ()分组 176 text = 'applepad is worth $333, applewatch is worth $444' 177 ret = re.search('.*(\$\d+).*(\$\d+)', text) # .*代表0个或多个任意字符,用.+也可以 178 print('>>' + ret.group()) # >>applepad is worth $333, applewatch is worth $444 179 print('>>' + ret.group(0)) # >>applepad is worth $333, applewatch is worth $444 等价于ret.group() 180 print('>>' + ret.group(1)) # >>$333 181 print('>>' + ret.group(2)) # >>$444 182 print(ret.group(1, 2)) # ('$333', '$444') 183 print(ret.groups()) # ('$333', '$444') 184 185 # findall函数 186 text = 'applepad is worth $333, applewatch is worth $444' 187 ret = re.findall('\$\d+', text) 188 print(ret) # >> ['$333', '$444'] 189 190 # sub函数 191 text = 'applepad is worth $333, applewatch is worth $444' 192 ret = re.sub('\$\d+', '0', text) 193 print(ret) # >> applepad is worth 0, applewatch is worth 0 194 195 # 剔除标签,只保留文字内容 196 html = """ 197 <dd class="job_bt"> 198 <h3 class="des>职位描述:</h3> 199 <div> 200 <p>职位1</p> 201 <p>职位2</p> 202 <p>职位3</p> 203 <p>职位4</p> 204 </div> 205 </dd> 206 """ 207 ret = re.sub('<.+?>', '', html) # 非贪婪模式 208 print(ret) 209 210 # split函数 211 text = 'hello world&ni hao shijie' 212 ret = re.split(' |&', text) 213 print(ret) # >> ['hello', 'world', 'ni', 'hao', 'shijie'] 214 215 text = 'hello world&ni hao shijie' 216 ret = re.split('[^a-zA-Z]', text) # 非英文字符当作分隔符 217 print(ret) # >> ['hello', 'world', 'ni', 'hao', 'shijie'] 218 219 # compile函数,编译后放入内存 220 text = 'the numbers is 50.12' 221 rex = re.compile('\d+\.?\d*') # 222 ret = re.search(rex, text) 223 print(ret.group()) # 50.12 224 225 # 正则表达式加注释 226 text = 'the numbers is 50.12' 227 rex = re.compile(r""" 228 \d+ #小数点前的一个或多个数字 229 \.? #0个或1一个. 230 \d* #小数后的0个或多个数字 231 """, re.VERBOSE) 232 ret = re.search(rex, text) 233 print(ret.group()) # 50.12
posted on 2021-12-29 17:29 Shine-Zhong 阅读(59) 评论(0) 编辑 收藏 举报