正则表达式

  1 import re
  2 
  3 # match:从开始的未知开始匹配
  4 # search:在字符串中找到第一个满足条件的字符串
  5 # 匹配某个字符串
  6 text = 'hello'
  7 ret = re.match('he', text)
  8 print('>>' + ret.group())  # >>he
  9 
 10 # 点.匹配任意的一个字符
 11 text = 'hello'
 12 ret = re.match('.ello', text)
 13 print('>>' + ret.group())  # >>hello
 14 
 15 # 点.+匹配任意的字符
 16 text = 'hello'
 17 ret = re.match('.+llo', text)
 18 print('>>' + ret.group())  # >>hello
 19 
 20 # \d匹配任意的一个数字
 21 text = '1234'
 22 ret = re.match('\d', text)
 23 print('>>' + ret.group())  # >>1
 24 
 25 # \D匹配任意的一个非数字
 26 text = 'f1234'
 27 ret = re.match('\D', text)
 28 print('>>' + ret.group())  # >>f
 29 
 30 # \s匹配空白字符(\n,\t,\r,空格)
 31 text = ' ab'
 32 ret = re.match('\s', text)
 33 print('>>' + ret.group())  # >>
 34 
 35 # \w匹配的是a-z 和A-z已经数字和下划线
 36 text = 'Zab'
 37 ret = re.match('\w', text)
 38 print('>>' + ret.group())  # >>Z
 39 
 40 # []的形式代替\w
 41 text = '_Zab_'
 42 ret = re.match('[a-zA-Z0-9_]', text)
 43 print('>>' + ret.group())  # >>_
 44 
 45 # \W 与 \w相反
 46 text = '$+'
 47 ret = re.match('\W', text)
 48 print('>>' + ret.group())  # >>$
 49 
 50 # []组合的方式,只要满足括号中的字符,就可以匹配
 51 text = '0733-888888abc'
 52 ret = re.match('[\d\-]+', text)
 53 print('>>' + ret.group())  # >>0733-888888
 54 
 55 # 中括号的形式代替\d
 56 text = '09345'
 57 ret = re.match('[0-9]+', text)
 58 print('>>' + ret.group())  # >>09345
 59 
 60 # [^...]非中括号里面的字符
 61 text = 'abc45'
 62 ret = re.match('[^0-9]+', text)
 63 print('>>' + ret.group())  # >>abc
 64 
 65 # [^...]非中括号里面的字符
 66 text = '$@!_'
 67 ret = re.match('[^a-zA-Z0-9_]+', text)
 68 print('>>' + ret.group())  # >>$@!
 69 
 70 # *匹配0个或多个字符
 71 text = '0731ab31'
 72 ret = re.match('\d*', text)
 73 print('>>' + ret.group())  # >>0731
 74 
 75 # *匹配0个或多个字符
 76 text = 'abc'
 77 ret = re.match('\d*', text)
 78 print('>>' + ret.group())  # >>
 79 
 80 # +匹配至少一个或者多个字符
 81 text = '_ab#abc'
 82 ret = re.match('\w+', text)
 83 print('>>' + ret.group())  # >>_ab
 84 
 85 # ?匹配0个或一个字符
 86 text = '_ab#abc'
 87 ret = re.match('\w?', text)
 88 print('>>' + ret.group())  # >>_
 89 
 90 # ?匹配0个或一个字符
 91 text = '#ab#abc'
 92 ret = re.match('\w?', text)
 93 print('>>' + ret.group())  # >>
 94 
 95 # {m}匹配m个字符
 96 text = 'abcdabc'
 97 ret = re.match('\w{3}', text)
 98 print('>>' + ret.group())  # >>abc
 99 
100 # {m,n}匹配m-n个字符
101 text = 'abcdefg'
102 ret = re.match('\w{1,5}', text)
103 print('>>' + ret.group())  # >>abcde
104 
105 # 验证手机号码,1开头,第二位数3/4/5/7/8,再接着后面9位数
106 text = '13426262626'
107 ret = re.match('1[34578]\d{9}', text)
108 print('>>' + ret.group())  # >>13426262626
109 
110 # 验证邮箱,格式xxxx@xx.xx
111 text = 'hello@126.com'
112 ret = re.match('\w+@[a-z0-9]+\.[a-z]+', text)
113 print('>>' + ret.group())  # >>hello@126.com
114 
115 # 验证URL,http/https/ftp://xxxxxx
116 text = 'http://www.baidu.com/'
117 ret = re.match('(http|htts|ftp)://[^\s]+', text)
118 print('>>' + ret.group())  # >>http://www.baidu.com/
119 
120 # 验证身份证号码
121 text = '36011112222222222X'
122 ret = re.match('\d{17}[xX\d]', text)
123 print('>>' + ret.group())  # >>36011112222222222X
124 
125 # ^脱字号,以xxx开始,在中括号里代表取反的作用
126 text = 'hello'
127 ret = re.search('^he', text)
128 print('>>' + ret.group())  # >>he
129 
130 # $:表示以xxx结尾
131 text = 'xxx@126.com'
132 ret = re.match('\w+@126.com$', text)
133 print('>>' + ret.group())  # >>xxx@126.com
134 
135 # |匹配多个字符串或者表达式
136 text = 'http'
137 ret = re.match('ftp|http|https', text)
138 print('>>' + ret.group())  # >>http
139 
140 # 贪婪模式,+尽量匹配多个字符
141 text = '0123456'
142 ret = re.match('\d+', text)
143 print('>>' + ret.group())  # >>0123456
144 
145 # 非贪婪模式,只匹配最小的条件字符
146 text = '0123456'
147 ret = re.match('\d+?', text)
148 print('>>' + ret.group())  # >>0
149 
150 text = '<h1>标题</h1>'
151 ret = re.match('<.+?>', text)
152 print('>>' + ret.group())  # >><h1>
153 
154 # 匹配0-100中间的数字
155 # 可以出现1,2,3,10....99,100
156 # 不可以出现0,09,012,101
157 text = '17'
158 ret = re.match('[1-9]\d?$|100$', text)  # \d?一位数字,且这个数字可有可无
159 print('>>' + ret.group())  # >>99
160 
161 # \转移字符
162 text = 'applepad is worth $9999'
163 ret = re.search('\$\d+', text)
164 print('>>' + ret.group())  # >>$9999
165 
166 text = '\\n'  # '\\n'== r'\n'
167 ret = re.search('\\\\n', text)
168 print('>>' + ret.group())  # >>\n
169 
170 # 使用r原生
171 text = r'\n'  # '\\n'== r'\n'
172 ret = re.search(r'\\n', text)
173 print('>>' + ret.group())  # >>\n
174 
175 # ()分组
176 text = 'applepad is worth $333, applewatch is worth $444'
177 ret = re.search('.*(\$\d+).*(\$\d+)', text)  # .*代表0个或多个任意字符,用.+也可以
178 print('>>' + ret.group())  # >>applepad is worth $333, applewatch is worth $444
179 print('>>' + ret.group(0))  # >>applepad is worth $333, applewatch is worth $444 等价于ret.group()
180 print('>>' + ret.group(1))  # >>$333
181 print('>>' + ret.group(2))  # >>$444
182 print(ret.group(1, 2))  # ('$333', '$444')
183 print(ret.groups())  # ('$333', '$444')
184 
185 # findall函数
186 text = 'applepad is worth $333, applewatch is worth $444'
187 ret = re.findall('\$\d+', text)
188 print(ret)  # >> ['$333', '$444']
189 
190 # sub函数
191 text = 'applepad is worth $333, applewatch is worth $444'
192 ret = re.sub('\$\d+', '0', text)
193 print(ret)  # >> applepad is worth 0, applewatch is worth 0
194 
195 # 剔除标签,只保留文字内容
196 html = """
197 <dd class="job_bt">
198     <h3 class="des>职位描述:</h3>
199     <div>
200         <p>职位1</p>
201         <p>职位2</p>
202         <p>职位3</p>
203         <p>职位4</p>
204     </div>
205 </dd>
206 """
207 ret = re.sub('<.+?>', '', html)  # 非贪婪模式
208 print(ret)
209 
210 # split函数
211 text = 'hello world&ni hao shijie'
212 ret = re.split(' |&', text)
213 print(ret)  # >> ['hello', 'world', 'ni', 'hao', 'shijie']
214 
215 text = 'hello world&ni hao shijie'
216 ret = re.split('[^a-zA-Z]', text)  # 非英文字符当作分隔符
217 print(ret)  # >> ['hello', 'world', 'ni', 'hao', 'shijie']
218 
219 # compile函数,编译后放入内存
220 text = 'the numbers is 50.12'
221 rex = re.compile('\d+\.?\d*')  #
222 ret = re.search(rex, text)
223 print(ret.group())  # 50.12
224 
225 # 正则表达式加注释
226 text = 'the numbers is 50.12'
227 rex = re.compile(r"""
228     \d+ #小数点前的一个或多个数字
229     \.? #0个或1一个.
230     \d* #小数后的0个或多个数字
231     """, re.VERBOSE)
232 ret = re.search(rex, text)
233 print(ret.group())  # 50.12

 

posted on 2021-12-29 17:29  Shine-Zhong  阅读(59)  评论(0编辑  收藏  举报

导航