python基础之re模块
就其本质而言,正则表达式(或 RE)是一种小型的、高度专业化的编程语言,(在Python中)它内嵌在Python中,并通过 re 模块实现。正则表达式模式被编译成一系列的字节码,然后由用 C 编写的匹配引擎执行。
正则就是给字符串操作得。
爬虫里会大量用到字符串。要处理一定是对字符串处理。
正则表达式是模糊匹配,这就是正则表达式得真正关键所在。
匹配是一个一个对应的关系,匹配上就放进自己的列表中。
字符匹配(普通字符,元字符):
1 普通字符:大多数字符和字母都会和自身匹配
>>> re.findall('alvin','yuanaleSxalexwupeiqi')
['alvin']
2 元字符:. ^ $ * + ? { } [ ] | ( ) \ #共11个元字符
def findall(pattern, string, flags=0): """Return a list of all non-overlapping matches in the string. If one or more capturing groups are present in the pattern, return a list of groups; this will be a list of tuples if the pattern has more than one group. Empty matches are included in the result.""" return _compile(pattern, flags).findall(string)
re.findall(pattern,string) #找到所有的匹配元素,返回列表
(1) . : 匹配除\n以外的任意符号
print(re.findall("a.+d","abcd"))
(2)^ :从字符串开始位置匹配
print(re.findall("^luchuan","luchuan123asd"))
(3)* + ? {} :重复
print(re.findall("[0-9]{4}","asd1231asd123")) print(re.findall("[0-9]{1,}","asd1231asd123"))
贪婪匹配: #用得比较多
print(re.findall("\d+","af5324jh523hgj34gkhg53453"))
非贪婪匹配:
print(re.findall("\d+?","af5324jh523hgj34gkhg53453")) print(re.findall("\d","af5324jh523hgj34gkhg53453"))
(4)字符集[]:起一个或者的意思
print(re.findall("a[bc]d","hasdabdjhacd"))
注意: *,+ .等元字符都是普通符号,- ^ \:
print(re.findall("[0-9]+","dash234sdfj223")) print(re.findall("\d+","dash234sdfj223")) print(re.findall("[a-z]+","dash234sdfj223")) print(re.findall("[^2]","d2a2")) print(re.findall("[^\d]","d2a2")) print(re.findall("[^\d]+","d2a24sdf2ff23df21sfsf32d2d21d"))
(5)():分组
print(re.findall("(ad)+","addd")) print(re.findall("(ad)+luchuan","adddluchuangfsdui")) print(re.findall("(ad)+luchuan","adadluchuangfsdui")) #adadyuan都匹配到了,但是只把ad放进列表里了 print(re.findall("(?:ad)+luchuan","adadluchuangfsdui")) #取消组内优先级,将匹配所有匹配到得内容 print(re.findall("(\d)+luchuan","ad12343luchuangfs234dui")) print(re.findall("(?:\d)+luchuan","ad12343luchuangfs234dui"))
命名分组:
ret=re.findall(r"\w+\.aticles\.\d{2}","lu.aticles.1234") print(ret) ret=re.findall(r"(\w+)\.aticles\.(\d{2})","lu.aticles.1234") print(ret) ret=re.search(r"(?P<author>\w+)\.aticles\.(?P<id>\d{2})","lu.aticles.1234") #命名分组,可以通过别名来取值 print(ret.group("id")) print(ret.group("author"))
(6)| : 或
print(re.findall("www\.(oldboy|baidu)\.com","www.oldboy.com")) #不命名分组 print(re.findall("www\.(?:oldboy|baidu)\.com","www.oldboy.com"))
(7)\ : 转义
1 后面加一个元字符使其变成普通符号 \. \*
2 将一些普通符号变成特殊符号 比如 \d \w
print(re.findall("-?\d+\.?\d*\*\d+\.?\d*","-2*6+7*45+1.4*3-8/4")) print(re.findall("\w","$da@s4 234")) print(re.findall("a\sb","a badf")) print(re.findall(r"\bI","hello I am LIA")) #ASCII码中有\b字符,所以需要原生字符 print(re.findall("\\bI","hello I am LIA")) print(re.findall(r"\bI","hello$I am LIA")) print(re.findall("c\\\\l","abc\l")) #python解释器默认会把\\解释成\,re模块又会把\\解释成\,所以需要四个 print(re.findall(r"c\\l","abc\l")) #告诉python解释器按照正则去匹配。 print(re.findall("\d+\.?\d*\*\d+\.?\d*","3.5*22+3*2+4.5*33-8+2"))
re的方法:
s=re.finditer("\d+","ad324das32") print(s) print(next(s).group()) #next后只是个对象,还需要进行操作 print(next(s).group())
search:只匹配第一个结果
ret=re.search("\d","jksf34asd3") #使用search做计算器 print(ret) print(ret.group()) #通过group()取值,None得话是匹配未成功
match:只在字符串开始的位置匹配
ret=re.match("\d+","432jksf34asd3") print(ret) print(ret.group())
split:拆分
s2=re.split("\d+","fh233jfd324sfsa213190sdf",2) print(s2) ret3=re.split("l","hello luchuan") print(ret3)
re.sub:替换
ret4=re.sub("\d+","A","hello 234jkhh23") ret4=re.sub("\d+","A","hello 234jkhh23",1) print(ret4)
re.subn:
ret4=re.subn("\d+","A","hello 234jkhh23") print(ret4)
compile :编译方法,一次得话,没什么意义,匹配多个字符串就有意义了
c=re.compile("\d+") ret5=c.findall("hello32world53") print(ret5)
链接:http://www.cnblogs.com/yuanchenqi/articles/5732581.html