02 re模块
# re的工作是在python中执行正则表达式 import re # # find # result = re.findall("\d+", "baby的电话号是: 185123456789") # print(result) # # it = re.finditer("\d+", "baby123456789的电话号是: 185123456789") # for el in it: # print(el.group()) # 分组 # search 搜索, 查找 # 一旦匹配到结果. 直接返回, 如果匹配不到结果. 返回None # result = re.search("\d", "宝宝的电话是") # print(result) # print(result.group()) # 匹配, 从头开始匹配. 相当于在你正则前面加了一个^ # result = re.match("\d+", "157宝宝的电话是:") # print(result.group()) # search和match的区别: search查找. 找到了结果就返回. match. 从头开始匹配. # # 坑: 爬虫的一个重点 # # .*? # result = re.finditer(r"姓名:(?P<name>.*?), 爱好:(?P<hobby>.*?),", "姓名:宝宝, 爱好:女,") # for el in result: # print(el.group("name"), el.group("hobby")) # # 正则的常用操作 # result = re.split("\d+", "宝宝110来找你了. 你回头收拾收拾去119报道") # print(result) # 用正则替换 # s = re.sub("\d+", "__sb__", "宝宝110来找你了. 你回头收拾收拾去119报道") # print(s) # s = re.subn("\d+", "__sb__", "宝宝110来找你了. 你回头收拾收拾去119报道") # 替换了xxx次 # print(s) # code = "for i in range(10): print(i)" # # c = compile(code, "", "exec") # 编译 # # exec(c) # 快速执行 # reg = re.compile(r"\d+") # 编译了一段正则. 加载了一段正则 # lst = reg.findall("呵呵, 宝宝才不去110呢.他要去120了") # print(lst) # # re.findall(r"\d+", "呵呵, 宝宝才不去110呢.他要去120了") # lst = re.findall(r"a(?:\d+)c", "a123456c") # 把括号python中的分组变成了原来正则表达式中的分组 # print(lst) # # 爬取电影天堂 from urllib.request import urlopen content = urlopen("https://www.dytt8.net/html/gndy/dyzz/20181219/57954.html").read().decode("gbk") # print(content) reg = r'<div id="Zoom">.*?片 名(?P<name>.*?)<br />◎年 代(?P<years>.*?)<br />.*?◎上映日期(?P<date>.*?)<br />'+ \ '.*?◎主 演(?P<main>.*?)◎简 介.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)">' it = re.finditer(reg, content, re.S) # re.S 去掉.里面的\n for el in it: print(el.group("name")) print(el.group("years")) print(el.group("date")) print(el.group("main").replace("<br /> ", ", ")) print(el.group("download"))