正则练习
1 import re 2 from urllib.request import urlopen # 打开一个连接 读取源代码 3 import ssl 4 5 # 干掉数字签名证书 6 ssl._create_default_https_context = ssl._create_default_https_context 7 8 9 def getPage(url): 10 response = urlopen(url) # 和网页链接 11 return response.read().decode("utf-8") # 返回正常的页面源代码,一大堆html 12 13 14 def parsePage(s): # s是页面源代码 15 ret = re.findall('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?' + 16 '<span class="title">(?P<title>.*?)</span>' + 17 '.*?<span class="rating_num".*?>(?P<rating_num>.*?)</span>.*?' + 18 '<span>(?P<comment_num>.*?)评价</span>', s, re.S) 19 return ret 20 21 22 def main(num): 23 url = 'https://movie.douban.com/top250?start=%s&filter=' % num 24 response_html = getPage(url) # response_html 是html页面源码 25 ret = parsePage(response_html) 26 print(ret) 27 28 29 count = 0 30 for i in range(1): # 10⻚ 31 main(count) 32 count += 25
写入文件版
1 import re 2 from urllib.request import urlopen 3 import ssl 4 5 ssl._create_default_https_context = ssl._create_default_https_context 6 7 8 def getPage(url): 9 response = urlopen(url) 10 return response.read().decode("utf-8") 11 12 13 def parsePage(s): 14 com = re.compile( 15 '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?' + 16 '<span class="title">(?P<title>.*?)</span>' + 17 '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>' + 18 '(?P<comment_num>.*?)评价</span>', re.S) 19 ret = com.finditer(s) 20 for i in ret: 21 yield { 22 "id": i.group("id"), 23 "title": i.group("title"), 24 "rating_num": i.group("rating_num"), 25 "comment_num": i.group("comment_num"), 26 } 27 28 29 def main(num): 30 url = 'https://movie.douban.com/top250?start=%s&filter=' % num 31 response_html = getPage(url) 32 ret = parsePage(response_html) 33 f = open("move", "a", encoding="utf-8") 34 35 for obj in ret: 36 # print(obj) 37 data = str(obj) 38 f.write(data + "\n") 39 40 41 count = 0 42 for i in range(5): 43 main(count)