036 re模块的小练习
1.匹配标签
1 import re 2 ret = re.search('<(?P<tag_name>\w+)>\w+</(?P=tag_name)>','<h1>hello</h1>') 3 #还可以在分组中利用?P<name>的形式给分组起名字 4 #获取的匹配结果可以直接用group('名字')拿到对应的值 5 print(ret.group()) 6 print(ret.group('tag_name')) 7 8 9 # ------------------------- 10 ret = re.search(r'<(\w+)>\w+</\1>','<h1>hello</h1>') 11 #如果不给分组起名字,也可以用\序号来找到对应的组,表说要找的内容和前面的组的内容一致 12 #获取的匹配结果可以直接用group(序号)拿到对应的值 13 print(ret.group()) 14 print(ret.group(1))
2.匹配整数
1 import re 2 ret = re.findall(r'\d+',"1-2*(60+(-40.35/5)-(-4*3))") 3 print(ret) 4 5 import re 6 ret = re.findall(r'-?\d+\.\d*|(-?\d+)',"1-2*(60+(-40.35/5)-(-4*3))") 7 print(ret) 8 9 ret.remove('') 10 print(ret)
3.数字匹配
1 # 3.数字匹配 2 # 1.匹配一段文本中的每行的时间字符串,比如’1990-07-12‘ 3 4 # 1.分别取出一年的十二个月: 5 while True: 6 cmd = input('请输入月份:>>') 7 if re.match(r'^(0?[1-9]|1[0-2])$',cmd): 8 print('格式正确') 9 else: 10 print('格式错误') 11 12 # 2. 13 ret =re.match(r'^(0?[1-9]|1[0-2])$','11') 14 print(ret.group()) 15 16 # 一个月的31天 17 ret = re.match(r'^((0?[1-9])|((1|2)[0-9])|30|31)$','31')#从头匹配 18 print(ret.group()) 19 20 # 匹配QQ号 21 while True: 22 cmd = input('请输入你扣扣号:>>') 23 if re.match(r'^[1-9][0-9]{4,10}$',cmd): 24 print('输入正确') 25 else: 26 print('输入错误') 27 28 # 浮点数 29 ret = re.match(r'-?\d+\.?\d*','21.5') 30 print(ret.group())
4.爬虫练习
1 import requests 2 3 import re 4 import json 5 6 def getPage(url): 7 8 response=requests.get(url) 9 return response.text 10 11 def parsePage(s): 12 13 com=re.compile('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>' 14 '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>',re.S) 15 16 ret=com.finditer(s) 17 for i in ret: 18 yield { 19 "id":i.group("id"), 20 "title":i.group("title"), 21 "rating_num":i.group("rating_num"), 22 "comment_num":i.group("comment_num"), 23 } 24 25 def main(num): 26 27 url='https://movie.douban.com/top250?start=%s&filter='%num 28 response_html=getPage(url) 29 ret=parsePage(response_html) 30 print(ret) 31 f=open("move_info7","a",encoding="utf8") 32 33 for obj in ret: 34 print(obj) 35 data=json.dumps(obj,ensure_ascii=False) 36 f.write(data+"\n") 37 38 if __name__ == '__main__': 39 count=0 40 for i in range(10): 41 main(count) 42 count+=25