正则表达式
^ ```以某个开头```
$ 以某个结尾
.* ```某个任意多次,大于等于0, !!! .号不匹配\n```
? 让某个取消贪婪匹配,可以理解为改为从左到右匹配到某个为止
+ 某个至少为一次,大于等于1
{ }例,{2,5},某个出现2到5次.....{2},{2,}等
| 或者(a|b),选a或者b
[ ]有三种意思,1.[13567]中括号的任选一个-------2.[0-9],[a-z]-------3.[.]就代表.号,不代表任意字符了
[^] ```例,[^1]非1```
[a-z]同上
\s空格
\S非空格
\w代表[A-Za-z0-9_]
\W代表非[A-Za-z0-9_]
[\u4E00-\u9FA5]代表汉字
( )略
\d数字
例1
"""去贪婪匹配"""
import re
a = "zooooozzoz"
regex_str = ".*(z.*z).*"
m = re.findall(regex_str, a)
print(m)
# >>> ['zoz']
# zoooooz不符合条件
# zz不符合条件
import re
c = "zooooozzozy"
regex_str = "(z.+?z).*"
match_obj = re.match(regex_str, c)
print(match_obj.group(1))
# >>>zoooooz
例2
import re
line1 = "你出生在2016-09-01"
line2 = "你出生在2016-9-1"
line3 = "你出生在2016/09/01"
line4 = "你出生在2016年9月1号"
line5 = "你出生在2016-09"
regex_str = "(你出生在\d{4}(-|/|年)\d{1,2}($|(月|/|-)\d{1,2}($|号)))"
match_obj = re.match(regex_str, line5)
if match_obj:
print(match_obj.group(1))
#五个line都能匹配
简单的正则爬虫 ``` # -*- coding: utf-8 -*- # @Author: Lai
import re
import os
import requests
BASE_PATH = "E:/EW/"
def get_block_link(url):
html_obj = requests.get(url)
reg = '<img onerror'
link_list = re.findall(reg, html_obj.text, re.S)
return link_list
得到标题和多章节的链接
def get_child_link(url):
links = []
for link in get_block_link(url):
html_obj = requests.get(link)
reg = '.*?'
link = re.findall(reg, html_obj.text)[0]
links.append(link)
return links
def get_charterName_and_content(url):
html_obj = requests.get(url)
html_obj.encoding = "GBK"
reg_content = 'id="content">(.*?)