2-3-07-01 作业-爬虫邮箱
4. 作业
4.1 身份证号码
import re
text = "dsf130429191912015219k13042919591219521Xkk"
data_list = re.findall("\d{17}[\dX]", text) # [abc]
print(data_list)
# ['130429191912015219', '13042919591219521X']
data_list = re.findall("\d{17}(\d|X)", text)
print(data_list)
# ['9', 'X']
data_list = re.findall("(\d{17}(\d|X))", text)
print(data_list)
# [('130429191912015219', '9'), ('13042919591219521X', 'X')]
data_list = re.findall("(\d{6})(\d{4})(\d{2})(\d{2})(\d{3})([0-9]|X)", text)
print(data_list)
# [('130429', '1919', '12', '01', '521', '9'), ('130429', '1959', '12', '19', '521', 'X')]
4.2 手机号
import re
text = "我的手机哈是15133377892,你的手机号是1171123啊?"
data_list = re.findall("1[3-9]\d{9}", text)
print(data_list)
# ['15133377892']
4.3 邮箱地址
re.ASCII 不匹配中文
import re
text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀"
email_list = re.findall("\w+@\w+\.\w+",text)
print(email_list)
# ['442662578@qq.com和xxxxx']
email_list = re.findall("[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+", text, re.ASCII)
print(email_list)
# ['442662578@qq.com', 'xxxxx@live.com']
#
email_list = re.findall("\w+@\w+\.\w+", text, re.ASCII)
print(email_list)
# ['442662578@qq.com', 'xxxxx@live.com']
text = "楼主太牛44266-2578@qq.com逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀"
email_list = re.findall("(\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*)", text, re.ASCII)
print(email_list)
# [('44266-2578@qq.com', '-2578', '', ''), ('xxxxx@live.com', '', '', '')]
爬虫邮箱
实现获取页面上的所有评论(已实现),并提取里面的邮箱。
# 先安装两个模块
pip3 install requests
pip3 install beautifulsoup4
import re
import requests
from bs4 import BeautifulSoup
res = requests.get(
url="https://www.douban.com/group/topic/79870081/",
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}
)
bs_object = BeautifulSoup(res.text, "html.parser")
comment_object_list = bs_object.find_all("p", attrs={"class": "reply-content"})
for comment_object in comment_object_list:
text = comment_object.text
print(text)
# 请继续补充代码,提取text中的邮箱地址
作业:
import re
import requests
from bs4 import BeautifulSoup
res = requests.get(
url="https://www.douban.com/group/topic/79870081/",
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}
)
bs_object = BeautifulSoup(res.text, "html.parser")
comment_object_list = bs_object.find_all("p", attrs={"class": "reply-content"})
for comment_object in comment_object_list:
text = comment_object.text
# 请继续补充代码,提取text中的邮箱地址
email_list = re.findall("\w+@\w+\.\w+", text, re.ASCII)
if email_list:
print(email_list)