2-3-07-01 作业-爬虫邮箱

4. 作业

4.1 身份证号码

   import re
   
   text = "dsf130429191912015219k13042919591219521Xkk"
   data_list = re.findall("\d{17}[\dX]", text) # [abc]
   print(data_list) 
# ['130429191912015219', '13042919591219521X']

   data_list = re.findall("\d{17}(\d|X)", text)
   print(data_list) 
# ['9', 'X']

   data_list = re.findall("(\d{17}(\d|X))", text)
   print(data_list) 
# [('130429191912015219', '9'), ('13042919591219521X', 'X')]   

   data_list = re.findall("(\d{6})(\d{4})(\d{2})(\d{2})(\d{3})([0-9]|X)", text)
   print(data_list) 
# [('130429', '1919', '12', '01', '521', '9'), ('130429', '1959', '12', '19', '521', 'X')]

4.2 手机号

import re

text = "我的手机哈是15133377892,你的手机号是1171123啊?"
data_list = re.findall("1[3-9]\d{9}", text)
print(data_list)  
# ['15133377892']

4.3 邮箱地址

re.ASCII 不匹配中文

import re

text = "楼主太牛逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀"
email_list = re.findall("\w+@\w+\.\w+",text)
print(email_list) 
# ['442662578@qq.com和xxxxx']


email_list = re.findall("[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+", text, re.ASCII)
print(email_list) 
# ['442662578@qq.com', 'xxxxx@live.com']

# 
email_list = re.findall("\w+@\w+\.\w+", text, re.ASCII)
print(email_list) 
# ['442662578@qq.com', 'xxxxx@live.com']


text = "楼主太牛44266-2578@qq.com逼了,在线想要 442662578@qq.com和xxxxx@live.com谢谢楼主,手机号也可15131255789,搞起来呀"
email_list = re.findall("(\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*)", text, re.ASCII)
print(email_list)
# [('44266-2578@qq.com', '-2578', '', ''), ('xxxxx@live.com', '', '', '')]

爬虫邮箱

实现获取页面上的所有评论(已实现),并提取里面的邮箱。

   # 先安装两个模块
   pip3 install requests
   pip3 install beautifulsoup4
import re
import requests
from bs4 import BeautifulSoup

res = requests.get(
    url="https://www.douban.com/group/topic/79870081/",
    headers={
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    }
)
bs_object = BeautifulSoup(res.text, "html.parser")
comment_object_list = bs_object.find_all("p", attrs={"class": "reply-content"})
for comment_object in comment_object_list:
    text = comment_object.text
    print(text)
    # 请继续补充代码,提取text中的邮箱地址

作业:

import re  
import requests  
from bs4 import BeautifulSoup  
  
res = requests.get(  
    url="https://www.douban.com/group/topic/79870081/",  
    headers={  
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',  
    }  
)  
bs_object = BeautifulSoup(res.text, "html.parser")  
comment_object_list = bs_object.find_all("p", attrs={"class": "reply-content"})  
for comment_object in comment_object_list:  
    text = comment_object.text  
    # 请继续补充代码,提取text中的邮箱地址  
    email_list = re.findall("\w+@\w+\.\w+", text, re.ASCII)  
    if email_list:  
        print(email_list)
posted @ 2022-10-26 23:34  布丁家的苏苏  Views(2)  Comments(0)    收藏  举报