Python 爬邮箱
利用Python实现了从网站中爬取邮箱的功能,以下为代码实现。
""" 网络爬虫爬邮箱 """ from bs4 import BeautifulSoup import requests import requests.exceptions from urllib.parse import urlsplit from collections import deque import re import os import csv class EmailCrawler: """ 邮箱爬虫 """ # 邮箱正则表达式 __email_addr_pattern = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" def crawl(self, urls): """ 爬取 \n参数: urls - 网址列表或者文件(.txt,.csv) """ new_urls = deque() # 网址列表 processed_urls = set() # 已爬的网址 emails = set() # 邮箱地址 if type(urls) is deque: new_urls = urls elif type(urls) is list: new_urls = deque(urls) elif type(urls) is str: data = list() if os.path.exists(urls): data = self.__readCSVData(urls) else: data = urls.split(',') new_urls = deque(data) else: print("不支持的参数!") return emails """ 开始爬取 """ # 遍历网址直到结束 while len(new_urls): # 从队列头部推出一个网址 url = new_urls.popleft() processed_urls.add(url) # 提取基本网址与路径已解决相对链接 parts = urlsplit(url) base_url = "{0.scheme}://{0.netloc}".format(parts) path = url[:url.rfind('/')+1] if '/' in parts.path else url # 获取网址内容 print("Processing %s" %url) try: response = requests.get(url) except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError): # 忽略页面错误 continue # 提取页面中的所有email,并且将它们添加到结果集 new_emails = set(re.findall(self.__email_addr_pattern, response.text, re.I)) if len(new_emails) > 0: emails.update(new_emails) print(new_emails) # 给文档创建beautiful soup soup = BeautifulSoup(response.text, features="lxml") # 找到并处理文档中所有的锚 for anchor in soup.find_all('a'): # 从锚中提取链接 link = anchor.attrs['href'] if 'href' in anchor.attrs else '' # 处理内部链接 if link.startswith('/'): link = base_url + link elif not link.startswith('http'): link = path + link # 添加新链接 if not link in new_urls and not link in processed_urls: new_urls.append(link)return emails def __readCSVData(self, filename): """ 读取文件 """ data = list() with open(filename, 'r') as f: f_csv = csv.reader(f) for row in f_csv: data.append(row[0]) return dataif __name__ == '__main__': # urls = 'http://www.themoscowtimes.com' # urls = ['http://www.themoscowtimes.com'] urls = 'urls.txt' emailCrawl = EmailCrawler() emails = emailCrawl.crawl(urls)
以上代码参考自:http://scraping.pro/simple-email-crawler-python/
另有一个开源的从文本中提取邮件地址的项目:https://pypi.org/project/email-scraper/