Python 爬邮箱

利用Python实现了从网站中爬取邮箱的功能,以下为代码实现。

""" 网络爬虫爬邮箱 """

from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from collections import deque
import re
import os
import csv

class EmailCrawler:
    """ 邮箱爬虫 """

    # 邮箱正则表达式
    __email_addr_pattern = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"

    def crawl(self, urls):
        """ 
        爬取
        \n参数: urls - 网址列表或者文件(.txt,.csv)
         """
        new_urls = deque()       # 网址列表
        processed_urls = set()  # 已爬的网址
        emails = set()          # 邮箱地址

        if type(urls) is deque:
            new_urls = urls
        elif type(urls) is list:
            new_urls = deque(urls)
        elif type(urls) is str:
            data = list()
            if os.path.exists(urls):
                data = self.__readCSVData(urls)
            else:
                data = urls.split(',')
            new_urls = deque(data)
        else:
            print("不支持的参数!")
            return emails        

        """ 开始爬取 """
        # 遍历网址直到结束
        while len(new_urls):
            # 从队列头部推出一个网址
            url = new_urls.popleft()
            processed_urls.add(url)

            # 提取基本网址与路径已解决相对链接
            parts = urlsplit(url)
            base_url = "{0.scheme}://{0.netloc}".format(parts)
            path = url[:url.rfind('/')+1] if '/' in parts.path else url

            # 获取网址内容
            print("Processing %s" %url)
            try:
                response = requests.get(url)
            except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
                # 忽略页面错误
                continue

            # 提取页面中的所有email,并且将它们添加到结果集
            new_emails = set(re.findall(self.__email_addr_pattern, response.text, re.I))
            if len(new_emails) > 0:
                emails.update(new_emails)
                print(new_emails)

            # 给文档创建beautiful soup
            soup = BeautifulSoup(response.text, features="lxml")

            # 找到并处理文档中所有的锚
            for anchor in soup.find_all('a'):
                # 从锚中提取链接
                link = anchor.attrs['href'] if 'href' in anchor.attrs else ''
                # 处理内部链接
                if link.startswith('/'):
                    link = base_url + link
                elif not link.startswith('http'):
                    link = path + link
        
                # 添加新链接
                if not link in new_urls and not link in processed_urls:
                    new_urls.append(link)return emails

    def __readCSVData(self, filename):
        """ 读取文件 """
        data = list()
        with open(filename, 'r') as f:
            f_csv = csv.reader(f)
            for row in f_csv:
                data.append(row[0])
        return dataif __name__ == '__main__':

    # urls = 'http://www.themoscowtimes.com'
    # urls = ['http://www.themoscowtimes.com']
    urls = 'urls.txt'
    emailCrawl = EmailCrawler()
    emails = emailCrawl.crawl(urls)

以上代码参考自:http://scraping.pro/simple-email-crawler-python/

另有一个开源的从文本中提取邮件地址的项目:https://pypi.org/project/email-scraper/

posted @ 2020-04-10 13:11  pry_up  阅读(1963)  评论(0编辑  收藏  举报