【python】python每天抓取一篇英语美文,发送到邮箱

import requests,os,time
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                        ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
#TODO 进入主页面的函数,找到文章链接
def findEssay(rootUrl,pages,recordsPath):
    try:
        htmlpp = requests.get(rootUrl, headers=header)
        html = htmlpp.text.encode(htmlpp.encoding)
        # 获取dirname
        dirUrl1 = os.path.dirname(rootUrl)
        dirUrl = os.path.dirname(dirUrl1)
        soup = BeautifulSoup(html, 'html.parser')
        essayTags = soup.select('.node_list a')  # 得到当前页面的文章Tag值
        for essayTag in essayTags:
            essayUrl = essayTag.get('href')  # 获取文章的链接
            essayName = essayTag.text  # 获取文章的名字
            if essayUrl not in pages:
                downloadEssay(dirUrl, essayUrl, essayName)
                pages.add(essayUrl)
                with open(recordsPath, 'a+') as attach:
                    attach.write(str(essayUrl) + '\n')
                    print('写入记录成功')
                return
        nextPageBaseUrl = soup.select('.page a')[-2].get('href')     #取得下一页的链接
        nextPageUrl = os.path.join(dirUrl1,nextPageBaseUrl)            #组装成完整的链接
        findEssay(nextPageUrl,pages,recordsPath)
    except Exception as e:
        print('根链接出现错误'+str(e))
#TODO 下载文章内容
def downloadEssay(dirUrl,essayUrl,essayName):
    try:
        htmlpp = requests.get(dirUrl + essayUrl)
        html = htmlpp.text.encode(htmlpp.encoding)
        soup = BeautifulSoup(html, 'html.parser')
        paras = soup.select('#dede_content div')
        mailTo(essayName,paras)
    except Exception as e:
        with open(recordsPath, 'a+') as attach:
            attach.write(str(essayUrl) + '\n')
        findEssay(rootUrl, pages, recordsPath)
        print('下载文章失败 '+str(e))
#TODO 制作成word文档,命名为日期,发送邮件的函数,发送完成删除文件
def mailTo(essayName,paras):
    content = ""
    for para in paras:
        content = content + '<p>' + para.getText() + '</p>'
    # #发送方邮件地址
    sender = '发件人@163.com'
    # 发送方邮件密码
    pwd = input('Password: ')
    receivers = ['收件人1@qq.com','收件人2@qq.com']  # 输入一个你要收取邮件的邮箱地址

    # 邮件的内容、收件人、发件人信息
    mail_message = '<html><body><h1>'+essayName+'</h1>' + \
                   '<article>'+content+'</article>' + \
                   '</body></html>'
    message = MIMEText(mail_message, 'html', 'utf-8')  # 发送含HTML内容的邮件
    message['To'] = ';'.join(receivers)  # 填入收件人邮箱地址
    message['From'] = sender  # 填入发件人邮箱地址

    # 邮件的标题
    today = time.strftime('%y%m%d')  # 以当前日期命名文档
    today = str(today)
    subject = '今日美文'+today
    message['Subject'] = subject  # 可以不设置编码

    try:
        smtpObj = smtplib.SMTP_SSL('smtp.163.com', 465)  # 网易163邮箱 使用非本地服务器,需要建立和网易邮件服务 的SSL链接,端口465
        smtpObj.login(sender, pwd)  # 登录认证
        smtpObj.sendmail(sender, receivers, message.as_string())  # 发送邮件主题
        print('邮件发送成功!')
        smtpObj.quit()
    except smtplib.SMTPException as e:
        print('邮件发送失败,失败原因:', e)

if __name__ == '__main__':
    recordsPath = 'C:\\enEssaysToLH.txt'
    pages = set()
    if not os.path.exists(recordsPath):
        with open(recordsPath,'w'):
            print('创建记录文件')
    with open(recordsPath,'r') as readFile:
        for line in readFile.readlines():
            pages.add(line.rstrip())
    #TODO 解析主链接,生成dirname,进入主页面的函数
    rootUrl = 'http://www.enread.com/essays/index.html'
    findEssay(rootUrl,pages,recordsPath)

 

发送了很多次邮件,每次英文做主题(subject)的时候,都会出现554问题。当把邮件的题目统一换成中文后,同一个文章就能发送出去。可能这里面涉及了编码的问题,待以后研究。

 

posted @ 2018-10-11 19:40  HanJunOvO  阅读(572)  评论(0编辑  收藏  举报