Python爬取网上笑话并定时邮件发送--P.S.想经常收笑话的同学,可以邮件我
效果如下图
Author : Leon
Email : yangli0534@yahoo.com
Description: 1 grab a joke from the Internet 2 email to someone on schedule
1 # -*- coding:gb2312 -*- 2 3 #!/usr/bin/python 4 __author__ = 'Leon' 5 6 ''' 7 Author : Leon 8 Email : yangli0534@yahoo.com 9 Description: 1 grab a joke from the Internet 10 2 email to someone on schedule 11 3 参考了网友李鹏飞关于抓取网页的部分 ,感谢。侵权删 12 ''' 13 import smtplib 14 from email.MIMEMultipart import MIMEMultipart 15 from email.MIMEText import MIMEText 16 import urllib2 17 import re 18 import schedule 19 import time 20 import datetime 21 22 class randomJoke: 23 24 #初始化方法 25 def __init__(self): 26 self.url = 'http://lengxiaohua.com/random' 27 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 28 #初始化headers 29 self.headers = { 'User-Agent' : self.user_agent } 30 #笑话内容 31 self.content = [] 32 33 #获取网页源代码 34 def getSourceCode(self): 35 try: 36 request = urllib2.Request(url = self.url, headers=self.headers) 37 response = urllib2.urlopen(request) 38 sourceCode = response.read().decode('utf-8') 39 return sourceCode 40 except urllib2.URLError, e: 41 if hasattr(e,"reason"): 42 print u"网络错误...",e.reason 43 return None 44 45 #获取笑话 46 def setContent(self): 47 sourceCode = self.getSourceCode() 48 if not sourceCode: 49 print('获取网页内容失败~!') 50 quit() 51 pattern = re.compile(' <pre.*?js="joke_summary".*?"first_char">(.*?)</span>(.*?)</pre>.*?class="user_info">.*?<a.*?>(.*?)</a>.*?(.*?)',re.S) 52 items = re.findall(pattern,sourceCode) 53 self.content = items 54 #print u"已经爬取源代码...正在解析源代码..." 55 56 #返回笑话 57 def getContent(self): 58 return self.content 59 60 #打印一则笑话 61 def printAJoke(self,number): 62 joke = self.content[number] 63 print u"作者:%s" %(joke[2]) 64 print u'发表于:'+ joke[3] 65 #item[0]和item[1]组成完整的内容 66 print joke[0]+joke[1] 67 68 def getAJoke(self,number): 69 joke = self.content[number] 70 content = "" 71 #content = content+ u"作者:" %(joke[2]) 72 #print u'发表于:'+ joke[3] 73 #item[0]和item[1]组成完整的内容 74 content = joke[0]+joke[1] 75 return content 76 ## run the task on chedule 77 def job(): 78 global myRandomJoke 79 #global server 80 global toaddr 81 global fromaddr 82 global password 83 t = datetime.datetime.now() 84 content = "" 85 content = content+ u"你好,这里是随机笑话!" 86 content = content+ "It's " 87 content = content+ t.strftime("%A, %d. %B %Y %I:%M%p")+'\n' 88 myRandomJoke.setContent() 89 #myRandomJoke.printAJoke(2) 90 content = content+myRandomJoke.getAJoke(2) 91 print content 92 msg = MIMEMultipart() 93 msg['From'] = fromaddr 94 msg['To'] = toaddr 95 msg['Subject'] = "Leon send a joke for u on"+t.strftime("%A, %d. %B %Y %I:%M%p") 96 97 try: 98 body = "YOUR MESSAGE HERE" 99 #msg.attach(MIMEText(content, 'plain')) 100 #msg.attach(MIMEText(content, 'plain')) 101 msg.attach(MIMEText(content,format,'utf-8')) 102 text = msg.as_string() 103 #server = smtplib.SMTP_SSL("smtp.126.com", 25)# connect to email server 104 server = smtplib.SMTP("smtp.126.com")# connect to email server 105 server.login(fromaddr,password) 106 server.sendmail(fromaddr, toaddr, text) 107 server.quit() 108 print "send email successfully" 109 except: 110 print "failed!" 111 112 toaddr = "somebody@yahoo.com" # email address to send 113 114 fromaddr = "yourname@126.com" 115 password = "xxxxxxxxxxx"# 116 #server = smtplib.SMTP('smtp.yahoo.com', 587, None, 30) 117 #server = smtplib.SMTP_SSL('smtp.googlemail.com', 465) 118 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server 119 120 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server 121 #server.login(fromaddr,password) 122 myRandomJoke = randomJoke() 123 job() 124 schedule.every(2).minutes.do(job)#run every 2 minutes 125 #notQuit = True 126 #print u"你好,这里是随机笑话!" 127 while True: 128 schedule.run_pending() 129 time.sleep(10) 130 131 132 133 server.quit() 134 quit()
遇到的问题有1.SMTP的问题,和具体的邮箱相关。
2.汉字编码。使用utf-8编码的内容,在一些客户端中无法显示,修改为gbk编码后解决。
经验,尽量使用开发测试完善的package, 会更稳定。比如,这个定时程序,可以用datetime猎取时间然后判断实现,性能不如使用schedule模块。
更新
因为发现上述抓取笑话的网站更新不及时,于是更新为糗事百科上抓取文本,更新后的代码如下:
1 # -*- coding:utf-8 -*- 2 3 import urllib 4 import urllib2 5 import re 6 import thread 7 import time 8 import random 9 #糗事百科爬虫类,在网友代码基础上修改 10 class qiushibaike: 11 12 #初始化方法,定义一些变量 13 def __init__(self): 14 #self.pageIndex = 30 15 #user_agent 从火狐 HttpFox中headers查找到 16 #self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0)' 17 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 18 #初始化headers 19 self.headers = { 'User-Agent' : self.user_agent } 20 #存放段子的变量,每一个元素是每一页的段子们 21 self.stories = [] 22 #存放程序是否继续运行的变量 23 self.enable = False 24 self.pageStories = [] 25 26 #传入某一页的索引获得页面代码 27 def getPage(self): 28 try: 29 pageIndex = random.randint(2,35) 30 url = 'https://www.qiushibaike.com/text/page/' + str(pageIndex)+'/' 31 #构建请求的request 32 request = urllib2.Request(url,headers = self.headers) 33 #利用urlopen获取页面代码 34 response = urllib2.urlopen(request) 35 #将页面转化为UTF-8编码 36 pageCode = response.read().decode('utf-8') 37 return pageCode 38 39 except urllib2.URLError, e: 40 if hasattr(e,"reason"): 41 print u"连接糗事百科失败,错误原因",e.reason 42 return None 43 44 45 #传入某一页代码,返回本页不带图片的段子列表 46 def getPageItems(self): 47 pageCode = self.getPage() 48 if not pageCode: 49 print "页面加载失败...." 50 return None 51 pattern = re.compile('<div class="author clearfix">.*?href.*?<img src.*?title=.*?<h2>(.*?)</h2>.*?<div class="content">(.*?)</div>.*?<i class="number">(.*?)</i>',re.S) 52 items = re.findall(pattern,pageCode) 53 #print len(items) 54 #print "*****items" 55 #用来存储每页的段子们 56 self.pageStories = [] 57 #遍历正则表达式匹配的信息 58 i = 0 59 for item in items: 60 #如果不含有图片,把它加入list中 61 #if not haveImg: 62 replaceBR = re.compile('<.?span>') 63 #将<br/> 用 换行符\n 替换 64 text = re.sub(replaceBR,"\n",item[1]) 65 #item[0]是一个段子的发布者,item[1]是内容,item[2]是发布时间,item[4]是点赞数 66 self.pageStories.append([item[0],text,item[2]]) 67 68 #strip()作用:去掉转义字符后输出 69 #print item[0] +"-------0t" 70 #print item[1] +"-------0T" 71 #print text + "******TEXT" 72 #print item[2] +"-------0t" 73 74 #return pageStories 75 76 #加载并提取页面的内容,加入到列表中 77 def loadPage(self): 78 #如果当前未看的页数少于2页,则加载新一页 79 if self.enable == True: 80 if len(self.stories) < 2: 81 #获取新一页 82 #pageStories = self.getPageItems() 83 self.getPageItems() 84 #将该页的段子存放到全局list中 85 if self.pageStories: 86 self.stories.append(self.pageStories) 87 #获取完之后页码索引加一,表示下次读取下一页 88 #self.pageIndex += 1 89 90 #print len(self.stories) 91 #调用该方法,每次敲回车打印输出一个段子 92 def getOneStory(self): 93 #遍历一页的段子 94 95 # for story in pageStories: 96 # #等待用户输入 97 # input = raw_input() 98 # #每当输入回车一次,判断一下是否要加载新页面 99 # self.loadPage() 100 # #如果输入Q则程序结束 101 # if input == "Q": 102 # self.enable = False 103 # return 104 # #print "$$$" 105 # print len(story) 106 # #现在网页已没有发布时间了 107 # #print u"第%d页\t发布人:%s\t发布时间:%s\t赞:%s\n%s" %(page,story[0],story[2],story[3],story[1]) 108 # print u"第%d页\t发布人:%s\t赞:%s\n%s" %(page,story[0],story[2],story[1]) 109 self.loadPage() 110 len_page = len(self.pageStories) 111 story = self.pageStories[random.randint(0, len_page-1)] 112 print u'回车看下一个,Q退出' 113 input = raw_input() 114 # #每当输入回车一次,判断一下是否要加载新页面 115 # self.loadPage() 116 # #如果输入Q则程序结束 117 if input == "Q": 118 self.enable = False 119 return 120 #print len(story) 121 print u'%s' %story[1] 122 #开始方法 123 def start(self): 124 print u"正在读取糗事百科,按回车查看新段子,Q退出" 125 #使变量为True,程序可以正常运行 126 self.enable = True 127 128 #先加载一页内容 129 self.loadPage() 130 131 #局部变量,控制当前读到了第几页 132 #nowPage = 0 133 while self.enable: 134 if len(self.stories)>0: 135 #print len(self.stories) 136 #print "-------stories" 137 #从全局list中获取一页的段子 138 self.pageStories = self.stories[0] 139 #当前读到的页数加一 140 #nowPage += 1 141 #将全局list中第一个元素删除,因为已经取出 142 del self.stories[0] 143 #print "---------------------------------" 144 #print len(pageStories) 145 #print nowPage 146 #输出该页的段子 147 self.getOneStory() 148 def getAJoke(self): 149 self.enable = True 150 self.loadPage() 151 self.pageStories = self.stories[0] 152 del self.stories[0] 153 len_page = len(self.pageStories) 154 story = self.pageStories[random.randint(0, len_page-1)] 155 self.enable = True 156 return story[1] 157 158 #print u'回车看下一个,Q退出' 159 #input = raw_input() 160 # #每当输入回车一次,判断一下是否要加载新页面 161 # self.loadPage() 162 # #如果输入Q则程序结束 163 #if input == "Q": 164 # self.enable = False 165 # return 166 #print len(story) 167 #print u'%s' %story[1] 168 169 #spider = QSBK() 170 #Aspider.start() 171 #print spider.getAJoke()
使用方法如下:
1 #-*- coding:utf-8 -*- 2 3 4 #!/usr/bin/python 5 __author__ = 'Leon' 6 7 ''' 8 Author : Leon 9 Email : yangli0534@yahoo.com 10 Description: 1 grab a joke from the Internet 11 2 email to someone on schedule 12 3 参考了部分网友的代码 ,感谢。侵权删 13 ''' 14 import smtplib 15 from email.MIMEMultipart import MIMEMultipart 16 from email.MIMEText import MIMEText 17 import re 18 import schedule 19 import time 20 import datetime 21 from qiushibaike import qiushibaike 22 23 def job(): 24 #global myRandomJoke 25 global myQiuBai 26 #global server 27 global toaddr 28 global fromaddr 29 global password 30 t = datetime.datetime.now() 31 content = '' 32 content = content+ u'笑口常开!' 33 content = content+ u"It's " 34 content = content+ t.strftime("%A, %d. %B %Y %I:%M%p")+'\n' 35 #myRandomJoke.setContent() 36 #myRandomJoke.printAJoke(2) 37 #content = content + myRandomJoke.getAJoke(2) 38 content = content + myQiuBai.getAJoke() 39 print content 40 #content = u'''你好,这是一封测试邮件,来自yangli0534@yahooc.com''' 41 #content = content +t.strftime("%A, %d. %B %Y %I:%M%p") 42 msg = MIMEMultipart() 43 msg['From'] =fromaddr 44 msg['To'] =','.join(toaddr) 45 msg['Cc'] = ','.join(ccaddr) 46 msg['Bcc'] = ','.join(ccaddr) 47 msg['Subject'] = u"Leon send a joke to you on"+t.strftime("%A, %d. %B %Y %I:%M%p") 48 49 try: 50 #body = "YOUR MESSAGE HERE" 51 body = content 52 #msg.attach(MIMEText(content, 'plain')) 53 #msg.attach(MIMEText(content, 'plain')) 54 #msg.attach(MIMEText(content,format,'utf-8')) 55 msg.attach(MIMEText(body.encode('gbk'))) 56 text = msg.as_string() 57 #server = smtplib.SMTP_SSL("smtp.126.com", 25)# connect to email server 58 server = smtplib.SMTP("smtp.139.com",25)# connect to email server 59 server.login(fromaddr,password) 60 #server.sendmail(fromaddr, toaddr, text) 61 server.sendmail(fromaddr, toaddr + ccaddr, text) 62 #server.sendmail(fromaddr, fromaddr,text) 63 server.quit() 64 print "send email successfully" 65 except: 66 print "failed!" 67 68 toaddr = ['1184802734@qq.com','18811007706@139.com'] # email address to send 69 ccaddr = ['502327976@qq.com'] # carbon copy 70 bccaddr = ['15210579762@139.com']#blind carbon copy 71 #toaddr2 = '502327976@qq.com' 72 fromaddr = 'china__mobile@139.com'#send address 73 password = "xxxxxxx"#password 74 #server = smtplib.SMTP('smtp.yahoo.com', 587, None, 30) 75 #server = smtplib.SMTP_SSL('smtp.googlemail.com', 465) 76 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server 77 78 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server 79 #server.login(fromaddr,password) 80 #myRandomJoke = randomJoke() 81 myQiuBai = qiushibaike() # 82 job() 83 schedule.every(2).minutes.do(job)# send a email every 2 minutes 84 #notQuit = True 85 #print u"你好,这里是随机笑话!" 86 while True: 87 schedule.run_pending()# 88 time.sleep(10) 89 90 91 92 server.quit() 93 quit()
OPTIMISM, PASSION & HARDWORK