我的新博客

Python爬取网上笑话并定时邮件发送--P.S.想经常收笑话的同学,可以邮件我

我的博客

效果如下图

1

2

Author : Leon
Email  : yangli0534@yahoo.com
Description: 1 grab a joke from the Internet              2 email to someone on schedule

 

  1 # -*- coding:gb2312 -*-
  2 
  3 #!/usr/bin/python
  4 __author__ = 'Leon'
  5 
  6 '''
  7     Author : Leon
  8     Email  : yangli0534@yahoo.com
  9     Description: 1 grab a joke from the Internet 
 10                  2 email to someone on schedule 
 11                  3 参考了网友李鹏飞关于抓取网页的部分 ,感谢。侵权删
 12 '''
 13 import smtplib
 14 from email.MIMEMultipart import MIMEMultipart
 15 from email.MIMEText import MIMEText
 16 import urllib2
 17 import re
 18 import schedule
 19 import time
 20 import datetime
 21 
 22 class randomJoke:
 23 
 24     #初始化方法
 25     def __init__(self):
 26         self.url = 'http://lengxiaohua.com/random'
 27         self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
 28         #初始化headers
 29         self.headers = { 'User-Agent' : self.user_agent }
 30         #笑话内容
 31         self.content = []
 32 
 33     #获取网页源代码
 34     def getSourceCode(self):
 35         try:
 36             request = urllib2.Request(url = self.url, headers=self.headers)
 37             response = urllib2.urlopen(request)
 38             sourceCode = response.read().decode('utf-8')
 39             return sourceCode
 40         except urllib2.URLError, e:
 41             if hasattr(e,"reason"):
 42                 print u"网络错误...",e.reason
 43                 return None
 44 
 45     #获取笑话
 46     def setContent(self):
 47         sourceCode = self.getSourceCode()
 48         if not sourceCode:
 49             print('获取网页内容失败~!')
 50             quit()
 51         pattern = re.compile(' <pre.*?js="joke_summary".*?"first_char">(.*?)</span>(.*?)</pre>.*?class="user_info">.*?<a.*?>(.*?)</a>.*?(.*?)',re.S)
 52         items = re.findall(pattern,sourceCode)
 53         self.content = items
 54         #print u"已经爬取源代码...正在解析源代码..."
 55 
 56     #返回笑话
 57     def getContent(self):
 58         return self.content
 59 
 60     #打印一则笑话
 61     def printAJoke(self,number):
 62         joke = self.content[number]
 63         print u"作者:%s" %(joke[2])
 64         print u'发表于:'+ joke[3]
 65         #item[0]和item[1]组成完整的内容
 66         print joke[0]+joke[1]
 67 
 68     def getAJoke(self,number):
 69         joke = self.content[number]
 70         content = ""
 71         #content = content+ u"作者:" %(joke[2])
 72         #print u'发表于:'+ joke[3]
 73         #item[0]和item[1]组成完整的内容
 74         content =  joke[0]+joke[1]
 75         return content
 76 ## run the task on chedule
 77 def job():
 78     global myRandomJoke
 79     #global server
 80     global toaddr
 81     global fromaddr
 82     global password
 83     t = datetime.datetime.now()
 84     content = ""
 85     content = content+ u"你好,这里是随机笑话!"
 86     content = content+ "It's "
 87     content = content+ t.strftime("%A, %d. %B %Y %I:%M%p")+'\n'
 88     myRandomJoke.setContent()
 89     #myRandomJoke.printAJoke(2)
 90     content = content+myRandomJoke.getAJoke(2)
 91     print content
 92     msg = MIMEMultipart()
 93     msg['From'] = fromaddr
 94     msg['To'] = toaddr
 95     msg['Subject'] = "Leon send a joke for u on"+t.strftime("%A, %d. %B %Y %I:%M%p")
 96     
 97     try:
 98         body = "YOUR MESSAGE HERE"
 99         #msg.attach(MIMEText(content, 'plain'))
100         #msg.attach(MIMEText(content, 'plain'))
101         msg.attach(MIMEText(content,format,'utf-8'))
102         text = msg.as_string()
103         #server = smtplib.SMTP_SSL("smtp.126.com", 25)# connect to email server
104         server = smtplib.SMTP("smtp.126.com")# connect to email server
105         server.login(fromaddr,password)
106         server.sendmail(fromaddr, toaddr, text)
107         server.quit()
108         print "send email successfully"
109     except:
110         print "failed!"
111 
112 toaddr = "somebody@yahoo.com" # email address to send
113 
114 fromaddr = "yourname@126.com"
115 password = "xxxxxxxxxxx"#
116 #server = smtplib.SMTP('smtp.yahoo.com', 587, None, 30)
117 #server = smtplib.SMTP_SSL('smtp.googlemail.com', 465)
118 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
119 
120 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
121 #server.login(fromaddr,password)
122 myRandomJoke = randomJoke()
123 job()
124 schedule.every(2).minutes.do(job)#run every 2 minutes
125 #notQuit = True
126 #print u"你好,这里是随机笑话!"
127 while True:
128     schedule.run_pending()
129     time.sleep(10)
130     
131 
132 
133 server.quit()
134 quit()

 遇到的问题有1.SMTP的问题,和具体的邮箱相关。

2.汉字编码。使用utf-8编码的内容,在一些客户端中无法显示,修改为gbk编码后解决。

经验,尽量使用开发测试完善的package, 会更稳定。比如,这个定时程序,可以用datetime猎取时间然后判断实现,性能不如使用schedule模块。

 

更新

因为发现上述抓取笑话的网站更新不及时,于是更新为糗事百科上抓取文本,更新后的代码如下:

  1                      # -*- coding:utf-8 -*-
  2 
  3 import urllib
  4 import urllib2
  5 import re
  6 import thread
  7 import time
  8 import random
  9 #糗事百科爬虫类,在网友代码基础上修改
 10 class qiushibaike:
 11 
 12     #初始化方法,定义一些变量
 13     def __init__(self):
 14         #self.pageIndex = 30
 15         #user_agent 从火狐 HttpFox中headers查找到
 16         #self.user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0)'
 17         self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
 18         #初始化headers
 19         self.headers = { 'User-Agent' : self.user_agent }
 20         #存放段子的变量,每一个元素是每一页的段子们
 21         self.stories = []
 22         #存放程序是否继续运行的变量
 23         self.enable = False
 24         self.pageStories = []
 25         
 26     #传入某一页的索引获得页面代码
 27     def getPage(self):
 28         try:
 29             pageIndex = random.randint(2,35)
 30             url = 'https://www.qiushibaike.com/text/page/' + str(pageIndex)+'/'
 31             #构建请求的request
 32             request = urllib2.Request(url,headers = self.headers)
 33             #利用urlopen获取页面代码
 34             response = urllib2.urlopen(request)
 35             #将页面转化为UTF-8编码
 36             pageCode = response.read().decode('utf-8')
 37             return pageCode
 38 
 39         except urllib2.URLError, e:
 40             if hasattr(e,"reason"):
 41                 print u"连接糗事百科失败,错误原因",e.reason
 42                 return None
 43 
 44 
 45     #传入某一页代码,返回本页不带图片的段子列表
 46     def getPageItems(self):
 47         pageCode = self.getPage()
 48         if not pageCode:
 49             print "页面加载失败...."
 50             return None
 51         pattern = re.compile('<div class="author clearfix">.*?href.*?<img src.*?title=.*?<h2>(.*?)</h2>.*?<div class="content">(.*?)</div>.*?<i class="number">(.*?)</i>',re.S)
 52         items = re.findall(pattern,pageCode)
 53         #print len(items)
 54         #print "*****items"
 55         #用来存储每页的段子们
 56         self.pageStories = []
 57         #遍历正则表达式匹配的信息
 58         i = 0
 59         for item in items:
 60             #如果不含有图片,把它加入list中
 61             #if not haveImg:
 62                 replaceBR = re.compile('<.?span>')
 63                 #将<br/> 用 换行符\n 替换
 64                 text = re.sub(replaceBR,"\n",item[1])
 65                 #item[0]是一个段子的发布者,item[1]是内容,item[2]是发布时间,item[4]是点赞数
 66                 self.pageStories.append([item[0],text,item[2]])
 67 
 68                 #strip()作用:去掉转义字符后输出
 69         #print item[0] +"-------0t"
 70         #print item[1] +"-------0T"
 71         #print text + "******TEXT"
 72         #print item[2] +"-------0t"
 73        
 74         #return pageStories
 75 
 76     #加载并提取页面的内容,加入到列表中
 77     def loadPage(self):
 78         #如果当前未看的页数少于2页,则加载新一页
 79         if self.enable == True:
 80             if len(self.stories) < 2:
 81                 #获取新一页
 82                 #pageStories = self.getPageItems()
 83                 self.getPageItems()
 84                 #将该页的段子存放到全局list中
 85                 if self.pageStories:
 86                     self.stories.append(self.pageStories)
 87                     #获取完之后页码索引加一,表示下次读取下一页
 88                     #self.pageIndex += 1
 89 
 90                 #print len(self.stories)
 91     #调用该方法,每次敲回车打印输出一个段子
 92     def getOneStory(self):
 93         #遍历一页的段子
 94 
 95         # for story in pageStories:
 96         #     #等待用户输入
 97         #     input = raw_input()
 98         #     #每当输入回车一次,判断一下是否要加载新页面
 99         #     self.loadPage()
100         #     #如果输入Q则程序结束
101         #     if input == "Q":
102         #         self.enable = False
103         #         return
104         #     #print "$$$"
105         #     print len(story)
106         #     #现在网页已没有发布时间了
107         #     #print u"第%d页\t发布人:%s\t发布时间:%s\t赞:%s\n%s" %(page,story[0],story[2],story[3],story[1])
108         #     print u"第%d页\t发布人:%s\t赞:%s\n%s" %(page,story[0],story[2],story[1])
109         self.loadPage()
110         len_page = len(self.pageStories)
111         story = self.pageStories[random.randint(0, len_page-1)]
112         print u'回车看下一个,Q退出'
113         input = raw_input()
114         #     #每当输入回车一次,判断一下是否要加载新页面
115         #     self.loadPage()
116         #     #如果输入Q则程序结束
117         if input == "Q":
118             self.enable = False
119             return
120         #print len(story)
121         print u'%s' %story[1]
122     #开始方法
123     def start(self):
124         print u"正在读取糗事百科,按回车查看新段子,Q退出"
125         #使变量为True,程序可以正常运行
126         self.enable = True
127        
128         #先加载一页内容
129         self.loadPage()
130         
131         #局部变量,控制当前读到了第几页
132         #nowPage = 0
133         while self.enable:
134             if len(self.stories)>0:
135                 #print len(self.stories)
136                 #print "-------stories"
137                 #从全局list中获取一页的段子
138                 self.pageStories = self.stories[0]
139                 #当前读到的页数加一
140                 #nowPage += 1
141                 #将全局list中第一个元素删除,因为已经取出
142                 del self.stories[0]
143                 #print "---------------------------------"
144                 #print len(pageStories)
145                 #print nowPage
146                 #输出该页的段子
147                 self.getOneStory()
148     def getAJoke(self):
149         self.enable = True
150         self.loadPage()
151         self.pageStories = self.stories[0]
152         del self.stories[0]
153         len_page = len(self.pageStories)
154         story = self.pageStories[random.randint(0, len_page-1)]
155         self.enable = True
156         return story[1]
157 
158         #print u'回车看下一个,Q退出'
159         #input = raw_input()
160         #     #每当输入回车一次,判断一下是否要加载新页面
161         #     self.loadPage()
162         #     #如果输入Q则程序结束
163         #if input == "Q":
164         #   self.enable = False
165         #    return
166         #print len(story)
167         #print u'%s' %story[1]
168 
169 #spider = QSBK()
170 #Aspider.start()
171 #print spider.getAJoke()

使用方法如下:

 1 #-*- coding:utf-8 -*-
 2 
 3 
 4 #!/usr/bin/python
 5 __author__ = 'Leon'
 6 
 7 '''
 8     Author : Leon
 9     Email  : yangli0534@yahoo.com
10     Description: 1 grab a joke from the Internet 
11                  2 email to someone on schedule 
12                  3 参考了部分网友的代码 ,感谢。侵权删
13 '''
14 import smtplib
15 from email.MIMEMultipart import MIMEMultipart
16 from email.MIMEText import MIMEText
17 import re
18 import schedule
19 import time
20 import datetime
21 from qiushibaike import qiushibaike
22 
23 def job():
24     #global myRandomJoke
25     global myQiuBai
26     #global server
27     global toaddr
28     global fromaddr
29     global password
30     t = datetime.datetime.now()
31     content = ''
32     content = content+ u'笑口常开!'
33     content = content+ u"It's "
34     content = content+ t.strftime("%A, %d. %B %Y %I:%M%p")+'\n'
35     #myRandomJoke.setContent()
36     #myRandomJoke.printAJoke(2)
37     #content = content + myRandomJoke.getAJoke(2)
38     content = content + myQiuBai.getAJoke()
39     print content
40     #content = u'''你好,这是一封测试邮件,来自yangli0534@yahooc.com'''
41     #content = content +t.strftime("%A, %d. %B %Y %I:%M%p")
42     msg = MIMEMultipart()
43     msg['From'] =fromaddr
44     msg['To'] =','.join(toaddr)
45     msg['Cc'] = ','.join(ccaddr)
46     msg['Bcc'] = ','.join(ccaddr)
47     msg['Subject'] = u"Leon send a joke to you on"+t.strftime("%A, %d. %B %Y %I:%M%p")
48     
49     try:
50         #body = "YOUR MESSAGE HERE"
51         body = content
52         #msg.attach(MIMEText(content, 'plain'))
53         #msg.attach(MIMEText(content, 'plain'))
54         #msg.attach(MIMEText(content,format,'utf-8'))
55         msg.attach(MIMEText(body.encode('gbk')))
56         text = msg.as_string()
57         #server = smtplib.SMTP_SSL("smtp.126.com", 25)# connect to email server
58         server = smtplib.SMTP("smtp.139.com",25)# connect to email server
59         server.login(fromaddr,password)
60         #server.sendmail(fromaddr, toaddr, text)
61         server.sendmail(fromaddr, toaddr + ccaddr, text)
62         #server.sendmail(fromaddr, fromaddr,text)
63         server.quit()
64         print "send email successfully"
65     except:
66         print "failed!"
67 
68 toaddr = ['1184802734@qq.com','18811007706@139.com'] # email address to send
69 ccaddr = ['502327976@qq.com'] # carbon copy
70 bccaddr = ['15210579762@139.com']#blind carbon copy
71 #toaddr2 = '502327976@qq.com'
72 fromaddr = 'china__mobile@139.com'#send address
73 password = "xxxxxxx"#password
74 #server = smtplib.SMTP('smtp.yahoo.com', 587, None, 30)
75 #server = smtplib.SMTP_SSL('smtp.googlemail.com', 465)
76 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
77 
78 #server = smtplib.SMTP_SSL("smtp.qq.com", 465)# connect to email server
79 #server.login(fromaddr,password)
80 #myRandomJoke = randomJoke()
81 myQiuBai = qiushibaike() # 
82 job()
83 schedule.every(2).minutes.do(job)# send a email every 2 minutes
84 #notQuit = True
85 #print u"你好,这里是随机笑话!"
86 while True:
87     schedule.run_pending()# 
88     time.sleep(10)
89     
90 
91 
92 server.quit()
93 quit()

 

posted @ 2017-07-16 17:46  Leon#0534  阅读(725)  评论(0编辑  收藏  举报

我的新博客

专注天线学习,欢迎交流 yangli0534@gmail.com - 创建于 2010年

我永远是茫茫EE领域的一名小学生。