01爬虫 爬取糗事百科段子

复制代码
 1 # -*- coding:utf-8 -*-
 2 import urllib2
 3 import re
 4 for page in range(1,5):
 5     url = 'http://www.qiushibaike.com/hot/page/' + str(page)
 6     user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
 7     headers = { 'User-Agent' : user_agent }
 8     try:
 9         request = urllib2.Request(url,headers = headers)
10         response = urllib2.urlopen(request)
11         content = response.read().decode('utf-8')
12         pattern = re.compile('<span>(.*?)</span>',re.S)
13         items = re.findall(pattern,content)
14         for item in items:
15                 print item
16     except urllib2.URLError, e:
17         if hasattr(e,"code"):
18             print e.code
19         if hasattr(e,"reason"):
20             print e.reason
复制代码

 

posted @   miao_a_miao  阅读(178)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示