初探爬虫小练习,爬取知乎圆桌内容标题

先上代码:

 1 # -*- coding: utf-8 -*-
 2 import urllib
 3 import urllib2
 4 import re
 5 import json
 6 
 7 
 8 class Spider:
 9 
10     def __init__(self):
11         # 记录爬取每页的开始
12         self.offset = 0
13         self.title = []
14 
15     def spider_page(self):
16         url = 'https://www.zhihu.com/r/roundtables?offset={offset}'
17         headers = {
18                       "Host": "www.zhihu.com",
19                       "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0",
20                       "Accept": "*/*",
21                       "Accept-Language": "zh-CN;en-US,en;q=0.5",
22                       "X-Requested-With": "XMLHttpRequest",
23                       "Referer": "https://www.zhihu.com/roundtable",
24                       "Connection": "keep-alive",
25                       "Accept-Charset": "utf-8"
26         }
27 
28         url = url.format(offset=str(self.offset))
29         req = urllib2.Request(url, headers=headers)
30         html = urllib2.urlopen(req).read().decode('utf-8').encode('utf-8')
31         html = json.loads(html)
32 
33         before = self.offset
34         try:
35             next_offset = re.findall(r'offset=(.*)', html['paging']['next'])
36             self.offset = int(next_offset[0])
37         except:
38             self.offset = 0
39 
40         titles = html['htmls']
41         temp = ""
42         for title in titles:
43             temp += title
44 
45         return {"before": before , "temp": temp}
46 
47     def find_title(self, titles):
48         titles = titles[:]
49         title = re.findall(r'<span.*?class="name">(.*?)</span></span></a>', titles)
50 
51         for item in title:
52             self.title.append(item)
53 
54 
55 if __name__ == '__main__':
56     s = Spider()
57     for i in range(6):
58         results = s.spider_page()
59         s.find_title(results["temp"])
60 
61     for item in s.title:
62         print item

总结:

    总体来说,这段代码还是比较好理解的,有以下几点需要注意:

  1.     输出乱码,看返回报文首部有Content-Encoding:"gzip",那么你就要注意你的请求首部信息Accept-Ecoding的内容是否能解压,或者直接告知服务器直接不加密传输。
  2.     在就是对返回的json对象调用json.loads进行解码成json数据类型,否则数据是以字符串的形式存在的。
posted @ 2016-09-11 10:21  魔术师的礼帽  阅读(292)  评论(0编辑  收藏  举报