Python爬虫(六)

源码:

 1 import requests
 2 import re
 3 from my_mysql import MysqlConnect
 4 
 5 
 6 # 获取问答信息
 7 def get_contents(page,headers):
 8     url = 'https://www.zhihu.com/api/v4/members/chen-lu-ya-26/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20&sort_by=created'.format(page)
 9     req = requests.get(url,headers=headers)
10     html_json_dict = req.json()
11     # print(html_json_dict)
12     data_list = html_json_dict['data']
13     contents = []
14     for item in data_list:
15         question = item['question']['title']
16         excerpt = item['excerpt']
17         if '<' in excerpt:
18             pat = r'(.*?)<.*>(.*)'
19             res = re.search(pat, excerpt)
20             front = res.group(1)
21             back = res.group(2)
22             pat = r'<.*?>(.*?)<.*?>'
23             res = re.findall(pat, excerpt)
24             middle = ' '.join(res)
25             excerpt = front + middle + back
26         contents.append((question,excerpt))
27     return contents
28 
29 if __name__ == '__main__':
30     headers = {
31         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
32     }
33     mc = MysqlConnect('127.0.0.1','root','123456','homework')
34     for page in range(0,20*8,20):
35         contents = get_contents(page, headers)
36         # print(contents)
37         for content in contents:
38             sql = 'insert into zhihu values(null,%s,%s)'
39             mc.exec_data(sql,content)
40             print(content)

 

posted @ 2018-08-19 15:10  _积木城池  阅读(219)  评论(0编辑  收藏  举报