16、爬取知乎大v张佳玮的文章“标题”、“摘要”、“链接”,并存储到本地文件
爬取知乎大v张佳玮的文章“标题”、“摘要”、“链接”,并存储到本地文件
1 # 爬取知乎大v张佳玮的文章“标题”、“摘要”、“链接”,并存储到本地文件 2 # URL https://www.zhihu.com/people/zhang-jia-wei/posts 3 4 import requests 5 import time 6 import openpyxl 7 import csv 8 9 headers = { 10 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 11 'accept-language': 'zh-CN,zh;q=0.9', 12 'cache-control': 'max-age=0', 13 'cookie':'__DAYU_PP=iJb63REJnnjIMmBvzNMV65ab0a6aae4f; q_c1=d75d908a13c44b95bd75f27578ad2088|1521641428000|1521641428000; _zap=bec28151-809b-4936-971a-d18f5255add0; tgw_l7_route=f2979fdd289e2265b2f12e4f4a478330; _xsrf=wQDRNSLBlRv3aimMzhUNyqg1BpLUnWAr; d_c0="ABDmoGi2RQ-PTp5SSFyQvDgz_QEjeQfCFgk=|1555156366"; capsion_ticket="2|1:0|10:1555156366|14:capsion_ticket|44:MTBhN2FkYjYyNWEyNDFjYWJiYTk2N2E1YTA1NDE4OTk=|084e15694c6993269b3aab564e9ea5d7983782f4b37dbc4537e000aa7b081901"', 14 'upgrade-insecure-requests': '1', 15 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' 16 } 17 18 res = requests.get('https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20&sort_by=created',headers=headers) 19 20 totals = res.json()['paging']['totals'] 21 num = 0 22 23 # excel 表头部分 24 25 wb = openpyxl.Workbook() 26 sheet = wb.active 27 sheet.title = '张佳玮的文章' 28 sheet['A1'] = '编号' 29 sheet['B1'] = '标题' 30 sheet['C1'] = '创建时间' 31 sheet['D1'] = '链接' 32 sheet['E1'] = '摘要' 33 34 # csv 表头部分 35 36 with open('zhihu.csv','w',newline='',encoding='utf-8') as csv_file: 37 writer = csv.writer(csv_file) 38 writer.writerow(['编号','标题','创建时间','链接','摘要']) 39 40 for offset in range(0,totals,20): 41 res = requests.get('https://www.zhihu.com/api/v4/members/zhang-jia-wei/articles?include=data%5B*%5D.comment_count%2Csuggest_edit%2Cis_normal%2Cthumbnail_extra_info%2Cthumbnail%2Ccan_comment%2Ccomment_permission%2Cadmin_closed_comment%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%2Creview_info%2Cis_labeled%2Clabel_info%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20&sort_by=created'.format(offset),headers=headers) 42 html = res.json() 43 items = res.json()['data'] 44 for item in items: 45 num = num + 1 46 print(num,end='\t') 47 timeArray = time.localtime(item['created']) 48 print(time.strftime("%Y-%m-%d %H:%M:%S",timeArray),end='\t') 49 print(item['title']) 50 51 # excel 内容部分 52 sheet.append([num,item['title'],time.strftime("%Y-%m-%d %H:%M:%S",timeArray),item['url'],item['excerpt']]) 53 54 # csv 内容部分 55 writer.writerow([num,item['title'],time.strftime("%Y-%m-%d %H:%M:%S",timeArray),item['url'],item['excerpt']]) 56 57 # excel 保存到文件 58 wb.save('zhihu.xlsx')
下面截图是有一次只爬了一页的结果