爬取知H热门保存为csv
requests_(sava_csv_list)zhihu_com_collection_hot.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:zhihu_com_collection_hot.py # Author:LGSP_Harold import csv import os import requests from lxml import etree file_dir = './files/zhihu_com_collection_hot' if not os.path.exists(file_dir): os.makedirs(file_dir) url = 'https://www.zhihu.com/collection/hot' headers = { 'authority': 'www.zhihu.com', 'method': 'GET', 'path': '/collection/hot', 'scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'dnt': '1', 'pragma': 'no-cache', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } response = requests.get(url=url, headers=headers) # print(response.text) html_obj = etree.HTML(response.text) items = html_obj.xpath('//div[@class="CollectionListCard CollectionHotListPage-collectionCard"]') file_path = file_dir + '/zhihu_com_collection_hot.csv' with open(file=file_path, mode='w', encoding='utf-8') as file: # writer = csv.writer(file) writer = csv.writer(file, delimiter='`') # 默认以,分隔,delimiter可改变分隔符 writer.writerow(['question', 'link', 'author', 'answer']) data = [] for item in items: info = [] author = item.xpath('.//span[@class="CollectionListCard-creatorName"]/text()')[0] question = item.xpath('.//a[@class="CollectionListCard-contentTitle"]/text()')[0] link = item.xpath('.//a[@class="CollectionListCard-contentTitle"]/@href')[0] answer = item.xpath('.//div[@class="CollectionListCard-contentExcerpt"]/text()')[0] info = [question, link, author, answer] data.append(info) # writer.writerow([question, link, author, answer]) writer.writerows(data)
略懂,略懂....