爬取知乎Python中文社区信息
爬取知乎Python中文社区信息,https://zhuanlan.zhihu.com/zimei
1 import requests 2 from urllib.parse import urlencode 3 from pyquery import PyQuery as pq 4 from pymongo import MongoClient 5 import json 6 import time 7 8 base_url = 'https://www.zhihu.com/api/v4/columns/zimei/articles?limit=10&' 9 headers = { 10 'authority': 'www.zhihu.com', 11 'referer': 'https://zhuanlan.zhihu.com/zimei', 12 'origin': 'https://zhuanlan.zhihu.com', 13 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 14 } 15 16 client = MongoClient() 17 db = client['zhihu'] 18 collection = db['zhihu'] 19 max_page = 100 20 21 22 def get_page(page): 23 params = { 24 'offset': page*10 25 } 26 url = base_url + urlencode(params) 27 try: 28 response = requests.get(url, headers=headers) 29 if response.status_code == 200: 30 31 return response.json() 32 except requests.ConnectionError as e: 33 print('Error', e.args) 34 35 36 def parse_page(json_1): 37 if json_1: 38 items = json_1.get('data') 39 for item in items: 40 if page == 1 : 41 continue 42 else: 43 44 zhihu = {} 45 zhihu['name'] = item.get('author').get('name') 46 zhihu['title'] = item.get('title') 47 zhihu['text'] = pq(item.get('excerpt')).text() 48 zhihu['comments'] = item.get('comment_count') 49 zhihu['reposts'] = item.get('voteup_count') 50 zhihu['data'] = time.strftime('%Y-%m-%d %H%:%M',time.localtime(item.get('updated'))) 51 yield zhihu 52 53 def write_to_file(content): 54 with open('zhihu.json','a',encoding='utf-8') as f: 55 f.write(json.dumps(content,ensure_ascii=False)+'\n') 56 f.close() 57 58 def save_to_mongo(result): 59 if collection.insert(result): 60 print('Saved to Mongo') 61 62 63 if __name__ == '__main__': 64 for page in range(1, max_page + 1): 65 json_1 = get_page(page) 66 67 results = parse_page(json_1) 68 for result in results: 69 print(result) 70 write_to_file(result) 71 save_to_mongo(result)