使用requests爬取懂球D新闻资讯入库mongodb
requests_dongqiudi_com.py
#!/usr/bin/env python3 # coding=utf-8 # Version:python3.6.1 # File:urllib_dongqiudi_com.py # Author:LGSP_Harold import pymongo import requests base_url = 'https://www.dongqiudi.com/api/app/tabs/web/56.json?' headers = { 'authority': 'www.dongqiudi.com', 'method': 'GET', 'path': '/api/app/tabs/web/56.json?after=1624801226&page=2&child_tab_id=0', 'scheme': 'https', 'accept': 'application/json, text/plain, */*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'dnt': '1', 'pragma': 'no-cache', 'referer': 'https://www.dongqiudi.com/articlesList/56', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', } def get_page(page): min = '' for num in range(1, page): try: params = { 'after': min, 'child_tab_id': 0, 'page': page, } response = requests.get(url=base_url, headers=headers, params=params) if response.status_code == 200: min = response.json().get('min') # print(response.json()) results = parse_page(response.json()) for data in results: handle_mongodb(data) except Exception as e: print('Error:', e.args) def parse_page(json): if json: items = json.get('articles') for item in items: data = {} data['id'] = item.get('id') data['title'] = item.get('title') data['url'] = item.get('url') data['published_at'] = item.get('published_at') data['comments_total'] = item.get('comments_total') yield data def handle_mongodb(data): client = pymongo.MongoClient('mongodb://admin:admin@localhost:27017') db = client['db_dongqiudi_com'] collection = db['collection_list'] data = dict(data) if not collection.insert_one(data): print('not save to mongo') if __name__ == '__main__': page = int(input('输入您要爬取的总页码数:')) get_page(page)
略懂,略懂....