使用requests爬取懂球D新闻资讯入库mongodb

 requests_dongqiudi_com.py

#!/usr/bin/env python3
# coding=utf-8
# Version:python3.6.1
# File:urllib_dongqiudi_com.py
# Author:LGSP_Harold
import pymongo
import requests

base_url = 'https://www.dongqiudi.com/api/app/tabs/web/56.json?'

headers = {
    'authority': 'www.dongqiudi.com',
    'method': 'GET',
    'path': '/api/app/tabs/web/56.json?after=1624801226&page=2&child_tab_id=0',
    'scheme': 'https',
    'accept': 'application/json, text/plain, */*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'no-cache',
    'dnt': '1',
    'pragma': 'no-cache',
    'referer': 'https://www.dongqiudi.com/articlesList/56',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}


def get_page(page):
    min = ''
    for num in range(1, page):
        try:
            params = {
                'after': min,
                'child_tab_id': 0,
                'page': page,
            }
            response = requests.get(url=base_url, headers=headers, params=params)
            if response.status_code == 200:
                min = response.json().get('min')
                # print(response.json())
                results = parse_page(response.json())
                for data in results:
                    handle_mongodb(data)
        except Exception as e:
            print('Error:', e.args)


def parse_page(json):
    if json:
        items = json.get('articles')
        for item in items:
            data = {}
            data['id'] = item.get('id')
            data['title'] = item.get('title')
            data['url'] = item.get('url')
            data['published_at'] = item.get('published_at')
            data['comments_total'] = item.get('comments_total')
            yield data


def handle_mongodb(data):
    client = pymongo.MongoClient('mongodb://admin:admin@localhost:27017')
    db = client['db_dongqiudi_com']
    collection = db['collection_list']
    data = dict(data)
    if not collection.insert_one(data):
        print('not save to mongo')


if __name__ == '__main__':
    page = int(input('输入您要爬取的总页码数:'))
    get_page(page)

 

posted @ 2021-08-18 15:45  嘆世殘者——華帥  阅读(39)  评论(0编辑  收藏  举报