爬取网易云新闻

一、爬取网易云新闻


import re
import requests

"""
@author RansySun
@create 2019-07-23-9:24
"""
count = 0
for i in ['nba', 'cba', 'china']:
    # 请求网易新闻
    response = requests.get(f"https://sports.163.com/{i}/")
    data = response.text
    # <a href="https://sports.163.com/19/0723/07/EKOJ4J0P0005877U.html">邓肯重返马刺当助理教练 波波:现在轮到他报答我了</a>
    url_res = re.findall('<a href="(https://sports.163.com/.*?)"', data)
    # 去重使用set集合
    url_res = set(url_res)

    # 第二次请求指定新闻链接
    for res in url_res:

        url_response = requests.get(res)
        url_data = url_response.text
        # 获取新闻内容
        new_res = re.findall('<p>(.*?)</p>', url_data)
        # 获取新闻标题
        new_title = re.findall('<h1>(.*?)</h1>', url_data)[0]
        # 去除新闻标题中出现的特殊符号
        new_title = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~,…]|\s', '', new_title)

        # print(new_title)

        # print(res)
        str_ = ""
        for res in new_res:
            res = re.sub("<.*?>", "", res)
            str_ += f'{res}\n'

        # print(str_)
        fw = open(f'{count}_{new_title}.txt', 'w', encoding='utf8')
        fw.write(str_)
        fw.flush()
        count += 1
        print(f'成功保存第{count}_{new_title}down.....')

    

效果:
网易云新闻

posted @ 2019-07-23 14:36  RandySun  阅读(224)  评论(0编辑  收藏  举报