爬取网易云新闻
一、爬取网易云新闻
import re
import requests
"""
@author RansySun
@create 2019-07-23-9:24
"""
count = 0
for i in ['nba', 'cba', 'china']:
# 请求网易新闻
response = requests.get(f"https://sports.163.com/{i}/")
data = response.text
# <a href="https://sports.163.com/19/0723/07/EKOJ4J0P0005877U.html">邓肯重返马刺当助理教练 波波:现在轮到他报答我了</a>
url_res = re.findall('<a href="(https://sports.163.com/.*?)"', data)
# 去重使用set集合
url_res = set(url_res)
# 第二次请求指定新闻链接
for res in url_res:
url_response = requests.get(res)
url_data = url_response.text
# 获取新闻内容
new_res = re.findall('<p>(.*?)</p>', url_data)
# 获取新闻标题
new_title = re.findall('<h1>(.*?)</h1>', url_data)[0]
# 去除新闻标题中出现的特殊符号
new_title = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~,…]|\s', '', new_title)
# print(new_title)
# print(res)
str_ = ""
for res in new_res:
res = re.sub("<.*?>", "", res)
str_ += f'{res}\n'
# print(str_)
fw = open(f'{count}_{new_title}.txt', 'w', encoding='utf8')
fw.write(str_)
fw.flush()
count += 1
print(f'成功保存第{count}_{new_title}down.....')
效果:
在当下的阶段,必将由程序员来主导,甚至比以往更甚。