爬虫之网易新闻（体育）

https://sports.163.com/{i} i in ['nba','cba','china']

import os
import re
import requests

if not os.path.exists('网易新闻'): # 生成文件夹
    os.mkdir('网易新闻')

count = 0
for i in ['nba','cba','china']:
    # 获取所有的url
    response = requests.get(f'https://sports.163.com/{i}/')
    data = response.text
    url_res = re.findall('href="(https://sports.163.com/.*?)"', data)
    url_res = set(url_res)

    # 针对单个url
    for url in url_res:
        url_response = requests.get(url)
        url_data = url_response.text

        try:
            title = re.findall('<h1>(.*?)</h1>', url_data, re.S)[0]
            news_res = \
                re.findall('<div class="post_text" id="endText" style="border-top:1px solid #ddd;">(.*?责任编辑：.*?)</span>',
                           url_data, re.S)[0]  #
            news_res = re.sub('<.*?>', '', news_res)
        except:
            continue

        title = re.sub('[!"#$%&()*+,-./:;<=>?@[\\]^_‘{|}~，…]|\s', '', title)  # 除掉标题所有的脏字符
        title_path = os.path.join('网易新闻', f'{title}.txt')  # 拼接出新闻的路径
        f = open(title_path, 'w', encoding='utf8')
        f.write(news_res)
        f.flush()
        f.close()
        count += 1

        print(f'完成{count}篇, {title} done...')

posted @ 2019-07-23 16:21 SweetJack 阅读(447) 评论(0) 编辑收藏举报

刷新页面返回顶部

大大尊者wang

梦里青春可得追，欲将诗句绊馀晖。酒阑病客惟思睡，蜜熟黄蜂亦懒飞。芍药樱桃俱扫地，鬓丝禅榻两忘机。凭君借取法界观，一洗人间万事非。

爬虫之网易新闻（体育）

爬虫之网易新闻（体育）

公告

大大尊者wang

梦里青春可得追，欲将诗句绊馀晖。酒阑病客惟思睡，蜜熟黄蜂亦懒飞。芍药樱桃俱扫地，鬓丝禅榻两忘机。凭君借取法界观，一洗人间万事非。

爬虫之 网易新闻（体育）

爬虫之 网易新闻（体育）

公告

爬虫之网易新闻（体育）

爬虫之网易新闻（体育）