Python之py9-py9博客情况获取

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import re
import datetime
import requests

url_name_str='''朱子超  https://www.cnblogs.com/heroknot/
赵嘉豪 https://www.cnblogs.com/zhoajiahao/
巩景云 https://www.cnblogs.com/gongjingyun123--/
李琦 https://www.cnblogs.com/1naonao/
潘立府 https://www.cnblogs.com/plf-Jack/
胡凯琴 https://www.cnblogs.com/863652104kai/
雷俊 https://www.cnblogs.com/lucky75/
刘闯 https://www.cnblogs.com/miaowugulu/
毛毅智 https://www.cnblogs.com/acate/
葛林丽 https://www.cnblogs.com/geyatou322/
朱缘应 https://www.cnblogs.com/zhuyuanying123--/
雷鸣 https://www.cnblogs.com/leimingqq2/
赵刚 https://www.cnblogs.com/zhaogang0104/
吴锡 https://www.cnblogs.com/ZDQ1/
张岩 https://www.cnblogs.com/zuihoudebieli/
高化焱 https://www.cnblogs.com/gaohuayan/
孔凡平 https://www.cnblogs.com/WilliamKong94/
王强 https://www.cnblogs.com/bruce123/
杨文益 https://www.cnblogs.com/pythonywy/
伍开日 https://www.cnblogs.com/clarence203/
朱竹平 https://www.cnblogs.com/Hades123/
周瑞星 https://www.cnblogs.com/zrx19960128/
许长义 https://www.cnblogs.com/xcyandwxl/
储皖浏 https://www.cnblogs.com/chuwanliu/
陈石 https://www.cnblogs.com/chencharry/
徐浩 https://www.cnblogs.com/einsam/
吴奇宇 https://www.cnblogs.com/blog5434/
张天承 https://www.cnblogs.com/bladecheng/
赵志强 https://www.cnblogs.com/wsxiaoyao/
朱健 https://www.cnblogs.com/masterjian924/
魏义军 https://www.cnblogs.com/Dr-wei/
曹降祥 https://www.cnblogs.com/fengxuemuyangren/
陈跃春 https://www.cnblogs.com/chenych/
黄云 https://www.cnblogs.com/yellowcloud/
段力钢 https://www.cnblogs.com/raynduan/
刘金 https://www.cnblogs.com/itboy-newking/
'''



def get_name_url_dict():
    """读取文件"""
    if not os.path.exists('博客地址.txt'):
        with open('博客地址.txt', 'w', encoding='utf8') as fw:
            fw.write(url_name_str)
            fw.flush()
            print('写入文件成功...')
        
    with open('博客地址.txt', 'r', encoding='utf8') as fr:
        name_urls = fr.readlines()

    name_url_dict = dict()
    for name_url in name_urls:
        name_url_split = name_url.split()

        name = name_url_split[0]
        url = name_url_split[1]

        name_url_dict[name] = url

    print(f'同学数:{len(name_url_dict)}')

    return name_url_dict


def request_next_url_data(next_url, url_list):
    """请求下一个网页"""
    next_response = requests.get(next_url)
    next_data = next_response.text
    next_url_list = re.findall('href="(.*?)">(.*?)</a>', next_data)

    url_list.extend(next_url_list)
    re_next_url = re.findall('<a href="(https://www.cnblogs.com/.{0,30}/default\.html\?page=\d+)">下一页</a>', next_data)
    if re_next_url:
        re_next_url = re_next_url[0]
        request_next_url_data(re_next_url, url_list)

    return url_list


def for_every_name_urls(name_url_dict):
    """循环爬取所有人的博客信息"""
    s_sum = ''

    for name, home_url in name_url_dict.items():
        # 拼接主页
        s_sum = f'{s_sum}{name} {home_url}\n'
        print(name, home_url)

        # 获取第一页的内容
        response = requests.get(home_url)
        data = response.text
        url_list = re.findall('href="(.*?)">(.*?)</a>', data)

        # 判断是否存在下一页
        next_url = re.findall('[^;]<a href="(https://www.*?/default\.html\?page=\d+)">下一页</a>', data)
        if next_url:
            next_url = next_url[0]
            url_list = request_next_url_data(next_url, url_list)

        # 去重处理
        url_set = set()
        for url in url_list:
            if url[0].startswith(f'{home_url}p/') and url[0].endswith('html'):
                url_set.add(url)
        print(url_set)
        for url in url_set:
            s = f'{name} {url[0]} {url[1]}'
            s_sum = f'{s_sum}{s}\n'

        s_sum = f'{s_sum}\n'

    return s_sum


def save_file(s_sum):
    day_time = str(datetime.datetime.now()).split(' ')[0]
    f = open(f'{day_time}-py9博客情况汇总.txt', 'w', encoding='utf8')
    f.write(s_sum)
    f.close()


if __name__ == '__main__':
    name_url_dict = get_name_url_dict()
    s_sum = for_every_name_urls(name_url_dict)
    print(s_sum)
    save_file(s_sum)
posted @ 2019-04-30 08:29  B站-水论文的程序猿  阅读(385)  评论(0编辑  收藏  举报