#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import re
import datetime
import requests
url_name_str='''朱子超 https://www.cnblogs.com/heroknot/
赵嘉豪 https://www.cnblogs.com/zhoajiahao/
巩景云 https://www.cnblogs.com/gongjingyun123--/
李琦 https://www.cnblogs.com/1naonao/
潘立府 https://www.cnblogs.com/plf-Jack/
胡凯琴 https://www.cnblogs.com/863652104kai/
雷俊 https://www.cnblogs.com/lucky75/
刘闯 https://www.cnblogs.com/miaowugulu/
毛毅智 https://www.cnblogs.com/acate/
葛林丽 https://www.cnblogs.com/geyatou322/
朱缘应 https://www.cnblogs.com/zhuyuanying123--/
雷鸣 https://www.cnblogs.com/leimingqq2/
赵刚 https://www.cnblogs.com/zhaogang0104/
吴锡 https://www.cnblogs.com/ZDQ1/
张岩 https://www.cnblogs.com/zuihoudebieli/
高化焱 https://www.cnblogs.com/gaohuayan/
孔凡平 https://www.cnblogs.com/WilliamKong94/
王强 https://www.cnblogs.com/bruce123/
杨文益 https://www.cnblogs.com/pythonywy/
伍开日 https://www.cnblogs.com/clarence203/
朱竹平 https://www.cnblogs.com/Hades123/
周瑞星 https://www.cnblogs.com/zrx19960128/
许长义 https://www.cnblogs.com/xcyandwxl/
储皖浏 https://www.cnblogs.com/chuwanliu/
陈石 https://www.cnblogs.com/chencharry/
徐浩 https://www.cnblogs.com/einsam/
吴奇宇 https://www.cnblogs.com/blog5434/
张天承 https://www.cnblogs.com/bladecheng/
赵志强 https://www.cnblogs.com/wsxiaoyao/
朱健 https://www.cnblogs.com/masterjian924/
魏义军 https://www.cnblogs.com/Dr-wei/
曹降祥 https://www.cnblogs.com/fengxuemuyangren/
陈跃春 https://www.cnblogs.com/chenych/
黄云 https://www.cnblogs.com/yellowcloud/
段力钢 https://www.cnblogs.com/raynduan/
刘金 https://www.cnblogs.com/itboy-newking/
'''
def get_name_url_dict():
"""读取文件"""
if not os.path.exists('博客地址.txt'):
with open('博客地址.txt', 'w', encoding='utf8') as fw:
fw.write(url_name_str)
fw.flush()
print('写入文件成功...')
with open('博客地址.txt', 'r', encoding='utf8') as fr:
name_urls = fr.readlines()
name_url_dict = dict()
for name_url in name_urls:
name_url_split = name_url.split()
name = name_url_split[0]
url = name_url_split[1]
name_url_dict[name] = url
print(f'同学数:{len(name_url_dict)}')
return name_url_dict
def request_next_url_data(next_url, url_list):
"""请求下一个网页"""
next_response = requests.get(next_url)
next_data = next_response.text
next_url_list = re.findall('href="(.*?)">(.*?)</a>', next_data)
url_list.extend(next_url_list)
re_next_url = re.findall('<a href="(https://www.cnblogs.com/.{0,30}/default\.html\?page=\d+)">下一页</a>', next_data)
if re_next_url:
re_next_url = re_next_url[0]
request_next_url_data(re_next_url, url_list)
return url_list
def for_every_name_urls(name_url_dict):
"""循环爬取所有人的博客信息"""
s_sum = ''
for name, home_url in name_url_dict.items():
# 拼接主页
s_sum = f'{s_sum}{name} {home_url}\n'
print(name, home_url)
# 获取第一页的内容
response = requests.get(home_url)
data = response.text
url_list = re.findall('href="(.*?)">(.*?)</a>', data)
# 判断是否存在下一页
next_url = re.findall('[^;]<a href="(https://www.*?/default\.html\?page=\d+)">下一页</a>', data)
if next_url:
next_url = next_url[0]
url_list = request_next_url_data(next_url, url_list)
# 去重处理
url_set = set()
for url in url_list:
if url[0].startswith(f'{home_url}p/') and url[0].endswith('html'):
url_set.add(url)
print(url_set)
for url in url_set:
s = f'{name} {url[0]} {url[1]}'
s_sum = f'{s_sum}{s}\n'
s_sum = f'{s_sum}\n'
return s_sum
def save_file(s_sum):
day_time = str(datetime.datetime.now()).split(' ')[0]
f = open(f'{day_time}-py9博客情况汇总.txt', 'w', encoding='utf8')
f.write(s_sum)
f.close()
if __name__ == '__main__':
name_url_dict = get_name_url_dict()
s_sum = for_every_name_urls(name_url_dict)
print(s_sum)
save_file(s_sum)