import json
import re
import requests
from bs4 import BeautifulSoup
fOut = open("博客爬取文章列表标题及地址.txt", "w", encoding="utf8")
for idx in range(20):
print("#" * 50, idx + 1)
url = "https://www.cnblogs.com/AggSite/AggSitePostList"
data = {
"CategoryType": "SiteHome",
"ParentCategoryId": 0,
"CategoryId": 808,
"PageIndex": idx + 1,
"TotalPostCount": 4000,
"ItemListActionName": "AggSitePostList"
}
headers = {
"Content-Type": "application/json; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"
}
resp = requests.post(url, data=json.dumps(data), headers=headers, timeout=3)
# print(resp.text)
if resp.status_code != 200:
print(resp.status_code)
raise Exception()
soup = BeautifulSoup(resp.text, "html.parser")
post_items = soup.find_all("article", class_="post-item")
for post_item in post_items:
link = post_item.find("a", class_="post-item-title")
href = link.get("href")
text = link.get_text()
span = post_item.find("span", id=re.compile(r"^digg"))
number = span.get_text()
print(href, text, number)
# 写出文件
fOut.write("%s\t%s\n%s\t" % (href, text, number))
fOut.flush()
print("success:%s, %s, %s" % (href, text, number))
fOut.close()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了