import csv
import requests
from bs4 import BeautifulSoup
def fetch_page_data(page_number):
base_url = 'https://regengbaike.com/'
load_more_url_template = base_url + '?page={}'
url = load_more_url_template.format(page_number)
try:
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.content
except requests.RequestException as e:
print(f"请求出错: {e}")
return None
def parse_article(article):
# 尝试匹配两种不同的类名
title_element = article.find('h2', {'class': ['article-title pl-3', 'article-title mb-3 pl-3']})
title = title_element.a.text.strip() if title_element and title_element.a else ''
explanation_element = article.find('div', {'article-text': True})
explanation = explanation_element.p.text.strip() if explanation_element and explanation_element.p else ''
publish_time_element = article.find('time', {'itemprop': 'pubDate'})
publish_time = publish_time_element.text.strip() if publish_time_element else ''
return title, explanation, publish_time
def main():
total_pages = 175
output_file = 'output.csv' # 更改输出文件为CSV
with open(output_file, mode='w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
for page in range(1, total_pages + 1):
content = fetch_page_data(page)
if content:
soup = BeautifulSoup(content, 'html.parser')
articles = soup.find_all('article', {'class': 'article-item'})
for article in articles:
title, explanation, publish_time = parse_article(article)
if title or explanation or publish_time:
# 使用writerow方法写入一行数据
writer.writerow([title, explanation, publish_time])
else:
print(f'未找到有效数据 (Page {page}, Article)')
print(f"第 {page} 页数据读取完成")
else:
print(f"第 {page} 页数据获取失败")
print("所有页面数据已抓取完毕。")
if __name__ == '__main__':
main()
第二版的代码 直接输出排版好的csv文件
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧
· 园子的第一款AI主题卫衣上架——"HELLO! HOW CAN I ASSIST YOU TODAY
· 【自荐】一款简洁、开源的在线白板工具 Drawnix