import csv
import requests
from bs4 import BeautifulSoup
def fetch_page_data(page_number):
base_url = 'https://regengbaike.com/'
load_more_url_template = base_url + '?page={}'
url = load_more_url_template.format(page_number)
try:
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.content
except requests.RequestException as e:
print(f"请求出错: {e}")
return None
def parse_article(article):
# 尝试匹配两种不同的类名
title_element = article.find('h2', {'class': ['article-title pl-3', 'article-title mb-3 pl-3']})
title = title_element.a.text.strip() if title_element and title_element.a else ''
explanation_element = article.find('div', {'article-text': True})
explanation = explanation_element.p.text.strip() if explanation_element and explanation_element.p else ''
publish_time_element = article.find('time', {'itemprop': 'pubDate'})
publish_time = publish_time_element.text.strip() if publish_time_element else ''
return title, explanation, publish_time
def main():
total_pages = 175
output_file = 'output.csv' # 更改输出文件为CSV
with open(output_file, mode='w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
for page in range(1, total_pages + 1):
content = fetch_page_data(page)
if content:
soup = BeautifulSoup(content, 'html.parser')
articles = soup.find_all('article', {'class': 'article-item'})
for article in articles:
title, explanation, publish_time = parse_article(article)
if title or explanation or publish_time:
# 使用writerow方法写入一行数据
writer.writerow([title, explanation, publish_time])
else:
print(f'未找到有效数据 (Page {page}, Article)')
print(f"第 {page} 页数据读取完成")
else:
print(f"第 {page} 页数据获取失败")
print("所有页面数据已抓取完毕。")
if __name__ == '__main__':
main()
第二版的代码 直接输出排版好的csv文件