网络热词分析一(用python爬取数据)

import csv
import requests
from bs4 import BeautifulSoup

def fetch_page_data(page_number):
base_url = 'https://regengbaike.com/'
load_more_url_template = base_url + '?page={}'
url = load_more_url_template.format(page_number)

try:
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.content
except requests.RequestException as e:
print(f"请求出错: {e}")
return None

def parse_article(article):
# 尝试匹配两种不同的类名
title_element = article.find('h2', {'class': ['article-title pl-3', 'article-title mb-3 pl-3']})
title = title_element.a.text.strip() if title_element and title_element.a else ''

explanation_element = article.find('div', {'article-text': True})
explanation = explanation_element.p.text.strip() if explanation_element and explanation_element.p else ''

publish_time_element = article.find('time', {'itemprop': 'pubDate'})
publish_time = publish_time_element.text.strip() if publish_time_element else ''

return title, explanation, publish_time

def main():
total_pages = 175
output_file = 'output.csv' # 更改输出文件为CSV

with open(output_file, mode='w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
for page in range(1, total_pages + 1):
content = fetch_page_data(page)
if content:
soup = BeautifulSoup(content, 'html.parser')
articles = soup.find_all('article', {'class': 'article-item'})

for article in articles:
title, explanation, publish_time = parse_article(article)

if title or explanation or publish_time:
# 使用writerow方法写入一行数据
writer.writerow([title, explanation, publish_time])
else:
print(f'未找到有效数据 (Page {page}, Article)')

print(f"第 {page} 页数据读取完成")
else:
print(f"第 {page} 页数据获取失败")

print("所有页面数据已抓取完毕。")

if __name__ == '__main__':
main()

第二版的代码 直接输出排版好的csv文件
posted @ 2024-09-06 19:18  财神给你送元宝  阅读(12)  评论(0编辑  收藏  举报