网络热词分析一（用python爬取数据）

import csv
import requests
from bs4 import BeautifulSoup

def fetch_page_data(page_number):
    base_url = 'https://regengbaike.com/'
    load_more_url_template = base_url + '?page={}'
    url = load_more_url_template.format(page_number)

    try:
        response = requests.get(url)
        response.raise_for_status()  # 检查请求是否成功
        return response.content
    except requests.RequestException as e:
        print(f"请求出错: {e}")
        return None

def parse_article(article):
    # 尝试匹配两种不同的类名
    title_element = article.find('h2', {'class': ['article-title pl-3', 'article-title mb-3 pl-3']})
    title = title_element.a.text.strip() if title_element and title_element.a else ''

    explanation_element = article.find('div', {'article-text': True})
    explanation = explanation_element.p.text.strip() if explanation_element and explanation_element.p else ''

    publish_time_element = article.find('time', {'itemprop': 'pubDate'})
    publish_time = publish_time_element.text.strip() if publish_time_element else ''

    return title, explanation, publish_time

def main():
    total_pages = 175
    output_file = 'output.csv'  # 更改输出文件为CSV

    with open(output_file, mode='w', encoding='utf-8', newline='') as file:
        writer = csv.writer(file)
        for page in range(1, total_pages + 1):
            content = fetch_page_data(page)
            if content:
                soup = BeautifulSoup(content, 'html.parser')
                articles = soup.find_all('article', {'class': 'article-item'})

                for article in articles:
                    title, explanation, publish_time = parse_article(article)

                    if title or explanation or publish_time:
                        # 使用writerow方法写入一行数据
                        writer.writerow([title, explanation, publish_time])
                    else:
                        print(f'未找到有效数据 (Page {page}, Article)')

                print(f"第 {page} 页数据读取完成")
            else:
                print(f"第 {page} 页数据获取失败")

    print("所有页面数据已抓取完毕。")

if __name__ == '__main__':
    main()

第二版的代码 直接输出排版好的csv文件
posted @ 2024-09-06 19:18 财神给你送元宝阅读(12) 评论(0) 编辑收藏举报
刷新页面返回顶部
youxiandechilun

网络热词分析一（用python爬取数据）

公告