每日总结（python文本分析）

导入文本文档并输出在终端

# Python 3.x版本

import os

# 获取根目录下文件的绝对路径
root_path = "./"
file_path = os.path.join(root_path, 'pinglun.txt')

try:
    # 打开文本文件并读取所有内容
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # 显示文件内容
    print(content)

except FileNotFoundError:
    print("文件不存在：", file_path)
except Exception as e:
    print("读取文件时发生错误：", str(e))

爬取豆瓣电影评论

import requests, re
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

ua = UserAgent()
headers = {"User-Agent": ua.random}

def getHTMLText(url):
    try:
        r = requests.get( url, headers=headers, timeout=30 )
        r.raise_for_status()
        r.encoding = "utf-8"
        return r.text
    except:
        return ""

def fillMoviedata(soup, moviedata):
    commentinfo = soup.find_all('span', 'comment-info')
    pattern = re.compile('allstar(\d+) rating')
    comments = soup.find_all('span', 'short')

    # 仅处理评论信息和评论内容存在的情况
    for i in range(min(len(commentinfo), len(comments))):
        p = re.findall(pattern, str(commentinfo[i]))
        moviedata.append([commentinfo[i].a.string if commentinfo[i].a else "", comments[i].string, p])

def printList(moviedata, num):
    for i in range(min(num, len(moviedata))):
        u = moviedata[i]
        try:
            print("序号: {}\n用户名: {}\n评论内容: {}\n评分: {}星\n".format(i + 1, u[0], u[1],
                                                                              int(eval(u[2][0]) / 10) if u[2] else ""))
        except Exception as e:
            print("序号: {}\n用户名: {}\n评论内容: {}\n".format(i + 1, u[0], u[1]))

def fetch_movie_comments(movieid, num_comments, start_page=1, limit_per_page=20, method='new_score'):
    Moviedata = []
    total_pages = (num_comments // limit_per_page) + (num_comments % limit_per_page > 0)

    for page in range(start_page, min(start_page + total_pages + 1, 100)):  # 假设最多抓取100页，防止无限循环
        url = f'https://movie.douban.com/subject/{movieid}/comments?start={page * limit_per_page - limit_per_page}&limit={limit_per_page}&sort={method}&status=P'
        html = getHTMLText(url)
        soup = BeautifulSoup(html, 'html.parser')
        fillMoviedata(soup, Moviedata)

    printList(Moviedata, num_comments)

# 调用修改后的函数，抓取1000条评论
fetch_movie_comments(34805219, 20, 1)

导入文件对其进行snownlp分析与饼图分析

import os
from snownlp import SnowNLP

import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
font_path = 'path/to/your/font.ttf'  # 替换为你系统中存在的支持中文的字体文件路径
if os.path.exists(font_path):
    fontprop = FontProperties(fname=font_path)
else:
    print("指定的字体文件不存在，请确认路径是否正确！")

# 在绘图之前设置全局字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 如果你系统有SimHei字体（黑体）
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 读取根目录下的pinglun.txt文件内容
root_path = "./"
file_path = os.path.join(root_path, 'pinglun.txt')

try:
    with open(file_path, 'r', encoding='utf-8') as f:
        text_content = f.read()

    # 将文本拆分成句子列表
    sentences = [line.strip() for line in text_content.split('\n') if line.strip()]

    # 对每个句子进行情感分析，并收集情感得分
    sentiment_scores = []
    binary_sentiments = []  # 添加用于饼图的二元情感类别（0：负面，1：正面）
    for sentence in sentences:
        s = SnowNLP(sentence)
        sentiment_score = s.sentiments
        sentiment_scores.append(sentiment_score)
        binary_sentiments.append(0 if sentiment_score < 0.5 else 1)  # 简单地将情感得分低于0.5视为负面，否则视为正面

    # 绘制情感得分直方图并保存
    fig_hist = plt.figure()
    plt.hist(sentiment_scores, bins=10, edgecolor='black')
    plt.xlabel('情感得分')
    plt.ylabel('句子数量')
    plt.title('根目录下pinglun文件的情感分析 - 直方图')
    plt.grid(True)
    plt.savefig(os.path.join(root_path, 'sentiment_histogram.png'))

    # 绘制情感二元分类的饼图并保存
    labels = ['负面', '正面']
    fig_pie = plt.figure()
    plt.pie(binary_sentiments, labels=labels, autopct='%1.1f%%', startangle=90)
    plt.title('根目录下pinglun文件的情感分析 - 饼图')
    plt.axis('equal')  # 保证饼图是圆形
    plt.savefig(os.path.join(root_path, 'sentiment_pie_chart.png'))

    # 显示图表（在命令行界面运行时可选）
    plt.show()

except FileNotFoundError:
    print("文件不存在：", file_path)
except Exception as e:
    print("处理文件时发生错误：", str(e))