6.5博客

python和工程数学俩实验真累啊

python学习:

import re

from collections import Counter

import requests

from lxml import etree

import pandas as pd

import jieba

import matplotlib.pyplot as plt

from wordcloud import WordCloud

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"

}

comments = []

words = []

def regex_change(line):

# 前缀的正则
 
username_regex = re.compile(r"^\\d+::")
 
# URL,为了防止对中文的过滤,所以使用\[a-zA-Z0-9\]而不是\\w
 
url_regex = re.compile(r"""
 
    (https?://)?
 
    (\[a-zA-Z0-9\]+)
 
    (\\.\[a-zA-Z0-9\]+)
 
    (\\.\[a-zA-Z0-9\]+)\*
 
    (/\[a-zA-Z0-9\]+)\*
 
""", re.VERBOSE | re.IGNORECASE)
 
# 剔除日期
 
data_regex = re.compile(u"""        #utf-8编码
 
    年 |
 
    月 |
 
    日 |
 
    (周一) |
 
    (周二) | 
 
    (周三) | 
 
    (周四) | 
 
    (周五) | 
 
    (周六)
 
""", re.VERBOSE)
 
# 剔除所有数字
 
decimal_regex = re.compile(r"\[^a-zA-Z\]\\d+")
 
# 剔除空格
 
space_regex = re.compile(r"\\s+")
 
regEx = "\[\\n”“|,,;;''/?! 。的了是\]"  # 去除字符串中的换行符、中文冒号、|,需要去除什么字符就在里面写什么字符
 
line = re.sub(regEx, "", line)
 
line = username_regex.sub(r"", line)
 
line = url_regex.sub(r"", line)
 
line = data_regex.sub(r"", line)
 
line = decimal_regex.sub(r"", line)
 
line = space_regex.sub(r"", line)
 
return line

def getComments(url):

score = 0
 
resp = requests.get(url, headers=headers).text
 
html = etree.HTML(resp)
 
comment_list = html.xpath(".//div\[@class='comment'\]")
 
for comment in comment_list:
 
    status = ""
 
    name = comment.xpath(".//span\[@class='comment-info'\]/a/text()")\[0\]  # 用户名
 
    content = comment.xpath(".//p\[@class='comment-content'\]/span\[@class='short'\]/text()")\[0\]  # 短评内容
 
    content = str(content).strip()
 
    word = jieba.cut(content, cut_all=False, HMM=False)
 
    time = comment.xpath(".//span\[@class='comment-info'\]/a/text()")\[1\]  # 评论时间
 
    mark = comment.xpath(".//span\[@class='comment-info'\]/span/@title"# 评分
 
    if len(mark) == 0:
 
        score = 0
 
    else:
 
        for i in mark:
 
            status = str(i)
 
        if status == "力荐":
 
            score = 5
 
        elif status == "推荐":
 
            score = 4
 
        elif status == "还行":
 
            score = 3
 
        elif status == "较差":
 
            score = 2
 
        elif status == "很差":
 
            score = 1
 
    good = comment.xpath(".//span\[@class='comment-vote'\]/span\[@class='vote-count'\]/text()")\[0\]  # 点赞数(有用数)
 
    comments.append(\[str(name), content, str(time), score, int(good)\])
 
    for i in word:
 
        if len(regex_change(i)) >= 2:
 
            words.append(regex_change(i))

def getWordCloud(words):

# 生成词云
 
all_words = \[\]
 
all_words += \[word for word in words\]
 
dict_words = dict(Counter(all_words))
 
bow_words = sorted(dict_words.items(), key=lambda d: d\[1\], reverse=True)
 
print("热词前10位:")
 
for i in range(10):
 
    print(bow_words\[i\])
 
text = ' '.join(words)
 
w = WordCloud(background_color='white',
 
                 width=1000,
 
                 height=700,
 
                 font_path='simhei.ttf',
 
                 margin=10).generate(text)
 
plt.show()
 
plt.imshow(w)
 
w.to_file('wordcloud.png')

print("请选择以下选项:")

print(" 1.热门评论")

print(" 2.最新评论")

info = int(input())

print("前10位短评信息:")

title = ['用户名', '短评内容', '评论时间', '评分', '点赞数']

if info == 1:

comments = \[\]
 
words = \[\]
 
for i in range(0, 60, 20):
 
    url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=new_score".format(
 
        i)  # 前3页短评信息(热门)
 
    getComments(url)
 
df = pd.DataFrame(comments, columns=title)
 
print(df.head(10))
 
print("点赞数前10位的短评信息:")
 
df = df.sort_values(by='点赞数', ascending=False)
 
print(df.head(10))
 
getWordCloud(words)

elif info == 2:

comments = \[\]
 
words=\[\]
 
for i in range(0, 60, 20):
 
    url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=time".format(
 
        i)  # 前3页短评信息(最新)
 
    getComments(url)
 
df = pd.DataFrame(comments, columns=title)
 
print(df.head(10))
 
print("点赞数前10位的短评信息:")
 
df = df.sort_values(by='点赞数', ascending=False)
 
print(df.head(10))
 
getWordCloud(words)
posted @   张佳木  阅读(4)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· winform 绘制太阳,地球,月球 运作规律
· 上周热点回顾(3.3-3.9)
点击右上角即可分享
微信分享提示