Python数据处理训练

（一）、中国大学排名数据分析与可视化；（写到实验报告中）

【源代码程序】

import requests

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

# URL 模板，按年份爬取数据

URL_TEMPLATE = "https://www.shanghairanking.cn/rankings/bcur/{}"

# 爬取数据函数

def fetch_rankings(year):

url = URL_TEMPLATE.format(year)

response = requests.get(url)

# 检查响应状态码

if response.status_code != 200:

print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")

return []

soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", {"class": "rk-table"})

# 检查是否成功找到表格

if not table:

print(f"Failed to find the ranking table for year {year}.")

return []

rows = table.find_all("tr")[1:11] # 取前10行数据

rankings = []

for row in rows:

cols = row.find_all("td")

rank = cols[0].text.strip()

university = cols[1].text.strip()

score = cols[2].text.strip()

rankings.append((rank, university, score))

return rankings

# 打印排名信息

def print_rankings(rankings, year):

if not rankings:

print(f"No data available for year {year}.")

return

print(f"\n{year} 年前 10 名大学排名：")

print(f"{'排名':<5} {'大学':<20} {'得分':<10}")

print("-" * 40)

for rank, university, score in rankings:

print(f"{rank:<5} {university:<20} {score:<10}")

# 可视化函数

def plot_rankings(rankings_dict):

# 设置字体

plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体字体

plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题

years = list(rankings_dict.keys())

universities = {university for year in years for _, university, _ in rankings_dict[year]}

plt.figure(figsize=(10, 6))

for university in universities:

ranks = []

for year in years:

rank = next((int(rank) for rank, uni, _ in rankings_dict[year] if uni == university), None)

ranks.append(rank)

plt.plot(years, ranks, marker='o', label=university if ranks[-1] and ranks[-1] <= 10 else "")

plt.gca().invert_yaxis()

plt.xticks(years)

plt.xlabel('年份')

plt.ylabel('排名')

plt.title('2015-2019年前10大学排名变化')

plt.legend()

plt.show()

# 查询排名信息

def query_ranking(rankings_dict):

while True:

university = input("请输入大学名称：")

year = input("请输入年份（2015-2019）：")

if not year.isdigit() or int(year) not in rankings_dict:

print("年份输入有误，请重新输入。")

continue

year = int(year)

rank_info = next((rank for rank, uni, _ in rankings_dict[year] if uni == university), None)

if rank_info:

print(f"{year} 年 {university} 排名：{rank_info}")

else:

print(f"{year} 年没有找到 {university} 的排名信息。")

cont = input("是否继续查询？(y/n): ")

if cont.lower() != 'y':

break

if __name__ == "__main__":

rankings_dict = {}

for year in range(2015, 2019+1):

rankings_dict[year] = fetch_rankings(year)

print_rankings(rankings_dict[year], year)

plot_rankings(rankings_dict)

query_ranking(rankings_dict)

（二）、豆瓣图书评论数据分析与可视化；（写到实验报告中）

【源代码程序】

import requests

from bs4 import BeautifulSoup

import jieba

from wordcloud import WordCloud

import matplotlib.pyplot as plt

# 爬取短评数据函数

def fetch_comments(book_id, start=0, limit=20, status='P', sort_by='time'):

url = f"https://book.douban.com/subject/{book_id}/comments/?start={start}&limit={limit}&status={status}&sort={sort_by}"

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"

}

response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.content, "html.parser")

comments = soup.find_all("div", class_="comment")

comment_data = []

for comment in comments:

username = comment.find("span", class_="comment-info").find("a").text

content = comment.find("span", class_="short").text

date = comment.find("a", class_="comment-time").text.strip()

rating_tag = comment.find("span", class_="rating")

rating = rating_tag['class'][0][7:8] if rating_tag else "无评分"

votes = comment.find("span", class_="vote-count").text

comment_data.append({

"username": username,

"content": content,

"date": date,

"rating": rating,

"votes": int(votes)

})

return comment_data

# 爬取多页数据

def fetch_multiple_pages(book_id, start=0, limit=20, status='P',sort_by='score', num_pages=3):

all_comments = []

for page in range(num_pages):

start_page = start + page * limit

comments = fetch_comments(book_id, start=start_page, limit=limit, status=status, sort_by=sort_by)

all_comments.extend(comments)

return all_comments

# 输出前10条短评信息

def print_top_comments(comments, top_n=10):

for i, comment in enumerate(comments[:top_n], 1):

print(

f"{i}. 用户名: {comment['username']}, 评论时间: {comment['date']}, 评分: {comment['rating']}, 点赞数: {comment['votes']}")

print(f" 短评: {comment['content']}")

# 按照点赞数排序并输出前10条短评信息

def print_top_comments_by_votes(comments, top_n=10):

sorted_comments = sorted(comments, key=lambda x: x['votes'], reverse=True)

print_top_comments(sorted_comments, top_n)

# 文本分析与词云生成

def generate_wordcloud(comments):

text = " ".join([comment['content'] for comment in comments])

words = " ".join(jieba.cut(text))

if not words.strip():

print("没有足够的评论内容生成词云。")

return

wordcloud = WordCloud(font_path='msyh.ttc', width=800, height=400, background_color='white').generate(words)

plt.figure(figsize=(10, 5))

plt.imshow(wordcloud, interpolation="bilinear")

plt.axis("off")

plt.show()

# 统计词频

words_list = jieba.lcut(text)

word_freq = {}

for word in words_list:

if len(word) > 1:

word_freq[word] = word_freq.get(word, 0) + 1

sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

print("前10位词频统计结果：")

for word, freq in sorted_word_freq[:10]:

print(f"{word}: {freq}")

def total(book_id,sort_by):

start = 0 # 起始位置

limit = 20 # 每页数量

statuses = ['P', 'N', 'F'] # 读书状态：P代表读过，N代表在读，F代表想读

# 存储各状态下的评论

all_comments = {}

# 爬取数据并存储

for status in statuses:

comments = fetch_multiple_pages(book_id, start=start, limit=limit, status=status, sort_by=sort_by, num_pages=3)

all_comments[status] = comments

if book_id =='36781566':

print(f"《叙事本能》")

else:

print(f"《暗处的女儿》")

if sort_by == 'time':

print("最新排序前10位短评信息：")

else:

print("热门排序前10位短评信息：")

# 输出各状态下前10位短评信息

for status, comments in all_comments.items():

# 读书状态：P代表读过，N代表在读，F代表想读

if status == 'P':

print(f"读书状态:读过")

elif status == 'N':

print(f"读书状态:在读")

else:

print(f"读书状态:想读")

print_top_comments(comments)

print()

# 文本分析与词云生成

print("\n按点赞数排序前10位短评信息：")

all_comments_merged = sum(all_comments.values(), [])

generate_wordcloud(all_comments_merged)

if __name__ == "__main__":

books_id = ["36701566","36721763"]

sorts = ['time','score'] # 'time' for 最新, 'score' for 热门

# 爬取数据

for book_id in books_id:

for sort in sorts:

total(book_id,sort)

print("\n热门排序前10位短评信息：")

（三）、函数图形1绘制；（写到实验报告中）

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

x = np.arange(0, 10, 0.0001)

y1 = x ** 2

y2 = np.cos(x * 2)

y3 = y1 * y2

plt.plot(x, y1,linestyle='-.')

plt.plot(x, y2,linestyle=':')

plt.plot(x, y3,linestyle='--')

plt.savefig("3-1.png")

plt.show()

import matplotlib.pyplot as plt

import numpy as np

fig, subs = plt.subplots(2, 2)

subs[0][0].plot(x, y1)

subs[0][1].plot(x, y2)

subs[1][0].plot(x, y3)

plt.savefig("3-2.png")

plt.show()

（四）、函数图形2绘制；（写到实验报告中）

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

x = np.arange(-2, 2, 0.0001)

y1 = np.sqrt(2 * np.sqrt(x ** 2) - x ** 2)

y2 = (-2.14) * np.sqrt(np.sqrt(2) - np.sqrt(np.abs(x)))

plt.plot(x, y1, 'r', x, y2, 'r')

plt.fill_between(x, y1, y2, facecolor='pink')

plt.savefig("heart.png")

plt.show()

posted @ 2024-05-26 23:37 不会JAVA的小袁阅读(24) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· Python可视化训练

· 上机实验：数据准备与模型评估

· 2023年5月5日(软件工程日报)

· 日报 python

· Python作业6--中国大学排名数据分析与可视化、豆瓣图书评论数据分析与可视化

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了
· 记一次.NET内存居高不下排查解决与启示

历史上的今天：
2023-05-26 回文数
2023-05-26 自守数
2023-05-26 亲密数
2023-05-26 完数
2023-05-26 马克思手稿中的数学题

公告

昵称：不会JAVA的小袁
园龄： 1年10个月
粉丝： 6
关注： 3

+加关注

2025年3月

日

一

二

三

四

五

六

yuanxinglan

Python数据处理训练

公告

搜索

常用链接

我的标签

随笔分类

随笔档案

阅读排行榜

评论排行榜

推荐排行榜

最新评论