python04

Python数据处理训练

班级: 信2205-2班学号: 20224082 姓名:艾鑫

一实验目的

l 使学生熟练安装扩展库numpy、requests、bs4、pandas、seaborn、matplotlib等；

l 使学生熟悉使用标准库cvs操作文件；

l 使学生熟悉使用pandas进行数据分析的基本操作；

l 使学生了解使用seaborn绘制热力图的方法；

l 使学生熟练使用matplotlib进行数据可视化；

l 使学生熟练使用nmupy进行科学计算；

l 使学生熟练运用requests库和bs4库进行基本的数据爬取

二实验环境及实验准备

l 所需硬件环境为微机；

l 所需软件环境为Python 3.X等；

l 掌握Python下numpy、requests、bs4、pandas、seaborn、matplotlib、cvs等的使用；

三实验内容

（一）、中国大学排名数据分析与可视化；（写到实验报告中）

【源代码程序】

import requests

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

# URL 模板，按年份爬取数据

URL_TEMPLATE = "https://www.shanghairanking.cn/rankings/bcur/{}"

# 爬取数据函数

def fetch_rankings(year):

url = URL_TEMPLATE.format(year)

response = requests.get(url)

# 检查响应状态码

if response.status_code != 200:

print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")

return []

soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", {"class": "rk-table"})

# 检查是否成功找到表格

if not table:

print(f"Failed to find the ranking table for year {year}.")

return []

rows = table.find_all("tr")[1:11] # 取前10行数据

rankings = []

for row in rows:

cols = row.find_all("td")

rank = cols[0].text.strip()

university = cols[1].text.strip()

score = cols[2].text.strip()

rankings.append((rank, university, score))

return rankings

# 打印排名信息

def print_rankings(rankings, year):

if not rankings:

print(f"No data available for year {year}.")

return

print(f"\n{year} 年前 10 名大学排名：")

print(f"{'排名':<5} {'大学':<20} {'得分':<10}")

print("-" * 40)

for rank, university, score in rankings:

print(f"{rank:<5} {university:<20} {score:<10}")

# 可视化函数

def plot_rankings(rankings_dict):

# 设置字体

plt.rcParams['font.sans-serif'] = ['SimHei'] # 使用黑体字体

plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题

years = list(rankings_dict.keys())

universities = {university for year in years for _, university, _ in rankings_dict[year]}

plt.figure(figsize=(10, 6))

for university in universities:

ranks = []

for year in years:

rank = next((int(rank) for rank, uni, _ in rankings_dict[year] if uni == university), None)

ranks.append(rank)

plt.plot(years, ranks, marker='o', label=university if ranks[-1] and ranks[-1] <= 10 else "")

plt.gca().invert_yaxis()

plt.xticks(years)

plt.xlabel('年份')

plt.ylabel('排名')

plt.title('2015-2019年前10大学排名变化')

plt.legend()

plt.show()

# 查询排名信息

def query_ranking(rankings_dict):

while True:

university = input("请输入大学名称：")

year = input("请输入年份（2015-2019）：")

if not year.isdigit() or int(year) not in rankings_dict:

print("年份输入有误，请重新输入。")

continue

year = int(year)

rank_info = next((rank for rank, uni, _ in rankings_dict[year] if uni == university), None)

if rank_info:

print(f"{year} 年 {university} 排名：{rank_info}")

else:

print(f"{year} 年没有找到 {university} 的排名信息。")

cont = input("是否继续查询？(1/0): ")

if cont.lower() != '1':

break

if __name__ == "__main__":

rankings_dict = {}

for year in range(2015, 2019+1):

rankings_dict[year] = fetch_rankings(year)

print_rankings(rankings_dict[year], year)

plot_rankings(rankings_dict)

query_ranking(rankings_dict)

【运行测试】

（二）、豆瓣图书评论数据分析与可视化；（写到实验报告中）

【源代码程序】

import re

from collections import Counter

import requests

# from lxml import etree

from lxml import etree

import pandas as pd

import jieba

import matplotlib.pyplot as plt

from wordcloud import WordCloud

headers = {

# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"

"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36"

}

comments = []

words = []

def regex_change(line):

# 前缀的正则

username_regex = re.compile(r"^\d+::")

# URL，为了防止对中文的过滤，所以使用[a-zA-Z0-9]而不是\w

url_regex = re.compile(r"""

(https?://)?

([a-zA-Z0-9]+)

(\.[a-zA-Z0-9]+)

(\.[a-zA-Z0-9]+)*

(/[a-zA-Z0-9]+)*

""", re.VERBOSE | re.IGNORECASE)

# 剔除日期

data_regex = re.compile(u""" #utf-8编码

年 |

月 |

日 |

(周一) |

(周二) |

(周三) |

(周四) |

(周五) |

(周六)

""", re.VERBOSE)

# 剔除所有数字

decimal_regex = re.compile(r"[^a-zA-Z]\d+")

# 剔除空格

space_regex = re.compile(r"\s+")

regEx = "[\n”“|,，；;''/?! 。的了是]" # 去除字符串中的换行符、中文冒号、|，需要去除什么字符就在里面写什么字符

line = re.sub(regEx, "", line)

line = username_regex.sub(r"", line)

line = url_regex.sub(r"", line)

line = data_regex.sub(r"", line)

line = decimal_regex.sub(r"", line)

line = space_regex.sub(r"", line)

return line

def getComments(url):

score = 0

resp = requests.get(url, headers=headers).text

html = etree.HTML(resp)

comment_list = html.xpath(".//div[@class='comment']")

for comment in comment_list:

status = ""

name = comment.xpath(".//span[@class='comment-info']/a/text()")[0] # 用户名

content = comment.xpath(".//p[@class='comment-content']/span[@class='short']/text()")[0] # 短评内容

content = str(content).strip()

word = jieba.cut(content, cut_all=False, HMM=False)

time = comment.xpath(".//span[@class='comment-info']/a/text()")[1] # 评论时间

mark = comment.xpath(".//span[@class='comment-info']/span/@title") # 评分

if len(mark) == 0:

score = 0

else:

for i in mark:

status = str(i)

if status == "力荐":

score = 5

elif status == "推荐":

score = 4

elif status == "还行":

score = 3

elif status == "较差":

score = 2

elif status == "很差":

score = 1

good = comment.xpath(".//span[@class='comment-vote']/span[@class='vote-count']/text()")[0] # 点赞数（有用数）

comments.append([str(name), content, str(time), score, int(good)])

for i in word:

if len(regex_change(i)) >= 2:

words.append(regex_change(i))

def getWordCloud(words):

# 生成词云

all_words = []

all_words += [word for word in words]

dict_words = dict(Counter(all_words))

bow_words = sorted(dict_words.items(), key=lambda d: d[1], reverse=True)

print("热词前10位：")

for i in range(10):

print(bow_words[i])

text = ' '.join(words)

w = WordCloud(background_color='white',

width=1000,

height=700,

font_path='simhei.ttf',

margin=10).generate(text)

plt.show()

plt.imshow(w)

w.to_file('wordcloud.png')

print("请选择以下选项:")

print(" 1.热门评论")

print(" 2.最新评论")

info = int(input())

print("前10位短评信息：")

title = ['用户名', '短评内容', '评论时间', '评分', '点赞数']

if info == 1:

comments = []

words = []

for i in range(0, 60, 20):

url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=new_score".format(

i) # 前3页短评信息（热门）

getComments(url)

df = pd.DataFrame(comments, columns=title)

print(df.head(10))

print("点赞数前10位的短评信息：")

df = df.sort_values(by='点赞数', ascending=False)

print(df.head(10))

getWordCloud(words)

elif info == 2:

comments = []

words=[]

for i in range(0, 60, 20):

url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=time".format(

i) # 前3页短评信息（最新）

getComments(url)

df = pd.DataFrame(comments, columns=title)

print(df.head(10))

print("点赞数前10位的短评信息：")

df = df.sort_values(by='点赞数', ascending=False)

print(df.head(10))

getWordCloud(words)

【运行测试】

（三）、函数图形1绘制；（写到实验报告中）

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

x = np.arange(0, 10, 0.0001)

y1 = x ** 2

y2 = np.cos(x * 2)

y3 = y1 * y2

plt.plot(x, y1,linestyle='-.')

plt.plot(x, y2,linestyle=':')

plt.plot(x, y3,linestyle='--')

plt.savefig("3-1.png")

plt.show()

import matplotlib.pyplot as plt

import numpy as np

fig, subs = plt.subplots(2, 2)

subs[0][0].plot(x, y1)

subs[0][1].plot(x, y2)

subs[1][0].plot(x, y3)

plt.savefig("3-2.png")

plt.show()python

【运行测试】

（四）、函数图形2绘制；（写到实验报告中）

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

x = np.arange(-2, 2, 0.0001)

y1 = np.sqrt(2 * np.sqrt(x ** 2) - x ** 2)

y2 = (-2.14) * np.sqrt(np.sqrt(2) - np.sqrt(np.abs(x)))

plt.plot(x, y1, 'r', x, y2, 'r')

plt.fill_between(x, y1, y2, facecolor='red')

plt.savefig("heart.png")

plt.show()python

【运行测试】

Python数据处理训练

班级: 信2205-2班学号: 20224082 姓名:艾鑫

实验自评

实验内容	自评结果（在对应格内打ü）
实验内容	不熟练	一般	比较熟练	熟练
Python下数据爬取及应用				ü
Python下科学计算及数据分析				ü
Python下可视化展示				ü

实验体会

第一题：

爬取数据时，需要注意网站的反爬虫机制，合理设置请求头等信息。

数据清洗和整理是数据分析的重要步骤，需要确保数据的准确性和一致性。

可视化能够直观地展示数据的变化趋势，有助于发现数据中的规律和异常。

第二题：

跨页连续爬取时，需要注意处理分页逻辑和防止重复爬取。

文本分析能够揭示用户对于图书的情感倾向和关注点。

词云图形能够直观地展示文本中的关键词汇，有助于快速了解文本的主要内容。

第三题:

绘制函数图形时，需要注意选择合适的x值范围和步长，以确保图形的平滑性和准确性。

分段函数的处理需要仔细定义每个区间的函数表达式，并确保在区间交界处函数的连续性。

填充图形的绘制能够更直观地展示函数曲线所包围的区域。

posted @ 2024-05-26 22:47 艾鑫4646 阅读(15) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

aixin52129211

python04

公告