python04

Python数据处理训练

 

班级: 2205-2         学号: 20224082        姓名:艾鑫

实验目的

l 使学生熟练安装扩展库numpyrequestsbs4pandasseabornmatplotlib等;

使学生熟悉使用标准库cvs操作文件;

l 使学生熟悉使用pandas进行数据分析的基本操作

l 使学生了解使用seaborn绘制热力图的方法;

使学生熟练使用matplotlib进行数据可视化

使学生熟练使用nmupy进行科学计算

使学生熟练运用requests库和bs4库进行基本的数据爬取

实验环境及实验准备

l 所需硬件环境为微机;

所需软件环境为Python 3.X等;

掌握Pythonnumpyrequestsbs4pandasseabornmatplotlibcvs等的使用;

实验内容

(一)、中国大学排名数据分析与可视化(写到实验报告中)

【源代码程序】

 import requests

from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

 

# URL 模板,按年份爬取数据

URL_TEMPLATE = "https://www.shanghairanking.cn/rankings/bcur/{}"

 

 

# 爬取数据函数

def fetch_rankings(year):

    url = URL_TEMPLATE.format(year)

    response = requests.get(url)

 

    # 检查响应状态码

    if response.status_code != 200:

        print(f"Failed to retrieve data for year {year}. Status code: {response.status_code}")

        return []

 

    soup = BeautifulSoup(response.content, "html.parser")

    table = soup.find("table", {"class": "rk-table"})

 

    # 检查是否成功找到表格

    if not table:

        print(f"Failed to find the ranking table for year {year}.")

        return []

 

    rows = table.find_all("tr")[1:11]  # 取前10行数据

 

    rankings = []

    for row in rows:

        cols = row.find_all("td")

        rank = cols[0].text.strip()

        university = cols[1].text.strip()

        score = cols[2].text.strip()

        rankings.append((rank, university, score))

 

    return rankings

 

 

# 打印排名信息

def print_rankings(rankings, year):

    if not rankings:

        print(f"No data available for year {year}.")

        return

 

    print(f"\n{year} 年前 10 名大学排名:")

    print(f"{'排名':<5} {'大学':<20} {'得分':<10}")

    print("-" * 40)

    for rank, university, score in rankings:

        print(f"{rank:<5} {university:<20} {score:<10}")

 

 

# 可视化函数

def plot_rankings(rankings_dict):

    # 设置字体

    plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体字体

    plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

 

    years = list(rankings_dict.keys())

    universities = {university for year in years for _, university, _ in rankings_dict[year]}

 

    plt.figure(figsize=(10, 6))

 

    for university in universities:

        ranks = []

        for year in years:

            rank = next((int(rank) for rank, uni, _ in rankings_dict[year] if uni == university), None)

            ranks.append(rank)

        plt.plot(years, ranks, marker='o', label=university if ranks[-1] and ranks[-1] <= 10 else "")

 

    plt.gca().invert_yaxis()

    plt.xticks(years)

    plt.xlabel('年份')

    plt.ylabel('排名')

    plt.title('2015-2019年前10大学排名变化')

    plt.legend()

    plt.show()

 

 

# 查询排名信息

def query_ranking(rankings_dict):

    while True:

        university = input("请输入大学名称:")

        year = input("请输入年份(2015-2019):")

 

        if not year.isdigit() or int(year) not in rankings_dict:

            print("年份输入有误,请重新输入。")

            continue

 

        year = int(year)

        rank_info = next((rank for rank, uni, _ in rankings_dict[year] if uni == university), None)

 

        if rank_info:

            print(f"{year} 年 {university} 排名:{rank_info}")

        else:

            print(f"{year} 年没有找到 {university} 的排名信息。")

 

        cont = input("是否继续查询?(1/0): ")

        if cont.lower() != '1':

            break

 

 

if __name__ == "__main__":

    rankings_dict = {}

 

    for year in range(2015, 2019+1):

        rankings_dict[year] = fetch_rankings(year)

        print_rankings(rankings_dict[year], year)

 

    plot_rankings(rankings_dict)

 

    query_ranking(rankings_dict)

运行测试

 

 

)、豆瓣图书评论数据分析与可视化(写到实验报告中)

【源代码程序】

import re

from collections import Counter

import requests

# from lxml import etree

from lxml import etree

import pandas as pd

import jieba

import matplotlib.pyplot as plt

from wordcloud import WordCloud

headers = {

# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39"

"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36"

}

comments = []

words = []

def regex_change(line):

# 前缀的正则

username_regex = re.compile(r"^\d+::")

# URL,为了防止对中文的过滤,所以使用[a-zA-Z0-9]而不是\w

url_regex = re.compile(r"""

(https?://)?

([a-zA-Z0-9]+)

(\.[a-zA-Z0-9]+)

(\.[a-zA-Z0-9]+)*

(/[a-zA-Z0-9]+)*

""", re.VERBOSE | re.IGNORECASE)

# 剔除日期

data_regex = re.compile(u""" #utf-8编码

|

|

|

(周一) |

(周二) |

(周三) |

(周四) |

(周五) |

(周六)

""", re.VERBOSE)

# 剔除所有数字

decimal_regex = re.compile(r"[^a-zA-Z]\d+")

# 剔除空格

space_regex = re.compile(r"\s+")

regEx = "[\n”“|,,;;''/?! 。的了是]" # 去除字符串中的换行符、中文冒号、|,需要去除什么字符就在里面写什么字符

line = re.sub(regEx, "", line)

line = username_regex.sub(r"", line)

line = url_regex.sub(r"", line)

line = data_regex.sub(r"", line)

line = decimal_regex.sub(r"", line)

line = space_regex.sub(r"", line)

return line

def getComments(url):

score = 0

resp = requests.get(url, headers=headers).text

html = etree.HTML(resp)

comment_list = html.xpath(".//div[@class='comment']")

for comment in comment_list:

status = ""

name = comment.xpath(".//span[@class='comment-info']/a/text()")[0] # 用户名

content = comment.xpath(".//p[@class='comment-content']/span[@class='short']/text()")[0] # 短评内容

content = str(content).strip()

word = jieba.cut(content, cut_all=False, HMM=False)

time = comment.xpath(".//span[@class='comment-info']/a/text()")[1] # 评论时间

mark = comment.xpath(".//span[@class='comment-info']/span/@title") # 评分

if len(mark) == 0:

score = 0

else:

for i in mark:

status = str(i)

if status == "力荐":

score = 5

elif status == "推荐":

score = 4

elif status == "还行":

score = 3

elif status == "较差":

score = 2

elif status == "很差":

score = 1

good = comment.xpath(".//span[@class='comment-vote']/span[@class='vote-count']/text()")[0] # 点赞数(有用数)

comments.append([str(name), content, str(time), score, int(good)])

for i in word:

if len(regex_change(i)) >= 2:

words.append(regex_change(i))

def getWordCloud(words):

# 生成词云

all_words = []

all_words += [word for word in words]

dict_words = dict(Counter(all_words))

bow_words = sorted(dict_words.items(), key=lambda d: d[1], reverse=True)

print("热词前10位:")

for i in range(10):

print(bow_words[i])

text = ' '.join(words)

w = WordCloud(background_color='white',

width=1000,

height=700,

font_path='simhei.ttf',

margin=10).generate(text)

plt.show()

plt.imshow(w)

w.to_file('wordcloud.png')

print("请选择以下选项:")

print(" 1.热门评论")

print(" 2.最新评论")

info = int(input())

print("前10位短评信息:")

title = ['用户名', '短评内容', '评论时间', '评分', '点赞数']

if info == 1:

comments = []

words = []

for i in range(0, 60, 20):

url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=new_score".format(

i) # 前3页短评信息(热门)

getComments(url)

df = pd.DataFrame(comments, columns=title)

print(df.head(10))

print("点赞数前10位的短评信息:")

df = df.sort_values(by='点赞数', ascending=False)

print(df.head(10))

getWordCloud(words)

elif info == 2:

comments = []

words=[]

for i in range(0, 60, 20):

url = "https://book.douban.com/subject/10517238/comments/?start={}&limit=20&status=P&sort=time".format(

i) # 前3页短评信息(最新)

getComments(url)

df = pd.DataFrame(comments, columns=title)

print(df.head(10))

print("点赞数前10位的短评信息:")

df = df.sort_values(by='点赞数', ascending=False)

print(df.head(10))

getWordCloud(words)

 

 

运行测试

 

 

)、函数图形1绘制(写到实验报告中)

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

 

x = np.arange(0, 10, 0.0001)

y1 = x ** 2

y2 = np.cos(x * 2)

y3 = y1 * y2

plt.plot(x, y1,linestyle='-.')

plt.plot(x, y2,linestyle=':')

plt.plot(x, y3,linestyle='--')

plt.savefig("3-1.png")

plt.show()

 

 

 

import matplotlib.pyplot as plt

import numpy as np

fig, subs = plt.subplots(2, 2)

subs[0][0].plot(x, y1)

subs[0][1].plot(x, y2)

subs[1][0].plot(x, y3)

plt.savefig("3-2.png")

plt.show()python

 

 

运行测试

 

 

(四)、函数图形2绘制(写到实验报告中)

【源代码程序】

import matplotlib.pyplot as plt

import numpy as np

 

x = np.arange(-2, 2, 0.0001)

y1 = np.sqrt(2 * np.sqrt(x ** 2) - x ** 2)

y2 = (-2.14) * np.sqrt(np.sqrt(2) - np.sqrt(np.abs(x)))

plt.plot(x, y1, 'r', x, y2, 'r')

plt.fill_between(x, y1, y2, facecolor='red')

plt.savefig("heart.png")

plt.show()python

 

运行测试

 

 

 

 

 

 

 


Python数据处理训练

 

班级: 2205-2        学号:  20224082       姓名:艾鑫

实验自评

实验内容

自评结果(在对应格内打ü

不熟练

一般

比较熟练

熟练

Python下数据爬取及应用

 

 

 

ü

Python下科学计算及数据分析

 

 

 

ü

Python下可视化展示

 

 

 

ü

实验体会

第一题:

爬取数据时,需要注意网站的反爬虫机制,合理设置请求头等信息。

数据清洗和整理是数据分析的重要步骤,需要确保数据的准确性和一致性。

可视化能够直观地展示数据的变化趋势,有助于发现数据中的规律和异常。

第二题:

跨页连续爬取时,需要注意处理分页逻辑和防止重复爬取。

文本分析能够揭示用户对于图书的情感倾向和关注点。

词云图形能够直观地展示文本中的关键词汇,有助于快速了解文本的主要内容。

第三题:

绘制函数图形时,需要注意选择合适的x值范围和步长,以确保图形的平滑性和准确性。

分段函数的处理需要仔细定义每个区间的函数表达式,并确保在区间交界处函数的连续性。

填充图形的绘制能够更直观地展示函数曲线所包围的区域。

 

 

 

 

 

posted @ 2024-05-26 22:47  艾鑫4646  阅读(16)  评论(0编辑  收藏  举报